In [19]:
# set up environment by importing all necessary libraries.

# Matplotlib forms basis for visualization in Python
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from IPython.display import display
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

%matplotlib inline

sns.set()

In [2]:
# load dataset and drop N/A
df = pd.read_csv('./data/vgsales.csv').dropna()
print(df.shape)

(16291, 11)


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16291 entries, 0 to 16597
Data columns (total 11 columns):
Rank            16291 non-null int64
Name            16291 non-null object
Platform        16291 non-null object
Year            16291 non-null float64
Genre           16291 non-null object
Publisher       16291 non-null object
NA_Sales        16291 non-null float64
EU_Sales        16291 non-null float64
JP_Sales        16291 non-null float64
Other_Sales     16291 non-null float64
Global_Sales    16291 non-null float64
dtypes: float64(6), int64(1), object(4)
memory usage: 1.5+ MB


In [4]:
# convert some attributes to it's proper type
df['Year'] = df['Year'].astype('int64')

In [5]:
# Summary Statistics:
df.describe()

Unnamed: 0,Rank,Year,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
count,16291.0,16291.0,16291.0,16291.0,16291.0,16291.0,16291.0
mean,8290.190228,2006.405561,0.265647,0.147731,0.078833,0.048426,0.54091
std,4792.65445,5.832412,0.822432,0.509303,0.311879,0.190083,1.567345
min,1.0,1980.0,0.0,0.0,0.0,0.0,0.01
25%,4132.5,2003.0,0.0,0.0,0.0,0.0,0.06
50%,8292.0,2007.0,0.08,0.02,0.0,0.01,0.17
75%,12439.5,2010.0,0.24,0.11,0.04,0.04,0.48
max,16600.0,2020.0,41.49,29.02,10.22,10.57,82.74


In [6]:
# Display first 10 rows of the data
display(df[:10])

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
4,5,Pokemon Red/Pokemon Blue,GB,1996,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37
5,6,Tetris,GB,1989,Puzzle,Nintendo,23.2,2.26,4.22,0.58,30.26
6,7,New Super Mario Bros.,DS,2006,Platform,Nintendo,11.38,9.23,6.5,2.9,30.01
7,8,Wii Play,Wii,2006,Misc,Nintendo,14.03,9.2,2.93,2.85,29.02
8,9,New Super Mario Bros. Wii,Wii,2009,Platform,Nintendo,14.59,7.06,4.7,2.26,28.62
9,10,Duck Hunt,NES,1984,Shooter,Nintendo,26.93,0.63,0.28,0.47,28.31


# Processing Data

In [7]:
# create new attribute region sales share
df['NA_share'] = df['NA_Sales'] / df['Global_Sales']
df['EU_share'] = df['EU_Sales'] / df['Global_Sales']
df['JP_share'] = df['JP_Sales'] / df['Global_Sales']
df['Other_share'] = df['Other_Sales'] / df['Global_Sales']

In [8]:
# create target of our model
regionalShare = df.drop(['Rank', 'Name', 'Platform', 'Platform', 'Year', 'Genre', 'Publisher', 'NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales'], axis = 1)

In [9]:
# display our target column
display(regionalShare[:5])

Unnamed: 0,NA_share,EU_share,JP_share,Other_share
0,0.50145,0.350737,0.045564,0.102248
1,0.722664,0.088966,0.169235,0.019135
2,0.44249,0.359576,0.105807,0.092406
3,0.477273,0.333636,0.099394,0.089697
4,0.35926,0.283392,0.325789,0.031878


In [16]:
# split the sales share by their region
naShare = regionalShare['NA_share']
euShare = regionalShare['EU_share']
jpShare = regionalShare['JP_share']
otherShare = regionalShare['Other_share']

In [12]:
# create features of our model
features = df.drop(['Rank', 'Name', 'NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales', 'NA_share', 'EU_share', 'JP_share', 'Other_share' ], axis = 1)

In [13]:
# display our target column
display(features[:5])

Unnamed: 0,Platform,Year,Genre,Publisher
0,Wii,2006,Sports,Nintendo
1,NES,1985,Platform,Nintendo
2,Wii,2008,Racing,Nintendo
3,Wii,2009,Sports,Nintendo
4,GB,1996,Role-Playing,Nintendo


#### This code is inspired by NikhilRaman project 'Predicting Game Sales in NA!' from Kaggle
#### https://www.kaggle.com/antoshachekhonte/predicting-game-sales-in-na

In [14]:
# process our features to numeric values to fit our model since most of the require numeric values

num_features = pd.DataFrame(index = features.index)

for col, col_data in features.iteritems():
    
    if col_data.dtype == object:
        col_data = pd.get_dummies(col_data, prefix = col)
    num_features = num_features.join(col_data)
    
features = num_features
display(features[:5])

Unnamed: 0,Platform_2600,Platform_3DO,Platform_3DS,Platform_DC,Platform_DS,Platform_GB,Platform_GBA,Platform_GC,Platform_GEN,Platform_GG,...,Publisher_Zushi Games,Publisher_bitComposer Games,Publisher_dramatic create,Publisher_fonfun,Publisher_iWin,Publisher_id Software,Publisher_imageepoch Inc.,Publisher_inXile Entertainment,"Publisher_mixi, Inc",Publisher_responDESIGN
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Model Building

#### This model buliding is inspired by NikhilRaman project 'Predicting Game Sales in NA!' from Kaggle
#### https://www.kaggle.com/antoshachekhonte/predicting-game-sales-in-na

In [17]:
# We start our model with the NA sales share
# We divide the date into training and testing with a ratio of 7:3
X_train, X_test, y_train, y_test = train_test_split(features,naShare,test_size = 0.3,random_state = 2)

##### Decision Tree Regression and K-Neighbors Regression will be use to build our model,and we will use the R-squared score to tell the performance of our model

In [20]:
# Decision Tree Regression model for NA
regDTR = DecisionTreeRegressor(random_state = 4)
regDTR.fit(X_train, y_train)
y_regDTR = regDTR.predict(X_test)
print ('The r2_score on the Decision Tree Regressor model predicting NA share...')
print (r2_score(y_test, y_regDTR))

The r2_score on the Decision Tree Regressor model predicting NA share...
0.2735763381782613


In [21]:
# K Neighbors Regressor model for NA
regKNR = KNeighborsRegressor()
regKNR.fit(X_train, y_train)
y_regKNR = regKNR.predict(X_test)
print ('The r2_score on the K Neighbors Regressor predicting NA share....')
print (r2_score(y_test, y_regKNR))

The r2_score on the K Neighbors Regressor predicting NA share....
0.44142074964888955
