In [24]:
# set up environment by importing all necessary libraries.

# Matplotlib forms basis for visualization in Python
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import pandas as pd

import numpy as np

from sklearn.model_selection import train_test_split

sns.set()

In [2]:
# load dataset and drop N/A
df = pd.read_csv('./data/vgsales.csv').dropna()

In [3]:
# see what's in the dataset
df.sample(10)

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
3598,3600,Lips: Number One Hits,X360,2009.0,Misc,Microsoft Game Studios,0.15,0.35,0.0,0.07,0.56
3464,3466,Harvest Moon: The Tale of Two Towns,DS,2010.0,Simulation,Natsume,0.27,0.06,0.22,0.03,0.58
7683,7685,Tom Clancy's Splinter Cell: Essentials,PSP,2006.0,Shooter,Ubisoft,0.17,0.01,0.0,0.02,0.2
501,502,Zumba Fitness 2,Wii,2011.0,Sports,Majesco Entertainment,1.54,1.07,0.0,0.28,2.88
9431,9433,Cabela's Dangerous Hunts 2009,X360,2008.0,Sports,Activision,0.12,0.0,0.0,0.01,0.13
7545,7547,Deal or No Deal: The Banker is Back!,DS,2008.0,Misc,Mindscape,0.0,0.2,0.0,0.0,0.2
6362,6364,LEGO The Hobbit,XOne,2014.0,Action,Warner Bros. Interactive Entertainment,0.13,0.12,0.0,0.02,0.27
5787,5789,NASCAR Thunder 2003,GC,2002.0,Racing,Electronic Arts,0.24,0.06,0.0,0.01,0.31
15725,15728,Disciples III: Renaissance,PC,2010.0,Strategy,Kalypso Media,0.0,0.01,0.0,0.0,0.02
9836,9838,Mahou Shoujo Lyrical Nanoha A's Portable: The ...,PSP,2011.0,Fighting,Namco Bandai Games,0.0,0.0,0.12,0.0,0.12


In [11]:
X = df.drop(columns=['Rank','Name','Year','Publisher','NA_Sales','EU_Sales','JP_Sales','Other_Sales','Global_Sales'])
X.sample(10)

Unnamed: 0,Platform,Genre
1901,X360,Action
7486,3DS,Action
16497,GBA,Puzzle
10973,PSP,Action
9613,DS,Platform
13362,DS,Action
7500,GBA,Misc
11785,PSV,Sports
2305,PS,Action
4794,GB,Role-Playing


In [7]:
X['Platform'].value_counts()

DS      2131
PS2     2127
PS3     1304
Wii     1290
X360    1234
PSP     1197
PS      1189
PC       938
XB       803
GBA      786
GC       542
3DS      499
PSV      410
PS4      336
N64      316
SNES     239
XOne     213
SAT      173
WiiU     143
2600     116
NES       98
GB        97
DC        52
GEN       27
NG        12
SCD        6
WS         6
3DO        3
TG16       2
PCFX       1
GG         1
Name: Platform, dtype: int64

In [8]:
X['Genre'].value_counts()

Action          3251
Sports          2304
Misc            1686
Role-Playing    1470
Shooter         1282
Adventure       1274
Racing          1225
Platform         875
Simulation       848
Fighting         836
Strategy         670
Puzzle           570
Name: Genre, dtype: int64

In [10]:
y = df['Global_Sales']
y.sample(10)

9507     0.13
715      2.25
8068     0.18
2118     0.98
10139    0.11
3950     0.50
8318     0.17
11732    0.08
8382     0.17
16313    0.01
Name: Global_Sales, dtype: float64

In [12]:
# now we got our independent variables in "X"
# we got our target variables in "y"
# before we do any modeling, we need to process "X" first

In [17]:
# turn to one hot encoding
platform_one_hot = pd.get_dummies(X['Platform'])
genre_one_hot = pd.get_dummies(X['Genre'])

In [18]:
platform_one_hot.head(10)

Unnamed: 0,2600,3DO,3DS,DC,DS,GB,GBA,GC,GEN,GG,...,SAT,SCD,SNES,TG16,WS,Wii,WiiU,X360,XB,XOne
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
genre_one_hot.head(10)

Unnamed: 0,Action,Adventure,Fighting,Misc,Platform,Puzzle,Racing,Role-Playing,Shooter,Simulation,Sports,Strategy
0,0,0,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,0,0,1,0,0,0,0
5,0,0,0,0,0,1,0,0,0,0,0,0
6,0,0,0,0,1,0,0,0,0,0,0,0
7,0,0,0,1,0,0,0,0,0,0,0,0
8,0,0,0,0,1,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,1,0,0,0


In [26]:
# now we merge those one hot variables
X = pd.concat([platform_one_hot, genre_one_hot], axis=1, join_axes=[platform_one_hot.index])
X.head(10)

Unnamed: 0,2600,3DO,3DS,DC,DS,GB,GBA,GC,GEN,GG,...,Fighting,Misc,Platform,Puzzle,Racing,Role-Playing,Shooter,Simulation,Sports,Strategy
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
5,0,0,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
6,0,0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [27]:
train_test_split?

In [28]:
# now we need to split into 7:3 training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=21)

In [29]:
X_train.head(5)

Unnamed: 0,2600,3DO,3DS,DC,DS,GB,GBA,GC,GEN,GG,...,Fighting,Misc,Platform,Puzzle,Racing,Role-Playing,Shooter,Simulation,Sports,Strategy
3510,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
13189,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4602,0,0,0,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1029,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8773,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [30]:
y_train.head(5)

3510     0.57
13189    0.05
4602     0.42
1029     1.73
8773     0.15
Name: Global_Sales, dtype: float64

In [31]:
X_test.head(5)

Unnamed: 0,2600,3DO,3DS,DC,DS,GB,GBA,GC,GEN,GG,...,Fighting,Misc,Platform,Puzzle,Racing,Role-Playing,Shooter,Simulation,Sports,Strategy
7482,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
719,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
5218,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
10121,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14904,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [32]:
y_test.head(5)

7482     0.21
719      2.25
5218     0.36
10121    0.11
14904    0.03
Name: Global_Sales, dtype: float64

In [37]:
# first let use the KNN
from sklearn.neighbors import KNeighborsRegressor

knn = KNeighborsRegressor(n_neighbors=15)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

knn.score(X_test, y_test)

-0.416641505678911

In [35]:
# the R^2 score is VERY VERY LOW (facepalm)
# one thing that I'm curious is how can we even get negative value??

In [41]:
# let see the prediction
print(y_pred[:10])
print(y_test[:10])

[0.71666667 0.29466667 0.44466667 0.16466667 0.23533333 0.92533333
 0.37266667 0.29533333 0.45733333 0.42      ]
7482     0.21
719      2.25
5218     0.36
10121    0.11
14904    0.03
6924     0.24
5295     0.35
13875    0.04
2011     1.04
9879     0.12
Name: Global_Sales, dtype: float64


In [47]:
# let see what we can do to improve
# KNeighborsRegressor?
for i in range(10, 200, 10):
    knn = KNeighborsRegressor(n_neighbors=i)
    knn.fit(X_train, y_train)
    print("Score of knn regression at "+ str(i) + " is: {}".format(knn.score(X_test, y_test)))

Score of knn regression at 10 is: -0.9768367852521755
Score of knn regression at 20 is: -0.23488106140405304
Score of knn regression at 30 is: -0.09034620278876337
Score of knn regression at 40 is: -0.06527234001744198
Score of knn regression at 50 is: -0.027537548119076494
Score of knn regression at 60 is: 0.005922347997617816
Score of knn regression at 70 is: -0.001030053736475578
Score of knn regression at 80 is: 0.007246137772097039
Score of knn regression at 90 is: 0.0101438114704272
Score of knn regression at 100 is: 0.015233119457709552
Score of knn regression at 110 is: 0.018642148637798006
Score of knn regression at 120 is: 0.02250405531522559
Score of knn regression at 130 is: 0.0005823130794057052
Score of knn regression at 140 is: -0.0029797919597844835
Score of knn regression at 150 is: 0.00039550109744224127
Score of knn regression at 160 is: 0.001041696006370385
Score of knn regression at 170 is: 0.0061300878300499795
Score of knn regression at 180 is: 0.0071367455002783

In [48]:
# The best result for using the KNN regressor is when
# the n_neighbors set to 120. The result become worse
# after 120 is due to overfitting.