In [43]:
# set up environment by importing all necessary libraries.

# Matplotlib forms basis for visualization in Python
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import pandas as pd

import numpy as np

from sklearn.model_selection import train_test_split

sns.set()

In [44]:
# load dataset and drop N/A
df = pd.read_csv('./data/vgsales.csv').dropna()

In [45]:
# see what's in the dataset
df.sample(10)

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
6969,6971,Super Robot Taisen D,GBA,2003.0,Strategy,Banpresto,0.0,0.0,0.23,0.01,0.23
10954,10956,The Idolmaster: Platinum Stars,PS4,2016.0,Simulation,Namco Bandai Games,0.0,0.0,0.09,0.0,0.09
5479,5481,Jeanne d'Arc,PSP,2006.0,Role-Playing,Sony Computer Entertainment,0.21,0.0,0.1,0.02,0.33
2765,2767,Resonance of Fate,PS3,2010.0,Role-Playing,Sega,0.2,0.22,0.24,0.09,0.74
4112,4114,Secret Weapons Over Normandy,PS2,2003.0,Simulation,LucasArts,0.24,0.18,0.0,0.06,0.48
3869,3871,Petz: Hamsterz Life 2,DS,2007.0,Misc,Ubisoft,0.48,0.0,0.0,0.04,0.52
214,215,Monster Hunter Freedom 3,PSP,2010.0,Role-Playing,Capcom,0.0,0.0,4.87,0.0,4.87
8900,8902,Cardinal Syn,PS,1998.0,Fighting,Sony Computer Entertainment,0.08,0.06,0.0,0.01,0.15
14096,14098,Detective Conan: Prelude from the Past,PSP,2012.0,Action,Namco Bandai Games,0.0,0.0,0.04,0.0,0.04
7906,7908,Jewel Master: Cradle of Athena,DS,2010.0,Puzzle,Storm City Games,0.18,0.0,0.0,0.01,0.19


In [46]:
X = df.drop(columns=['Rank','Name','Year','Publisher','NA_Sales','EU_Sales','JP_Sales','Other_Sales','Global_Sales'])
X.sample(10)

Unnamed: 0,Platform,Genre
8498,PS2,Adventure
15544,DS,Misc
504,PS,Racing
10432,XB,Fighting
11255,PS2,Role-Playing
6541,Wii,Action
13867,3DS,Puzzle
3154,PC,Action
14649,PSP,Fighting
127,NES,Action


In [47]:
X['Platform'].value_counts()

DS      2131
PS2     2127
PS3     1304
Wii     1290
X360    1234
PSP     1197
PS      1189
PC       938
XB       803
GBA      786
GC       542
3DS      499
PSV      410
PS4      336
N64      316
SNES     239
XOne     213
SAT      173
WiiU     143
2600     116
NES       98
GB        97
DC        52
GEN       27
NG        12
SCD        6
WS         6
3DO        3
TG16       2
PCFX       1
GG         1
Name: Platform, dtype: int64

In [48]:
X['Genre'].value_counts()

Action          3251
Sports          2304
Misc            1686
Role-Playing    1470
Shooter         1282
Adventure       1274
Racing          1225
Platform         875
Simulation       848
Fighting         836
Strategy         670
Puzzle           570
Name: Genre, dtype: int64

In [49]:
y = df['Global_Sales']
y.sample(10)

12306    0.06
11894    0.07
13485    0.04
1673     1.20
4980     0.38
13147    0.05
4336     0.45
8536     0.16
9064     0.14
6328     0.27
Name: Global_Sales, dtype: float64

In [50]:
# now we got our independent variables in "X"
# we got our target variables in "y"
# before we do any modeling, we need to process "X" first

In [51]:
# turn to one hot encoding
platform_one_hot = pd.get_dummies(X['Platform'])
genre_one_hot = pd.get_dummies(X['Genre'])

In [52]:
platform_one_hot.head(10)

Unnamed: 0,2600,3DO,3DS,DC,DS,GB,GBA,GC,GEN,GG,...,SAT,SCD,SNES,TG16,WS,Wii,WiiU,X360,XB,XOne
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [53]:
genre_one_hot.head(10)

Unnamed: 0,Action,Adventure,Fighting,Misc,Platform,Puzzle,Racing,Role-Playing,Shooter,Simulation,Sports,Strategy
0,0,0,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,0,0,1,0,0,0,0
5,0,0,0,0,0,1,0,0,0,0,0,0
6,0,0,0,0,1,0,0,0,0,0,0,0
7,0,0,0,1,0,0,0,0,0,0,0,0
8,0,0,0,0,1,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,1,0,0,0


In [54]:
# now we merge those one hot variables
X = pd.concat([platform_one_hot, genre_one_hot], axis=1, join_axes=[platform_one_hot.index])
X.head(10)

Unnamed: 0,2600,3DO,3DS,DC,DS,GB,GBA,GC,GEN,GG,...,Fighting,Misc,Platform,Puzzle,Racing,Role-Playing,Shooter,Simulation,Sports,Strategy
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
5,0,0,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
6,0,0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [55]:
# train_test_split?

In [56]:
# now we need to split into 7:3 training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=21)

In [57]:
X_train.head(5)

Unnamed: 0,2600,3DO,3DS,DC,DS,GB,GBA,GC,GEN,GG,...,Fighting,Misc,Platform,Puzzle,Racing,Role-Playing,Shooter,Simulation,Sports,Strategy
3510,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
13189,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4602,0,0,0,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1029,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8773,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [58]:
y_train.head(5)

3510     0.57
13189    0.05
4602     0.42
1029     1.73
8773     0.15
Name: Global_Sales, dtype: float64

In [59]:
X_test.head(5)

Unnamed: 0,2600,3DO,3DS,DC,DS,GB,GBA,GC,GEN,GG,...,Fighting,Misc,Platform,Puzzle,Racing,Role-Playing,Shooter,Simulation,Sports,Strategy
7482,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
719,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
5218,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
10121,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14904,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [60]:
y_test.head(5)

7482     0.21
719      2.25
5218     0.36
10121    0.11
14904    0.03
Name: Global_Sales, dtype: float64

In [61]:
# first let use the KNN
from sklearn.neighbors import KNeighborsRegressor

knn = KNeighborsRegressor(n_neighbors=15)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

knn.score(X_test, y_test)

-0.416641505678911

In [62]:
# the R^2 score is VERY VERY LOW (facepalm)
# one thing that I'm curious is how can we even get negative value??

In [63]:
# let see the prediction
print(y_pred[:10])
print(y_test[:10])

[0.71666667 0.29466667 0.44466667 0.16466667 0.23533333 0.92533333
 0.37266667 0.29533333 0.45733333 0.42      ]
7482     0.21
719      2.25
5218     0.36
10121    0.11
14904    0.03
6924     0.24
5295     0.35
13875    0.04
2011     1.04
9879     0.12
Name: Global_Sales, dtype: float64


In [64]:
# let see what we can do to improve
# KNeighborsRegressor?
for i in range(10, 200, 10):
    knn = KNeighborsRegressor(n_neighbors=i)
    knn.fit(X_train, y_train)
    print("Score of knn regression at "+ str(i) + " is: {}".format(knn.score(X_test, y_test)))

Score of knn regression at 10 is: -0.9768367852521755
Score of knn regression at 20 is: -0.23488106140405304
Score of knn regression at 30 is: -0.09034620278876337
Score of knn regression at 40 is: -0.06527234001744198
Score of knn regression at 50 is: -0.027537548119076494
Score of knn regression at 60 is: 0.005922347997617816
Score of knn regression at 70 is: -0.001030053736475578
Score of knn regression at 80 is: 0.007246137772097039
Score of knn regression at 90 is: 0.0101438114704272
Score of knn regression at 100 is: 0.015233119457709552
Score of knn regression at 110 is: 0.018642148637798006
Score of knn regression at 120 is: 0.02250405531522559
Score of knn regression at 130 is: 0.0005823130794057052
Score of knn regression at 140 is: -0.0029797919597844835
Score of knn regression at 150 is: 0.00039550109744224127
Score of knn regression at 160 is: 0.001041696006370385
Score of knn regression at 170 is: 0.0061300878300499795
Score of knn regression at 180 is: 0.0071367455002783

In [65]:
# The best result for using the KNN regressor is when
# the n_neighbors set to 120. The result become worse
# after 120 is due to overfitting.

In [66]:
# try to use different kind of encoding for categorical data instead of one hot
# first get the value for x and y
X = df.drop(columns=['Rank','Name','Year','Publisher','NA_Sales','EU_Sales','JP_Sales','Other_Sales','Global_Sales'])
y = df['Global_Sales']


In [67]:
# get unique genre and platform
unique_platform = X['Platform'].unique()
unique_genre = X['Genre'].unique()

In [68]:
X['Platform'] = X['Platform'].astype('category')
X['Platform'] = X['Platform'].cat.codes
X['Platform'] = X['Platform'] + 1 # to avoid 0 mapping
X['Genre'] = X['Genre'].astype('category')
X['Genre'] = X['Genre'].cat.codes
X['Genre'] = X['Genre'] + 1
X.describe()

Unnamed: 0,Platform,Genre
count,16291.0,16291.0
mean,16.812841,5.928611
std,8.369998,3.762844
min,1.0,1.0
25%,8.0,2.0
50%,17.0,6.0
75%,22.0,9.0
max,31.0,12.0


In [69]:
# split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=21)

In [70]:
X_train.head(5)

Unnamed: 0,Platform,Genre
3510,17,6
13189,14,1
4602,7,4
1029,27,1
8773,16,7


In [71]:
y_train.head(5)

3510     0.57
13189    0.05
4602     0.42
1029     1.73
8773     0.15
Name: Global_Sales, dtype: float64

In [72]:
for i in range(10, 200, 10):
    knn = KNeighborsRegressor(n_neighbors=i)
    knn.fit(X_train, y_train)
    print("Score of knn regression at "+ str(i) + " is: {}".format(knn.score(X_test, y_test)))

Score of knn regression at 10 is: -0.057523695784102324
Score of knn regression at 20 is: 0.0415612301450774
Score of knn regression at 30 is: 0.04506284603113486
Score of knn regression at 40 is: 0.0325489360606207
Score of knn regression at 50 is: 0.032035034310105015
Score of knn regression at 60 is: 0.036639507322938525
Score of knn regression at 70 is: 0.036862359593643035
Score of knn regression at 80 is: 0.0385725361213034
Score of knn regression at 90 is: 0.034895039851696175
Score of knn regression at 100 is: 0.03651840587507371
Score of knn regression at 110 is: 0.035425023506861386
Score of knn regression at 120 is: 0.03849262636684858
Score of knn regression at 130 is: 0.03631760487107593
Score of knn regression at 140 is: 0.03374953493765975
Score of knn regression at 150 is: 0.030073583141058124
Score of knn regression at 160 is: 0.030919373769158542
Score of knn regression at 170 is: 0.029539325174868125
Score of knn regression at 180 is: 0.029453045003150602
Score of kn

In [73]:
# we see a better result when n_neighbors is set to 30
# next we can compare the prediction and the actual value
knn = KNeighborsRegressor(n_neighbors=30)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

In [74]:
data = {'Actual Sales': y_test, 
        'Predicted Sales': y_pred,
        'difference': y_test - y_pred}
result = pd.DataFrame(data)
result.head(10)

Unnamed: 0,Actual Sales,Predicted Sales,difference
7482,0.21,0.335333,-0.125333
719,2.25,0.243667,2.006333
5218,0.36,0.988,-0.628
10121,0.11,0.205333,-0.095333
14904,0.03,0.318333,-0.288333
6924,0.24,0.878667,-0.638667
5295,0.35,0.431,-0.081
13875,0.04,0.49,-0.45
2011,1.04,0.532333,0.507667
9879,0.12,0.310333,-0.190333


In [75]:
temp = df.drop(columns=['Rank', 'Year', 'Publisher', 'NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales'])
temp.head(10)

Unnamed: 0,Name,Platform,Genre
0,Wii Sports,Wii,Sports
1,Super Mario Bros.,NES,Platform
2,Mario Kart Wii,Wii,Racing
3,Wii Sports Resort,Wii,Sports
4,Pokemon Red/Pokemon Blue,GB,Role-Playing
5,Tetris,GB,Puzzle
6,New Super Mario Bros.,DS,Platform
7,Wii Play,Wii,Misc
8,New Super Mario Bros. Wii,Wii,Platform
9,Duck Hunt,NES,Shooter


In [76]:
result = pd.merge(temp, result, left_index=True, right_index=True, how='outer').dropna()

In [77]:
result.sample(10)

Unnamed: 0,Name,Platform,Genre,Actual Sales,Predicted Sales,difference
12947,Choujikuu Yousai Macross: Do You Remember Love,PS3,Shooter,0.05,1.099667,-1.049667
1139,Pokemon Battle Revolution,Wii,Role-Playing,1.61,0.408,1.202
7309,Ride,PS4,Racing,0.22,0.555667,-0.335667
14550,Pimp My Ride: Street Racing,DS,Racing,0.03,0.235667,-0.205667
8345,Major League Baseball 2K9,PS2,Sports,0.17,0.421667,-0.251667
11160,MLB SlugFest Loaded,XB,Sports,0.09,0.275667,-0.185667
8831,Shadow Ops: Red Mercury,XB,Shooter,0.15,0.338,-0.188
11491,TimeSplitters: Future Perfect,GC,Shooter,0.08,0.256667,-0.176667
2115,Kingdom Hearts Re:coded,DS,Role-Playing,0.98,0.479333,0.500667
7121,NBA ShootOut 2003,PS2,Sports,0.23,0.421667,-0.191667


In [78]:
# one conclusion that we can draw is that this model has very low accuracy.
# The overall predicted value is much lower than the actual value


In [86]:
# we can try to scale up the feature to see if that affect the prediction
scaledX = X + 20
# split the data
X_train, X_test, y_train, y_test = train_test_split(scaledX, y, test_size=0.3, random_state=21)
# train
for i in range(10, 200, 10):
    knn = KNeighborsRegressor(n_neighbors=i)
    knn.fit(X_train, y_train)
    print("Score of knn regression at "+ str(i) + " is: {}".format(knn.score(X_test, y_test)))

Score of knn regression at 10 is: -0.057523695784102324
Score of knn regression at 20 is: 0.0415612301450774
Score of knn regression at 30 is: 0.04506284603113486
Score of knn regression at 40 is: 0.0325489360606207
Score of knn regression at 50 is: 0.032035034310105015
Score of knn regression at 60 is: 0.036639507322938525
Score of knn regression at 70 is: 0.036862359593643035
Score of knn regression at 80 is: 0.0385725361213034
Score of knn regression at 90 is: 0.034895039851696175
Score of knn regression at 100 is: 0.03651840587507371
Score of knn regression at 110 is: 0.035425023506861386
Score of knn regression at 120 is: 0.03849262636684858
Score of knn regression at 130 is: 0.03631760487107593
Score of knn regression at 140 is: 0.03374953493765975
Score of knn regression at 150 is: 0.030073583141058124
Score of knn regression at 160 is: 0.030919373769158542
Score of knn regression at 170 is: 0.029539325174868125
Score of knn regression at 180 is: 0.029453045003150602
Score of kn

Unnamed: 0,Platform,Genre
0,27,11
1,12,5
2,27,7
3,27,11
4,6,8
5,6,6
6,5,5
7,27,4
8,27,5
9,12,9
