In [1]:
import pandas as pd
import numpy as np

import sys; sys.path.append('../..') ; sys.path.append('../Dataset/'); 
from dataset import steamGamesDataset


In [32]:
lhs_df = pd.read_csv('../Data/lhsDataset.csv')

onehot = pd.read_clipboard('../Data/OneHotVectors.csv')


rhs_df = pd.read_csv('../Data/rhs.csv')
rhs_df = rhs_df['Median playtime forever'].to_frame()


Lets Normalize Input

In [33]:
lhs_df = (lhs_df.astype('Float64') - lhs_df.astype('Float64').min()) / (lhs_df.astype('Float64').max() - lhs_df.astype('Float64').min())


Now let us train on some elementary models

In [46]:
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR, LinearSVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error


In [35]:
x_train, x_test, y_train, y_test = train_test_split(lhs_df, rhs_df.values.ravel(), test_size=0.3, random_state=0)
x_train.shape


(50201, 36)

In [36]:
regr = AdaBoostRegressor(random_state=0)
regr.fit(x_train, y_train)
print(f'AdaBoost Training score {regr.score(x_train, y_train)}')
print(f'AdaBoost Testing score {regr.score(x_test, y_test)}')

train_pred = regr.predict(x_train)
test_pred = regr.predict(x_test)

print(f'Train Errors:   MAE: {mean_absolute_error(y_train, train_pred)} MAPE: {mean_absolute_percentage_error(y_train, train_pred)}')
print(f'Test Errors:   MAE: {mean_absolute_error(y_test, test_pred)} MAPE: {mean_absolute_percentage_error(y_test, test_pred)}')


AdaBoost Training score-1.5703510424291576
AdaBoost Testing score-2.17398814838985


In [39]:
regr = RandomForestRegressor(random_state=0)
regr.fit(x_train, y_train)
print(f'Random Forest Training score {regr.score(x_train, y_train)}')
print(f'Random Forest Testing score {regr.score(x_test, y_test)}')

train_pred = regr.predict(x_train)
test_pred = regr.predict(x_test)

print(f'Train Errors:   MAE: {mean_absolute_error(y_train, train_pred)} MAPE: {mean_absolute_percentage_error(y_train, train_pred)}')
print(f'Test Errors:   MAE: {mean_absolute_error(y_test, test_pred)} MAPE: {mean_absolute_percentage_error(y_test, test_pred)}')


Random Forest Training score0.8387597142654579
Random Forest Testing score-0.10900047470936425


In [44]:
regr = LinearRegression()
regr.fit(x_train, y_train)
print(f'Linear Regression Training score {regr.score(x_train, y_train)}')
print(f'Linear Regression Testing score {regr.score(x_test, y_test)}')

train_pred = regr.predict(x_train)
test_pred = regr.predict(x_test)

print(f'Train Errors:   MAE: {mean_absolute_error(y_train, train_pred)} MAPE: {mean_absolute_percentage_error(y_train, train_pred)}')
print(f'Test Errors:   MAE: {mean_absolute_error(y_test, test_pred)} MAPE: {mean_absolute_percentage_error(y_test, test_pred)}')


Random Forest Training score 0.00858175477690959
Random Forest Testing score 0.006673533896030648
Train Errors:   MAE: 162.13580060027533 MAPE: 3.2253057122177363e+17
Test Errors:   MAE: 165.69396233903507 MAPE: 3.267996635833555e+17


In [47]:
regr = SGDRegressor()
regr.fit(x_train, y_train)
print(f'SGDRegressor Training score {regr.score(x_train, y_train)}')
print(f'SGDRegressor Testing score {regr.score(x_test, y_test)}')

train_pred = regr.predict(x_train)
test_pred = regr.predict(x_test)

print(f'Train Errors:   MAE: {mean_absolute_error(y_train, train_pred)} MAPE: {mean_absolute_percentage_error(y_train, train_pred)}')
print(f'Test Errors:   MAE: {mean_absolute_error(y_test, test_pred)} MAPE: {mean_absolute_percentage_error(y_test, test_pred)}')


AdaBoost Training score 0.005809544378502118
AdaBoost Testing score 0.005125093815810167
Train Errors:   MAE: 173.26210007668183 MAPE: 3.60099085679644e+17
Test Errors:   MAE: 176.01975626043006 MAPE: 3.6305815024751494e+17


In [48]:
regr = DecisionTreeRegressor(random_state=0)
regr.fit(x_train, y_train)
print(f'Decision Tree Training score {regr.score(x_train, y_train)}')
print(f'Decision Tree Testing score {regr.score(x_test, y_test)}')

train_pred = regr.predict(x_train)
test_pred = regr.predict(x_test)

print(f'Train Errors:   MAE: {mean_absolute_error(y_train, train_pred)} MAPE: {mean_absolute_percentage_error(y_train, train_pred)}')
print(f'Test Errors:   MAE: {mean_absolute_error(y_test, test_pred)} MAPE: {mean_absolute_percentage_error(y_test, test_pred)}')


AdaBoost Training score 1.0
AdaBoost Testing score -2.7100383749357007
Train Errors:   MAE: 0.0 MAPE: 0.0
Test Errors:   MAE: 185.2991401347897 MAPE: 1.0268479271228138e+17


In [50]:
regr = SVR()
regr.fit(x_train, y_train)
print(f'SVR Training score {regr.score(x_train, y_train)}')
print(f'SVR Testing score {regr.score(x_test, y_test)}')

train_pred = regr.predict(x_train)
test_pred = regr.predict(x_test)

print(f'Train Errors:   MAE: {mean_absolute_error(y_train, train_pred)} MAPE: {mean_absolute_percentage_error(y_train, train_pred)}')
print(f'Test Errors:   MAE: {mean_absolute_error(y_test, test_pred)} MAPE: {mean_absolute_percentage_error(y_test, test_pred)}')


AdaBoost Training score -0.0023569011949566843
AdaBoost Testing score -0.0023484245674787463
Train Errors:   MAE: 102.99025872747306 MAPE: 7320648448979645.0
Test Errors:   MAE: 105.5983656285847 MAPE: 7714082546637280.0


In [52]:
regr = LinearSVR()
regr.fit(x_train, y_train)
print(f'LinearSVR Training score {regr.score(x_train, y_train)}')
print(f'LinearSVR Testing score {regr.score(x_test, y_test)}')

train_pred = regr.predict(x_train)
test_pred = regr.predict(x_test)

print(f'Train Errors:   MAE: {mean_absolute_error(y_train, train_pred)} MAPE: {mean_absolute_percentage_error(y_train, train_pred)}')
print(f'Test Errors:   MAE: {mean_absolute_error(y_test, test_pred)} MAPE: {mean_absolute_percentage_error(y_test, test_pred)}')


LinearSVR Training score -0.002046669384440092
LinearSVR Testing score -0.0020381205213861886
Train Errors:   MAE: 102.73357167169749 MAPE: 7361121507902095.0
Test Errors:   MAE: 105.23378678194864 MAPE: 7847110492302163.0


In [53]:
regr = KNeighborsRegressor()
regr.fit(x_train, y_train)
print(f'KNN Training score {regr.score(x_train, y_train)}')
print(f'KNN Testing score {regr.score(x_test, y_test)}')

train_pred = regr.predict(x_train)
test_pred = regr.predict(x_test)

print(f'Train Errors:   MAE: {mean_absolute_error(y_train, train_pred)} MAPE: {mean_absolute_percentage_error(y_train, train_pred)}')
print(f'Test Errors:   MAE: {mean_absolute_error(y_test, test_pred)} MAPE: {mean_absolute_percentage_error(y_test, test_pred)}')


KNN Training score 0.2214014636671079
KNN Testing score -0.1286966755356549
Train Errors:   MAE: 131.00431067110216 MAPE: 1.8011133016236106e+17
Test Errors:   MAE: 158.936332791076 MAPE: 2.349743633682665e+17
