In [1]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

# preprocessing
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV

# Modelling
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from catboost import CatBoostRegressor
from xgboost import XGBRegressor


Import the CSV Data as Pandas DataFrame

In [2]:
df = pd.read_csv('../data/data_cleaned.csv')

View the first 5 rows of the dataset

In [3]:
df.head()

Unnamed: 0,ID,CONSOLE,YEAR,CATEGORY,PUBLISHER,RATING,CRITICS_POINTS,USER_POINTS,SalesInMillions
0,2860,ds,2008,role-playing,Nintendo,E,2.833333,0.303704,1.779257
1,495,pc,2019,shooter,Activision,M,4.5625,0.00641,0.534402
2,2641,ps2,2002,sports,Electronic Arts,E,4.181818,0.326923,1.383964
3,811,ps3,2013,action,Activision,M,2.259259,0.032579,0.082671
4,1470,psp,2008,strategy,Idea Factory,T,17.5,0.464706,0.791305


In [4]:
print("Categories in 'CONSOLE' variable:   ")
print(df["CONSOLE"].unique())

print("Categories in 'CATEGORY' variable:    ")
print(df["CATEGORY"].unique())

print("Categories in 'PUBLISHER' variable:(FIRST 10)    ")
print(df["PUBLISHER"].unique()[:10])

print("Categories in 'RATING' variable:    ")
print(df["RATING"].unique())

Categories in 'CONSOLE' variable:   
['ds' 'pc' 'ps2' 'ps3' 'psp' 'wii' 'psv' 'gba' 'x360' 'gc' 'ps' 'x' 'wiiu'
 'xone' 'ps4' '3ds' 'dc']
Categories in 'CATEGORY' variable:    
['role-playing' 'shooter' 'sports' 'action' 'strategy' 'racing'
 'simulation' 'misc' 'fighting' 'platform' 'adventure' 'puzzle']
Categories in 'PUBLISHER' variable:(FIRST 10)    
['Nintendo' 'Activision' 'Electronic Arts' 'Idea Factory' 'Atari'
 'Sony Computer Entertainment' 'Paradox Interactive' 'Lucasarts'
 'Redoctane' 'Konami Digital Entertainment']
Categories in 'RATING' variable:    
['E' 'M' 'T' 'E10+' 'K-A' 'RP']


#### Preparing X and y variables

In [5]:
X = df.drop("SalesInMillions", axis=1)
y = df["SalesInMillions"]

len(X), len(y)

(3104, 3104)

#### Data Transformation

In [6]:
# create column transformers with different transformers for numeric and categorical variables
num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include="object").columns

num_transformer = StandardScaler()
cat_transformer = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", cat_transformer, cat_features),
        ("StandardScaler", num_transformer, num_features)
    ]
)

#### Split the data into training and testing sets

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

len(X_train), len(y_train), len(X_test), len(y_test)

(2483, 2483, 621, 621)

**Fitting the transformer on the dataset**

In [8]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [9]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [10]:
params={
                "Decision Tree": {
                    'criterion':['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
                    'splitter':['best','random'],
                    'max_features':['sqrt','log2'],
                },
                "Random Forest":{
                    'criterion':['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
                 
                    'max_features':['sqrt','log2',None],
                    'n_estimators': [8,16,32,64,128,256]
                },
                "Gradient Boosting":{
                    'loss':['squared_error', 'huber', 'absolute_error', 'quantile'],
                    'learning_rate':[.1,.01,.05,.001],
                    'subsample':[0.6,0.7,0.75,0.8,0.85,0.9],
                    'criterion':['squared_error', 'friedman_mse'],
                    'max_features':['auto','sqrt','log2'],
                    'n_estimators': [8,16,32,64,128,256]
                },
                "Linear Regression":{},
                "XGBRegressor":{
                    'learning_rate':[.1,.01,.05,.001],
                    'n_estimators': [8,16,32,64,128,256]
                },
                "CatBoosting Regressor":{
                    'depth': [6,8,10],
                    'learning_rate': [0.01, 0.05, 0.1],
                    'iterations': [30, 50, 100]
                },
                "AdaBoost Regressor":{
                    'learning_rate':[.1,.01,0.5,.001],
                    'loss':['linear','square','exponential'],
                    'n_estimators': [8,16,32,64,128,256]
                }
                
            }

#### Model Training and Hyperparameter Tuning

In [11]:
models = {
                "Random Forest": RandomForestRegressor(),
                "Decision Tree": DecisionTreeRegressor(),
                "Gradient Boosting": GradientBoostingRegressor(),
                "Linear Regression": LinearRegression(),
                "XGBRegressor": XGBRegressor(),
                "CatBoosting Regressor": CatBoostRegressor(verbose=False),
                "AdaBoost Regressor": AdaBoostRegressor(),
            }

model_list = []
rmse_list =[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    para=params[list(models.keys())[i]]
    # model.fit(X_train, y_train) # Train model

    gs = GridSearchCV(model, para, cv=3, n_jobs=-1)
    gs.fit(X_train, y_train)
    
    model.set_params(**gs.best_params_)
    model.fit(X_train, y_train)
    
    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    rmse_list.append(model_test_rmse)
    
    print('='*35)
    print('\n')

Random Forest
Model performance for Training set
- Root Mean Squared Error: 0.6407
- Mean Absolute Error: 0.4163
- R2 Score: 0.9092
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 1.8650
- Mean Absolute Error: 1.1918
- R2 Score: 0.3245


Decision Tree
Model performance for Training set
- Root Mean Squared Error: 0.0000
- Mean Absolute Error: 0.0000
- R2 Score: 1.0000
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 2.6599
- Mean Absolute Error: 1.6287
- R2 Score: -0.3740




3456 fits failed out of a total of 10368.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
2204 fits failed with the following error:
Traceback (most recent call last):
  File "d:\Documents\Projects\sales_prediction_using_machine_learning\venv\lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\Documents\Projects\sales_prediction_using_machine_learning\venv\lib\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params()
  File "d:\Documents\Projects\sales_prediction_using_machine_learning\venv\lib\site-packages\sklearn\base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "d:\Documents\Pro

Gradient Boosting
Model performance for Training set
- Root Mean Squared Error: 1.4502
- Mean Absolute Error: 1.0164
- R2 Score: 0.5349
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 1.9442
- Mean Absolute Error: 1.2133
- R2 Score: 0.2659


Linear Regression
Model performance for Training set
- Root Mean Squared Error: 1.9755
- Mean Absolute Error: 1.1906
- R2 Score: 0.1369
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 324093376573.7820
- Mean Absolute Error: 63892616298.7334
- R2 Score: -20399208448348601188352.0000


XGBRegressor
Model performance for Training set
- Root Mean Squared Error: 0.9883
- Mean Absolute Error: 0.8032
- R2 Score: 0.7840
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 1.9037
- Mean Absolute Error: 1.1918
- R2 Score: 0.2962




11 fits failed out of a total of 81.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
11 fits failed with the following error:
Traceback (most recent call last):
  File "d:\Documents\Projects\sales_prediction_using_machine_learning\venv\lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\Documents\Projects\sales_prediction_using_machine_learning\venv\lib\site-packages\catboost\core.py", line 5734, in fit
    return self._fit(X, y, cat_features, text_features, embedding_features, None, sample_weight, None, None, None, None, baseline,
  File "d:\Documents\Projects\sales_prediction_using_machine_learning\venv\lib\site-packages\catboost\core.py", l

CatBoosting Regressor
Model performance for Training set
- Root Mean Squared Error: 1.1229
- Mean Absolute Error: 0.8831
- R2 Score: 0.7211
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 1.8164
- Mean Absolute Error: 1.1681
- R2 Score: 0.3593


AdaBoost Regressor
Model performance for Training set
- Root Mean Squared Error: 1.5913
- Mean Absolute Error: 1.1397
- R2 Score: 0.4400
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 2.2299
- Mean Absolute Error: 1.3116
- R2 Score: 0.0343




#### Results

In [12]:
pd.set_option('display.float_format', lambda x: '%.5f' % x)

In [15]:
results = pd.DataFrame(zip(model_list, rmse_list), columns=["Model Name", "RMSE Score"]).sort_values(by="RMSE Score", ascending=True)
results

Unnamed: 0,Model Name,RMSE Score
5,CatBoosting Regressor,1.81636
0,Random Forest,1.86499
4,XGBRegressor,1.90368
2,Gradient Boosting,1.94418
6,AdaBoost Regressor,2.22991
1,Decision Tree,2.65986
3,Linear Regression,324093376573.78204


**Best Performing model is CatBoost**

In [16]:
cat_boost = CatBoostRegressor(verbose=False)
catboost = cat_boost.fit(X_train, y_train)
y_pred = cat_boost.predict(X_test)
score = np.sqrt(mean_squared_error(y_test, y_pred))
print(" Root mean square error of the model is %.2f" %score)

 Root mean square error of the model is 1.80


**Difference between Actual and Predicted Values**

In [17]:
pred_df=pd.DataFrame({'Actual Value':y_test,'Predicted Value':y_pred,'Difference':y_test-y_pred})
pred_df

Unnamed: 0,Actual Value,Predicted Value,Difference
1653,3.20171,2.20244,0.99927
2188,2.09816,2.82107,-0.72291
309,2.07012,1.28051,0.78961
746,0.28021,2.92650,-2.64629
1097,0.32568,2.07046,-1.74478
...,...,...,...
2398,0.54414,1.69066,-1.14652
1973,0.36102,2.14825,-1.78723
2683,0.59655,1.18896,-0.59240
1637,2.62548,3.08476,-0.45928
