### Importing Libraries 

In [1]:
#Data Preprocessing 
import pandas as pd
import numpy as np
import matplotlib as mpt
import matplotlib.pyplot as plt
import seaborn as sns 
import re 
#Model Development
from sklearn import metrics
from sklearn.metrics import accuracy_score, confusion_matrix, mean_squared_error, r2_score 
from sklearn.model_selection import train_test_split, cross_val_score 
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score
import sklearn.model_selection as ms
#Decision Tree
from sklearn import tree
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier 
from sklearn.tree import plot_tree
from sklearn import metrics
#Gradient Boosting 
from xgboost import XGBRegressor
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline

## Data Cleaning and Preprocessing

In [2]:
decision_vg = pd.read_csv("video_games_sales.csv")

decision_vg = decision_vg.rename(columns = {"rank": "Sales Rank", "na_sales": "North American Sales (in millions)", 
                                            "eu_sales": "European Sales (in millions)", "jp_sales": "Japan Sales (in millions)", 
                                            "other_sales": "Other Sales (in millions)", "global_sales": "Global Sales (in millions)"})

print(f"Distribution of the Variable and Data Types in the Dataset:\n{decision_vg.info()}\n\n")
print(f"Columns Names in the Dataframe:\n{decision_vg.columns.tolist()}\n\n")
print(f"Quantity of of null/missing data entries:\n{decision_vg.isnull().sum()}")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16598 entries, 0 to 16597
Data columns (total 11 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Sales Rank                          16598 non-null  int64  
 1   name                                16598 non-null  object 
 2   platform                            16598 non-null  object 
 3   year                                16327 non-null  float64
 4   genre                               16598 non-null  object 
 5   publisher                           16540 non-null  object 
 6   North American Sales (in millions)  16598 non-null  float64
 7   European Sales (in millions)        16598 non-null  float64
 8   Japan Sales (in millions)           16598 non-null  float64
 9   Other Sales (in millions)           16598 non-null  float64
 10  Global Sales (in millions)          16598 non-null  float64
dtypes: float64(6), int64(1), object(4)
memory

In [3]:
decision_vg.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Sales Rank,16598.0,8300.605254,4791.853933,1.0,4151.25,8300.5,12449.75,16600.0
year,16327.0,2006.406443,5.828981,1980.0,2003.0,2007.0,2010.0,2020.0
North American Sales (in millions),16598.0,0.264667,0.816683,0.0,0.0,0.08,0.24,41.49
European Sales (in millions),16598.0,0.146652,0.505351,0.0,0.0,0.02,0.11,29.02
Japan Sales (in millions),16598.0,0.077782,0.309291,0.0,0.0,0.0,0.04,10.22
Other Sales (in millions),16598.0,0.048063,0.188588,0.0,0.0,0.01,0.04,10.57
Global Sales (in millions),16598.0,0.537441,1.555028,0.01,0.06,0.17,0.47,82.74


In [4]:
decision_vg.head(10)

Unnamed: 0,Sales Rank,name,platform,year,genre,publisher,North American Sales (in millions),European Sales (in millions),Japan Sales (in millions),Other Sales (in millions),Global Sales (in millions)
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37
5,6,Tetris,GB,1989.0,Puzzle,Nintendo,23.2,2.26,4.22,0.58,30.26
6,7,New Super Mario Bros.,DS,2006.0,Platform,Nintendo,11.38,9.23,6.5,2.9,30.01
7,8,Wii Play,Wii,2006.0,Misc,Nintendo,14.03,9.2,2.93,2.85,29.02
8,9,New Super Mario Bros. Wii,Wii,2009.0,Platform,Nintendo,14.59,7.06,4.7,2.26,28.62
9,10,Duck Hunt,NES,1984.0,Shooter,Nintendo,26.93,0.63,0.28,0.47,28.31


### Encoding Categorical Variables 

In [5]:
predictors = decision_vg[["platform", "year", "genre", "publisher", "European Sales (in millions)", "Japan Sales (in millions)"]] #Experiment with "Sales Rank"
response = decision_vg["North American Sales (in millions)"]

In [6]:
predictors = pd.get_dummies(predictors, columns=['platform', 'genre'], drop_first=True) #One-hot encode low-cardinality features 
                                                                        #(creates a column per category, values become True/False)
#get the mean of North American sales for each publisher
publisher_target_mean = predictors.join(response).groupby("publisher")["North American Sales (in millions)"].mean() 
global_mean = response.mean()
predictors["Mean North American Sales per Publisher"] = predictors["publisher"].map(publisher_target_mean).fillna(global_mean)

In [7]:
label_encoders = {}
for col in predictors.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    predictors[col] = le.fit_transform(predictors[col])
    label_encoders[col] = le

y_encoder = LabelEncoder()
response = y_encoder.fit_transform(response)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(predictors, response, test_size = 0.2, random_state = 42)

In [9]:
na_decision = DecisionTreeRegressor(random_state = 42) #DecisionTreeRegressor is made for continuous variables 
na_decision.fit(X_train, y_train)
y_pred = na_decision.predict(X_test)

mse = mean_squared_error(y_test, y_pred)  #Mean Squared Error is a common metric for regression
print(f"Mean Squared Error: {mse:.5f}\n")

r2 = r2_score(y_test, y_pred) #Evaluates the model accuracy
print(f"R² Score: {r2:.5f}")

Mean Squared Error: 1030.59691

R² Score: 0.57616


In [10]:
cross_val_r2 = cross_val_score(na_decision, predictors, response, cv=5, scoring='r2')
print(f"Cross-validated R² Scores: {cross_val_r2}\n")
print(f"Mean Cross-validated R²: {cross_val_r2.mean():.5f}\n")
print(f"Standard Deviation of R² Scores: {cross_val_r2.std():.5f}\n")

Cross-validated R² Scores: [  -0.7815804   -15.31288601  -25.06249417  -45.72307349 -240.4861996 ]

Mean Cross-validated R²: -65.47325

Standard Deviation of R² Scores: 88.71720



### Pruning the Decision Tree for Higher Accuracy 

In [None]:
depth_val = np.arange(2,16)
leaf_val = np.arange(1,21, step=10)

grid_s = [{'max_depth': depth_val,'min_samples_leaf': leaf_val}]
model = tree.DecisionTreeRegressor(criterion='friedman_mse')

cv_tree = GridSearchCV(estimator=model,param_grid=grid_s,cv=ms.KFold(n_splits=10))
cv_tree.fit(X_train, y_train)

best_depth = cv_tree.best_params_['max_depth']

best_min_samples = cv_tree.best_params_['min_samples_leaf']

print(best_depth, best_min_samples)

final_model = tree.DecisionTreeRegressor(
    criterion='friedman_mse',
    max_depth=best_depth,
    min_samples_leaf=best_min_samples,
    random_state=123
)
final_model.fit(X_train, y_train)

In [None]:

plt.figure(figsize=(20,17))
tree.plot_tree(
    final_model,
    feature_names=predictors.columns,
    filled=True,
    rounded=True,
    fontsize=12
)
#plt.savefig("Feature Importance in Predicting North American Video Game Sales.png", bbox_inches = "tight", dpi = 300)
plt.show()

In [None]:
plt.figure(figsize=(30,27))
tree.plot_tree(final_model, max_depth=3, feature_names=predictors.columns, filled=True, rounded=True)
plt.title("Decision Tree (Top 3 Levels)")
plt.show()

In [None]:
# Predictions
Y_pred = final_model.predict(X_test)

# Metrics
r2 = r2_score(y_test, Y_pred)
mse = mean_squared_error(y_test, Y_pred)
rmse = np.sqrt(mse)

print(f"R² Score: {r2:.5f}")
print(f"MSE: {mse:.5f}")
print(f"RMSE: {rmse:.5f}")