In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import warnings
warnings.filterwarnings("ignore")

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv("/kaggle/input/top-1500-games-on-steam-by-revenue-09-09-2024/Steam_2024_bestRevenue_1500.csv")
df.head()

In [None]:
# Let's find the number of null values
df.isna().sum().sum()

##### Since the number of rows with empty values is very small, we can simply exclude them from our analysis

In [None]:
df = df.dropna()
df.info()

# Exploratory Data Analysis

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# sns.displot(data=df, x=df['price'], stat='count', bins='auto')

fig, axes = plt.subplots(1, 2, figsize=(12, 6))

# Histogram
sns.histplot(data=df, x='price', stat='count', bins='auto', ax=axes[0])
axes[0].set_title('Histogram of Prices')

# Box Plot
sns.boxplot(data=df, y='price', ax=axes[1])
axes[1].set_title('Box Plot of Prices')

plt.tight_layout()
plt.show()

**We see that the average price of a game is close to 20 dollars**

In [None]:
plt.figure(figsize=(24,12))
sns.pairplot(data=df,
                  x_vars=['avgPlaytime', 'copiesSold','reviewScore', 'revenue'],
                  y_vars=['price'],
                  kind="scatter")
plt.show()

**We observe that there is no correlation between any of avgPlaytime, reviewScore, copiesSold, revenue, and price**

In [None]:
df['publishers'].value_counts()

In [None]:
df['publisherClass'].value_counts()


In [None]:
df['developers'].value_counts()


# Feature Engineering

In [None]:
# We can remove the steamID and the name as they won't be useful in our analysis

df.drop(columns = ['name','steamId'], inplace=True)
df



In [None]:
# Before removing the releaseDate column, we may need to find the number of days that have passed since the game was released because the older the game is the lower its' price becomes
df['releaseDate'] = pd.to_datetime(df['releaseDate'], format='%d-%m-%Y')
df.info()

In [None]:
# Now, we can add a column that shows how many days have passed since the game's release date
from datetime import datetime

df['days_since_release'] = (datetime.now() - df['releaseDate']).dt.days
df.drop(columns= ['releaseDate'], inplace=True)
df = df.reset_index(drop=True)
df

In [None]:
from sklearn.preprocessing import OneHotEncoder

#Extract categorical columns from the dataframe
#Here we extract the columns with object datatype as they are the categorical columns
categorical_columns = df.select_dtypes(include=['object']).columns.tolist()

#Initialize OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)

# Apply one-hot encoding to the categorical columns
one_hot_encoded = encoder.fit_transform(df[categorical_columns])

#Create a DataFrame with the one-hot encoded columns
#We use get_feature_names_out() to get the column names for the encoded data
one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(categorical_columns))

# Concatenate the one-hot encoded dataframe with the original dataframe
df_encoded = pd.concat([df, one_hot_df], axis=1)

# Drop the original categorical columns
df_encoded = df_encoded.drop(categorical_columns, axis=1)
df_encoded

In [None]:
corr = df[['copiesSold', 'revenue','avgPlaytime','reviewScore','days_since_release', 'price']].corr()
#sns.heatmap(corr, annot=True, fmt=".4f", cmap=sns.color_palette("YlOrBr", as_cmap=True))
sns.heatmap(corr, annot=True, fmt=".4f", cmap=sns.color_palette("rocket_r", as_cmap=True))

We observe that the highest correlation between the variables is the correlation between **revenue** and **copiesSold** which **0.6277** which is not very high whereas the rest of the correlations vary between **0.01** and **0.1**. This indicaes that as the number of copies sold increases, the revenue tends to increase as well which is logical of course. 

As for the rest of the variables whose correlations vary between **0.01** and **0.1**, these ones are very small, which suggests that there is little to no linear relationship between those variables. This could indicate that the rest of the variables in this dataset are rather independent from each other or simply that whatever relationship exist between them is non-linear or determined by other factors not captured by the data set.

Overall, aparat from **revenue** and **copiesSold**,this indicates that the variables are linearly independant and we can go ahead and use them in the analysis

# Model Building

## Basic Regression Models

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

features = df_encoded.loc[:, df_encoded.columns != 'price']
target = df_encoded[['price']]

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

In [None]:
reg = LinearRegression()
reg.fit(X_train, y_train)

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

# Make Predictions on the test data
y_pred = reg.predict(X_test)

# Calculate the mean squared error and R^2 score
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")

This is a relativley high MSE, let's now try the regression with the other regression models that we could use 

In [None]:
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR


def evaluate_data(X_train, y_train, X_test, y_test):
    models = {
        "Linear Regression": LinearRegression(),
        "Ridge Regression": Ridge(),
        "Lasso Regression": Lasso(),
        "ElasticNet Regression": ElasticNet(),
        "Decision Tree Regressor": DecisionTreeRegressor(),
        "Random Forest Regressor": RandomForestRegressor(),
        "Gradient Boosting Regressor": GradientBoostingRegressor(),
        "Support Vector Regressor": SVR()
    }

    # Train and test each model
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        print(f"{name} - Mean Squared Error: {mse:.4f} || R^2 Score: {r2:.4f}")

evaluate_data(X_train, y_train, X_test, y_test)

It appears that the **Gradient Boosting Regressor** algorithm is the best performing model. And from the $R^2$ socre, this model can explain almost **70%** of the data. But let's see if we can improve the model further by applying standardization on the dataset

## Standardize the Dataset

In [None]:
from sklearn.preprocessing import StandardScaler

numeric_features = ['copiesSold', 'revenue','avgPlaytime','reviewScore','days_since_release']
scaler = StandardScaler()

# Transform the data using data normalization on the features

normalized_df = pd.DataFrame(scaler.fit_transform(df_encoded[numeric_features]))
normalized_df.columns = numeric_features

normalized_df = pd.concat([normalized_df,df_encoded['price'], one_hot_df], axis=1)
normalized_df

In [None]:
# Now, let's split the data again into a training and a test set

# Caputre the features and the target variables seperatley
features = normalized_df.loc[:, normalized_df.columns != 'price']
target = normalized_df[['price']]

# Split the data into train and test set
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Evaluate the model on the normalized data
evaluate_data(X_train, y_train, X_test, y_test)

## MinMaxStandardization

In [None]:
from sklearn.preprocessing import MinMaxScaler

numeric_features = ['copiesSold', 'revenue','avgPlaytime','reviewScore','days_since_release']
scaler = MinMaxScaler()

# Transform the data using data normalization on the features

min_max_df = pd.DataFrame(scaler.fit_transform(df_encoded[numeric_features]))
min_max_df.columns = numeric_features

min_max_df = pd.concat([min_max_df,df_encoded['price'], one_hot_df], axis=1)
min_max_df

In [None]:
# Now, let's split the data again into a training and a test set

# Caputre the features and the target variables seperatley
features = min_max_df.loc[:, min_max_df.columns != 'price']
target = min_max_df[['price']]

# Split the data into train and test set
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Evaluate the model on the normalized data
evaluate_data(X_train, y_train, X_test, y_test)

**From what we see above, it appears that standardization doesn't make a difference in the model performance**