In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

### Step 1: Define the problem and set your objective

**Goal**: predicting the popularity of a video game based on its features, such as genre, platform, publisher, and release year

### Step 2: Acquire and explore the dataset

In [None]:
df = pd.read_csv('/kaggle/input/popular-video-games-1980-2023/games.csv')
df.head(2)

**Remove 'Unnamed' feature**:

In [None]:
df = df.drop(['Unnamed: 0'], axis=1)
df.head(2)

#### Data Cleaning
by guide from ChatGPT with GPT-4 described [here](https://a113ssa.github.io/writeup/writeup-3/)

1. Data Exploration:

In [None]:
print(f'General info:\n{df.describe()}')
print()
print(f'Data types:\n{df.dtypes}')
print()
print(f'NAN values:\n{df.isna().sum()}')
print()
print(f'Columns info:\n{df.columns.to_list()}')

2. Handling Missing Values

In [None]:
# Model-based imputation (using KNNImputer from sklearn) of Rating data
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=5)
df['Rating'] = imputer.fit_transform(df[['Rating']])

In [None]:
# Check what Game is without Team data
x = df[df.Team.isna()][['Title', 'Genres']]
games_without_team = x.to_csv(header=None, index=False).strip('\n').split('\n')
print(f'Game with no data about Team: {games_without_team}')

In [None]:
# Fill Team by an empty string
df.Team = df.Team.fillna('')
df.isna().sum()

3. Data Transformation

In [None]:
#Change K to *1000
def convert_to_thousand(x):
    if('K' in x):
        if len(x)>1:
            return int(float(x.replace('K', '')) * 1000)
        else:
            return 1000
    else:
        return x
    
target_features = ['Times Listed', 'Number of Reviews', 'Plays', 'Playing', 'Backlogs', 'Wishlist']

for i in target_features:
    df[i] = df[i].apply(lambda x: convert_to_thousand(x))
            
df.head(4)

In [None]:
from sklearn.preprocessing import LabelEncoder

# Preprocessing steps
df['Team_Encoded'] = LabelEncoder().fit_transform(df['Team'])
df['Genres_Encoded'] = LabelEncoder().fit_transform(df['Genres'])
df['Date_Encoded'] = LabelEncoder().fit_transform(df['Release Date'])
df.head(3)

In [None]:
df['Release Date'] = pd.to_datetime(df['Release Date'], errors='coerce').dt.date
df.head(3)

4. Handling Outliers <br>
with the helpt of guide generated by ChatGPT with GPT-4 [here](https://a113ssa.github.io/learn-with-gpt4/handling-outliers/)

In [None]:
sns.set(style='whitegrid', palette='muted')
plt.figure(figsize=(15, 10))
sns.boxplot(data=df)
plt.xticks(rotation=45)
plt.show()

In [None]:
def detect_outliers_iqr(data, threshold=1.5):
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - threshold * IQR
    upper_bound = Q3 + threshold * IQR
    return ((data < lower_bound) | (data > upper_bound))

outliers = detect_outliers_iqr(df['Rating'])
print(f"Number of outliers: {outliers.sum()}")

Validate approach by comparing the performance of a model trained on the original data and the cleaned data:


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

not_used_features = ['Rating', 'Title', 'Team', 'Genres', 'Release Date', 'Reviews', 'Summary']
X = df.drop(not_used_features, axis=1)
y = df['Rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model trained on original data
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean squared error (original data): {mse}")

data_no_outliers = df[~outliers]

X_no_outliers = data_no_outliers.drop(not_used_features, axis=1)
y_no_outliers = data_no_outliers['Rating']
X_train_no, X_test_no, y_train_no, y_test_no = train_test_split(X_no_outliers, y_no_outliers, test_size=0.2, random_state=42)

# Model trained on cleaned data
lr_no_outliers = LinearRegression()
lr_no_outliers.fit(X_train_no, y_train_no)
y_pred_no = lr_no_outliers.predict(X_test_no)
mse_no_outliers = mean_squared_error(y_test_no, y_pred_no)
print(f"Mean squared error (cleaned data): {mse_no_outliers}")


Lower MSE values indicate better model performance, as they imply smaller differences between the predicted and actual values. In this specific example, the model trained on the original data has a slightly better performance (lower MSE) than the model trained on the cleaned data.