# Importing

In [81]:
# Basic libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Preprocessing
from sklearn.preprocessing import StandardScaler, RobustScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer

# Feature Engineering
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import RFE

# Models
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import LinearRegression, Lasso, Ridge
import xgboost as xgb

# Model Selection & Evaluation
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score

# Utilities
from sklearn.compose import ColumnTransformer
from collections import Counter
from sklearn.pipeline import Pipeline
from tqdm.auto import tqdm
import joblib
from sklearn.metrics import classification_report

# Combining Datasets

In [None]:
def merge_datasets(csv_path,dataset_name):
    demos = pd.read_csv("/kaggle/input/d/alyhany04/steam-games-analysis/datasets/raw/demos.csv")
    dlcs = pd.read_csv("/kaggle/input/d/alyhany04/steam-games-analysis/datasets/raw/dlcs.csv")
    gamalytic = pd.read_csv(csv_path)
    info = pd.read_csv("/kaggle/input/d/alyhany04/steam-games-analysis/datasets/raw/info_base_games.csv", low_memory=False)
    
    demos.columns = demos.columns.str.strip()
    dlcs.columns = dlcs.columns.str.strip()
    gamalytic.columns = gamalytic.columns.str.strip()
    info.columns = info.columns.str.strip()
    
    demos.rename(columns={"full_game_appid": "Full_game_appid"}, inplace=True)
    dlcs.rename(columns={"base_appid": "Full_game_appid"}, inplace=True)
    gamalytic.rename(columns={"steamId": "Full_game_appid"}, inplace=True)
    info.rename(columns={"appid": "Full_game_appid"}, inplace=True)
    
    
    demos["Full_game_appid"] = demos["Full_game_appid"].astype(str)
    dlcs["Full_game_appid"] = dlcs["Full_game_appid"].astype(str)
    gamalytic["Full_game_appid"] = gamalytic["Full_game_appid"].astype(str)
    info["Full_game_appid"] = info["Full_game_appid"].astype(str)
    
    
    merged_df = demos.merge(dlcs, on="Full_game_appid", how="inner") \
                     .merge(gamalytic, on="Full_game_appid", how="right") \
                     .merge(info, on="Full_game_appid", how="inner")
    merged_df.drop(columns=['Unnamed: 0','demo_appid','dlc_appid','Full_game_appid'], inplace=True)
    output_path = f"/kaggle/working/{dataset_name}combined_games.csv"
    merged_df.to_csv(output_path, index=False)
    print(merged_df.columns)
    print("Demos:", demos.shape)
    print("DLCs:", dlcs.shape)
    print("Gamalytic:", gamalytic.shape)
    print("Merged shape:", merged_df.shape)
    
    if 'reviewScore' in merged_df.columns:
          review_score = merged_df.pop('reviewScore')
          merged_df['reviewScore'] = review_score
    return merged_df
regression_df = merge_datasets("/kaggle/input/d/alyhany04/steam-games-analysis/datasets/raw/gamalytic_steam_games.csv","Regression ")
classification_df = merge_datasets("/kaggle/input/d/alyhany04/steam-games-analysis/datasets/raw/ms2_gamalytic_steam_games.csv","Classification ")

In [None]:
regression_df.head()

In [None]:
  classification_df.head()

# Data Inspection

In [None]:
print("Combined shape:", classification_df.shape)
classification_df.info()

In [None]:
regression_df.info()

In [None]:
regression_df.describe()

In [None]:
classification_df.info()

In [None]:
classification_df.info()

In [None]:
regression_df.info()

In [None]:
regression_df.info()

In [None]:
classification_df.info()

In [None]:
classification_df.isnull().sum()

In [None]:
regression_df.isnull().sum()

In [None]:
classification_df.duplicated().sum()

In [None]:
regression_df.duplicated().sum()

In [None]:
df = regression_df
print("\n=== Detailed Column Analysis ===")

print("\n1. Metacritic Analysis:")
print("Missing values:", df['metacritic'].isna().sum())
print("Percentage missing:", (df['metacritic'].isna().sum() / len(df)) * 100)

df['metacritic'] = pd.to_numeric(df['metacritic'], errors='coerce')
if 'reviewScore' in df.columns:
    df['reviewScore'] = pd.to_numeric(df['reviewScore'], errors='coerce')
    
    print("\nCorrelation with reviewScore:", df['metacritic'].corr(df['reviewScore']))

print("\nMetacritic statistics:")
print(df['metacritic'].describe())

print("\n2. Achievements Analysis:")

df['achievements_total'] = pd.to_numeric(df['achievements_total'], errors='coerce')

print("Missing values:", df['achievements_total'].isna().sum())
print("Percentage missing:", (df['achievements_total'].isna().sum() / len(df)) * 100)

print("\nGames with achievements:")
print(df[df['achievements_total'] > 0]['achievements_total'].describe())

if 'reviewScore' in df.columns:
    print("\nCorrelation with reviewScore:", df['achievements_total'].corr(df['reviewScore']))

print("\n3. Genres Analysis:")
print("Missing values:", df['genres'].isna().sum())
print("Percentage missing:", (df['genres'].isna().sum() / len(df)) * 100)
print("\nTop 10 most common genres:")
print(df['genres'].value_counts().head(10))

print("\n4. Release Date Analysis:")
print("Missing values:", df['release_date'].isna().sum())
print("Percentage missing:", (df['release_date'].isna().sum() / len(df)) * 100)
print("\nRelease date distribution:")
print(df['release_date'].value_counts().head())

print("\n5. High Missing Value Columns Analysis:")
columns_to_check = ['aiContent','name_x', 'name_y']
for col in columns_to_check:
    print(f"\n{col} missing values:", df[col].isna().sum())
    print(f"Percentage missing:", (df[col].isna().sum() / len(df)) * 100)

print("\n6. Duplicate Analysis:")
print("Number of duplicate rows:", df.duplicated().sum())
print("\nSample of duplicate rows:")
df[df.duplicated()].head()

## Detailed Column Analysis for Preprocessing Decisions

# EDA

### Univariate Analysis

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(df['price'], label='Price', color='blue')
plt.title('Price Plot')
plt.xlabel('Index')
plt.ylabel('Price')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
plt.style.use("ggplot")
publisher_class_counts = df['publisherClass'].value_counts()
explode = [0.1 if i == 0 else 0 for i in range(len(publisher_class_counts))]

plt.figure(figsize=(6, 6))
patches, texts, autotexts = plt.pie(
    publisher_class_counts,
    labels=None,  
    autopct='%1.1f%%', 
    startangle=90,
    textprops={"fontweight": "bold"},
    explode=explode,
    shadow=True,
    wedgeprops={'edgecolor': 'black', 'linewidth': 1.5},
    colors=plt.cm.Paired.colors
)

plt.legend(patches, publisher_class_counts.index, loc='upper right', bbox_to_anchor=(1.3, 1), title="Publisher Class")
plt.title('Publisher Class Distribution', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

In [None]:
df['reviewScore'].value_counts()

In [None]:
classification_df['reviewScore'].value_counts()

In [None]:
df['price'].value_counts()

In [None]:
steam_achievements_counts = df['steam_achievements'].value_counts()
plt.style.use("ggplot")
explode = [0.1 if i == 0 else 0 for i in range(len(steam_achievements_counts))]
plt.figure(figsize=(6, 6))
patches, texts, autotexts = plt.pie(
    steam_achievements_counts,
    labels=None,  
    autopct='%1.1f%%',
    startangle=90,
    textprops={"fontweight": "bold"},
    explode=explode,
    shadow=True,
    wedgeprops={'edgecolor': 'black', 'linewidth': 1.5},
    colors=plt.cm.Paired.colors
)

plt.legend(
    patches,
    steam_achievements_counts.index,
    loc='upper right',
    bbox_to_anchor=(1.3, 1),
    title="Steam Achievements"
)


plt.title('Steam Achievements Distribution', fontsize=14, fontweight='bold')


plt.tight_layout()
plt.show()

In [None]:
steam_trading_cards_counts = df['steam_trading_cards'].value_counts()
plt.style.use("ggplot")
explode = [0.1 if i == 0 else 0 for i in range(len(steam_trading_cards_counts))]
plt.figure(figsize=(6, 6))
patches, texts, autotexts = plt.pie(
    steam_trading_cards_counts,
    labels=None,  
    autopct='%1.1f%%',
    startangle=90,
    textprops={"fontweight": "bold"},
    explode=explode,
    shadow=True,
    wedgeprops={'edgecolor': 'black', 'linewidth': 1.5},
    colors=plt.cm.Paired.colors
)

plt.legend(
    patches,
    steam_trading_cards_counts.index,
    loc='upper right',
    bbox_to_anchor=(1.3, 1),
    title="Steam Trading Cards"
)


plt.title('Steam Trading Cards Distribution', fontsize=14, fontweight='bold')


plt.tight_layout()
plt.show()

In [None]:
workshop_support_counts = df['workshop_support'].value_counts()
plt.style.use("ggplot")
explode = [0.1 if i == 0 else 0 for i in range(len(workshop_support_counts))]
plt.figure(figsize=(6, 6))
patches, texts, autotexts = plt.pie(
    workshop_support_counts,
    labels=None,  
    autopct='%1.1f%%',
    startangle=90,
    textprops={"fontweight": "bold"},
    explode=explode,
    shadow=True,
    wedgeprops={'edgecolor': 'black', 'linewidth': 1.5},
    colors=plt.cm.Paired.colors
)

plt.legend(
    patches,
    workshop_support_counts.index,
    loc='upper right',
    bbox_to_anchor=(1.3, 1),
    title="Workshop Support"
)


plt.title('Workshop Support Distribution', fontsize=14, fontweight='bold')


plt.tight_layout()
plt.show()

In [None]:
price_counts = df['price'].value_counts().sort_index()
expanded_prices = np.repeat(price_counts.index, price_counts.values)
plt.figure(figsize=(10, 6))
plt.hist(expanded_prices, bins=50, color='skyblue', edgecolor='black')
plt.title('Price Distribution Histogram')
plt.xlabel('Price')
plt.ylabel('Number of Games')
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
review_score_counts = df['reviewScore'].value_counts().sort_index()
expanded_review_scores = np.repeat(review_score_counts.index, review_score_counts.values)
plt.figure(figsize=(10, 6))
plt.hist(expanded_review_scores, bins=50, color='skyblue', edgecolor='black')  
plt.title('Review Score Distribution')
plt.xlabel('Review Score')
plt.ylabel('Number of Games')
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
review_score_counts = classification_df['reviewScore'].value_counts().sort_index()
expanded_review_scores = np.repeat(review_score_counts.index, review_score_counts.values)
plt.figure(figsize=(10, 6))
plt.hist(expanded_review_scores, bins=50, color='skyblue', edgecolor='black')  
plt.title('Review Score Distribution')
plt.xlabel('Review Score')
plt.ylabel('Number of Games')
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
achievements_counts = df['achievements_total'].value_counts().sort_index()
expanded_achievements = np.repeat(achievements_counts.index, achievements_counts.values)
plt.figure(figsize=(10, 6))
n, bins, patches = plt.hist(expanded_achievements, bins=50, color='lightcoral', edgecolor='black')
plt.title('Achievements Total Distribution')
plt.xlabel('Achievements Total')
plt.ylabel('Number of Games')
plt.ylim(0, max(n) + (max(n) * 0.1))
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
platform_list = []

for platforms in df['supported_platforms']:
    if isinstance(platforms, str):  
        platforms = platforms.lower().split(',')
        platform_list.extend([platform.strip() for platform in platforms])


platform_counts = Counter(platform_list)


plt.figure(figsize=(8, 6))
plt.bar(platform_counts.keys(), platform_counts.values(), color=['skyblue', 'lightgreen', 'salmon'])
plt.title('Supported Platforms Distribution (Histogram)')
plt.xlabel('Platform')
plt.ylabel('Number of Games')


max_count = max(platform_counts.values())
plt.ylim(0, max_count + (max_count * 0.1))

plt.grid(True, axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

In [None]:
genres_list = []
for genres in df['genres']:
    if isinstance(genres, str): 
        genres = genres.lower().split(',')
        genres_list.extend([genre.strip() for genre in genres])


genres_counts = Counter(genres_list)


plt.figure(figsize=(10, 6))
plt.bar(genres_counts.keys(), genres_counts.values(), color=['skyblue', 'lightgreen', 'salmon'])
plt.title('Genres Distribution (Histogram)')
plt.xlabel('Genre')
plt.ylabel('Number of Games')

max_count = max(genres_counts.values())
plt.ylim(0, max_count + (max_count * 0.1)) 

plt.xticks(rotation=45, ha="right") 
plt.grid(True, axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

### Bivariate Analysis

In [None]:
price_threshold = df['price'].quantile(0.99)  
copiesSold_threshold = df['copiesSold'].quantile(0.99)  

filtered_df = df[(df['price'] <= price_threshold) & (df['copiesSold'] <= copiesSold_threshold)]

plt.figure(figsize=(8, 6))
sns.scatterplot(x='price', y='copiesSold', data=filtered_df, color='blue')
plt.title('Price vs Copies Sold (Filtered Data)')
plt.xlabel('Price')
plt.ylabel('Copies Sold')
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
numeric_columns = ['price', 'copiesSold', 'metacritic', 'steam_achievements', 'steam_trading_cards']
correlation_matrix = df[numeric_columns].corr()
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix of Numeric Columns')
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
sns.countplot(x='publisherClass', hue='workshop_support', data=df, palette='Set2')
plt.title('Publisher Class vs Workshop Support')
plt.xlabel('Publisher Class')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

### Additional Visualizations for Preprocessing Decisions

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(df['metacritic'].dropna(), kde=True)
plt.title('Distribution of Metacritic Scores')
plt.xlabel('Metacritic Score')
plt.ylabel('Count')
plt.show()

In [None]:
numerical_features = df.select_dtypes(include=['float64', 'int64']).columns
plt.figure(figsize=(12, 8))
sns.heatmap(df[numerical_features].corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap of Numerical Features')
plt.show()

# Preprocessing

## Data Cleaning

## Outliers

In [None]:
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
for col in numerical_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = (df[col] < lower_bound) | (df[col] > upper_bound)
    df.loc[outliers, col] = df[col].mean()
df.head()

## Name

In [None]:
df['achievements_total'] = df['achievements_total'].fillna(0)
df['genres'] = df['genres'].fillna(df['genres'].mode()[0])
print(df.columns)
df = df.dropna(subset=['release_date'])
columns_to_drop = ['metacritic']
df = df.drop(columns=columns_to_drop)
df = df.drop_duplicates()
print("Shape after Previous preprocessing:", df.shape)
print("\nMissing values after preprocessing:")
print(df.isna().sum())

In [None]:
df['name'] = df['name'].fillna('Unknown Game') 
df['name'] = df['name'].str.replace(r'[^a-zA-Z0-9\s]', '', regex=True)
df['name'] = df['name'].str.lower()
df['name'] = df['name'].str.strip()
df['name_tokens'] = df['name'].str.split()
df['name_length'] = df['name'].apply(len)
df[['name', 'name_tokens', 'name_length']].head()

## Date

In [None]:
df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')

df['year'] = df['release_date'].dt.year  
df['month'] = df['release_date'].dt.month  
df['day'] = df['release_date'].dt.day 

df['year'] = df['year'].fillna(df['year'].mode()[0])
df['day'] = df['day'].fillna(df['day'].mode()[0])
df['month'] = df['month'].fillna(df['month'].mode()[0])


print(f"Missing values in 'year': {df['year'].isnull().sum()}")
print(f"Missing values in 'day': {df['day'].isnull().sum()}")


df = df.drop('release_date', axis=1)

plt.figure(figsize=(10, 6))
sns.histplot(df['year'], kde=False, bins=20, color='skyblue')
plt.title('Distribution of Release Years')
plt.xlabel('Year')
plt.ylabel('Number of Games')
plt.grid(True)
plt.tight_layout()
plt.show()

plt.figure(figsize=(10, 6))
sns.histplot(df['month'], kde=False, bins=12, color='lightgreen')
plt.title('Distribution of Release Months')
plt.xlabel('Month')
plt.ylabel('Number of Games')
plt.grid(True)
plt.tight_layout()
plt.show()

plt.figure(figsize=(10, 6))
sns.histplot(df['day'], kde=False, bins=31, color='salmon')
plt.title('Distribution of Release Days')
plt.xlabel('Day of the Month')
plt.ylabel('Number of Games')
plt.grid(True)
plt.tight_layout()
plt.show()

## Encoding

### Classification Target

In [None]:
le = LabelEncoder()
classification_df['reviewScore'] = le.fit_transform(classification_df['reviewScore'])

### Boolean Features

In [None]:
 bool_features = ['steam_achievements', 'steam_trading_cards', 'workshop_support']
for feature in bool_features:
    df[feature] = df[feature].astype(int)

### Publisher Class

In [None]:
label_encoder = LabelEncoder()
df['publisher_class_encoded'] = label_encoder.fit_transform(df['publisherClass'])
df[['publisherClass', 'publisher_class_encoded']].head()
df.drop(columns='publisherClass',inplace=True)
print(df.columns)

### Genres

In [None]:
genres = df['genres'].str.split(',', expand=True)
genre_columns = []

unique_genres = pd.unique(genres.values.ravel())

for genre in unique_genres:
    df[genre] = genres.apply(lambda row: 1 if genre in row.values else 0, axis=1)
    genre_columns.append(genre)

df = df.drop('genres', axis=1)

df.head()

### Supported Platforms

In [None]:
platforms = df['supported_platforms'].str.split(',', expand=True)

df['windows'] = platforms[0].apply(lambda x: 1 if pd.notna(x) and 'windows' in x.lower() else 0)
df['linux'] = platforms[1].apply(lambda x: 1 if pd.notna(x) and 'linux' in x.lower() else 0)
df['mac'] = platforms[2].apply(lambda x: 1 if pd.notna(x) and 'mac' in x.lower() else 0)

df = df.drop('supported_platforms', axis=1)


df.head()

## Scaling

In [None]:
columns_to_scale = [col for col in numeric_columns if col not in genre_columns]
columns_to_scale.remove('metacritic')

X_numeric = df[columns_to_scale]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_numeric)
y = df['reviewScore']

X_scaled_df = pd.DataFrame(X_scaled, columns=X_numeric.columns)

X_final = pd.concat([X_scaled_df, df[genre_columns]], axis=1)


df_scaled = pd.concat([X_final, y], axis=1)


df_scaled.head()
df[df_scaled.columns] = df_scaled

In [None]:
df_scaled.describe()

In [None]:
df.head()

# Feature Engineering and Selection

## Total genres Feature

In [None]:
df['Total_Genres'] = df[genre_columns].sum(axis=1)
df.head()

In [None]:
zero_genres = (df['Total_Genres']==0).sum()
print(zero_genres)

## Number of Platforms Feature

In [None]:
platforms_columns = ['windows','linux','mac']
df['Total_Platforms'] = df[platforms_columns].sum(axis=1)

In [None]:
zero_platforms = (df['Total_Platforms']== 0 ).sum()
print(zero_platforms)
df = df[df['Total_Platforms']!=0]
zero_platforms_after = (df['Total_Platforms']== 0 ).sum()
print(zero_platforms_after)

## Columns With Too much nulls

In [None]:
df.drop(columns=['name_x','name_y','aiContent'],inplace=True)
print(df.columns)

## Useless features

In [None]:
df.drop(columns=['name','name_tokens','name_length'],inplace=True)
print(df.columns)

In [None]:
ones_count = df['windows'].sum()
zeros_count = len(df) - ones_count


sizes = [ones_count, zeros_count]
labels = ['Ones', 'Zeros']


explode = (0.1, 0) 
patches, texts, autotexts = plt.pie(
    sizes,
    labels=None, 
    autopct='%1.1f%%',
    startangle=90,
    textprops={"fontweight": "bold"},
    explode=explode,
    shadow=True,
    wedgeprops={'edgecolor': 'black', 'linewidth': 1.5},
    colors=plt.cm.Paired.colors
)


plt.legend(patches, labels, loc="best", fontsize='large', frameon=True)
plt.title('Distribution of Windows Column', fontweight='bold')
plt.axis('equal')  
plt.show()
df.drop(columns='windows',inplace=True)

In [None]:
ones_count = df['linux'].sum()
zeros_count = len(df) - ones_count


sizes = [ones_count, zeros_count]
labels = ['Ones', 'Zeros']


explode = (0.1, 0) 
patches, texts, autotexts = plt.pie(
    sizes,
    labels=None,  
    autopct='%1.1f%%',
    startangle=90,
    textprops={"fontweight": "bold"},
    explode=explode,
    shadow=True,
    wedgeprops={'edgecolor': 'black', 'linewidth': 1.5},
    colors=plt.cm.Paired.colors
)

plt.legend(patches, labels, loc="best", fontsize='large', frameon=True)
plt.title('Distribution of Linux Column', fontweight='bold')
plt.axis('equal')

In [None]:
ones_count = df['mac'].sum()
zeros_count = len(df) - ones_count


sizes = [ones_count, zeros_count]
labels = ['Ones', 'Zeros']


explode = (0.1, 0)  
patches, texts, autotexts = plt.pie(
    sizes,
    labels=None,  
    autopct='%1.1f%%',
    startangle=90,
    textprops={"fontweight": "bold"},
    explode=explode,
    shadow=True,
    wedgeprops={'edgecolor': 'black', 'linewidth': 1.5},
    colors=plt.cm.Paired.colors
)

# Add a legend instead of labels
plt.legend(patches, labels, loc="best", fontsize='large', frameon=True)
plt.title('Distribution of Windows Column', fontweight='bold')
plt.axis('equal')  # Make it a circle
df.drop(columns='mac',inplace=True)

## Correlation

In [None]:
numeric_columns = df.select_dtypes(include=['number']).columns
print(df.columns)
df_numeric = df[numeric_columns]
df_numeric = df_numeric.apply(pd.to_numeric, errors='coerce')
corr_matrix = df_numeric.corr()
threshold = 0.5
target = 'reviewScore'
if target in corr_matrix.columns:
    
    target_correlations = corr_matrix[target].drop(target).sort_values(key=abs, ascending=False)
    
    target_report = pd.DataFrame({
        'Feature': target_correlations.index,
        'Correlation': target_correlations.values,
        'Absolute_Correlation': target_correlations.abs().values,
        'Significant': target_correlations.abs() >= threshold
    })
    
    print(f"TARGET-ORIENTED FEATURE ANALYSIS (for '{target}')\n")
    print("="*70)
    print(target_report.to_string(index=False))
    
    significant_target_features = target_correlations[target_correlations.abs() >= threshold].index.tolist()
    print(f"Significant features for prediction (|r| ≥ {threshold}):")
    print(significant_target_features)
    print("\n\n")

print("CROSS-FEATURE CORRELATION ANALYSIS")
print("="*70)

significant_corrs = []
for i in range(len(corr_matrix.columns)):
    for j in range(i+1, len(corr_matrix.columns)):
        val = corr_matrix.iloc[i, j]
        if abs(val) >= threshold:
            significant_corrs.append({
                'Feature 1': corr_matrix.columns[i],
                'Feature 2': corr_matrix.columns[j],
                'Correlation': val
            })

results_df = pd.DataFrame(significant_corrs)
results_df['abs_corr'] = results_df['Correlation'].abs()
results_df = results_df.sort_values('abs_corr', ascending=False).drop('abs_corr', axis=1)

print(f"Significant Correlations Between Features (|r| ≥ {threshold}):\n")
print("="*70)
print(results_df.to_string(index=False))

if target in corr_matrix.columns:
    print("FEATURE SELECTION RECOMMENDATIONS\n\n")
    print("="*70)
    print("HIGH-VALUE FEATURES (keep these):\n")
    high_value = target_report[target_report['Significant']].sort_values('Absolute_Correlation', ascending=False)
    print(high_value[['Feature', 'Correlation']].to_string(index=False))
    
    print("LOW-VALUE FEATURES (consider removing):\n")
    low_value = target_report[~target_report['Significant']].sort_values('Absolute_Correlation', ascending=False)
    print(low_value[['Feature', 'Correlation']].to_string(index=False))
    
    redundant_features = set()
    for _, row in results_df.iterrows():
        if target not in [row['Feature 1'], row['Feature 2']]:
            if row['Feature 1'] in high_value['Feature'].values and row['Feature 2'] in high_value['Feature'].values:
                redundant_features.add(row['Feature 2']) 
    
    if redundant_features:
        print("POTENTIALLY REDUNDANT FEATURES (consider removing one of each pair):\n")
        print(list(redundant_features))

print("SUMMARY STATISTICS\n")
print("="*70)
print(f"- Total features analyzed: {len(corr_matrix.columns)}")
if target in corr_matrix.columns:
    print(f"- Features strongly correlated with target ({threshold}+): {len(high_value)}")
    print(f"- Features weakly correlated with target: {len(low_value)}")
print(f"- Significant cross-feature correlations found: {len(results_df)}")

## Redundant

In [None]:
df.drop(columns=['Violent', 'Nudity', 'Gore', 'workshop_support', 'Adventure', 'linux'], inplace=True)
print(df.columns)

In [None]:
classification_df = classification_df[['reviewScore']].copy()
df_features = df.drop(columns=['reviewScore'], errors='ignore')
classification_df = pd.concat([classification_df, df_features], axis=1)
regression_df = regression_df[['reviewScore']].copy()
regression_df = pd.concat([regression_df, df_features], axis=1)

In [None]:
regression_df.info()

In [None]:
classification_df.info()

## PCA

In [None]:
apply_pca = True  
print("\nChecking for missing values in the selected features:")
print(df.isna().sum())


if 'reviewScore' in df.columns:
    df = df.dropna(subset=['reviewScore'])  
print("\nFilling missing values for categorical columns...")
for col in tqdm(df.columns[df.dtypes == 'object'], desc="Categorical columns", leave=False):
    df[col] = df[col].fillna(df[col].mode()[0])


print("\nFilling missing values for numeric columns...")
for col in tqdm(df.select_dtypes(include=['number']).columns, desc="Numeric columns", leave=False):
    df[col] = df[col].fillna(df[col].mean())


df_selected = df.select_dtypes(include=['number'])  


print("\nTotal NaN after filling missing values:")
print(df_selected.isnull().sum().sum())
df_selected.columns = df_selected.columns.astype(str)


if apply_pca:
    print("\nApplying PCA...")
    pca = PCA(n_components=3)

    
    with tqdm(total=1, desc="PCA fitting", leave=False) as pbar:
        df_pca = pca.fit_transform(df_selected)
        pbar.update(1)

 
    df_pca_df = pd.DataFrame(df_pca, columns=[f'PCA_{i+1}' for i in range(df_pca.shape[1])])


    print("\nVariance Ratio per PCA Component:")
    for i, var_ratio in enumerate(pca.explained_variance_ratio_):
        print(f"PCA_{i+1}: {var_ratio:.4f}")

   
    feature_loadings = pd.DataFrame(
        pca.components_.T,  
        columns=[f'PCA_{i+1}' for i in range(pca.n_components_)],
        index=df_selected.columns
    )
abs_loadings = abs(df.corr())
pca_top_3_features = abs_loadings.sum(axis=1).nlargest(3).index.tolist()


if 'reviewScore' in pca_top_3_features:
    pca_top_3_features.remove('reviewScore')


if len(pca_top_3_features) < 3:
    additional_feature = abs_loadings.sum(axis=1).nlargest(3 + len(pca_top_3_features)).index[-1]
    pca_top_3_features.append(additional_feature)


print("Top 3 features PCA selected: ", pca_top_3_features)

### Recursive Feature Elimination (RFE)

In [None]:
# X = df.select_dtypes(include=['number']) 
# y = df['reviewScore']

# X.columns = X.columns.astype(str)  

# imputer = SimpleImputer(strategy='median')
# X_imputed = imputer.fit_transform(X)

# model = RandomForestRegressor(random_state=42)
# selector = RFE(estimator=model, n_features_to_select=5)


# with tqdm(total=1, desc="RFE Feature Selection Progress", unit="step") as pbar:
#     selector.fit(X_imputed, y)  
#     pbar.update(1)  


# rfe_selected_features = X.columns[selector.support_]
# print(f"Top 5 features selected by RFE: {rfe_selected_features}")

# Oversampling with smote

In [None]:
# min_samples = max(2000, int(0.2 * 26925))  # Adjust 26925 to your actual dataset size if needed

# # Step 2: Define custom sampling strategy
# sampling_strategy = {
#     'Mostly Negative': min(min_samples, 2894 * 3),  
#     'Overwhelmingly Positive': min_samples,
#     'Very Negative': min_samples,
#     'Overwhelmingly Negative': min_samples
# }

# # Step 3: Prepare data
# X = classification_df.drop(columns=['reviewScore'])
# y = classification_df['reviewScore']

# # Step 4: Apply Random Oversampling
# ros = RandomOverSampler(sampling_strategy=sampling_strategy, random_state=42)
# X_resampled, y_resampled = ros.fit_resample(X, y)

# # Step 5: Combine back into a DataFrame
# df_resampled = pd.concat([
#     pd.DataFrame(X_resampled, columns=X.columns),
#     pd.Series(y_resampled, name='reviewScore')
# ], axis=1)

# # Step 6: Display class distribution after sampling
# print("After custom oversampling:\n", df_resampled['reviewScore'].value_counts())

In [None]:
# X_group = df_resampled_group.drop(columns=['reviewGroup'])
# y_group = df_resampled_group['reviewGroup']

# X_resampled = df_resampled.drop(columns=['reviewScore'])
# y_resampled = df_resampled['reviewScore']

# # Split the data into training and testing sets (80-20 split)
# X_group_train, X_group_test, y_group_train, y_group_test = train_test_split(X_group, y_group, test_size=0.2, random_state=42, stratify=y_group)
# X_resampled_train, X_resampled_test, y_resampled_train, y_resampled_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)

# # Initialize Random Forest classifier
# rf = RandomForestClassifier(random_state=42)

# # Train on group mapped data (df_resampled_group)
# rf.fit(X_group_train, y_group_train)
# y_group_pred = rf.predict(X_group_test)
# accuracy_group = accuracy_score(y_group_test, y_group_pred)
# mse_group = mean_squared_error(y_group_test, y_group_pred)

# # Train on resampled data (df_resampled)
# rf.fit(X_resampled_train, y_resampled_train)
# y_resampled_pred = rf.predict(X_resampled_test)
# accuracy_resampled = accuracy_score(y_resampled_test, y_resampled_pred)
# mse_resampled = mean_squared_error(y_resampled_test, y_resampled_pred)

# # Print results
# print(f"Accuracy on group-mapped data (df_resampled_group): {accuracy_group:.4f}")
# print(f"Mean Squared Error on group-mapped data (df_resampled_group): {mse_group:.4f}")
# print(f"Accuracy on custom-resampled data (df_resampled): {accuracy_resampled:.4f}")
# print(f"Mean Squared Error on custom-resampled data (df_resampled): {mse_resampled:.4f}")

In [None]:
final_df = regression_df[['Total_Genres', 'Action',' Indie','copiesSold', 'year','Sports','reviewScore']]
final_df.to_csv('/kaggle/working/regression_preprocessed.csv', index=False)
final_df = classification_df[['Total_Genres', 'Action',' Indie','copiesSold', 'year','Sports', 'reviewScore']]
final_df.to_csv('/kaggle/working/classification_preprocessed.csv', index=False)
print('Saved\n')

# Data Splitting

In [None]:
def data_splitting(csv):
    df = pd.read_csv(csv)
    X = df[['Total_Genres', 'Action',' Indie','copiesSold', 'year','Sports']]
    X = X.fillna(X.median())
    y = df['reviewScore'] 
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = data_splitting('/kaggle/working/regression_preprocessed.csv')
X_train.describe()

In [None]:
X_train, X_test, y_train, y_test = data_splitting('/kaggle/working/classification_preprocessed.csv')
X_train.describe()

# ML Models

## Regression

### XGBoost

In [None]:
# xgb_model = xgb.XGBRegressor()
# xgb_model.fit(X_train, y_train)
# y_pred_xgb = xgb_model.predict(X_test)

# mse_xgb = mean_squared_error(y_test, y_pred_xgb)
# mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
# r2_xgb = r2_score(y_test, y_pred_xgb)

# print(f"XGBoost MSE: {mse_xgb}")
# print(f"XGBoost MAE: {mae_xgb}")
# print(f"XGBoost R²: {r2_xgb}")

# plt.figure(figsize=(8, 6))
# plt.scatter(y_test, y_pred_xgb, color='darkorange', edgecolor='k', alpha=0.7)
# plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='navy', linestyle='--', linewidth=2)
# plt.xlabel('Actual Values')
# plt.ylabel('Predicted Values')
# plt.title('Random Forest: Actual vs Predicted')
# plt.grid(True)
# plt.show()

### Random Forest

In [None]:
# rf_model = RandomForestRegressor(
#     n_estimators=100,
#     random_state=42,
#     n_jobs=-1
# )
# rf_model.fit(X_train, y_train)

# y_pred_rf = rf_model.predict(X_test)

# mse_rf = mean_squared_error(y_test, y_pred_rf)
# mae_rf = mean_absolute_error(y_test, y_pred_rf)
# r2_rf = r2_score(y_test, y_pred_rf)

# print(f"Random Forest MSE: {mse_rf:.4f}")
# print(f"Random Forest MAE: {mae_rf:.4f}")
# print(f"Random Forest R²: {r2_rf:.4f}")

# plt.figure(figsize=(8, 6))
# plt.scatter(y_test, y_pred_rf, color='darkorange', edgecolor='k', alpha=0.7)
# plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='navy', linestyle='--', linewidth=2)
# plt.xlabel('Actual Values')
# plt.ylabel('Predicted Values')
# plt.title('Random Forest: Actual vs Predicted')
# plt.grid(True)
# plt.show()

### Multilayer Perceptron (MLP) Regressor

In [None]:
# mlp_model = MLPRegressor(random_state=42)
# mlp_model.fit(X_train, y_train)
# y_pred_mlp = mlp_model.predict(X_test)

# mse_mlp = mean_squared_error(y_test, y_pred_mlp)
# mae_mlp = mean_absolute_error(y_test, y_pred_mlp)
# r2_mlp = r2_score(y_test, y_pred_mlp)

# print(f"MLP MSE: {mse_mlp:.4f}")
# print(f"MLP MAE: {mae_mlp:.4f}")
# print(f"MLP R²: {r2_mlp:.4f}")

# plt.figure(figsize=(8, 6))
# plt.scatter(y_test, y_pred_mlp, color='mediumslateblue', edgecolor='k', alpha=0.7)
# plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='darkred', linestyle='--', linewidth=2)
# plt.xlabel('Actual Values')
# plt.ylabel('Predicted Values')
# plt.title('MLP Regressor: Actual vs Predicted')
# plt.grid(True)
# plt.show()

### KNN Regressor

In [None]:
# knn_model = KNeighborsRegressor()
# knn_model.fit(X_train, y_train)
# y_pred_knn = knn_model.predict(X_test)
# mse_knn = mean_squared_error(y_test, y_pred_knn)
# mae_knn = mean_absolute_error(y_test, y_pred_knn)
# r2 = r2_score(y_test, y_pred_knn)
# print(f"KNN MSE: {mse_knn}")
# print(f"KNN MAE: {mae_knn}")
# print(f"R² KNN: {r2}")

# plt.figure(figsize=(8, 6))
# plt.scatter(y_test, y_pred_knn, color='seagreen', edgecolor='k', alpha=0.7)
# plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='black', linestyle='--', linewidth=2)
# plt.xlabel('Actual Values')
# plt.ylabel('Predicted Values')
# plt.title('KNN Regressor: Actual vs Predicted')
# plt.grid(True)
# plt.show()

### Polynomial Regression

In [None]:
# poly = PolynomialFeatures(degree=5)
# X_train_poly = poly.fit_transform(X_train)
# poly_model = LinearRegression()
# poly_model.fit(X_train_poly, y_train)
# X_test_poly = poly.transform(X_test)
# y_pred_poly = poly_model.predict(X_test_poly)
# mse_poly = mean_squared_error(y_test, y_pred_poly)
# mae_poly = mean_absolute_error(y_test, y_pred_poly)
# r2 = r2_score(y_test, y_pred_poly)
# print(f"Polynomial Regression MSE: {mse_poly}")
# print(f"Polynomial Regression MAE: {mae_poly}")
# print(f"R²: {r2}")

# plt.figure(figsize=(8, 6))
# plt.scatter(y_test, y_pred_poly, color='goldenrod', edgecolor='k', alpha=0.7)
# plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='darkblue', linestyle='--', linewidth=2)
# plt.xlabel('Actual Values')
# plt.ylabel('Predicted Values')
# plt.title('Polynomial Regression (Degree=2): Actual vs Predicted')
# plt.grid(True)
# plt.show()

### Lasso Regression

In [None]:
# lasso_model = Lasso(random_state=42)
# lasso_model.fit(X_train, y_train)
# y_pred_lasso = lasso_model.predict(X_test)
# mse_lasso = mean_squared_error(y_test, y_pred_lasso)
# mae_lasso = mean_absolute_error(y_test, y_pred_lasso)
# r2 = r2_score(y_test, y_pred_lasso)
# print(f"Lasso MSE: {mse_lasso}")
# print(f"Lasso MAE: {mae_lasso}")
# print(f"R²: {r2}")

# plt.figure(figsize=(8, 6))
# plt.scatter(y_test, y_pred_lasso, color='orangered', edgecolor='k', alpha=0.7)
# plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='forestgreen', linestyle='--', linewidth=2)
# plt.xlabel('Actual Values')
# plt.ylabel('Predicted Values')
# plt.title('Lasso Regression: Actual vs Predicted')
# plt.grid(True)
# plt.show()

### Ridge

In [None]:
# ridge_model = Ridge(random_state=42)
# ridge_model.fit(X_train, y_train)
# y_pred_ridge = ridge_model.predict(X_test)
# mse_ridge = mean_squared_error(y_test, y_pred_ridge)
# mae_ridge = mean_absolute_error(y_test, y_pred_ridge)
# r2 = r2_score(y_test, y_pred_ridge)
# print(f"Ridge MSE: {mse_ridge}")
# print(f"Ridge MAE: {mae_ridge}")
# print(f"R²: {r2}")

# plt.figure(figsize=(8, 6))
# plt.scatter(y_test, y_pred_ridge, color='slateblue', edgecolor='k', alpha=0.7)
# plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='darkviolet', linestyle='--', linewidth=2)
# plt.xlabel('Actual Values')
# plt.ylabel('Predicted Values')
# plt.title('Ridge Regression: Actual vs Predicted')
# plt.grid(True)
# plt.show()

### Decsion Trees

In [None]:
# tree_model = DecisionTreeRegressor(random_state=42)
# tree_model.fit(X_train, y_train)
# y_pred_tree = tree_model.predict(X_test)
# mse_tree = mean_squared_error(y_test, y_pred_tree)
# mae_tree = mean_absolute_error(y_test, y_pred_tree)

# r2_tree = r2_score(y_test, y_pred_tree)
# print(f"Decision Tree MSE: {mse_tree}")
# print(f"Decision Tree MAE: {mae_tree}")
# print(f"R²: {r2_tree}")

# plt.figure(figsize=(8, 6))
# plt.scatter(y_test, y_pred_tree, color='darkcyan', edgecolor='k', alpha=0.7)
# plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='darkorange', linestyle='--', linewidth=2)
# plt.xlabel('Actual Values')
# plt.ylabel('Predicted Values')
# plt.title('Decision Tree Regressor: Actual vs Predicted')
# plt.grid(True)
# plt.show()

### Linear

In [None]:
# linear_model = LinearRegression(fit_intercept=True)  
# linear_model.fit(X_train, y_train)

# y_pred_linear = linear_model.predict(X_test)

# mse_linear = mean_squared_error(y_test, y_pred_linear)
# mae_linear = mean_absolute_error(y_test, y_pred_linear)
# r2_linear = r2_score(y_test, y_pred_linear)

# print(f"Linear Regression MSE: {mse_linear}")
# print(f"Linear Regression MAE: {mae_linear}")
# print(f"Linear Regression R²: {r2_linear}")

# plt.figure(figsize=(8, 6))
# plt.scatter(y_test, y_pred_linear, color='green', edgecolor='k', alpha=0.7)
# plt.axhline(y=y_pred_linear.mean(), color='darkviolet', linestyle='--', linewidth=2)  
# plt.xlabel('Actual Values')
# plt.ylabel('Predicted Values')
# plt.title('Linear Regression: Actual vs Predicted (Horizontal Line with Slope 0)')
# plt.grid(True)
# plt.show()

## Classification