In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [3]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE, KMeansSMOTE
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [4]:
file_path = '../data/processed/Juegorawg_limpio.csv'
df = pd.read_csv(file_path)
df.copy()

Unnamed: 0,slug,name,playtime,released,rating,rating_top,ratings_count,reviews_text_count,added,metacritic,suggestions_count,updated,reviews_count,release_year,main_genre,metacritic_category
0,hellpoint,Hellpoint,3,2020-07-30,2.76,3,51,2,2182,61.0,494,2024-11-26 14:58:37,54,2020,Indie,Media
1,anomaly-2,Anomaly 2,2,2013-05-14,2.90,4,62,0,1898,77.0,610,2024-10-07 11:59:32,62,2013,Strategy,Media
2,reverse-4,Resident Evil Re:Verse,1,2022-10-28,1.53,1,63,0,1662,78.0,456,2024-11-14 19:49:03,64,2022,Shooter,Media
3,x-morph-defense,X-Morph: Defense,3,2017-08-30,3.08,3,47,1,1657,75.0,702,2024-11-08 09:09:44,48,2017,Indie,Media
4,west-of-dead,West of Dead,1,2020-06-08,3.08,3,64,2,1634,68.0,456,2024-03-05 13:43:19,66,2020,Indie,Media
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3763,the-otterman-empire,The Otterman Empire,0,2020-07-02,0.00,0,0,0,0,76.5,333,2020-12-16 15:29:54,0,2020,Casual,Media
3764,jacks-or-better-video-poker,Jacks or Better - Video Poker,0,2020-06-12,0.00,0,0,0,0,76.5,0,2022-04-29 05:58:06,0,2020,Casual,Media
3765,jumanji-the-curse-returns,Jumanji: The Curse Returns,0,2021-09-01,0.00,0,0,0,0,76.0,251,2022-09-19 08:16:26,0,2021,Adventure,Media
3766,avocuddle,AvoCuddle,0,2019-07-12,0.00,0,0,0,0,73.0,400,2020-12-16 14:38:36,0,2019,Indie,Media


# Feature numericas derivadas
1. Aplicar logaritmo a playtime para reducir la influencia de valores extremos.


In [8]:
df['playtime_log'] = df['playtime'].apply(lambda x: np.log1p(x))

# Codificamos las variables categoricas

In [5]:
df = pd.get_dummies(df, columns=['main_genre', 'metacritic_category'], drop_first=True)

### Reviso que no haya datos numericos en X

In [6]:
non_numeric_columns = df.select_dtypes(include=['object']).columns
if len(non_numeric_columns) > 0:
    print("Eliminando columnas no numéricas:", non_numeric_columns.tolist())
    df.drop(columns=non_numeric_columns, inplace=True)

Eliminando columnas no numéricas: ['slug', 'name', 'released', 'updated']


# Escalado de varianles numericas

In [9]:
scaler = StandardScaler()
num_vars = ['rating', 'metacritic', 'playtime_log']
df[num_vars] = scaler.fit_transform(df[num_vars])


# Seperacion en el conjunto de entrenamiento con respecto a nuestra target ' rating'

In [10]:
X = df.drop(columns=['rating']) 
y = df['rating']  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Balanceo de datos

In [11]:
quantile_low = y_train.quantile(0.05)  
quantile_high = y_train.quantile(0.95)  
mask = (y_train >= quantile_low) & (y_train <= quantile_high)
X_train_balanced = X_train[mask]
y_train_balanced = y_train[mask]

In [12]:
assert X_train_balanced.select_dtypes(include=['object']).shape[1] == 0, "Aún hay columnas categóricas en X_train_balanced"

# Entrenamiento del modelo Random Forest Regressor

In [13]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_balanced, y_train_balanced)

'''Predicciones'''
y_pred = rf_model.predict(X_test)

# Evaluación del modelo

In [14]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"MSE: {mse}")
print(f"R² Score: {r2}")

MSE: 0.02423602821277413
R² Score: 0.9756826085827873


## Primer resumen

In [15]:
print(X_train_balanced.describe())
print(y_train_balanced.describe())

          playtime   rating_top  ratings_count  reviews_text_count  \
count  2866.000000  2866.000000    2866.000000         2866.000000   
mean      1.996162     1.570482      11.434403            0.174110   
std       1.834416     1.796473      15.840987            0.517788   
min       0.000000     0.000000       0.000000            0.000000   
25%       0.000000     0.000000       1.000000            0.000000   
50%       2.000000     0.000000       4.000000            0.000000   
75%       3.000000     4.000000      16.000000            0.000000   
max       8.000000     5.000000      67.000000            8.000000   

             added   metacritic  suggestions_count  reviews_count  \
count  2866.000000  2866.000000        2866.000000    2866.000000   
mean    182.214236    -0.026166         309.238660      11.699581   
std     272.443220     1.032261         155.313715      16.145719   
min       0.000000    -9.743525           0.000000       0.000000   
25%      14.000000    -0