# 03 - Modeling

Proyecto: Prediccion de user_score

Modelos: LinearRegression, RandomForest, GradientBoosting

In [20]:
import pandas as pd
import numpy as np
import ast
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [21]:
from pathlib import Path
import pandas as pd
project_dir = Path('..')
df = pd.read_csv(project_dir / 'data' / 'data.csv')

TARGET = 'user_score'

# Parse date
parsed = pd.to_datetime(df['date'], errors='coerce')
df['date_year'] = parsed.dt.year
df['date_month'] = parsed.dt.month

# Parse genres
def parse_genres(val):
    if pd.isna(val):
        return []
    if isinstance(val, list):
        return val
    s = str(val).strip()
    try:
        parsed = ast.literal_eval(s)
        if isinstance(parsed, list):
            return [str(x).strip() for x in parsed]
    except Exception:
        pass
    s = s.strip('[]')
    parts = [p.strip().strip('\"').strip("'") for p in s.split(',') if p.strip()]
    return parts


df['genres_list'] = df['genres'].apply(parse_genres)

# Controlled vocabulary
all_genres = pd.Series([g for lst in df['genres_list'] for g in lst])
counts = all_genres.value_counts()
GENRE_TOP_N = 20
vocab = counts.head(GENRE_TOP_N).index.tolist()

for g in vocab:
    df[f'genre_{g}'] = df['genres_list'].apply(lambda lst: 1 if g in lst else 0)

num_features = ['meta_score', 'date_year', 'date_month']
cat_features = ['platform', 'esrb_rating']
genre_features = [f'genre_{g}' for g in vocab]
feature_cols = num_features + cat_features + genre_features

_df = df[df[TARGET].notna()].copy()
X = _df[feature_cols]
y = _df[TARGET]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median'))])

categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, num_features),
    ('cat', categorical_transformer, cat_features),
    ('genre', 'passthrough', genre_features)
])

X_train_p = preprocessor.fit_transform(X_train)
X_test_p = preprocessor.transform(X_test)
X_train_dense = X_train_p.toarray() if hasattr(X_train_p, 'toarray') else X_train_p
X_test_dense = X_test_p.toarray() if hasattr(X_test_p, 'toarray') else X_test_p

In [22]:
# Ensure X exists if cells executed out of order
if 'X' not in locals():
    project_dir = Path('..')
    df = pd.read_csv(project_dir / 'data' / 'data.csv')
    TARGET = 'user_score'
    X = df.drop(columns=[TARGET])

# Ensure preprocessor exists if cells executed out of order
if 'preprocessor' not in locals():
    from sklearn.compose import ColumnTransformer
    from sklearn.preprocessing import OneHotEncoder, StandardScaler
    # Fallback: infer numeric/categorical columns
    num_cols = [c for c in X.columns if X[c].dtype != 'object']
    cat_cols = [c for c in X.columns if X[c].dtype == 'object']
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), num_cols),
            ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
        ]
    )

models = {
    'LinearRegression': LinearRegression(),
    'RandomForest': RandomForestRegressor(n_estimators=200, random_state=42),
    'GradientBoosting': GradientBoostingRegressor(random_state=42)
}

for name, model in models.items():
    if name == 'GradientBoosting':
        model.fit(X_train_dense, y_train)
        preds = model.predict(X_test_dense)
    else:
        pipe = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
        pipe.fit(X_train, y_train)
        preds = pipe.predict(X_test)
    mae = mean_absolute_error(y_test, preds)
    rmse = mean_squared_error(y_test, preds) ** 0.5
    r2 = r2_score(y_test, preds)
    print(name, 'MAE', round(mae, 3), 'RMSE', round(rmse, 3), 'R2', round(r2, 3))

LinearRegression MAE 0.568 RMSE 0.809 R2 0.451
RandomForest MAE 0.577 RMSE 0.808 R2 0.451
GradientBoosting MAE 0.568 RMSE 0.811 R2 0.447


In [23]:
# Ensure X exists if cells executed out of order
if 'X' not in locals():
    project_dir = Path('..')
    df = pd.read_csv(project_dir / 'data' / 'data.csv')
    TARGET = 'user_score'
    X = df.drop(columns=[TARGET])

# Ensure preprocessor exists before feature names
if 'preprocessor' not in locals():
    from sklearn.compose import ColumnTransformer
    from sklearn.preprocessing import OneHotEncoder, StandardScaler
    num_cols = [c for c in X.columns if X[c].dtype != 'object']
    cat_cols = [c for c in X.columns if X[c].dtype == 'object']
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), num_cols),
            ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
        ]
    )

# Feature importance (GradientBoosting)
feature_names = [str(f) for f in preprocessor.get_feature_names_out()]
importances = models['GradientBoosting'].feature_importances_
fi = pd.DataFrame({'feature': feature_names, 'importance': importances}).sort_values('importance', ascending=False)
fi.head(10)

Unnamed: 0,feature,importance
0,num__meta_score,0.516402
1,num__date_year,0.197692
2,num__date_month,0.045646
11,cat__platform_iOS,0.040826
18,genre__genre_Miscellaneous,0.034386
34,genre__genre_Edutainment,0.029046
12,cat__esrb_rating_E,0.020214
26,genre__genre_Console-style RPG,0.018323
16,genre__genre_Action,0.012776
4,cat__platform_DS,0.012251
