# 02 - Feature Engineering

Proyecto: Prediccion de user_score

Objetivo: implementar multi-hot controlado para `genres` y features temporales.

In [1]:
import pandas as pd
import ast
from pathlib import Path

In [2]:
project_dir = Path('..')
df = pd.read_csv(project_dir / 'data' / 'data.csv')

# Parse date
parsed = pd.to_datetime(df['date'], errors='coerce')
df['date_year'] = parsed.dt.year
df['date_month'] = parsed.dt.month

# Parse genres into list
def parse_genres(val):
    if pd.isna(val):
        return []
    if isinstance(val, list):
        return val
    s = str(val).strip()
    try:
        parsed = ast.literal_eval(s)
        if isinstance(parsed, list):
            return [str(x).strip() for x in parsed]
    except Exception:
        pass
    s = s.strip('[]')
    parts = [p.strip().strip('\"').strip("'") for p in s.split(',') if p.strip()]
    return parts


df['genres_list'] = df['genres'].apply(parse_genres)

# Controlled vocabulary
all_genres = pd.Series([g for lst in df['genres_list'] for g in lst])
counts = all_genres.value_counts()
GENRE_TOP_N = 20
vocab = counts.head(GENRE_TOP_N).index.tolist()

# Multi-hot
for g in vocab:
    df[f'genre_{g}'] = df['genres_list'].apply(lambda lst: 1 if g in lst else 0)

print('Vocab size:', len(vocab))
print('Top genres:', vocab[:10])

Vocab size: 20
Top genres: ['Action', 'General', 'Miscellaneous', 'Puzzle', 'Platformer', '2D', 'Fantasy', 'Role-Playing', 'Strategy', '3D']


In [3]:
# Validacion de sparsity
# Ensure vocab exists if cells executed out of order
if 'vocab' not in locals():
    all_genres = pd.Series([g for lst in df['genres_list'] for g in lst])
    counts = all_genres.value_counts()
    GENRE_TOP_N = 20
    vocab = counts.head(GENRE_TOP_N).index.tolist()

genre_cols = [f'genre_{g}' for g in vocab]
sparsity = 1 - (df[genre_cols].sum().sum() / (len(df) * len(genre_cols)))
print('Sparsity:', round(sparsity, 3))

# Correlacion simple con user_score
corrs = df[genre_cols + ['user_score']].corr()['user_score'].drop('user_score').sort_values(ascending=False)
print(corrs.head(10))


Sparsity: 0.894
genre_Fantasy              0.163606
genre_Action Adventure     0.151563
genre_Console-style RPG    0.136317
genre_Strategy             0.111427
genre_Platformer           0.088164
genre_Turn-Based           0.085656
genre_3D                   0.081673
genre_Shooter              0.046808
genre_Tactics              0.044783
genre_Racing               0.036040
Name: user_score, dtype: float64
