# Preprocesado

## Importar librerias

In [20]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler

## Leer Dataset

In [21]:
data = pd.read_csv('data/processed_online_gaming_behavior_dataset.csv')
data.head()

Unnamed: 0,Age,Gender,Location,GameGenre,PlayTimeHours,InGamePurchases,GameDifficulty,SessionsPerWeek,AvgSessionDurationMinutes,PlayerLevel,AchievementsUnlocked,EngagementLevel
0,43,Male,Other,Strategy,16.271119,0,Medium,6,108,79,25,Medium_Low
1,29,Female,USA,Strategy,5.525961,0,Medium,5,144,11,10,Medium_Low
2,22,Female,USA,Sports,8.223755,0,Easy,16,142,35,41,High
3,35,Male,USA,Action,5.265351,1,Easy,9,85,57,47,Medium_High
4,33,Male,Europe,Action,15.531945,0,Medium,2,131,95,37,Medium_High


## conversión de características categóricas (CC)

In [22]:
# Codificar variable objetivo
label_encoder = LabelEncoder()
data['EngagementLevel'] = label_encoder.fit_transform(data['EngagementLevel'])

# Columnas categóricas nominales
categorical_columns = ['GameGenre', 'Gender', 'Location', 'GameDifficulty']

# One-hot encoding
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded = ohe.fit_transform(data[categorical_columns])
encoded_df = pd.DataFrame(encoded, columns=ohe.get_feature_names_out(categorical_columns))

# Combinar con numéricas
data_numerico = data.drop(columns=categorical_columns).reset_index(drop=True)
encoded_df = encoded_df.reset_index(drop=True)
data_procesada = pd.concat([data_numerico, encoded_df], axis=1)

data_procesada.head()

Unnamed: 0,Age,PlayTimeHours,InGamePurchases,SessionsPerWeek,AvgSessionDurationMinutes,PlayerLevel,AchievementsUnlocked,EngagementLevel,GameGenre_Action,GameGenre_RPG,...,GameGenre_Strategy,Gender_Female,Gender_Male,Location_Asia,Location_Europe,Location_Other,Location_USA,GameDifficulty_Easy,GameDifficulty_Hard,GameDifficulty_Medium
0,43,16.271119,0,6,108,79,25,3,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,29,5.525961,0,5,144,11,10,3,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2,22,8.223755,0,16,142,35,41,0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
3,35,5.265351,1,9,85,57,47,2,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
4,33,15.531945,0,2,131,95,37,2,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [23]:
data_procesada['EngagementLevel'].value_counts()
# 0-high, 1-low, 2-medium high, 3-medium low

EngagementLevel
0    10336
1    10324
2     9719
3     9655
Name: count, dtype: int64

## definir X e Y (tratar outliers)

In [24]:
# Separar características y variable objetivo
X = data_procesada.drop(columns=['EngagementLevel'])
y = data_procesada['EngagementLevel']


# Función para eliminar outliers usando IQR
def remove_outliers_iqr(df, percentage=None):
    """
    Elimina outliers usando el método del rango intercuartílico (IQR)
    solo en columnas numéricas continuas (más de 2 valores únicos).

    Parámetros:
    - df: DataFrame de entrada
    - percentage: None elimina todos los outliers; si se indica (ej. 0.05), mantiene ese porcentaje de los outliers

    Retorna:
    - DataFrame sin outliers (completo o parcial según el porcentaje)
    - Lista de columnas numéricas continuas
    """
    # Seleccionar solo columnas numéricas continuas
    continuous_columns = [
        col for col in df.columns
        if df[col].nunique() > 2 and pd.api.types.is_numeric_dtype(df[col])
    ]
    print(f"Columnas continuas seleccionadas: {continuous_columns}")

    # Calcular IQR
    Q1 = df[continuous_columns].quantile(0.25)
    Q3 = df[continuous_columns].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Crear máscara booleana de outliers
    outlier_mask = ((df[continuous_columns] < lower_bound) | (df[continuous_columns] > upper_bound)).any(axis=1)

    if percentage is None:
        # Eliminar todos los outliers
        return df[~outlier_mask] , continuous_columns
    else:
        # Mantener un porcentaje de outliers
        outliers = df[outlier_mask]
        non_outliers = df[~outlier_mask]
        outliers_sample = outliers.sample(frac=percentage, random_state=42)
        return pd.concat([non_outliers, outliers_sample]) , continuous_columns

# (i) sin outliers
X_no_outliers, columnas = remove_outliers_iqr(X)
y_no_outliers = y.loc[X_no_outliers.index]

# (ii) con 5% de outliers
X_5pct_outliers, columnas = remove_outliers_iqr(X, percentage=0.05)
y_5pct_outliers = y.loc[X_5pct_outliers.index]

# Mostrar tamaños de los datasets
X.shape, X_no_outliers.shape, X_5pct_outliers.shape

Columnas continuas seleccionadas: ['Age', 'PlayTimeHours', 'SessionsPerWeek', 'AvgSessionDurationMinutes', 'PlayerLevel', 'AchievementsUnlocked']
Columnas continuas seleccionadas: ['Age', 'PlayTimeHours', 'SessionsPerWeek', 'AvgSessionDurationMinutes', 'PlayerLevel', 'AchievementsUnlocked']


((40034, 21), (40034, 21), (40034, 21))

In [25]:
# Notamos que no hay outliers
# entonces generaremos el dataset con outliers
def add_artificial_outliers(df, columns, fraction=0.05, factor=3):
    """
    Agrega outliers artificiales a las columnas numéricas seleccionadas.
    
    Parámetros:
    - df: DataFrame con los datos originales
    - columns: Lista de columnas a las que se les agregarán outliers
    - fraction: Fracción de filas a las que se les agregarán outliers
    - factor: Factor para aumentar el valor de los outliers (por ejemplo, 3 veces el IQR)
    
    Retorna:
    - DataFrame con los outliers añadidos
    """
    df_outliers = df.copy()

    # Seleccionar filas al azar para agregar outliers
    n_outliers = int(len(df) * fraction)
    outlier_rows = np.random.choice(df.index, n_outliers, replace=False)

    for col in columns:
        # Calcular el rango IQR para la columna
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1

        # Crear outliers artificiales
        lower_outliers = Q1 - factor * IQR
        upper_outliers = Q3 + factor * IQR

        # Asignar valores fuera del rango IQR
        df_outliers.loc[outlier_rows, col] = np.random.choice([lower_outliers, upper_outliers], size=n_outliers)

    return df_outliers

data_5pct_outliers = add_artificial_outliers(data_procesada, columnas, fraction=0.05, factor=2) #5% de outliers en dataset


In [26]:
X = data_5pct_outliers.drop(columns=['EngagementLevel'])
y = data_5pct_outliers['EngagementLevel']

X_5pct_outliers, columnas = remove_outliers_iqr(X, percentage=0.05) # mantener 5% de outliers del dataset
y_5pct_outliers = y.loc[X_5pct_outliers.index]

X_no_outliers, columnas = remove_outliers_iqr(X)
y_no_outliers = y.loc[X_no_outliers.index]

X.shape, X_no_outliers.shape, X_5pct_outliers.shape

Columnas continuas seleccionadas: ['Age', 'PlayTimeHours', 'SessionsPerWeek', 'AvgSessionDurationMinutes', 'PlayerLevel', 'AchievementsUnlocked']
Columnas continuas seleccionadas: ['Age', 'PlayTimeHours', 'SessionsPerWeek', 'AvgSessionDurationMinutes', 'PlayerLevel', 'AchievementsUnlocked']


((40034, 21), (38033, 21), (38133, 21))

## Separar datos

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X_5pct_outliers, y_5pct_outliers, test_size=0.2, random_state=42) # CC y SI outliers y NO balanceados y NO escalados

In [28]:
X_train_clean, X_test_clean, y_train_clean, y_test_clean = train_test_split(X_no_outliers, y_no_outliers, test_size=0.2, random_state=42) # CC y NO outliers y NO balanceados y NO escalados

In [29]:
X_test.shape, X_train.shape, X_test_clean.shape, X_train_clean.shape

((7627, 21), (30506, 21), (7607, 21), (30426, 21))

## Balanceo de Clases

### Check desbalanceo

In [30]:
y_train.value_counts() # 5% outliers
# 0-high, 1-low, 2-medium high, 3-medium low

EngagementLevel
1    7892
0    7863
2    7379
3    7372
Name: count, dtype: int64

In [31]:
y_train_clean.value_counts()  # sin outliers
# 0-high, 1-low, 2-medium high, 3-medium low

EngagementLevel
1    7843
0    7794
2    7414
3    7375
Name: count, dtype: int64

### Balancear

In [32]:
def submuestreo_balanceado(X_train, y_train):
    """
    Realiza submuestreo aleatorio para balancear clases en el conjunto de entrenamiento.
    
    Parámetros:
    - X_train: DataFrame con las características.
    - y_train: Serie o array con la variable objetivo.
    Retorna:
    - X_train_under: DataFrame con características balanceadas.
    - y_train_under: Serie con etiquetas balanceadas.
    """
    # Combinar X e y
    train_df = X_train.copy()
    train_df['EngagementLevel'] = y_train

    # Encontrar el número mínimo de muestras entre clases
    min_class_count = train_df['EngagementLevel'].value_counts().min()

    # Submuestrear cada clase
    balanced_df = train_df.groupby('EngagementLevel').sample(n=min_class_count, random_state=42)

    # Separar nuevamente
    X_train_under = balanced_df.drop(columns=['EngagementLevel'])
    y_train_under = balanced_df['EngagementLevel']

    return X_train_under, y_train_under

In [33]:
X_train_clean_balanced, y_train_clean_balanced = submuestreo_balanceado(X_train_clean, y_train_clean) # CC y NO outliers y balanceado y NO escalado
X_train_balanced, y_train_balanced = submuestreo_balanceado(X_train, y_train) # CC y SI outliers y balanceado y NO escalado

### Check de balanceo

In [34]:
y_train_clean_balanced.value_counts() # sin outliers
# 0-high, 1-low, 2-medium high, 3-medium low

EngagementLevel
0    7375
1    7375
2    7375
3    7375
Name: count, dtype: int64

In [35]:
y_train_balanced.value_counts() # 5% outliers
# 0-high, 1-low, 2-medium high, 3-medium low

EngagementLevel
0    7372
1    7372
2    7372
3    7372
Name: count, dtype: int64

## Escalado de datos

In [36]:
scaler = StandardScaler()

In [37]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_clean_scaled = scaler.fit_transform(X_train_clean)
X_test_clean_scaled = scaler.transform(X_test_clean)

X_train_balanced_scaled = scaler.fit_transform(X_train_balanced)
#X_test_scaled = scaler.transform(X_test)

X_train_clean_balanced_scaled = scaler.fit_transform(X_train_clean_balanced)
#X_test_clean_scaled = scaler.transform(X_test_clean)

## v3 = X_train, X_test, y_train, y_test - CC y SI outliers y NO balanceados y NO ED
## v1 = X_train_clean, X_test_clean, y_train_clean, y_test_clean - CC y NO outliers y NO balanceados y NO ED
## v4 = X_train_balanced, X_test, y_train, y_test - CC y SI outliers y balanceados y NO ED
## v2 = X_train_clean_balanced, X_test_clean, y_train_clean, y_test_clean - CC y NO outliers y balanceados y NO ED

## v7 = X_train_scaled, X_test_scaled, y_train, y_test - CC y SI outliers y NO balanceados y ED
## v5 = X_train_clean_scaled, X_test_clean_scaled, y_train_clean, y_test_clean - CC y NO outliers y NO balanceados y ED
## v8 = X_train_balanced_scaled, X_test_scaled, y_train, y_test - CC y SI outliers y balanceados y ED
## v6 = X_train_clean_balanced_scaled, X_test_clean_scaled, y_train_clean, y_test_clean - CC y NO outliers y balanceados y ED

In [38]:
"""datasets = {
    "v3": {"X_train": X_train, "X_test": X_test, "y_train": y_train, "y_test": y_test},
    "v1": {"X_train": X_train_clean, "X_test": X_test_clean, "y_train": y_train_clean, "y_test": y_test_clean},
    "v4": {"X_train": X_train_balanced, "X_test": X_test, "y_train": y_train, "y_test": y_test}, -----malo
    "v2": {"X_train": X_train_clean_balanced, "X_test": X_test_clean, "y_train": y_train_clean, "y_test": y_test_clean}, -----malo
    "v7": {"X_train": X_train_scaled, "X_test": X_test_scaled, "y_train": y_train, "y_test": y_test},
    "v5": {"X_train": X_train_clean_scaled, "X_test": X_test_clean_scaled, "y_train": y_train_clean, "y_test": y_test_clean},
    "v8": {"X_train": X_train_balanced_scaled, "X_test": X_test_scaled, "y_train": y_train, "y_test": y_test}, -----malo
    "v6": {"X_train": X_train_clean_balanced_scaled, "X_test": X_test_clean_scaled, "y_train": y_train_clean, "y_test": y_test_clean}, -----malo
}

for version, data in datasets.items():
    with open(f"pickle/dataset_{version}.pkl", "wb") as f:
        pickle.dump(data, f)
print("Todos los conjuntos de datos fueron exportados correctamente.")"""


datasets = {
    "v2": {"X_train": X_train_clean_balanced, "X_test": X_test_clean, "y_train": y_train_clean_balanced, "y_test": y_test_clean},
    "v4": {"X_train": X_train_balanced, "X_test": X_test, "y_train": y_train_balanced, "y_test": y_test},
    "v8": {"X_train": X_train_balanced_scaled, "X_test": X_test_scaled, "y_train": y_train_balanced, "y_test": y_test},
    "v6": {"X_train": X_train_clean_balanced_scaled, "X_test": X_test_clean_scaled, "y_train": y_train_clean_balanced, "y_test": y_test_clean},
}

for version, data in datasets.items():
    with open(f"pickle/dataset_{version}.pkl", "wb") as f:
        pickle.dump(data, f)
print("Todos los conjuntos de datos fueron exportados correctamente.")


Todos los conjuntos de datos fueron exportados correctamente.
