In [None]:
import os, numpy as np, pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

DATA_PATH = "/content/online_gaming_insights.csv"   # ruta del CSV
OUT_DIR = "/content/data"; os.makedirs(OUT_DIR, exist_ok=True)
DROP_COLS = ["PlayerID", "Location"]

df = pd.read_csv(DATA_PATH)
df = df.drop(columns=[c for c in DROP_COLS if c in df.columns], errors="ignore")

col_eng = "EngagementLevel" if "EngagementLevel" in df.columns else "engagementlevel"
col_buy = "InGamePurchases"  if "InGamePurchases"  in df.columns else "ingamepurchases"

df["engagementlevel_num"] = df[col_eng].str.lower().str[:1].map({"l":1,"m":2,"h":3})
df[col_buy] = pd.to_numeric(df[col_buy], errors="coerce")
df["igp_bin"] = (df[col_buy] > 0).astype(int)
df = df.fillna(df.median(numeric_only=True)).drop_duplicates()

for c in df.select_dtypes(include="number"):
    q1, q3 = df[c].quantile([.25, .75]); iqr = q3 - q1
    df[c] = df[c].clip(q1 - 1.5*iqr, q3 + 1.5*iqr)

X = pd.get_dummies(df.drop(columns=[col_eng, col_buy, "engagementlevel_num", "igp_bin"]), drop_first=False)
X_scaled = StandardScaler(with_mean=False).fit_transform(X)
X_proc = pd.DataFrame(PCA(0.95, random_state=42).fit_transform(X_scaled), index=X.index)

def sv(Xm, y, prefix, strat=None):
    Xt, Xe, yt, ye = train_test_split(Xm, y, test_size=0.2, random_state=42, stratify=strat)
    X2, Xv, y2, yv = train_test_split(Xt, yt, test_size=0.2, random_state=42,
                                      stratify=(yt if strat is not None else None))
    for a, n in [(X2,"TrainX"), (y2,"TrainY"), (Xv,"ValidationX"), (yv,"ValidationY"), (Xe,"TestX"), (ye,"TestY")]:
        pd.DataFrame(a).to_csv(f"{OUT_DIR}/{n}_{prefix}.csv", index=False)

m = df["engagementlevel_num"].notna()
sv(X_proc.loc[m], df.loc[m,"engagementlevel_num"].astype(int),   "eng_cls", strat=df.loc[m,"engagementlevel_num"])
sv(X_proc.loc[m], df.loc[m,"engagementlevel_num"].astype(float), "eng_reg")
m = df[col_buy].notna()
sv(X_proc.loc[m], df.loc[m,"igp_bin"],                           "igp_cls", strat=df.loc[m,"igp_bin"])
sv(X_proc.loc[m], df.loc[m,col_buy].astype(float),               "igp_reg")

print(" Listo. Archivos guardados en:", OUT_DIR)