In [2]:
req = """numpy==2.0.0
pandas==2.2.2
scikit-learn==1.6.1
seaborn==0.13.2
"""
with open("requirements.txt", "w") as f:
    f.write(req)

!pip -q install -r requirements.txt

# Verificar
import numpy, pandas, sklearn, seaborn
print("numpy:", numpy.__version__)
print("pandas:", pandas.__version__)
print("scikit-learn:", sklearn.__version__)
print("seaborn:", seaborn.__version__)


numpy: 2.0.0
pandas: 2.2.2
scikit-learn: 1.6.1
seaborn: 0.13.2


# **Revisión de Dataset**

In [30]:
import pandas as pd

In [31]:
PATH = "matchData.csv"
df = pd.read_csv(PATH, low_memory=False)

In [32]:
print("Shape (filas, columnas):", df.shape)
print("\nPrimeras 5 filas (head):")
print(df.head(5))

Shape (filas, columnas): (101843, 1770)

Primeras 5 filas (head):
   dataVersion         matchId endOfGameResult   gameCreation  gameDuration  \
0            2  NA1_5348438296    GameComplete  1755372956560          1682   
1            2  NA1_5348419072    GameComplete  1755370753217          1759   
2            2  NA1_5345908214    GameComplete  1755055472135          2491   
3            2  NA1_5341292117    GameComplete  1754532281603          1973   
4            2  NA1_5341241370    GameComplete  1754529263025          2376   

   gameEndTimestamp      gameId gameMode                      gameName  \
0     1755374669877  5348438296  CLASSIC  teambuilder-match-5348438296   
1     1755372544375  5348419072  CLASSIC  teambuilder-match-5348419072   
2     1755057984148  5345908214  CLASSIC  teambuilder-match-5345908214   
3     1754534283418  5341292117  CLASSIC  teambuilder-match-5341292117   
4     1754531661796  5341241370  CLASSIC  teambuilder-match-5341241370   

       gameTyp

In [33]:
print("\nColumnas (primeras 30):")
print(list(df.columns[:30]))


Columnas (primeras 30):
['dataVersion', 'matchId', 'endOfGameResult', 'gameCreation', 'gameDuration', 'gameEndTimestamp', 'gameId', 'gameMode', 'gameName', 'gameType', 'gameVersion', 'mapId', 'participant0PlayerScore0', 'participant0PlayerScore1', 'participant0PlayerScore10', 'participant0PlayerScore11', 'participant0PlayerScore2', 'participant0PlayerScore3', 'participant0PlayerScore4', 'participant0PlayerScore5', 'participant0PlayerScore6', 'participant0PlayerScore7', 'participant0PlayerScore8', 'participant0PlayerScore9', 'participant0AllInPings', 'participant0AssistMePings', 'participant0Assists', 'participant0BaronKills', 'participant0BasicPings', 'participant0ChampExperience']


In [34]:
print("\nTipos de datos (primeras 30):")
print(df.dtypes.head(30))


Tipos de datos (primeras 30):
dataVersion                     int64
matchId                        object
endOfGameResult                object
gameCreation                    int64
gameDuration                    int64
gameEndTimestamp                int64
gameId                          int64
gameMode                       object
gameName                       object
gameType                       object
gameVersion                    object
mapId                           int64
participant0PlayerScore0        int64
participant0PlayerScore1        int64
participant0PlayerScore10       int64
participant0PlayerScore11       int64
participant0PlayerScore2        int64
participant0PlayerScore3        int64
participant0PlayerScore4        int64
participant0PlayerScore5        int64
participant0PlayerScore6        int64
participant0PlayerScore7        int64
participant0PlayerScore8        int64
participant0PlayerScore9        int64
participant0AllInPings          int64
participant0AssistM

In [35]:
dup_rows = df.duplicated().sum()
print(f"\nFilas duplicadas exactas: {dup_rows}")


Filas duplicadas exactas: 0


# **Limpieza**

In [36]:
import numpy as np
import pandas as pd

In [37]:
def normalize_col(c):
    c = str(c).strip()
    c = c.replace(" ", "_").replace("-", "_")
    c = "".join(ch if ch.isalnum() or ch=="_" else "" for ch in c)
    return c.lower()

original_cols = df.columns.tolist()
df.columns = [normalize_col(c) for c in df.columns]

In [38]:
n_rows = len(df)
empty_cols = [c for c in df.columns if df[c].isna().all()]
constant_cols = [c for c in df.columns if df[c].nunique(dropna=False) <= 1]
drop_cols = list(dict.fromkeys(empty_cols + constant_cols))  # sin duplicados
df.drop(columns=drop_cols, inplace=True)

In [39]:
before = len(df)
df.drop_duplicates(inplace=True, ignore_index=True)
removed_dups = before - len(df)

In [40]:
for int_col in ["game_duration", "map_id", "data_version"]:
    if int_col in df.columns:
        df[int_col] = pd.to_numeric(df[int_col], errors="coerce").astype("Int64")

In [41]:
def ts_to_datetime(s):
    s = pd.to_numeric(s, errors="coerce")
    if s.dropna().empty:
        return pd.Series(pd.NaT, index=s.index)
    # heurística: si la mediana es demasiado grande, probablemente esté en ms
    med = s.dropna().median()
    if med > 1e12:  # ms
        return pd.to_datetime(s, unit="ms", errors="coerce")
    else:           # s
        return pd.to_datetime(s, unit="s", errors="coerce")

In [42]:
for ts_col in ["game_creation", "game_end_timestamp"]:
    if ts_col in df.columns:
        df[ts_col + "_dt"] = ts_to_datetime(df[ts_col])

In [43]:
obj_cols = df.select_dtypes(include="object").columns
# para no tardar en columnas numéricas enormes, solo aplicamos strip si hay strings
for c in obj_cols:
    # sample para decidir si merece strip
    sample = df[c].dropna().astype(str).head(100)
    if not sample.empty and any((s != s.strip()) for s in sample):
        df[c] = df[c].astype(str).str.strip()

In [44]:
report = {
    "filas_finales": len(df),
    "columnas_finales": df.shape[1],
    "duplicados_eliminados": int(removed_dups),
    "columnas_eliminadas_total": len(drop_cols),
    "columnas_eliminadas_lista": drop_cols[:20],  # recorte para el print
    "renombres_aplicados_ejemplo": dict(zip(original_cols[:10], df.columns[:10])),
}
print("Resumen de la simulación de limpieza:")
for k,v in report.items():
    print(f"- {k}: {v}")

Resumen de la simulación de limpieza:
- filas_finales: 101843
- columnas_finales: 1436
- duplicados_eliminados: 0
- columnas_eliminadas_total: 334
- columnas_eliminadas_lista: ['participant0summonername', 'participant1summonername', 'participant2summonername', 'participant3summonername', 'participant4summonername', 'participant5summonername', 'participant6summonername', 'participant7summonername', 'participant8summonername', 'participant9summonername', 'dataversion', 'gamemode', 'gametype', 'mapid', 'participant0playerscore0', 'participant0playerscore1', 'participant0playerscore10', 'participant0playerscore11', 'participant0playerscore2', 'participant0playerscore3']
- renombres_aplicados_ejemplo: {'dataVersion': 'matchid', 'matchId': 'endofgameresult', 'endOfGameResult': 'gamecreation', 'gameCreation': 'gameduration', 'gameDuration': 'gameendtimestamp', 'gameEndTimestamp': 'gameid', 'gameId': 'gamename', 'gameMode': 'gameversion', 'gameName': 'participant0allinpings', 'gameType': 'part

In [45]:
OUT = "matchData_clean.csv"
df.to_csv(OUT, index=False)
print(f"\nArchivo guardado: {OUT}")


Archivo guardado: matchData_clean.csv


In [46]:
print("\nHead del archivo limpio (5 filas):")
print(df.head(5))


Head del archivo limpio (5 filas):
          matchid endofgameresult   gamecreation  gameduration  \
0  NA1_5348438296    GameComplete  1755372956560          1682   
1  NA1_5348419072    GameComplete  1755370753217          1759   
2  NA1_5345908214    GameComplete  1755055472135          2491   
3  NA1_5341292117    GameComplete  1754532281603          1973   
4  NA1_5341241370    GameComplete  1754529263025          2376   

   gameendtimestamp      gameid                      gamename     gameversion  \
0     1755374669877  5348438296  teambuilder-match-5348438296  15.16.704.6097   
1     1755372544375  5348419072  teambuilder-match-5348419072  15.16.704.6097   
2     1755057984148  5345908214  teambuilder-match-5345908214  15.15.701.6241   
3     1754534283418  5341292117  teambuilder-match-5341292117  15.15.701.6241   
4     1754531661796  5341241370  teambuilder-match-5341241370  15.15.701.6241   

   participant0allinpings  participant0assistmepings  ...  team1dragonfirst  \
0

In [47]:
print("\nTipos de datos (primeras 30) del archivo limpio:")
print(df.dtypes.head(30))


Tipos de datos (primeras 30) del archivo limpio:
matchid                                object
endofgameresult                        object
gamecreation                            int64
gameduration                            int64
gameendtimestamp                        int64
gameid                                  int64
gamename                               object
gameversion                            object
participant0allinpings                  int64
participant0assistmepings               int64
participant0assists                     int64
participant0baronkills                  int64
participant0champexperience             int64
participant0champlevel                  int64
participant0championid                  int64
participant0championname               object
participant0championtransform           int64
participant0commandpings                int64
participant0consumablespurchased        int64
participant0damagedealttobuildings      int64
participant0damagedealttoobjec