In [13]:
# %% 1) Ubicar data/raw + helper de lectura
from pathlib import Path
import pandas as pd
import numpy as np

RAW = (Path.cwd().parent / "data" / "raw")
if not RAW.exists():
    base = Path.cwd()
    for c in [base] + list(base.parents):
        d = c / "data" / "raw"
        if d.exists():
            RAW = d
            break

def lee(p):
    try:
        return pd.read_csv(p, encoding="utf-8", low_memory=False)
    except UnicodeDecodeError:
        return pd.read_csv(p, encoding="latin1", low_memory=False)

twitch_game       = lee(RAW / "Twitch_game_data.csv")
twitch_global     = lee(RAW / "Twitch_global_data.csv")
general_esport    = lee(RAW / "GeneralEsportData.csv")
historical_esport = lee(RAW / "HistoricalEsportData.csv")

print("CSV cargados desde:", RAW.resolve())


CSV cargados desde: C:\Users\cubea\OneDrive\Escritorio\project_esports_occidente\data\raw


In [14]:
# %% 2) Función para estandarizar nombres: minúsculas + snake_case
def limpiar_columnas(df):
    df = df.copy()
    df.columns = (
        df.columns
          .str.strip()
          .str.lower()
          .str.replace(" ", "_", regex=False)
          .str.replace("-", "_", regex=False)
    )
    return df

twitch_game_clean       = limpiar_columnas(twitch_game)
twitch_global_clean     = limpiar_columnas(twitch_global)
general_esport_clean    = limpiar_columnas(general_esport)
historical_esport_clean = limpiar_columnas(historical_esport)


In [15]:
# %% 3) Unificar columnas clave (una sola vez)
twitch_game_clean = twitch_game_clean.rename(columns={"game": "game", "year": "year"})
twitch_global_clean = twitch_global_clean.rename(columns={"year": "year"})

general_esport_clean = general_esport_clean.rename(columns={
    "releasedate": "year",
    "totalearnings": "earnings_total",
    "offlineearnings": "earnings_offline",
    "percentoffline": "percent_offline",
    "totalplayers": "players_total",
    "totaltournaments": "tournaments_total"
})

historical_esport_clean = historical_esport_clean.rename(columns={
    "date": "year",
    "earnings": "earnings_total",
    "players": "players_total",
    "tournaments": "tournaments_total"
})


In [16]:
# %% 4) FIX-01: corregir 'year' en Twitch desde el CSV original (evitar 1970)
twitch_game_clean["year"]   = pd.to_numeric(twitch_game["Year"], errors="coerce").astype("Int64")
twitch_global_clean["year"] = pd.to_numeric(twitch_global["year"], errors="coerce").astype("Int64")

print("Años únicos Twitch (game):", sorted(twitch_game_clean["year"].dropna().unique()))


Años únicos Twitch (game): [np.int64(2016), np.int64(2017), np.int64(2018), np.int64(2019), np.int64(2020), np.int64(2021), np.int64(2022), np.int64(2023), np.int64(2024)]


In [17]:
# %% 5) FIX-02 + Tipos numéricos en Esports (general e histórico)
# Arreglar 'year' en histórico: extraer año real desde la columna original 'Date' si hace falta
# (esto soluciona el problema de años vacíos en earnings_yearly)

# Si 'year' quedó vacío o no-numérico, usa la columna original del CSV
if historical_esport_clean["year"].isna().all():
    historical_esport_clean["year"] = pd.to_datetime(
        historical_esport["Date"], errors="coerce"
    ).dt.year.astype("Int64")
else:
    # Mezcla robusta: intentar fecha y/o numérico
    y_dt  = pd.to_datetime(historical_esport_clean["year"], errors="coerce").dt.year
    y_num = pd.to_numeric(historical_esport_clean["year"], errors="coerce")
    historical_esport_clean["year"] = y_dt.fillna(y_num).astype("Int64")

# General (tipos numéricos)
general_esport_clean["year"] = pd.to_numeric(general_esport_clean["year"], errors="coerce").astype("Int64")
for col in ["earnings_total", "earnings_offline", "players_total", "tournaments_total"]:
    general_esport_clean[col] = pd.to_numeric(general_esport_clean[col], errors="coerce")

# Histórico (tipos numéricos)
for col in ["earnings_total", "players_total", "tournaments_total"]:
    historical_esport_clean[col] = pd.to_numeric(historical_esport_clean[col], errors="coerce")



In [18]:
# %% 6) POP: agregación mensual → anual por juego (popularidad)
# Asegurar numéricos
for c in ["hours_watched","hours_streamed","avg_viewers","peak_viewers","avg_channels","streamers"]:
    if c in twitch_game_clean.columns:
        twitch_game_clean[c] = pd.to_numeric(twitch_game_clean[c], errors="coerce")

# Limitar a 2016–2024
twitch_game_clean = twitch_game_clean[
    (twitch_game_clean["year"] >= 2016) & (twitch_game_clean["year"] <= 2024)
].copy()

pop = (
    twitch_game_clean
    .groupby(["game","year"], as_index=False)
    .agg({
        "hours_watched":  "sum",
        "hours_streamed": "sum",
        "avg_viewers":    "mean",
        "peak_viewers":   "max"
    })
)
print("pop ->", pop.shape)

pop -> (4718, 6)


In [19]:
# %% 7) Premios anuales por juego (earnings_yearly) y metadatos por juego (meta_game)
earnings_yearly = historical_esport_clean.loc[
    :, ["game","year","earnings_total","players_total","tournaments_total"]
].copy()

meta_game = (
    general_esport_clean.loc[
        :, ["game","genre","year","earnings_total","earnings_offline",
            "percent_offline","players_total","tournaments_total"]
    ]
    .rename(columns={
        "year": "release_year",
        "earnings_total": "earnings_total_lifetime",
        "earnings_offline": "earnings_offline_lifetime",
        "players_total": "players_total_lifetime",
        "tournaments_total": "tournaments_total_lifetime"
    })
    .copy()
)
print("earnings_yearly ->", earnings_yearly.shape, " | meta_game ->", meta_game.shape)


earnings_yearly -> (10239, 5)  | meta_game -> (669, 8)


In [20]:
# %% FIX-01: normalizar nombres de juego y reintentar merge

def normaliza_game(s):
    s = s.astype(str).str.lower().str.strip()
    s = s.str.replace(r"[®™]", "", regex=True)
    s = s.str.replace(r"[\.:,_\-\(\)\[\]\{\}!?'\"/\\]+", " ", regex=True)  # quita puntuación común
    s = s.str.replace(r"\s+", " ", regex=True).str.strip()                 # colapsa espacios
    return s

pop_norm  = pop.copy()
earn_norm = earnings_yearly.copy()

pop_norm["game_norm"]  = normaliza_game(pop_norm["game"])
earn_norm["game_norm"] = normaliza_game(earn_norm["game"])

# (Opcional) pequeño mapeo manual de alias frecuentes → puedes ampliar si hace falta
alias = {
    "cs go": "counter strike global offensive",
    "csgo": "counter strike global offensive",
    "counter strike: global offensive": "counter strike global offensive",
    "counter strike 2": "counter strike global offensive",  # si prefieres unir CS2 con CS:GO
    "rainbow six siege": "tom clancy s rainbow six siege",
}
pop_norm["game_norm"]  = pop_norm["game_norm"].replace(alias)
earn_norm["game_norm"] = earn_norm["game_norm"].replace(alias)

# Re-merge usando game_norm + year
panel = pop_norm.merge(
    earn_norm,
    on=["game_norm", "year"],
    how="inner",
    suffixes=("_pop", "_comp")
)

# Conservamos un nombre legible de juego (el de Twitch, por ejemplo)
if "game_pop" in panel.columns:
    panel["game"] = panel["game_pop"]
else:
    panel["game"] = panel["game_norm"]

# Selecciona columnas finales útiles
cols_finales = [
    "game", "year",
    "hours_watched", "hours_streamed", "avg_viewers", "peak_viewers",
    "earnings_total", "players_total", "tournaments_total"
]
panel = panel[[c for c in cols_finales if c in panel.columns]].copy()

print("panel (tras normalizar) ->", panel.shape)
panel.head(10)


panel (tras normalizar) -> (3543, 9)


Unnamed: 0,game,year,hours_watched,hours_streamed,avg_viewers,peak_viewers,earnings_total,players_total,tournaments_total
0,ARMS,2017,468819,9200,652.0,40901,1195.0,17,5
1,ARMS,2017,468819,9200,652.0,40901,91.0,5,2
2,ARMS,2017,468819,9200,652.0,40901,50.0,4,3
3,ARMS,2017,468819,9200,652.0,40901,261.0,3,2
4,ARMS,2017,468819,9200,652.0,40901,695.0,9,4
5,ARMS,2017,468819,9200,652.0,40901,1100.1,9,3
6,Age of Empires,2016,248884,232,334.0,107455,5148.0,23,4
7,Age of Empires,2016,248884,232,334.0,107455,2684.0,24,3
8,Age of Empires,2016,248884,232,334.0,107455,32409.0,43,8
9,Age of Empires,2016,248884,232,334.0,107455,817.0,3,1


In [21]:
# %% 8) Normalizar nombres de juego para mejorar el emparejamiento
import re

def normaliza_game(s: pd.Series) -> pd.Series:
    s = s.astype(str).str.lower().str.strip()
    s = s.str.replace(r"[®™]", "", regex=True)
    s = s.str.replace(r"[\.:,_\-\(\)\[\]\{\}!?'\"/\\]+", " ", regex=True)  # quitar puntuación
    s = s.str.replace(r"\s+", " ", regex=True).str.strip()
    return s

pop["game_norm"]  = normaliza_game(pop["game"])
earnings_yearly["game_norm"] = normaliza_game(earnings_yearly["game"])

# Alias opcionales (añade más si lo ves en tu diagnóstico)
alias = {
    "cs go": "counter strike global offensive",
    "csgo": "counter strike global offensive",
    "counter strike: global offensive": "counter strike global offensive",
    "counter strike 2": "counter strike global offensive",
    "rainbow six siege": "tom clancy s rainbow six siege",
}
pop["game_norm"]  = pop["game_norm"].replace(alias)
earnings_yearly["game_norm"] = earnings_yearly["game_norm"].replace(alias)


In [22]:
# %% 9) MERGE: unir popularidad (Twitch) con premios (Esports) por juego y año (usando game_norm)
panel = pop.merge(
    earnings_yearly,
    on=["game_norm","year"],
    how="inner",
    suffixes=("_pop","_comp")
)

# Conservar nombre legible de juego (del lado Twitch)
panel["game"] = panel["game_pop"] if "game_pop" in panel.columns else panel["game_norm"]

# Seleccionar columnas finales
cols_finales = [
    "game", "year",
    "hours_watched", "hours_streamed", "avg_viewers", "peak_viewers",
    "earnings_total", "players_total", "tournaments_total"
]
panel = panel[[c for c in cols_finales if c in panel.columns]].copy()

print("panel (pop + earnings) ->", panel.shape)
panel.head(10)


panel (pop + earnings) -> (3543, 9)


Unnamed: 0,game,year,hours_watched,hours_streamed,avg_viewers,peak_viewers,earnings_total,players_total,tournaments_total
0,ARMS,2017,468819,9200,652.0,40901,1195.0,17,5
1,ARMS,2017,468819,9200,652.0,40901,91.0,5,2
2,ARMS,2017,468819,9200,652.0,40901,50.0,4,3
3,ARMS,2017,468819,9200,652.0,40901,261.0,3,2
4,ARMS,2017,468819,9200,652.0,40901,695.0,9,4
5,ARMS,2017,468819,9200,652.0,40901,1100.1,9,3
6,Age of Empires,2016,248884,232,334.0,107455,5148.0,23,4
7,Age of Empires,2016,248884,232,334.0,107455,2684.0,24,3
8,Age of Empires,2016,248884,232,334.0,107455,32409.0,43,8
9,Age of Empires,2016,248884,232,334.0,107455,817.0,3,1


In [23]:
# %% 10) Top 10 juegos más vistos y con mayores premios (una sola vez)
ranking = (
    panel.groupby("game", as_index=False)
    .agg({"hours_watched": "sum", "earnings_total": "sum"})
)

print("🎮 Top 10 por horas vistas:")
display(ranking.sort_values("hours_watched", ascending=False).head(10))

print("\n🏆 Top 10 por premios totales:")
display(ranking.sort_values("earnings_total", ascending=False).head(10))


🎮 Top 10 por horas vistas:


Unnamed: 0,game,hours_watched,earnings_total
97,League of Legends,131105243041,86676190.0
71,Fortnite,70014424381,191012700.0
49,Dota 2,53198939276,304658000.0
42,Counter-Strike: Global Offensive,53101311544,158704000.0
186,VALORANT,49546008476,32018220.0
115,Minecraft,29427402799,3008059.0
7,Apex Legends,28343745368,26153240.0
90,Hearthstone,28100356534,27211530.0
35,Call of Duty: Warzone,25836170744,12904810.0
191,World of Warcraft,16584771587,6232200.0



🏆 Top 10 por premios totales:


Unnamed: 0,game,hours_watched,earnings_total
49,Dota 2,53198939276,304658000.0
71,Fortnite,70014424381,191012700.0
42,Counter-Strike: Global Offensive,53101311544,158704000.0
97,League of Legends,131105243041,86676190.0
178,Tom Clancy's Rainbow Six Siege,11129580448,48130530.0
143,Rocket League,9658901666,42406000.0
132,Overwatch,15765447961,35255910.0
186,VALORANT,49546008476,32018220.0
90,Hearthstone,28100356534,27211530.0
7,Apex Legends,28343745368,26153240.0
