In [1]:
# 1) Ubicar data/raw + helper
from pathlib import Path
import pandas as pd
import numpy as np

RAW = (Path.cwd().parent / "data" / "raw")
if not RAW.exists():
    base = Path.cwd()
    for c in [base] + list(base.parents):
        d = c / "data" / "raw"
        if d.exists():
            RAW = d
            break


# 2) Cargar y mostrar columnas 
def lee(p):
    try:
        return pd.read_csv(p, encoding="utf-8", low_memory=False)
    except UnicodeDecodeError:
        return pd.read_csv(p, encoding="latin1", low_memory=False)

twitch_game       = lee(RAW / "Twitch_game_data.csv")
twitch_global     = lee(RAW / "Twitch_global_data.csv")
general_esport    = lee(RAW / "GeneralEsportData.csv")
historical_esport = lee(RAW / "HistoricalEsportData.csv")

for nombre, df in {
    "Twitch_game_data": twitch_game,
    "Twitch_global_data": twitch_global,
    "GeneralEsportData": general_esport,
    "HistoricalEsportData": historical_esport,
}.items():
    print(f"\n{nombre} ({df.shape[1]} columnas):")
    print(list(df.columns))




Twitch_game_data (12 columnas):
['Rank', 'Game', 'Month', 'Year', 'Hours_watched', 'Hours_streamed', 'Peak_viewers', 'Peak_channels', 'Streamers', 'Avg_viewers', 'Avg_channels', 'Avg_viewer_ratio']

Twitch_global_data (9 columnas):
['year', 'Month', 'Hours_watched', 'Avg_viewers', 'Peak_viewers', 'Streams', 'Avg_channels', 'Games_streamed', 'Viewer_ratio']

GeneralEsportData (8 columnas):
['Game', 'ReleaseDate', 'Genre', 'TotalEarnings', 'OfflineEarnings', 'PercentOffline', 'TotalPlayers', 'TotalTournaments']

HistoricalEsportData (5 columnas):
['Date', 'Game', 'Earnings', 'Players', 'Tournaments']


In [2]:
# 1) Función para estandarizar nombres: minúsculas + snake_case
def limpiar_columnas(df):
    df = df.copy()
    df.columns = (
        df.columns
          .str.strip()
          .str.lower()
          .str.replace(" ", "_")
          .str.replace("-", "_")
    )
    return df


In [3]:
twitch_game_clean       = limpiar_columnas(twitch_game)
twitch_global_clean     = limpiar_columnas(twitch_global)
general_esport_clean    = limpiar_columnas(general_esport)
historical_esport_clean = limpiar_columnas(historical_esport)

datasets_limpios = {
    "twitch_game_clean": twitch_game_clean,
    "twitch_global_clean": twitch_global_clean,
    "general_esport_clean": general_esport_clean,
    "historical_esport_clean": historical_esport_clean,
}

In [4]:
for nombre, df in datasets_limpios.items():
	print(f"\n{nombre} ({df.shape[1]} columnas):")
	print(list(df.columns))


twitch_game_clean (12 columnas):
['rank', 'game', 'month', 'year', 'hours_watched', 'hours_streamed', 'peak_viewers', 'peak_channels', 'streamers', 'avg_viewers', 'avg_channels', 'avg_viewer_ratio']

twitch_global_clean (9 columnas):
['year', 'month', 'hours_watched', 'avg_viewers', 'peak_viewers', 'streams', 'avg_channels', 'games_streamed', 'viewer_ratio']

general_esport_clean (8 columnas):
['game', 'releasedate', 'genre', 'totalearnings', 'offlineearnings', 'percentoffline', 'totalplayers', 'totaltournaments']

historical_esport_clean (5 columnas):
['date', 'game', 'earnings', 'players', 'tournaments']


In [5]:
# %% Unificar columnas clave (una sola vez)
twitch_game_clean = twitch_game_clean.rename(columns={"game": "game", "year": "year"})
twitch_global_clean = twitch_global_clean.rename(columns={"year": "year"})

general_esport_clean = general_esport_clean.rename(columns={
    "releasedate": "year",
    "totalearnings": "earnings_total",
    "offlineearnings": "earnings_offline",
    "percentoffline": "percent_offline",
    "totalplayers": "players_total",
    "totaltournaments": "tournaments_total"
})

historical_esport_clean = historical_esport_clean.rename(columns={
    "date": "year",
    "earnings": "earnings_total",
    "players": "players_total",
    "tournaments": "tournaments_total"
})


In [6]:
# %% FIX-01: corregir 'year' en datasets de Twitch desde los dataframes originales
twitch_game_clean["year"]   = pd.to_numeric(twitch_game["Year"], errors="coerce").astype("Int64")
twitch_global_clean["year"] = pd.to_numeric(twitch_global["year"], errors="coerce").astype("Int64")

print("Años únicos Twitch (game):", twitch_game_clean["year"].unique()[:10])
print("Años únicos Twitch (global):", twitch_global_clean["year"].unique()[:10])


Años únicos Twitch (game): <IntegerArray>
[2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]
Length: 9, dtype: Int64
Años únicos Twitch (global): <IntegerArray>
[2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]
Length: 9, dtype: Int64


In [7]:
# %% Tipos numéricos en Esports
general_esport_clean["year"] = pd.to_numeric(general_esport_clean["year"], errors="coerce").astype("Int64")
for col in ["earnings_total", "earnings_offline", "players_total", "tournaments_total"]:
    general_esport_clean[col] = pd.to_numeric(general_esport_clean[col], errors="coerce")

historical_esport_clean["year"] = pd.to_numeric(historical_esport_clean["year"], errors="coerce").astype("Int64")
for col in ["earnings_total", "players_total", "tournaments_total"]:
    historical_esport_clean[col] = pd.to_numeric(historical_esport_clean[col], errors="coerce")


In [8]:
# %% POP: agregación mensual → anual por juego
# asegurar numéricos básicos por si viene algo como texto
for c in ["hours_watched","hours_streamed","avg_viewers","peak_viewers","avg_channels","streamers"]:
    if c in twitch_game_clean.columns:
        twitch_game_clean[c] = pd.to_numeric(twitch_game_clean[c], errors="coerce")

# limitar a rango razonable de años (opcional)
twitch_game_clean = twitch_game_clean[
    (twitch_game_clean["year"] >= 2016) & (twitch_game_clean["year"] <= 2024)
].copy()

pop = (
    twitch_game_clean
    .groupby(["game","year"], as_index=False)
    .agg({
        "hours_watched":  "sum",
        "hours_streamed": "sum",
        "avg_viewers":    "mean",
        "peak_viewers":   "max"
    })
)

print("pop ->", pop.shape)
pop.head(10)

pop -> (4718, 6)


Unnamed: 0,game,year,hours_watched,hours_streamed,avg_viewers,peak_viewers
0,.hack//G.U. Last Recode,2017,145350,35258,202.0,1222
1,20 Minutes Till Dawn,2022,911356,12253,1267.0,29743
2,2XKO,2024,2137305,31942,2876.0,60961
3,60 Parsecs!,2018,529688,1867,736.0,31960
4,60 Seconds!,2016,1151013,3165,517.666667,56904
5,60 Seconds!,2017,623520,1932,282.0,31311
6,60 Seconds!,2018,222209,871,299.0,18245
7,60 Seconds!,2019,837363,3240,563.0,41049
8,60 Seconds! Reatomized,2023,756797,2576,1018.0,57965
9,60 Seconds! Reatomized,2024,4975406,16843,1689.5,142643


In [9]:
# %% Premios anuales por juego y metadatos por juego
earnings_yearly = historical_esport_clean.loc[
    :, ["game","year","earnings_total","players_total","tournaments_total"]
].copy()

meta_game = (
    general_esport_clean.loc[
        :, ["game","genre","year","earnings_total","earnings_offline",
            "percent_offline","players_total","tournaments_total"]
    ]
    .rename(columns={
        "year": "release_year",
        "earnings_total": "earnings_total_lifetime",
        "earnings_offline": "earnings_offline_lifetime",
        "players_total": "players_total_lifetime",
        "tournaments_total": "tournaments_total_lifetime"
    })
    .copy()
)

for name, df in {"pop": pop, "earnings_yearly": earnings_yearly, "meta_game": meta_game}.items():
    print(f"{name}: {df.shape}")


pop: (4718, 6)
earnings_yearly: (10239, 5)
meta_game: (669, 8)


In [10]:
# %% MERGE: unir popularidad (Twitch) con premios (Esports) por juego y año
panel = pop.merge(earnings_yearly, on=["game","year"], how="inner")
print("panel (pop + earnings):", panel.shape)
panel.head(10)


panel (pop + earnings): (0, 9)


Unnamed: 0,game,year,hours_watched,hours_streamed,avg_viewers,peak_viewers,earnings_total,players_total,tournaments_total


In [11]:
# %% QA: comprobar si hay valores nulos o duplicados
print("Nulos por columna en panel:")
print(panel.isna().sum().sort_values(ascending=False))
print("\nDuplicados game-year:", panel.duplicated(subset=["game","year"]).sum())


Nulos por columna en panel:
game                 0
year                 0
hours_watched        0
hours_streamed       0
avg_viewers          0
peak_viewers         0
earnings_total       0
players_total        0
tournaments_total    0
dtype: int64

Duplicados game-year: 0


In [12]:
# %% Top 10 juegos más vistos y con mayores premios
ranking = (
    panel.groupby("game", as_index=False)
    .agg({"hours_watched": "sum", "earnings_total": "sum"})
)

print("🎮 Top 10 por horas vistas:")
display(ranking.sort_values("hours_watched", ascending=False).head(10))

print("\n🏆 Top 10 por premios totales:")
display(ranking.sort_values("earnings_total", ascending=False).head(10))


🎮 Top 10 por horas vistas:


Unnamed: 0,game,hours_watched,earnings_total



🏆 Top 10 por premios totales:


Unnamed: 0,game,hours_watched,earnings_total
