In [1]:
from pathlib import Path
import pandas as pd


# 1) Encontrar carpeta data/raw 

In [2]:
RAW = (Path.cwd().parent / "data" / "raw")
if not RAW.exists():
    base = Path.cwd()
    for c in [base] + list(base.parents):
        d = c / "data" / "raw"
        if d.exists():
            RAW = d
            break

print(RAW.resolve())       

C:\Users\cubea\OneDrive\Escritorio\project_esports_occidente\data\raw


# 2) Lector

In [3]:
def lee(p):
    try:
        return pd.read_csv(p, encoding="utf-8", low_memory=False)
    except UnicodeDecodeError:
        return pd.read_csv(p, encoding="latin-1", low_memory=False)

# 3) Carga de ficheros

In [4]:
twitch_game       = lee(RAW / "Twitch_game_data.csv")
twitch_global     = lee(RAW / "Twitch_global_data.csv")
general_esport    = lee(RAW / "GeneralEsportData.csv")
historical_esport = lee(RAW / "HistoricalEsportData.csv")


In [32]:
def vista_previa(df, nombre, n=3):
    print("\n",nombre)
    display(df.head(n))  

vista_previa(twitch_game, "twitch_game")
vista_previa(twitch_global, "twitch_global")
vista_previa(general_esport, "general_esport")
vista_previa(historical_esport, "historical_esport")


 twitch_game


Unnamed: 0,Rank,Game,Month,Year,Hours_watched,Hours_streamed,Peak_viewers,Peak_channels,Streamers,Avg_viewers,Avg_channels,Avg_viewer_ratio
0,1,League of Legends,1,2016,94377226,1362044,530270,2903,129172,127021,1833,69.29
1,2,Counter-Strike: Global Offensive,1,2016,47832863,830105,372654,2197,120849,64378,1117,57.62
2,3,Dota 2,1,2016,45185893,433397,315083,1100,44074,60815,583,104.26



 twitch_global


Unnamed: 0,year,Month,Hours_watched,Avg_viewers,Peak_viewers,Streams,Avg_channels,Games_streamed,Viewer_ratio
0,2016,1,480241904,646355,1275257,7701675,20076,12149,29.08
1,2016,2,441859897,635769,1308032,7038520,20427,12134,28.98
2,2016,3,490669308,660389,1591551,7390957,20271,12234,28.92



 general_esport


Unnamed: 0,Game,ReleaseDate,Genre,TotalEarnings,OfflineEarnings,PercentOffline,TotalPlayers,TotalTournaments
0,Age of Empires,1997,Strategy,736284.75,522378.17,0.709478,624,341
1,Age of Empires II,1999,Strategy,3898508.73,1361409.22,0.349213,2256,1939
2,Age of Empires III,2005,Strategy,122256.72,44472.6,0.363764,172,179



 historical_esport


Unnamed: 0,Date,Game,Earnings,Players,Tournaments
0,1998-01-01,Command & Conquer: Red Alert,15000.0,8,1
1,1998-01-01,QuakeWorld,15000.0,8,1
2,1998-05-01,Quake II,15000.0,4,1


In [6]:
tg = twitch_game.rename(columns={
    "Game":"game","Year":"year","Month":"month",
    "Hours_watched":"hours_watched","Hours_streamed":"hours_streamed",
    "Peak_viewers":"peak_viewers","Avg_viewers":"avg_viewers",
    "Peak_channels":"peak_channels","Avg_channels":"avg_channels",
    "Streamers":"streamers","Avg_viewer_ratio":"avg_viewer_ratio",
    "Rank":"rank"
}).copy()

tg["game"] = tg["game"].astype(str).str.strip()
tg["game_key"] = tg["game"].str.lower().str.strip()
tg["year"] = tg["year"].astype(int)
tg["month"] = tg["month"].astype(int)
tg["year_month"] = pd.to_datetime(tg["year"].astype(str)+"-"+tg["month"].astype(str)+"-01")


print( tg.shape)
display(tg.head(3))



(21000, 14)


Unnamed: 0,rank,game,month,year,hours_watched,hours_streamed,peak_viewers,peak_channels,streamers,avg_viewers,avg_channels,avg_viewer_ratio,game_key,year_month
0,1,League of Legends,1,2016,94377226,1362044,530270,2903,129172,127021,1833,69.29,league of legends,2016-01-01
1,2,Counter-Strike: Global Offensive,1,2016,47832863,830105,372654,2197,120849,64378,1117,57.62,counter-strike: global offensive,2016-01-01
2,3,Dota 2,1,2016,45185893,433397,315083,1100,44074,60815,583,104.26,dota 2,2016-01-01


In [7]:
he = historical_esport.rename(columns={
    "Game":"game","Date":"date",
    "Earnings":"year_prize_usd","Players":"year_players","Tournaments":"year_tournaments"
}).copy()

he["game"] = he["game"].astype(str).str.strip()
he["game_key"] = he["game"].str.lower().str.strip()


he["year"] = pd.to_datetime(he["date"], errors="coerce").dt.year
he = he.dropna(subset=["year"])
he["year"] = he["year"].astype(int)

he["year_prize_usd"]   = pd.to_numeric(he["year_prize_usd"].astype(str).str.replace(",", ""), errors="coerce")
he["year_players"]     = pd.to_numeric(he["year_players"], errors="coerce")
he["year_tournaments"] = pd.to_numeric(he["year_tournaments"], errors="coerce")

print(he.shape)
display(he.head(3))

(10239, 7)


Unnamed: 0,date,game,year_prize_usd,year_players,year_tournaments,game_key,year
0,1998-01-01,Command & Conquer: Red Alert,15000.0,8,1,command & conquer: red alert,1998
1,1998-01-01,QuakeWorld,15000.0,8,1,quakeworld,1998
2,1998-05-01,Quake II,15000.0,4,1,quake ii,1998


In [8]:
ge = general_esport.rename(columns={
    "Game":"game","ReleaseDate":"release_date","Genre":"genre",
    "TotalEarnings":"total_prize_usd","OfflineEarnings":"offline_prize_usd",
    "PercentOffline":"pct_offline_prize","TotalPlayers":"total_players",
    "TotalTournaments":"total_tournaments"
}).copy()

ge["game"] = ge["game"].astype(str).str.strip()
ge["game_key"] = ge["game"].str.lower().str.strip()

for c in ["total_prize_usd","offline_prize_usd","pct_offline_prize","total_players","total_tournaments"]:
    if c in ge.columns:
        ge[c] = pd.to_numeric(ge[c], errors="coerce")

print( ge.shape)
display(ge.head(3))

(669, 9)


Unnamed: 0,game,release_date,genre,total_prize_usd,offline_prize_usd,pct_offline_prize,total_players,total_tournaments,game_key
0,Age of Empires,1997,Strategy,736284.75,522378.17,0.709478,624,341,age of empires
1,Age of Empires II,1999,Strategy,3898508.73,1361409.22,0.349213,2256,1939,age of empires ii
2,Age of Empires III,2005,Strategy,122256.72,44472.6,0.363764,172,179,age of empires iii


In [9]:
# 1) Seleccionar solo las columnas necesarias de cada DF para el merge
he_sel = he[["game_key","year","year_prize_usd","year_players","year_tournaments"]].copy()
ge_sel = ge[["game_key","release_date","genre","total_prize_usd","offline_prize_usd",
             "pct_offline_prize","total_players","total_tournaments"]].copy()

# 2) Merge base (mensual por juego) + histórico anual (por juego y año)
master = tg.merge(
    he_sel,
    on=["game_key","year"],
    how="left"
)

# 3) Añadir totales/metadata del juego (por game_key)
master = master.merge(
    ge_sel,
    on="game_key",
    how="left"
)

print("MASTER (sin filtrar):", master.shape)
display(master.head())



MASTER (sin filtrar): (48377, 24)


Unnamed: 0,rank,game,month,year,hours_watched,hours_streamed,peak_viewers,peak_channels,streamers,avg_viewers,...,year_prize_usd,year_players,year_tournaments,release_date,genre,total_prize_usd,offline_prize_usd,pct_offline_prize,total_players,total_tournaments
0,1,League of Legends,1,2016,94377226,1362044,530270,2903,129172,127021,...,50025.02,40.0,5.0,2009.0,Multiplayer Online Battle Arena,113596000.0,98546688.28,0.867519,9588.0,3000.0
1,1,League of Legends,1,2016,94377226,1362044,530270,2903,129172,127021,...,239015.61,174.0,9.0,2009.0,Multiplayer Online Battle Arena,113596000.0,98546688.28,0.867519,9588.0,3000.0
2,1,League of Legends,1,2016,94377226,1362044,530270,2903,129172,127021,...,299620.13,395.0,23.0,2009.0,Multiplayer Online Battle Arena,113596000.0,98546688.28,0.867519,9588.0,3000.0
3,1,League of Legends,1,2016,94377226,1362044,530270,2903,129172,127021,...,1459231.04,561.0,24.0,2009.0,Multiplayer Online Battle Arena,113596000.0,98546688.28,0.867519,9588.0,3000.0
4,1,League of Legends,1,2016,94377226,1362044,530270,2903,129172,127021,...,517405.11,145.0,11.0,2009.0,Multiplayer Online Battle Arena,113596000.0,98546688.28,0.867519,9588.0,3000.0


In [10]:
master[master['total_prize_usd'].isnull()]

Unnamed: 0,rank,game,month,year,hours_watched,hours_streamed,peak_viewers,peak_channels,streamers,avg_viewers,...,year_prize_usd,year_players,year_tournaments,release_date,genre,total_prize_usd,offline_prize_usd,pct_offline_prize,total_players,total_tournaments
63,8,Z1: Battle Royale,1,2016,7894571,205569,41588,460,21396,10625,...,,,,,,,,,,
64,9,Talk Shows & Podcasts,1,2016,7688369,53235,84051,148,10779,10347,...,,,,,,,,,,
85,12,Diablo III,1,2016,6235668,218490,57383,2185,25458,8392,...,,,,,,,,,,
86,13,Destiny,1,2016,5892573,551615,18783,1741,60340,7930,...,,,,,,,,,,
93,16,Tom Clancy's Rainbow Six Siege,1,2016,4866039,242134,25742,694,42244,6549,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48372,196,PokÃ©mon HeartGold/SoulSilver,9,2024,683902,10093,10600,42,1443,951,...,,,,,,,,,,
48373,197,Grounded,9,2024,683226,37779,39473,116,7076,950,...,,,,,,,,,,
48374,198,Ragnarok Online,9,2024,679965,28491,2539,75,1066,945,...,,,,,,,,,,
48375,199,Politics,9,2024,673799,16675,19746,210,957,937,...,,,,,,,,,,


In [11]:
import re
import unicodedata

def normaliza_nombre(col):
    """Convierte nombres de columna a snake_case sin acentos ni espacios raros."""
    col = str(col)
    col = unicodedata.normalize("NFKD", col).encode("ascii", "ignore").decode("ascii")  # elimina acentos
    col = re.sub(r"[^\w\s]", "", col)  # quita símbolos
    col = re.sub(r"\s+", "_", col.strip().lower())  # snake_case
    return col

master.columns = [normaliza_nombre(c) for c in master.columns]

print("✅ Columnas normalizadas:")
print(master.columns.tolist())


✅ Columnas normalizadas:
['rank', 'game', 'month', 'year', 'hours_watched', 'hours_streamed', 'peak_viewers', 'peak_channels', 'streamers', 'avg_viewers', 'avg_channels', 'avg_viewer_ratio', 'game_key', 'year_month', 'year_prize_usd', 'year_players', 'year_tournaments', 'release_date', 'genre', 'total_prize_usd', 'offline_prize_usd', 'pct_offline_prize', 'total_players', 'total_tournaments']


In [12]:
print("\n🔍 Columnas tipo object con ejemplos:")
for col in master.columns:
    if master[col].dtype == "object":
        muestra = master[col].dropna().astype(str).head(5).tolist()
        print(f"- {col}: ejemplo → {muestra}")


🔍 Columnas tipo object con ejemplos:
- game: ejemplo → ['League of Legends', 'League of Legends', 'League of Legends', 'League of Legends', 'League of Legends']
- game_key: ejemplo → ['league of legends', 'league of legends', 'league of legends', 'league of legends', 'league of legends']
- genre: ejemplo → ['Multiplayer Online Battle Arena', 'Multiplayer Online Battle Arena', 'Multiplayer Online Battle Arena', 'Multiplayer Online Battle Arena', 'Multiplayer Online Battle Arena']


In [13]:
for col in master.columns:
    if master[col].dtype == "object":
        try:
            master[col] = pd.to_numeric(master[col]
                                        .astype(str)
                                        .str.replace(",", "")
                                        .str.replace("%", ""),
                                        errors="ignore")
        except Exception as e:
            pass

  master[col] = pd.to_numeric(master[col]
  master[col] = pd.to_numeric(master[col]
  master[col] = pd.to_numeric(master[col]


In [14]:
cols_keep = [
    "game", "game_key", "year", "month", "year_month",
    "hours_watched", "avg_viewers", "peak_viewers", "streamers",
    "year_prize_usd", "year_players", "year_tournaments",
    "genre", "total_prize_usd", "total_players", "total_tournaments"
]

final = master[cols_keep].copy()
print("✅ Dataset reducido a columnas esenciales:", final.shape)
display(final.head())

✅ Dataset reducido a columnas esenciales: (48377, 16)


Unnamed: 0,game,game_key,year,month,year_month,hours_watched,avg_viewers,peak_viewers,streamers,year_prize_usd,year_players,year_tournaments,genre,total_prize_usd,total_players,total_tournaments
0,League of Legends,league of legends,2016,1,2016-01-01,94377226,127021,530270,129172,50025.02,40.0,5.0,Multiplayer Online Battle Arena,113596000.0,9588.0,3000.0
1,League of Legends,league of legends,2016,1,2016-01-01,94377226,127021,530270,129172,239015.61,174.0,9.0,Multiplayer Online Battle Arena,113596000.0,9588.0,3000.0
2,League of Legends,league of legends,2016,1,2016-01-01,94377226,127021,530270,129172,299620.13,395.0,23.0,Multiplayer Online Battle Arena,113596000.0,9588.0,3000.0
3,League of Legends,league of legends,2016,1,2016-01-01,94377226,127021,530270,129172,1459231.04,561.0,24.0,Multiplayer Online Battle Arena,113596000.0,9588.0,3000.0
4,League of Legends,league of legends,2016,1,2016-01-01,94377226,127021,530270,129172,517405.11,145.0,11.0,Multiplayer Online Battle Arena,113596000.0,9588.0,3000.0


In [15]:
# A) Duplicados EXACTOS (todas las columnas idénticas)
print("Duplicados exactos en master:", master.duplicated().sum())

# B) Duplicados por CLAVE (game_key, year, month) en cada DF
print("tg dups por clave:", tg.duplicated(["game_key","year","month"]).sum())
print("he dups por clave:", he.duplicated(["game_key","year"]).sum())
print("ge dups por clave:", ge.duplicated(["game_key"]).sum())
print("master dups por clave:", master.duplicated(["game_key","year","month"]).sum())

# C) Si quieres ver cuáles son (top 10)
dups_master = (master[master.duplicated(["game_key","year","month"], keep=False)]
               .sort_values(["game_key","year","month"]))
display(dups_master.head(20))


Duplicados exactos en master: 467
tg dups por clave: 11
he dups por clave: 8024
ge dups por clave: 1
master dups por clave: 27388


Unnamed: 0,rank,game,month,year,hours_watched,hours_streamed,peak_viewers,peak_channels,streamers,avg_viewers,...,year_prize_usd,year_players,year_tournaments,release_date,genre,total_prize_usd,offline_prize_usd,pct_offline_prize,total_players,total_tournaments
345,142,Age of Empires,1,2016,248884,232,107455,18,40,334,...,5148.0,23.0,4.0,1997.0,Strategy,736284.75,522378.17,0.709478,624.0,341.0
346,142,Age of Empires,1,2016,248884,232,107455,18,40,334,...,2684.0,24.0,3.0,1997.0,Strategy,736284.75,522378.17,0.709478,624.0,341.0
347,142,Age of Empires,1,2016,248884,232,107455,18,40,334,...,32409.0,43.0,8.0,1997.0,Strategy,736284.75,522378.17,0.709478,624.0,341.0
348,142,Age of Empires,1,2016,248884,232,107455,18,40,334,...,817.0,3.0,1.0,1997.0,Strategy,736284.75,522378.17,0.709478,624.0,341.0
349,142,Age of Empires,1,2016,248884,232,107455,18,40,334,...,16425.0,60.0,4.0,1997.0,Strategy,736284.75,522378.17,0.709478,624.0,341.0
350,142,Age of Empires,1,2016,248884,232,107455,18,40,334,...,4144.8,12.0,1.0,1997.0,Strategy,736284.75,522378.17,0.709478,624.0,341.0
351,142,Age of Empires,1,2016,248884,232,107455,18,40,334,...,2420.0,16.0,1.0,1997.0,Strategy,736284.75,522378.17,0.709478,624.0,341.0
791,167,Age of Empires II,2,2016,129101,3169,1980,14,215,185,...,320.0,9.0,2.0,1999.0,Strategy,3898508.73,1361409.22,0.349213,2256.0,1939.0
792,167,Age of Empires II,2,2016,129101,3169,1980,14,215,185,...,3535.0,16.0,3.0,1999.0,Strategy,3898508.73,1361409.22,0.349213,2256.0,1939.0
793,167,Age of Empires II,2,2016,129101,3169,1980,14,215,185,...,620.0,6.0,2.0,1999.0,Strategy,3898508.73,1361409.22,0.349213,2256.0,1939.0


In [16]:
he_agg = (he.groupby(["game_key","year"], as_index=False)
            .agg({"year_prize_usd":"sum",
                  "year_players":"sum",
                  "year_tournaments":"sum"}))
print("he_agg shape:", he_agg.shape)


he_agg shape: (2215, 5)


In [17]:
ge_agg = (ge.sort_values("total_prize_usd", ascending=False)
            .drop_duplicates(subset=["game_key"], keep="first"))
print("ge_agg shape:", ge_agg.shape)


ge_agg shape: (668, 9)


In [18]:
he_sel = he_agg[["game_key","year","year_prize_usd","year_players","year_tournaments"]]
ge_sel = ge_agg[["game_key","release_date","genre","total_prize_usd","offline_prize_usd",
                 "pct_offline_prize","total_players","total_tournaments"]]

master = (tg.merge(he_sel, on=["game_key","year"], how="left")
            .merge(ge_sel, on="game_key", how="left"))

print("MASTER (tras sanar fuentes):", master.shape)
print("Duplicados por clave en master:", master.duplicated(["game_key","year","month"]).sum())


MASTER (tras sanar fuentes): (21000, 24)
Duplicados por clave en master: 11


In [19]:
master = (master
          .sort_values(["game_key","year","month"])
          .drop_duplicates(["game_key","year","month"], keep="first")
          .reset_index(drop=True))

print("Duplicados por clave en master (final):", master.duplicated(["game_key","year","month"]).sum())


Duplicados por clave en master (final): 0


In [20]:
final = master[(master["year"] >= 2016) & (master["year"] <= 2024)].copy()
print("FINAL 2016–2024:", final.shape)


FINAL 2016–2024: (20989, 24)


In [21]:
print(final.columns.tolist())


['rank', 'game', 'month', 'year', 'hours_watched', 'hours_streamed', 'peak_viewers', 'peak_channels', 'streamers', 'avg_viewers', 'avg_channels', 'avg_viewer_ratio', 'game_key', 'year_month', 'year_prize_usd', 'year_players', 'year_tournaments', 'release_date', 'genre', 'total_prize_usd', 'offline_prize_usd', 'pct_offline_prize', 'total_players', 'total_tournaments']


In [22]:
# Orden lógico recomendado (respetando tus 23 columnas)
prefer = [
    "game", "game_key", "genre", "release_date",   # identidad / metadatos
    "year", "month", "year_month",                 # tiempo
    # Popularidad (Twitch, mensual)
    "hours_watched", "avg_viewers", "peak_viewers", "streamers",
    "hours_streamed", "avg_channels", "peak_channels", "avg_viewer_ratio",
    # Competitividad (Esports)
    "year_prize_usd", "year_players", "year_tournaments",
    # Totales históricos del juego
    "total_prize_usd", "offline_prize_usd", "pct_offline_prize",
    "total_players", "total_tournaments",
    # Extras
    "rank"
]

left = [c for c in prefer if c in final.columns]
right = [c for c in final.columns if c not in left]
final = final[left + right].copy()

print(final.columns.tolist()[:12], "...")
display(final.head(5))

['game', 'game_key', 'genre', 'release_date', 'year', 'month', 'year_month', 'hours_watched', 'avg_viewers', 'peak_viewers', 'streamers', 'hours_streamed'] ...


Unnamed: 0,game,game_key,genre,release_date,year,month,year_month,hours_watched,avg_viewers,peak_viewers,...,avg_viewer_ratio,year_prize_usd,year_players,year_tournaments,total_prize_usd,offline_prize_usd,pct_offline_prize,total_players,total_tournaments,rank
0,.hack//G.U. Last Recode,.hack//g.u. last recode,,,2017,11,2017-11-01,145350,202,1222,...,4.12,,,,,,,,,199
1,20 Minutes Till Dawn,20 minutes till dawn,,,2022,6,2022-06-01,911356,1267,29743,...,74.38,,,,,,,,,159
2,2XKO,2xko,,,2024,8,2024-08-01,2137305,2876,60961,...,66.91,,,,,,,,,94
3,60 Parsecs!,60 parsecs!,,,2018,9,2018-09-01,529688,736,31960,...,283.71,,,,,,,,,109
4,60 Seconds!,60 seconds!,,,2016,7,2016-07-01,268754,361,32505,...,450.17,,,,,,,,,126


In [23]:
# Chequeo de duplicados por clave
dups = final.duplicated(["game_key", "year", "month"]).sum()
print("Duplicados por clave (game_key, year, month):", dups)

# Si hubiera, nos quedamos con la primera ocurrencia (conservador)
if dups > 0:
    final = (final
             .sort_values(["game_key","year","month"])
             .drop_duplicates(["game_key","year","month"], keep="first")
             .reset_index(drop=True))
    print("Duplicados eliminados. Nuevo shape:", final.shape)

Duplicados por clave (game_key, year, month): 0


In [24]:
def filtra(df, juego=None, genero=None, desde=None, hasta=None, con_premios=None):
    x = df.copy()
    if juego:
        x = x[x["game"].str.lower() == juego.lower()]
    if genero:
        x = x[x["genre"].str.lower() == genero.lower()]
    if desde:
        x = x[x["year"] >= int(desde)]
    if hasta:
        x = x[x["year"] <= int(hasta)]
    if con_premios is True:
        x = x[x["year_prize_usd"].notna()]
    if con_premios is False:
        x = x[x["year_prize_usd"].isna()]
    return x

# ejemplos:
display(filtra(final, juego="League of Legends").head(3))
display(filtra(final, genero="MOBA", desde=2020, con_premios=True).head(3))

Unnamed: 0,game,game_key,genre,release_date,year,month,year_month,hours_watched,avg_viewers,peak_viewers,...,avg_viewer_ratio,year_prize_usd,year_players,year_tournaments,total_prize_usd,offline_prize_usd,pct_offline_prize,total_players,total_tournaments,rank
9223,League of Legends,league of legends,Multiplayer Online Battle Arena,2009.0,2016,1,2016-01-01,94377226,127021,530270,...,69.29,10519032.01,3155.0,182.0,113596000.0,98546688.28,0.867519,9588.0,3000.0,1
9224,League of Legends,league of legends,Multiplayer Online Battle Arena,2009.0,2016,2,2016-02-01,93154772,134035,475784,...,73.54,10519032.01,3155.0,182.0,113596000.0,98546688.28,0.867519,9588.0,3000.0,1
9225,League of Legends,league of legends,Multiplayer Online Battle Arena,2009.0,2016,3,2016-03-01,94514511,127206,599114,...,74.77,10519032.01,3155.0,182.0,113596000.0,98546688.28,0.867519,9588.0,3000.0,1


Unnamed: 0,game,game_key,genre,release_date,year,month,year_month,hours_watched,avg_viewers,peak_viewers,...,avg_viewer_ratio,year_prize_usd,year_players,year_tournaments,total_prize_usd,offline_prize_usd,pct_offline_prize,total_players,total_tournaments,rank


In [25]:
from pathlib import Path

# Ruta destino: carpeta /data/processed (una arriba del notebook)
OUT_DIR = Path.cwd().parent / "data" / "processed"
OUT_DIR.mkdir(parents=True, exist_ok=True)  # crea la carpeta si no existe

# Nombre claro para la versión final
out_path = OUT_DIR / "esports_occidente_master_2016_2024_FINAL.csv"

# Guardamos el CSV
final.to_csv(out_path, index=False, encoding="utf-8")

print(f"✅ Dataset guardado correctamente en:\n{out_path}")
print(f"Shape: {final.shape[0]:,} filas × {final.shape[1]} columnas")


✅ Dataset guardado correctamente en:
c:\Users\cubea\OneDrive\Escritorio\project_esports_occidente\notebooks\data\processed\esports_occidente_master_2016_2024_FINAL.csv
Shape: 20,989 filas × 24 columnas


In [26]:
final_esports = final[final["year_prize_usd"].notna()].copy()
print(final_esports.shape)


(4201, 24)


In [27]:
juegos_esports = final_esports["game_key"].nunique()
print(f"🎮 Juegos eSports únicos: {juegos_esports}")

🎮 Juegos eSports únicos: 191


In [28]:
print("Shape:", final_esports.shape)
final_esports.info()


Shape: (4201, 24)
<class 'pandas.core.frame.DataFrame'>
Index: 4201 entries, 154 to 20879
Data columns (total 24 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   game               4201 non-null   object        
 1   game_key           4201 non-null   object        
 2   genre              4201 non-null   object        
 3   release_date       4201 non-null   float64       
 4   year               4201 non-null   int64         
 5   month              4201 non-null   int64         
 6   year_month         4201 non-null   datetime64[ns]
 7   hours_watched      4201 non-null   int64         
 8   avg_viewers        4201 non-null   int64         
 9   peak_viewers       4201 non-null   int64         
 10  streamers          4201 non-null   int64         
 11  hours_streamed     4201 non-null   int64         
 12  avg_channels       4201 non-null   int64         
 13  peak_channels      4201 non-null   int64       

In [29]:
pd.set_option("display.max_columns", None)  # para ver todas las columnas
display(final_esports.head(10))


Unnamed: 0,game,game_key,genre,release_date,year,month,year_month,hours_watched,avg_viewers,peak_viewers,streamers,hours_streamed,avg_channels,peak_channels,avg_viewer_ratio,year_prize_usd,year_players,year_tournaments,total_prize_usd,offline_prize_usd,pct_offline_prize,total_players,total_tournaments,rank
154,Age of Empires,age of empires,Strategy,1997.0,2016,1,2016-01-01,248884,334,107455,40,232,0,18,1072.78,64047.8,181.0,22.0,736284.75,522378.17,0.709478,624.0,341.0,142
155,Age of Empires II,age of empires ii,Strategy,1999.0,2016,2,2016-02-01,129101,185,1980,215,3169,4,14,40.74,56391.14,230.0,41.0,3898508.73,1361409.22,0.349213,2256.0,1939.0,167
156,Age of Empires II,age of empires ii,Strategy,1999.0,2016,3,2016-03-01,98903,133,967,204,2928,3,13,33.78,56391.14,230.0,41.0,3898508.73,1361409.22,0.349213,2256.0,1939.0,193
157,Age of Empires II,age of empires ii,Strategy,1999.0,2016,4,2016-04-01,116800,162,1412,208,2923,4,13,39.96,56391.14,230.0,41.0,3898508.73,1361409.22,0.349213,2256.0,1939.0,184
158,Age of Empires II,age of empires ii,Strategy,1999.0,2016,5,2016-05-01,93519,125,750,250,3204,4,13,29.19,56391.14,230.0,41.0,3898508.73,1361409.22,0.349213,2256.0,1939.0,199
159,Age of Empires II,age of empires ii,Strategy,1999.0,2016,6,2016-06-01,107624,149,7631,236,2896,4,14,37.16,56391.14,230.0,41.0,3898508.73,1361409.22,0.349213,2256.0,1939.0,178
160,Age of Empires II,age of empires ii,Strategy,1999.0,2016,8,2016-08-01,117560,158,1284,224,3446,4,14,34.11,56391.14,230.0,41.0,3898508.73,1361409.22,0.349213,2256.0,1939.0,192
161,Age of Empires II,age of empires ii,Strategy,1999.0,2016,9,2016-09-01,158921,221,4629,237,3356,4,15,47.35,56391.14,230.0,41.0,3898508.73,1361409.22,0.349213,2256.0,1939.0,133
162,Age of Empires II,age of empires ii,Strategy,1999.0,2016,10,2016-10-01,166279,223,2369,274,3883,5,17,42.82,56391.14,230.0,41.0,3898508.73,1361409.22,0.349213,2256.0,1939.0,163
163,Age of Empires II,age of empires ii,Strategy,1999.0,2016,11,2016-11-01,201024,279,2414,254,4477,6,18,44.9,56391.14,230.0,41.0,3898508.73,1361409.22,0.349213,2256.0,1939.0,132


In [30]:
cols_keep = [
    "game", "genre", "year", "month", "year_month",
    "hours_watched", "avg_viewers", "peak_viewers",
    "year_prize_usd", "year_players", "year_tournaments",
    "total_prize_usd", "total_players", "total_tournaments"
]

final_esports_clean = final_esports[cols_keep].copy()

print("✅ Dataset de eSports reducido a columnas esenciales:", final_esports_clean.shape)
display(final_esports_clean.head(5))


✅ Dataset de eSports reducido a columnas esenciales: (4201, 14)


Unnamed: 0,game,genre,year,month,year_month,hours_watched,avg_viewers,peak_viewers,year_prize_usd,year_players,year_tournaments,total_prize_usd,total_players,total_tournaments
154,Age of Empires,Strategy,2016,1,2016-01-01,248884,334,107455,64047.8,181.0,22.0,736284.75,624.0,341.0
155,Age of Empires II,Strategy,2016,2,2016-02-01,129101,185,1980,56391.14,230.0,41.0,3898508.73,2256.0,1939.0
156,Age of Empires II,Strategy,2016,3,2016-03-01,98903,133,967,56391.14,230.0,41.0,3898508.73,2256.0,1939.0
157,Age of Empires II,Strategy,2016,4,2016-04-01,116800,162,1412,56391.14,230.0,41.0,3898508.73,2256.0,1939.0
158,Age of Empires II,Strategy,2016,5,2016-05-01,93519,125,750,56391.14,230.0,41.0,3898508.73,2256.0,1939.0


In [31]:
final_esports_clean.info

<bound method DataFrame.info of                        game                  genre  year  month year_month  \
154          Age of Empires               Strategy  2016      1 2016-01-01   
155       Age of Empires II               Strategy  2016      2 2016-02-01   
156       Age of Empires II               Strategy  2016      3 2016-03-01   
157       Age of Empires II               Strategy  2016      4 2016-04-01   
158       Age of Empires II               Strategy  2016      5 2016-05-01   
...                     ...                    ...   ...    ...        ...   
20875  Yu-Gi-Oh! Duel Links  Collectible Card Game  2019      8 2019-08-01   
20876  Yu-Gi-Oh! Duel Links  Collectible Card Game  2019      9 2019-09-01   
20877  Yu-Gi-Oh! Duel Links  Collectible Card Game  2019     10 2019-10-01   
20878  Yu-Gi-Oh! Duel Links  Collectible Card Game  2019     11 2019-11-01   
20879  Yu-Gi-Oh! Duel Links  Collectible Card Game  2019     12 2019-12-01   

       hours_watched  avg_viewe