In [18]:
from pathlib import Path
import pandas as pd

# 1) Encontrar carpeta data/raw 


In [27]:
RAW = (Path.cwd().parent / "data" / "raw")
if not RAW.exists():
    base = Path.cwd()
    for c in [base] + list(base.parents):
        d = c / "data" / "raw"
        if d.exists():
            RAW = d
            break

print(RAW.resolve())       

C:\Users\cubea\OneDrive\Escritorio\project_esports_occidente\data\raw


In [28]:
twitch  = pd.read_csv(RAW / "Twitch_game_data.csv", encoding="cp1252", sep=None, engine="python")
esports = pd.read_csv(RAW / "HistoricalEsportData.csv")
general = pd.read_csv(RAW / "GeneralEsportData.csv")

print("twitch:", twitch.shape, "| esports:", esports.shape, "| general:", general.shape)




twitch: (21000, 12) | esports: (10239, 5) | general: (669, 8)


# Dataset 1: twitch_keep

- Fuente: datos de Twitch por juego, mes y año.
- Tema: mide la popularidad de los juegos.

In [30]:
twitch_keep = [
    "Game", "Year", "Month",
    "Hours_watched", "Hours_streamed",
    "Peak_viewers", "Peak_channels",
    "Streamers", "Avg_viewers", "Avg_channels",
    "Avg_viewer_ratio"
]
twitch_simple = twitch[twitch_keep].copy()

# Dataset 2: esports_keep
- Fuente: Esports Earnings o datasets competitivos.
- Tema: mide la competitividad económica (premios y torneos).

In [31]:
esports_keep = ["Game", "Date", "Earnings", "Players", "Tournaments"]
esports_simple = esports[esports_keep].copy()

# Dataset 3: general_keep

- Fuente: General Esports Data (por juego, sin temporalidad).
- Tema: características estáticas del juego

In [33]:
general_keep = [
    "Game", "ReleaseDate", "Genre",
    "TotalEarnings", "OfflineEarnings", "PercentOffline",
    "TotalPlayers", "TotalTournaments"
]
general_simple = general[general_keep].copy()

In [37]:
print("COLUMNAS CLAVE POR DATASET")
print("twitch_keep :", twitch_keep)
print("esports_keep:", esports_keep)
print("general_keep:", general_keep)

COLUMNAS CLAVE POR DATASET
twitch_keep : ['Game', 'Year', 'Month', 'Hours_watched', 'Hours_streamed', 'Peak_viewers', 'Peak_channels', 'Streamers', 'Avg_viewers', 'Avg_channels', 'Avg_viewer_ratio']
esports_keep: ['Game', 'Date', 'Earnings', 'Players', 'Tournaments']
general_keep: ['Game', 'ReleaseDate', 'Genre', 'TotalEarnings', 'OfflineEarnings', 'PercentOffline', 'TotalPlayers', 'TotalTournaments']


In [35]:
def basic_counts(df, name):
    print(f"\n>>> {name}")
    print("  filas, columnas :", df.shape)
    # columnas principales si existen:
    for c in ["Game", "Year", "Month", "Date", "Genre"]:
        if c in df.columns:
            print(f"  uniques[{c}]:", df[c].nunique())

basic_counts(twitch_simple,  "twitch_simple")
basic_counts(esports_simple, "esports_simple")
basic_counts(general_simple, "general_simple")


>>> twitch_simple
  filas, columnas : (21000, 11)
  uniques[Game]: 2359
  uniques[Year]: 9
  uniques[Month]: 12

>>> esports_simple
  filas, columnas : (10239, 5)
  uniques[Game]: 621
  uniques[Date]: 309

>>> general_simple
  filas, columnas : (669, 8)
  uniques[Game]: 669
  uniques[Genre]: 12


# Columnas clave entre sets, GAME - YEAR (Posible MONTH)

Twitch (Game-Year) + Esports (Game-Year) + General (Game) → Master dataset

___________________________________________________________________________

# Revision de Tipso de datos en los DATA SET

In [46]:
print("TYPES — twitch_simple")
print(twitch_simple.dtypes)

print("\nTYPES — esports_simple")
print(esports_simple.dtypes)

print("\nTYPES — general_simple")
print(general_simple.dtypes)


TYPES — twitch_simple
Game                 object
Year                  int64
Month                 int64
Hours_watched         int64
Hours_streamed        int64
Peak_viewers          int64
Peak_channels         int64
Streamers             int64
Avg_viewers           int64
Avg_channels          int64
Avg_viewer_ratio    float64
dtype: object

TYPES — esports_simple
Game            object
Date            object
Earnings       float64
Players          int64
Tournaments      int64
dtype: object

TYPES — general_simple
Game                 object
ReleaseDate           int64
Genre                object
TotalEarnings        object
OfflineEarnings      object
PercentOffline      float64
TotalPlayers          int64
TotalTournaments      int64
dtype: object


___________________________________________________________

Convertimos *"year"* y *"month"* a númerico entero

Si encontramos algo que no se puede convertir con (errors="coerce") lo cambiamos a *"NaN"*

In [None]:
twitch_simple["Year"] = pd.to_numeric(twitch_simple["Year"], errors="coerce")
twitch_simple["Month"] = pd.to_numeric(twitch_simple["Month"], errors="coerce")



# Date → Year
esports_simple["Year"] = pd.to_datetime(esports_simple["Date"], errors="coerce").dt.year

In [52]:
print(twitch_simple[["Year","Month"]].dtypes)
print("Meses únicos:", sorted(twitch_simple["Month"].dropna().unique().tolist()))


Year     int64
Month    int64
dtype: object
Meses únicos: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]


Convertimos las metricas numericas (premios, jugadores, torneos)a tipo numérico real parapoder agrupar mas tarde juego/año y calcular totales, o medias.

In [None]:
esports_simple["Earnings"] = pd.to_numeric(esports_simple["Earnings"], errors="coerce")
esports_simple["Players"] = pd.to_numeric(esports_simple["Players"], errors="coerce")
esports_simple["Tournaments"] = pd.to_numeric(esports_simple["Tournaments"], errors="coerce")

Crear Year a partir de Date 
solo extraemos el año , lo necesitaremos para realizar la clave entre datasets "Game-Year" 

In [57]:
esports_simple["Year"] = pd.to_datetime(esports_simple["Date"], errors="coerce").dt.year

In [56]:
print(esports_simple[["Date","Year"]].head(10).to_string(index=False))
print("Años NaN: ", esports_simple["Year"].isna().sum())
print("Rango de años:", int(esports_simple["Year"].min()), "→", int(esports_simple["Year"].max()))


      Date  Year
1998-01-01  1998
1998-01-01  1998
1998-05-01  1998
1998-05-01  1998
1998-05-01  1998
1998-07-01  1998
1998-11-01  1998
1998-11-01  1998
1998-11-01  1998
1998-12-01  1998
Años NaN:  0
Rango de años: 1998 → 2024


La idea es quedarnos con las fechas del 2016-2024 ya que no tenemos datos anteriores de streaming para hacer comparativas, cabe la posibilidad de que utilicemos los datos anteriores de streaming para mas estudio.