# ANALISIS DE LA NBA

In [8]:
from pathlib import Path                  # Manejo seguro de rutas
import sqlite3                            # Conexión a SQLite (estándar en Python)
import pandas as pd                       
import numpy as np                        

In [9]:
DB_PATH = Path("nba.sqlite")   

In [10]:
# Conectar de nuevo a la base (puede ser la copia filtrada o la original)
conn = sqlite3.connect(DB_PATH)
print("✅ Conectado a la base:", DB_PATH)

✅ Conectado a la base: nba.sqlite


In [11]:
# --- Función para obtener nombres de tablas ---
def get_tables(connection):
    """
    Devuelve una lista con los nombres de las tablas en la base SQLite.
    """
    q = "SELECT name FROM sqlite_master WHERE type='table' ORDER BY name;"
    return pd.read_sql_query(q, connection)["name"].tolist()

# --- Función para ver columnas de una tabla específica ---
def get_columns(connection, table_name):
    """
    Devuelve un DataFrame con la metadata de columnas de la tabla 'table_name'.
    Incluye: cid (índice), name (nombre), type (tipo), notnull, dflt_value, pk (si es clave primaria).
    """
    q = f"PRAGMA table_info({table_name});"  
    return pd.read_sql_query(q, connection)

# ---  Listar todas las tablas ---
tables = get_tables(conn)           
print(f"Tablas encontradas ({len(tables)}):")
for t in tables:
    print(" -", t)


Tablas encontradas (16):
 - common_player_info
 - draft_combine_stats
 - draft_history
 - game
 - game_info
 - game_summary
 - inactive_players
 - line_score
 - officials
 - other_stats
 - play_by_play
 - player
 - team
 - team_details
 - team_history
 - team_info_common


In [12]:
# --- Resumen: filas y columnas de cada tabla ---
resumen = []

for t in get_tables(conn):
    try:
        n_filas = pd.read_sql_query(f"SELECT COUNT(*) AS n FROM {t};", conn).loc[0, "n"]
        n_columnas = len(get_columns(conn, t))
        resumen.append((t, n_filas, n_columnas))
    except Exception as e:
        resumen.append((t, f"Error: {e}", None))

# Pasamos a DataFrame para verlo ordenado
df_resumen = pd.DataFrame(resumen, columns=["tabla", "filas", "columnas"])
display(df_resumen)


Unnamed: 0,tabla,filas,columnas
0,common_player_info,3632,33
1,draft_combine_stats,1633,47
2,draft_history,8257,14
3,game,65698,55
4,game_info,58053,4
5,game_summary,58110,14
6,inactive_players,110191,9
7,line_score,58053,43
8,officials,70971,5
9,other_stats,28271,26


In [13]:
import shutil

# Ruta a tu archivo original
DB_PATH = Path("nba.sqlite")   # asegúrate que este es tu archivo base

# Ruta para la copia (mismo nombre con sufijo _work)
DB_WORK_PATH = Path(DB_PATH.parent / (DB_PATH.stem + "_work" + DB_PATH.suffix))

# Crear copia
shutil.copy2(DB_PATH, DB_WORK_PATH)

print(f"✅ Copia creada en: {DB_WORK_PATH}")

✅ Copia creada en: nba_work.sqlite


In [14]:
# Cerramos conexión previa (si existía) y nos conectamos a la copia
try:
    conn.close()
except:
    pass

conn = sqlite3.connect(DB_WORK_PATH)
print("Conectado a la copia de trabajo ✅")


Conectado a la copia de trabajo ✅


# TABLA GAME 

In [15]:
def reduce_table(conn, table_name, keep_cols):
    """
    Crea una nueva tabla con solo las columnas indicadas y
    reemplaza la original en la base SQLite.
    """
    # 1) Validar que las columnas existen en la tabla
    cols_existentes = get_columns(conn, table_name)["name"].tolist()
    existentes = [c for c in keep_cols if c in cols_existentes]
    if not existentes:
        raise ValueError(f"Ninguna de las columnas pedidas existe en {table_name}")

    cols_sql = ", ".join(existentes)

    # 2) Crear tabla temporal con solo esas columnas
    tmp_table = f"{table_name}_tmp"
    conn.execute(f"DROP TABLE IF EXISTS {tmp_table};")
    conn.execute(f"CREATE TABLE {tmp_table} AS SELECT {cols_sql} FROM {table_name};")

    # 3) Borrar tabla original y renombrar la temporal
    conn.execute(f"DROP TABLE {table_name};")
    conn.execute(f"ALTER TABLE {tmp_table} RENAME TO {table_name};")
    conn.commit()

    print(f"✅ Tabla '{table_name}' reducida a {len(existentes)} columnas.")
    return existentes


In [16]:
game_cols = [
    # Identificadores
    "game_id","season_id","season_type","game_date",
    "team_id_home","team_id_away","wl_home",
    # Tiro
    "fgm_home","fga_home","fg3m_home","fg3a_home","ftm_home","fta_home",
    "fgm_away","fga_away","fg3m_away","fg3a_away","ftm_away","fta_away",
    # Rebotes
    "oreb_home","dreb_home","reb_home",
    "oreb_away","dreb_away","reb_away",
    # Otros
    "ast_home","stl_home","blk_home","tov_home","pf_home","pts_home",
    "ast_away","stl_away","blk_away","tov_away","pf_away","pts_away"
]


In [17]:
reduce_table(conn, "game", game_cols)

# Revisar resultado
get_columns(conn, "game")


✅ Tabla 'game' reducida a 37 columnas.


Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,game_id,TEXT,0,,0
1,1,season_id,TEXT,0,,0
2,2,season_type,TEXT,0,,0
3,3,game_date,NUM,0,,0
4,4,team_id_home,TEXT,0,,0
5,5,team_id_away,TEXT,0,,0
6,6,wl_home,TEXT,0,,0
7,7,fgm_home,REAL,0,,0
8,8,fga_home,REAL,0,,0
9,9,fg3m_home,REAL,0,,0


# TABLA other_stats

In [18]:
# --- Reducir 'other_stats' a las columnas acordadas y mostrar resultado + tamaño ---
import os

other_stats_cols = [
    "game_id",
    "team_id_home","team_id_away",
    "pts_paint_home","pts_paint_away",
    "pts_2nd_chance_home","pts_2nd_chance_away",
    "pts_fb_home","pts_fb_away",
    "lead_changes","times_tied",
    "team_turnovers_home","team_turnovers_away",
    "total_turnovers_home","total_turnovers_away",
    "team_rebounds_home","team_rebounds_away",
    "pts_off_to_home","pts_off_to_away"
]

reduce_table(conn, "other_stats", other_stats_cols)

# Ver columnas resultantes en 'other_stats'
display(get_columns(conn, "other_stats"))


✅ Tabla 'other_stats' reducida a 19 columnas.


Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,game_id,TEXT,0,,0
1,1,team_id_home,TEXT,0,,0
2,2,team_id_away,TEXT,0,,0
3,3,pts_paint_home,INT,0,,0
4,4,pts_paint_away,INT,0,,0
5,5,pts_2nd_chance_home,INT,0,,0
6,6,pts_2nd_chance_away,INT,0,,0
7,7,pts_fb_home,INT,0,,0
8,8,pts_fb_away,INT,0,,0
9,9,lead_changes,INT,0,,0


# line_score

In [19]:
line_score_cols = [
    "game_id",
    "team_id_home","team_id_away",
    # Puntos por cuartos (home)
    "pts_qtr1_home","pts_qtr2_home","pts_qtr3_home","pts_qtr4_home",
    # Puntos OT home (pueden existir varias: pts_ot1_home, pts_ot2_home, etc.)
    "pts_ot1_home","pts_ot2_home","pts_ot3_home","pts_ot4_home","pts_ot5_home",
    "pts_home",
    # Puntos por cuartos (away)
    "pts_qtr1_away","pts_qtr2_away","pts_qtr3_away","pts_qtr4_away",
    # Puntos OT away
    "pts_ot1_away","pts_ot2_away","pts_ot3_away","pts_ot4_away","pts_ot5_away",
    "pts_away"
]

# Reducir la tabla
reduce_table(conn, "line_score", line_score_cols)

# Revisar columnas resultantes
display(get_columns(conn, "line_score"))


✅ Tabla 'line_score' reducida a 23 columnas.


Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,game_id,TEXT,0,,0
1,1,team_id_home,TEXT,0,,0
2,2,team_id_away,TEXT,0,,0
3,3,pts_qtr1_home,TEXT,0,,0
4,4,pts_qtr2_home,TEXT,0,,0
5,5,pts_qtr3_home,TEXT,0,,0
6,6,pts_qtr4_home,TEXT,0,,0
7,7,pts_ot1_home,INT,0,,0
8,8,pts_ot2_home,INT,0,,0
9,9,pts_ot3_home,INT,0,,0


# game_info

In [20]:
game_info_cols = ["game_id", "game_date", "attendance"]

reduce_table(conn, "game_info", game_info_cols)

# Ver columnas resultantes
display(get_columns(conn, "game_info"))


✅ Tabla 'game_info' reducida a 3 columnas.


Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,game_id,TEXT,0,,0
1,1,game_date,NUM,0,,0
2,2,attendance,INT,0,,0


# team

In [21]:
team_cols = ["id", "full_name", "abbreviation", "city", "state"]

reduce_table(conn, "team", team_cols)

# Ver columnas resultantes
display(get_columns(conn, "team"))

✅ Tabla 'team' reducida a 5 columnas.


Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,id,TEXT,0,,0
1,1,full_name,TEXT,0,,0
2,2,abbreviation,TEXT,0,,0
3,3,city,TEXT,0,,0
4,4,state,TEXT,0,,0


# player

In [22]:
# 1) Sinónimos posibles en tu base
syn = {
    "id": ["id", "player_id", "person_id"],
    "full_name": ["full_name", "display_first_last", "player_name", "full_name_en"]
}

# 2) Detectar cuáles existen realmente en 'player'
cols_player = get_columns(conn, "player")["name"].tolist()

def pick_existing(candidates, existing):
    for c in candidates:
        if c in existing:
            return c
    return None

id_col = pick_existing(syn["id"], cols_player)
name_col = pick_existing(syn["full_name"], cols_player)

keep = [c for c in [id_col, name_col] if c is not None]
print("Columnas detectadas para conservar en 'player':", keep)

# 3) Reducir la tabla con las columnas detectadas
reduce_table(conn, "player", keep)


Columnas detectadas para conservar en 'player': ['id', 'full_name']
✅ Tabla 'player' reducida a 2 columnas.


['id', 'full_name']

# play_by_play

In [23]:
# === Reducir 'play_by_play' usando DB_WORK_PATH (sin 'conn') ===
from pathlib import Path
import sqlite3, pandas as pd, os

# 0) Confirmar ruta a la copia
try:
    DB_WORK_PATH
except NameError:
    DB_WORK_PATH = Path("nba_work.sqlite") 

DB_WORK_PATH = Path(DB_WORK_PATH)
if not DB_WORK_PATH.exists():
    raise FileNotFoundError(f"No encuentro la copia: {DB_WORK_PATH.resolve()}")

# Helpers que abren/cerran la conexión internamente
def run_query(q: str) -> pd.DataFrame:
    with sqlite3.connect(DB_WORK_PATH) as cxn:
        return pd.read_sql_query(q, cxn)

def exec_sql(cmds):
    with sqlite3.connect(DB_WORK_PATH) as cxn:
        cur = cxn.cursor()
        for c in cmds:
            cur.execute(c)
        cxn.commit()

def get_columns(table_name: str) -> pd.DataFrame:
    return run_query(f"PRAGMA table_info({table_name});")

def reduce_table(table_name: str, keep_cols):
    existentes = get_columns(table_name)["name"].tolist()
    keep = [c for c in keep_cols if c in existentes]
    if not keep:
        raise ValueError(f"Ninguna columna de {keep_cols} existe en '{table_name}'. "
                         f"Disponibles: {existentes}")
    tmp = f"{table_name}_tmp"
    exec_sql([
        f"DROP TABLE IF EXISTS {tmp};",
        f"CREATE TABLE {tmp} AS SELECT {', '.join(keep)} FROM {table_name};",
        f"DROP TABLE {table_name};",
        f"ALTER TABLE {tmp} RENAME TO {table_name};"
    ])
    print(f"✅ '{table_name}' reducida a {len(keep)} columnas.")

# 1) Sinónimos por campo
syn = {
    "game_id":              ["game_id","gid"],
    "eventnum":             ["eventnum","event_num","event_number"],
    "eventmsgtype":         ["eventmsgtype","event_msg_type","msg_type","event_type"],
    "eventmsgactiontype":   ["eventmsgactiontype","event_msg_action_type","action_type"],
    "periodo":              ["periodo","period","quarter","prd"],
    "pctimestring":         ["pctimestring","pc_time_string","time_remaining","clock","time"],
    "puntuacion":           ["puntuación","puntuacion","score"],
    "margen_puntuacion":    ["margen_de_puntuación","margen_de_puntuacion","scoremargin","score_margin","margin"],
    "player1_id":           ["player1_id","player_1_id","p1_id"],
    "player1_team_id":      ["player1_team_id","player_1_team_id","p1_team_id","team_id1"],
    "player2_id":           ["player2_id","player_2_id","p2_id"],
    "player2_team_id":      ["player2_team_id","player_2_team_id","p2_team_id","team_id2"],
    "player3_id":           ["player3_id","player_3_id","p3_id"],
    "player3_team_id":      ["player3_team_id","player_3_team_id","p3_team_id","team_id3"],
}

# 2) Detectar columnas presentes y construir keep_cols
cols_exist = get_columns("play_by_play")["name"].tolist()

def pick_existing(candidates, existing):
    for c in candidates:
        if c in existing:
            return c
    return None

orden = [
    "game_id","eventnum","eventmsgtype","eventmsgactiontype",
    "periodo","pctimestring","puntuacion","margen_puntuacion",
    "player1_id","player1_team_id","player2_id","player2_team_id","player3_id","player3_team_id"
]

keep_cols = []
for k in orden:
    col = pick_existing(syn[k], cols_exist)
    if col:
        keep_cols.append(col)


# 3) Reducir tabla y mostrar resultado + tamaño de archivo
reduce_table("play_by_play", keep_cols)
display(get_columns("play_by_play"))



✅ 'play_by_play' reducida a 14 columnas.


Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,game_id,TEXT,0,,0
1,1,eventnum,INT,0,,0
2,2,eventmsgtype,INT,0,,0
3,3,eventmsgactiontype,INT,0,,0
4,4,period,INT,0,,0
5,5,pctimestring,TEXT,0,,0
6,6,score,TEXT,0,,0
7,7,scoremargin,TEXT,0,,0
8,8,player1_id,TEXT,0,,0
9,9,player1_team_id,TEXT,0,,0


# team_info_common

In [24]:
def get_columns(table_name: str) -> pd.DataFrame:
    return run_query(f"PRAGMA table_info({table_name});")

def reduce_table(table_name: str, keep_cols):
    existentes = get_columns(table_name)["name"].tolist()
    keep = [c for c in keep_cols if c in existentes]
    if not keep:
        raise ValueError(f"Ninguna columna de {keep_cols} existe en '{table_name}'. "
                         f"Disponibles: {existentes}")
    tmp = f"{table_name}_tmp"
    exec_sql([
        f"DROP TABLE IF EXISTS {tmp};",
        f"CREATE TABLE {tmp} AS SELECT {', '.join(keep)} FROM {table_name};",
        f"DROP TABLE {table_name};",
        f"ALTER TABLE {tmp} RENAME TO {table_name};"
    ])
    print(f"✅ '{table_name}' reducida a {len(keep)} columnas.")

# 1) Columnas requeridas + opcionales (se incluyen solo si existen)
keep_candidates = [
    # requeridas
    "team_id", "season_year", "season_id", "team_conference", "team_division", "w", "l", "pct",
    # opcionales
    "conf_rank", "div_rank", "pts_pg", "reb_pg"
]

# 2) Reducir y mostrar resultado
reduce_table("team_info_common", keep_candidates)
display(get_columns("team_info_common"))


✅ 'team_info_common' reducida a 12 columnas.


Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,team_id,TEXT,0,,0
1,1,season_year,TEXT,0,,0
2,2,season_id,TEXT,0,,0
3,3,team_conference,TEXT,0,,0
4,4,team_division,TEXT,0,,0
5,5,w,INT,0,,0
6,6,l,INT,0,,0
7,7,pct,REAL,0,,0
8,8,conf_rank,INT,0,,0
9,9,div_rank,INT,0,,0


# common_player_info

In [25]:
def get_columns(table_name: str) -> pd.DataFrame:
    return run_query(f"PRAGMA table_info({table_name});")

def reduce_table(table_name: str, keep_cols):
    existentes = get_columns(table_name)["name"].tolist()
    keep = [c for c in keep_cols if c in existentes]
    if not keep:
        raise ValueError(f"Ninguna columna de {keep_cols} existe en '{table_name}'. "
                         f"Disponibles: {existentes}")
    tmp = f"{table_name}_tmp"
    exec_sql([
        f"DROP TABLE IF EXISTS {tmp};",
        f"CREATE TABLE {tmp} AS SELECT {', '.join(keep)} FROM {table_name};",
        f"DROP TABLE {table_name};",
        f"ALTER TABLE {tmp} RENAME TO {table_name};"
    ])
    print(f"✅ '{table_name}' reducida a {len(keep)} columnas.")

# 1) Candidatas (obligatorias + opcionales)
keep_candidates = [
    # obligatorias
    "person_id", "display_first_last", "position", "season_exp",
    # opcionales
    "height", "weight", "from_year", "to_year"
]

# 2) Reducir y mostrar resultado
reduce_table("common_player_info", keep_candidates)
display(get_columns("common_player_info"))




✅ 'common_player_info' reducida a 8 columnas.


Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,person_id,TEXT,0,,0
1,1,display_first_last,TEXT,0,,0
2,2,position,TEXT,0,,0
3,3,season_exp,REAL,0,,0
4,4,height,TEXT,0,,0
5,5,weight,TEXT,0,,0
6,6,from_year,REAL,0,,0
7,7,to_year,REAL,0,,0


In [26]:
# === Eliminar tablas que no se van a usar ===
to_drop = [
    "draft_combine_stats",
    "draft_history",
    "game_summary",
    "inactive_players",
    "officials",
    "team_details",
    "team_history"
]
# Comprobar tablas restantes
print("\nTablas finales en la base de trabajo:")
print(run_query("SELECT name FROM sqlite_master WHERE type='table';"))



Tablas finales en la base de trabajo:
                   name
0          game_summary
1             officials
2      inactive_players
3          team_details
4          team_history
5   draft_combine_stats
6         draft_history
7                  game
8           other_stats
9            line_score
10            game_info
11                 team
12               player
13         play_by_play
14     team_info_common
15   common_player_info


In [27]:
# === Tamaño de archivo + resumen de tablas (original vs copia) ===
from pathlib import Path
import sqlite3, pandas as pd, os

# 0) Rutas (ajusta si usas otros nombres)
try:
    DB_PATH
except NameError:
    DB_PATH = Path("nba.sqlite")
try:
    DB_WORK_PATH
except NameError:
    DB_WORK_PATH = Path("nba_work.sqlite")

def list_tables(db_path: Path):
    with sqlite3.connect(db_path) as cxn:
        q = "SELECT name FROM sqlite_master WHERE type='table' ORDER BY name;"
        return pd.read_sql_query(q, cxn)["name"].tolist()

def table_columns(db_path: Path, table: str) -> int:
    with sqlite3.connect(db_path) as cxn:
        return len(pd.read_sql_query(f"PRAGMA table_info({table});", cxn))

def table_rows(db_path: Path, table: str) -> int:
    with sqlite3.connect(db_path) as cxn:
        return int(pd.read_sql_query(f"SELECT COUNT(*) AS n FROM {table};", cxn).loc[0, "n"])

def db_size_mb(db_path: Path) -> float:
    return os.path.getsize(db_path) / (1024*1024)

# 1) Calcular tamaños (con chequeo por si falta algún archivo)
size_orig = db_size_mb(DB_PATH) if DB_PATH.exists() else None
size_work = db_size_mb(DB_WORK_PATH) if DB_WORK_PATH.exists() else None

print("\n💾 Tamaño de archivo")
if size_orig is not None:
    print(f"Original : {size_orig:.2f} MB  ({DB_PATH})")
else:
    print("Original : (no encontrado)")
if size_work is not None and size_orig is not None:
    ahorro = size_orig - size_work
    redu = 100*(1 - size_work/size_orig) if size_orig > 0 else 0
    print(f"Trabajada: {size_work:.2f} MB  ({DB_WORK_PATH})  "
          f"(ahorro: {ahorro:.2f} MB | {redu:.1f}% reducción)")
elif size_work is not None:
    print(f"Trabajada: {size_work:.2f} MB  ({DB_WORK_PATH})")

    display(df_work)



💾 Tamaño de archivo
Original : 2240.74 MB  (nba.sqlite)
Trabajada: 3082.05 MB  (nba_work.sqlite)  (ahorro: -841.31 MB | -37.5% reducción)


In [28]:
# === Resumen final: tablas, filas y columnas de la base reducida ===
def get_tables(db_path):
    with sqlite3.connect(db_path) as cxn:
        q = "SELECT name FROM sqlite_master WHERE type='table' ORDER BY name;"
        return pd.read_sql_query(q, cxn)["name"].tolist()

def get_table_info(db_path, table):
    with sqlite3.connect(db_path) as cxn:
        n_filas = pd.read_sql_query(f"SELECT COUNT(*) AS n FROM {table};", cxn).loc[0,"n"]
        n_cols  = len(pd.read_sql_query(f"PRAGMA table_info({table});", cxn))
    return n_filas, n_cols

# Recorrer todas las tablas de la base reducida
resumen = []
for t in get_tables(DB_WORK_PATH):
    filas, cols = get_table_info(DB_WORK_PATH, t)
    resumen.append([t, filas, cols])

df_final = pd.DataFrame(resumen, columns=["tabla","filas","columnas"])
display(df_final)

print("\nTotales globales:")
print(f"Filas en total: {df_final['filas'].sum():,}")
print(f"Columnas en total (sumadas por tabla): {df_final['columnas'].sum():,}")


Unnamed: 0,tabla,filas,columnas
0,common_player_info,3632,8
1,draft_combine_stats,1633,47
2,draft_history,8257,14
3,game,65698,37
4,game_info,58053,3
5,game_summary,58110,14
6,inactive_players,110191,9
7,line_score,58053,23
8,officials,70971,5
9,other_stats,28271,19



Totales globales:
Filas en total: 14,060,690
Columnas en total (sumadas por tabla): 231


In [29]:
# === Exportar TODAS las tablas de la base filtrada a CSV (con chunks) ===
from pathlib import Path
import sqlite3, pandas as pd, os, math, time

# 0) Ruta de la base filtrada (copia de trabajo)
try:
    DB_WORK_PATH
except NameError:
    DB_WORK_PATH = Path("nba_work.sqlite")
DB_WORK_PATH = Path(DB_WORK_PATH)

# 1) Carpeta de salida
OUT_DIR = Path("csv_export")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# 2) Utilidades
def list_tables(db_path: Path):
    with sqlite3.connect(db_path) as cxn:
        q = "SELECT name FROM sqlite_master WHERE type='table' ORDER BY name;"
        return pd.read_sql_query(q, cxn)["name"].tolist()

def count_rows(db_path: Path, table: str) -> int:
    with sqlite3.connect(db_path) as cxn:
        return int(pd.read_sql_query(f"SELECT COUNT(*) AS n FROM {table};", cxn).loc[0, "n"])

def export_table_to_csv(db_path: Path, table: str, out_dir: Path, chunksize: int = 200_000):
    """
    Exporta una tabla a CSV en chunks (para tablas grandes). 
    - Escribe encabezado en el primer chunk y luego agrega sin encabezado.
    """
    t0 = time.time()
    total = count_rows(db_path, table)
    out_file = out_dir / f"{table}.csv"
    if out_file.exists(): out_file.unlink()  # limpiar si ya existía

    if total == 0:
        # crear CSV vacío con solo encabezados
        with sqlite3.connect(db_path) as cxn:
            cols = pd.read_sql_query(f"PRAGMA table_info({table});", cxn)["name"].tolist()
        pd.DataFrame(columns=cols).to_csv(out_file, index=False, encoding="utf-8")
        print(f"✔ {table}: 0 filas → {out_file.name}")
        return

    with sqlite3.connect(db_path) as cxn:
        sql = f"SELECT * FROM {table};"
        first = True
        done = 0
        for chunk in pd.read_sql_query(sql, cxn, chunksize=chunksize):
            chunk.to_csv(out_file, index=False, mode="w" if first else "a",
                         header=first, encoding="utf-8")
            first = False
            done += len(chunk)
            pct = (done / total) * 100
            print(f"  {table}: {done:,}/{total:,} filas ({pct:5.1f}%)", end="\r")
    dt = time.time() - t0
    size_mb = os.path.getsize(out_file) / (1024*1024)
    print(f"\n✔ {table}: {total:,} filas → {out_file.name} ({size_mb:.2f} MB) en {dt:.1f}s")

# 3) Ejecutar exportación para todas las tablas
tables = list_tables(DB_WORK_PATH)
print("Tablas a exportar:", tables, "\n")

for t in tables:
    export_table_to_csv(DB_WORK_PATH, t, OUT_DIR, chunksize=200_000)

print("\n✅ Exportación completa. Archivos en:", OUT_DIR.resolve())


Tablas a exportar: ['common_player_info', 'draft_combine_stats', 'draft_history', 'game', 'game_info', 'game_summary', 'inactive_players', 'line_score', 'officials', 'other_stats', 'play_by_play', 'player', 'team', 'team_details', 'team_history', 'team_info_common'] 

  common_player_info: 3,632/3,632 filas (100.0%)
✔ common_player_info: 3,632 filas → common_player_info.csv (0.19 MB) en 0.1s
  draft_combine_stats: 1,633/1,633 filas (100.0%)
✔ draft_combine_stats: 1,633 filas → draft_combine_stats.csv (0.26 MB) en 0.2s
  draft_history: 8,257/8,257 filas (100.0%)
✔ draft_history: 8,257 filas → draft_history.csv (0.83 MB) en 0.2s
  game: 65,698/65,698 filas (100.0%)
✔ game: 65,698 filas → game.csv (12.61 MB) en 5.1s
  game_info: 58,053/58,053 filas (100.0%)
✔ game_info: 58,053 filas → game_info.csv (2.17 MB) en 0.4s
  game_summary: 58,110/58,110 filas (100.0%)
✔ game_summary: 58,110 filas → game_summary.csv (5.42 MB) en 0.9s
  inactive_players: 110,191/110,191 filas (100.0%)
✔ inactive_pl

# TABLA RANKING 

In [32]:
import pandas as pd

# Cargar el dataset de ranking
ranking_df = pd.read_csv("ranking.csv")

# Ver las primeras filas
print(ranking_df.head())

# Ver información general de columnas
print(ranking_df.info())

      TEAM_ID  LEAGUE_ID  SEASON_ID STANDINGSDATE CONFERENCE         TEAM   G  \
0  1610612743          0      22022    2022-12-22       West       Denver  30   
1  1610612763          0      22022    2022-12-22       West      Memphis  30   
2  1610612740          0      22022    2022-12-22       West  New Orleans  31   
3  1610612756          0      22022    2022-12-22       West      Phoenix  32   
4  1610612746          0      22022    2022-12-22       West  LA Clippers  33   

    W   L  W_PCT HOME_RECORD ROAD_RECORD  RETURNTOPLAY  
0  19  11  0.633        10-3         9-8           NaN  
1  19  11  0.633        13-2         6-9           NaN  
2  19  12  0.613        13-4         6-8           NaN  
3  19  13  0.594        14-4         5-9           NaN  
4  19  14  0.576        11-7         8-7           NaN  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 210342 entries, 0 to 210341
Data columns (total 13 columns):
 #   Column         Non-Null Count   Dtype  
---  ------    

In [33]:
# Definir columnas que queremos conservar
cols_keep = ["TEAM_ID", "SEASON_ID", "LEAGUE_ID", "TEAM", 
             "G", "W", "L", "W_PCT", "HOME_RECORD", "ROAD_RECORD"]

# Crear nuevo DataFrame solo con esas columnas
ranking_filtered = ranking_df[cols_keep]

# Verificar el cambio
print("Columnas originales:", ranking_df.shape[1])
print("Columnas después de filtrar:", ranking_filtered.shape[1])
print("Se eliminaron:", ranking_df.shape[1] - ranking_filtered.shape[1], "columnas")


Columnas originales: 13
Columnas después de filtrar: 10
Se eliminaron: 3 columnas


# NULOS common_player_info.csv

In [39]:
import pandas as pd

# Cargar la tabla desde la carpeta csv_export
common_player_info_df = pd.read_csv("csv_export/common_player_info.csv", low_memory=False)

print("🔹 Tabla: common_player_info")
print("Shape:", common_player_info_df.shape)

# Total de nulos
print("\nTotal nulos en la tabla:", common_player_info_df.isna().sum().sum())

# Nulos por columna
print("\nNulos por columna:")
print(common_player_info_df.isna().sum()[common_player_info_df.isna().sum() > 0])


🔹 Tabla: common_player_info
Shape: (3632, 8)

Total nulos en la tabla: 196

Nulos por columna:
position    45
height      74
weight      77
dtype: int64


In [41]:
print(common_player_info_df.dtypes)

person_id               int64
display_first_last     object
position               object
season_exp            float64
height                 object
weight                float64
from_year             float64
to_year               float64
dtype: object


# Paso 1: Conversión de height a pulgadas

In [42]:
def parse_height(h):
    try:
        feet, inches = h.split("-")
        return int(feet) * 12 + int(inches)   # convertir a pulgadas
    except:
        return None  # si falla, lo dejamos como NaN

common_player_info_df["height_inches"] = common_player_info_df["height"].apply(parse_height)

# Paso 2: Rellenar nulos en height_inches con la media

In [43]:
common_player_info_df["height_inches"] = common_player_info_df["height_inches"].fillna(
    common_player_info_df["height_inches"].mean()
)

# Paso 3: Rellenar nulos en weight con la media

In [44]:
common_player_info_df["weight"] = common_player_info_df["weight"].fillna(
    common_player_info_df["weight"].mean()
)

# Paso 4: Rellenar position con "Unknown"

In [45]:
common_player_info_df["position"] = common_player_info_df["position"].fillna("Unknown")


In [46]:
print("Nulos después de la limpieza:")
print(common_player_info_df.isna().sum())

print("\nEjemplo de datos transformados:")
print(common_player_info_df[["display_first_last", "position", "height", "height_inches", "weight"]].head(10))


Nulos después de la limpieza:
person_id              0
display_first_last     0
position               0
season_exp             0
height                74
weight                 0
from_year              0
to_year                0
height_inches          0
dtype: int64

Ejemplo de datos transformados:
    display_first_last position height  height_inches  weight
0       Alaa Abdelnaby  Forward   6-10           82.0   240.0
1      Zaid Abdul-Aziz   Center    6-9           81.0   235.0
2  Kareem Abdul-Jabbar   Center    7-2           86.0   225.0
3  Shareef Abdur-Rahim  Forward    6-9           81.0   245.0
4          Forest Able    Guard    6-3           75.0   180.0
5       John Abramovic  Forward    6-3           75.0   195.0
6         Alex Abrines    Guard    6-6           78.0   200.0
7     Precious Achiuwa  Forward    6-8           80.0   225.0
8           Quincy Acy  Forward    6-7           79.0   240.0
9          Alvan Adams   Center    6-9           81.0   210.0


# NULOS game_info.csv

In [48]:
import pandas as pd

# Cargar la tabla desde la carpeta csv_export
game_info_df = pd.read_csv("csv_export/game_info.csv", low_memory=False)

print("🔹 Tabla: game_info")
print("Shape:", game_info_df.shape)

# Total de nulos
print("\nTotal nulos en la tabla:", game_info_df.isna().sum().sum())

# Nulos por columna
print("\nNulos por columna:")
print(game_info_df.isna().sum()[game_info_df.isna().sum() > 0])


🔹 Tabla: game_info
Shape: (58053, 3)

Total nulos en la tabla: 5380

Nulos por columna:
attendance    5380
dtype: int64


In [49]:
# Rellenar attendance con 0 en los nulos
game_info_df["attendance"] = game_info_df["attendance"].fillna(0)

# Verificación
print("Nulos después de limpiar:")
print(game_info_df.isna().sum())

Nulos después de limpiar:
game_id       0
game_date     0
attendance    0
dtype: int64


# NULOS game.csv

In [52]:
# Cargar la tabla game.csv desde csv_export
game_df = pd.read_csv("csv_export/game.csv", low_memory=False)

print("🔹 Tabla: game")
print("Shape:", game_df.shape)

# Total de nulos
print("\nTotal nulos en la tabla:", game_df.isna().sum().sum())

# Nulos por columna
print("\nNulos por columna:")
print(game_df.isna().sum()[game_df.isna().sum() > 0])


🔹 Tabla: game
Shape: (65698, 37)

Total nulos en la tabla: 357715

Nulos por columna:
wl_home          2
fgm_home        13
fga_home     15447
fg3m_home    13218
fg3a_home    18683
ftm_home        16
fta_home      3004
fgm_away        13
fga_away     15447
fg3m_away    13218
fg3a_away    18683
ftm_away        13
fta_away      3004
oreb_home    18936
dreb_home    18999
reb_home     15729
oreb_away    18936
dreb_away    18998
reb_away     15725
ast_home     15805
stl_home     18849
blk_home     18626
tov_home     18684
pf_home       2856
ast_away     15801
stl_away     18849
blk_away     18625
tov_away     18685
pf_away       2851
dtype: int64


In [53]:
# 1. Verificación de IDs críticos
id_cols = ["game_id", "team_id_home", "team_id_away", "season_id", "game_date"]
print("Nulos en IDs críticos:")
print(game_df[id_cols].isna().sum())

# Eliminar filas con nulos en IDs
game_df = game_df.dropna(subset=id_cols)

# 2. Rellenar wl_home con cálculo a partir de puntos
mask_wl = game_df["wl_home"].isna()
game_df.loc[mask_wl, "wl_home"] = game_df.apply(
    lambda row: "W" if row["pts_home"] > row["pts_away"] else "L", axis=1
)

# 3. Rellenar estadísticas numéricas con 0
num_cols = game_df.select_dtypes(include=["int64", "float64"]).columns
# Excluir puntos y porcentajes si prefieres recalcularlos luego
exclude_cols = ["pts_home", "pts_away"]
fill_cols = [c for c in num_cols if c not in exclude_cols]

game_df[fill_cols] = game_df[fill_cols].fillna(0)

# 4. Verificación final
print("\nNulos después de limpiar:")
print(game_df.isna().sum().sum())


Nulos en IDs críticos:
game_id         0
team_id_home    0
team_id_away    0
season_id       0
game_date       0
dtype: int64

Nulos después de limpiar:
0


In [55]:
# Total de nulos
print("\nTotal nulos en la tabla:", game_df.isna().sum().sum())

# Nulos por columna
print("\nNulos por columna:")
print(game_df.isna().sum()[game_df.isna().sum() > 0])



Total nulos en la tabla: 0

Nulos por columna:
Series([], dtype: int64)


# NULOS line_score.csv

In [56]:
# Cargar la tabla line_score.csv desde csv_export
line_score_df = pd.read_csv("csv_export/line_score.csv", low_memory=False)

print("🔹 Tabla: line_score")
print("Shape:", line_score_df.shape)

# Total de nulos
print("\nTotal nulos en la tabla:", line_score_df.isna().sum().sum())

# Nulos por columna
print("\nNulos por columna:")
print(line_score_df.isna().sum()[line_score_df.isna().sum() > 0])


🔹 Tabla: line_score
Shape: (58053, 23)

Total nulos en la tabla: 314021

Nulos por columna:
pts_qtr1_home     1004
pts_qtr2_home     1013
pts_qtr3_home     1045
pts_qtr4_home     1044
pts_ot1_home     25759
pts_ot2_home     27051
pts_ot3_home     27243
pts_ot4_home     27270
pts_ot5_home     45577
pts_qtr1_away     1010
pts_qtr2_away     1013
pts_qtr3_away     1046
pts_qtr4_away     1046
pts_ot1_away     25759
pts_ot2_away     27051
pts_ot3_away     27243
pts_ot4_away     27270
pts_ot5_away     45577
dtype: int64


In [57]:
# 1. Verificación de IDs
id_cols = ["game_id", "team_id_home", "team_id_away"]
print("Nulos en IDs críticos:")
print(line_score_df[id_cols].isna().sum())

# Eliminar filas con nulos en IDs
line_score_df = line_score_df.dropna(subset=id_cols)

# 2. Rellenar parciales y OT con 0
score_cols = [c for c in line_score_df.columns if "pts_qtr" in c or "pts_ot" in c]
line_score_df[score_cols] = line_score_df[score_cols].fillna(0)

# 3. Recalcular totales si existen y tienen nulos
if "pts_home" in line_score_df.columns and "pts_away" in line_score_df.columns:
    line_score_df["pts_home"] = line_score_df[[c for c in line_score_df.columns if "_home" in c and ("qtr" in c or "ot" in c)]].sum(axis=1)
    line_score_df["pts_away"] = line_score_df[[c for c in line_score_df.columns if "_away" in c and ("qtr" in c or "ot" in c)]].sum(axis=1)

# 4. Verificación final
print("\nNulos después de limpiar:")
print(line_score_df.isna().sum().sum())

Nulos en IDs críticos:
game_id         0
team_id_home    0
team_id_away    0
dtype: int64

Nulos después de limpiar:
0


# Nulos de other_stats.csv

In [58]:


# Cargar la tabla other_stats.csv desde csv_export
other_stats_df = pd.read_csv("csv_export/other_stats.csv", low_memory=False)

print("🔹 Tabla: other_stats")
print("Shape:", other_stats_df.shape)

# Total de nulos
print("\nTotal nulos en la tabla:", other_stats_df.isna().sum().sum())

# Nulos por columna
print("\nNulos por columna:")
print(other_stats_df.isna().sum()[other_stats_df.isna().sum() > 0])


🔹 Tabla: other_stats
Shape: (28271, 19)

Total nulos en la tabla: 8878

Nulos por columna:
team_turnovers_home        2
team_turnovers_away        2
total_turnovers_home     316
total_turnovers_away     316
team_rebounds_home      1998
team_rebounds_away      1998
pts_off_to_home         2123
pts_off_to_away         2123
dtype: int64


In [59]:
# Verificar IDs críticos
id_cols = ["game_id", "team_id_home", "team_id_away"]
print("Nulos en IDs críticos:")
print(other_stats_df[id_cols].isna().sum())

# Rellenar métricas numéricas con 0
num_cols = other_stats_df.select_dtypes(include=["int64", "float64"]).columns
other_stats_df[num_cols] = other_stats_df[num_cols].fillna(0)

# Verificación
print("\nNulos después de limpiar:")
print(other_stats_df.isna().sum().sum())


Nulos en IDs críticos:
game_id         0
team_id_home    0
team_id_away    0
dtype: int64

Nulos después de limpiar:
0


# NULOS PLAY_BY_PLAY

In [61]:
# Cargar la tabla play_by_play.csv desde csv_export
play_df = pd.read_csv("csv_export/play_by_play.csv", low_memory=False)

print("🔹 Tabla: play_by_play")
print("Shape:", play_df.shape)

# Total de nulos
print("\nTotal nulos en la tabla:", play_df.isna().sum().sum())

# Nulos por columna
print("\nNulos por columna:")
print(play_df.isna().sum()[play_df.isna().sum() > 0])

🔹 Tabla: play_by_play
Shape: (13592899, 14)

Total nulos en la tabla: 44180115

Nulos por columna:
score              10028436
scoremargin        10028436
player1_team_id     1215858
player2_team_id     9660454
player3_team_id    13246931
dtype: int64


In [62]:
# 1. Verificar IDs críticos
id_cols = ["game_id", "eventnum", "period"]
print("Nulos en IDs críticos:")
print(play_df[id_cols].isna().sum())

# 2. Eliminar filas con nulos en IDs (si existen)
play_df = play_df.dropna(subset=id_cols)

# 3. Mantener NaN en score, scoremargin, player2_id, player2_team_id, player3_id, player3_team_id
# (son nulos válidos)

# Verificación
print("\nNulos después de la limpieza (solo IDs eliminados):")
print(play_df.isna().sum())


Nulos en IDs críticos:
game_id     0
eventnum    0
period      0
dtype: int64

Nulos después de la limpieza (solo IDs eliminados):
game_id                      0
eventnum                     0
eventmsgtype                 0
eventmsgactiontype           0
period                       0
pctimestring                 0
score                 10028436
scoremargin           10028436
player1_id                   0
player1_team_id        1215858
player2_id                   0
player2_team_id        9660454
player3_id                   0
player3_team_id       13246931
dtype: int64


# NULOS  PLAYER.CSV

In [63]:
# Cargar la tabla player.csv desde csv_export
player_df = pd.read_csv("csv_export/player.csv", low_memory=False)

print("🔹 Tabla: player")
print("Shape:", player_df.shape)

# Total de nulos
print("\nTotal nulos en la tabla:", player_df.isna().sum().sum())

# Nulos por columna
print("\nNulos por columna:")
print(player_df.isna().sum()[player_df.isna().sum() > 0])


🔹 Tabla: player
Shape: (4815, 2)

Total nulos en la tabla: 0

Nulos por columna:
Series([], dtype: int64)


# NULOS TEAM_INFO_COMMON

In [64]:
# Cargar la tabla team_info_common.csv desde csv_export
team_info_common_df = pd.read_csv("csv_export/team_info_common.csv", low_memory=False)

print("🔹 Tabla: team_info_common")
print("Shape:", team_info_common_df.shape)

# Total de nulos
print("\nTotal nulos en la tabla:", team_info_common_df.isna().sum().sum())

# Nulos por columna
print("\nNulos por columna:")
print(team_info_common_df.isna().sum()[team_info_common_df.isna().sum() > 0])


🔹 Tabla: team_info_common
Shape: (0, 12)

Total nulos en la tabla: 0

Nulos por columna:
Series([], dtype: int64)


# NULOS TEAM

In [65]:
# Cargar la tabla team.csv desde csv_export
team_df = pd.read_csv("csv_export/team.csv", low_memory=False)

print("🔹 Tabla: team")
print("Shape:", team_df.shape)

# Total de nulos
print("\nTotal nulos en la tabla:", team_df.isna().sum().sum())

# Nulos por columna
print("\nNulos por columna:")
print(team_df.isna().sum()[team_df.isna().sum() > 0])


🔹 Tabla: team
Shape: (30, 5)

Total nulos en la tabla: 0

Nulos por columna:
Series([], dtype: int64)


# NULOS RANKING

In [66]:
# Cargar ranking.csv desde la raíz
ranking_df = pd.read_csv("ranking.csv", low_memory=False)

print("🔹 Tabla: ranking")
print("Shape:", ranking_df.shape)

# Total de nulos
print("\nTotal nulos en la tabla:", ranking_df.isna().sum().sum())

# Nulos por columna
print("\nNulos por columna:")
print(ranking_df.isna().sum()[ranking_df.isna().sum() > 0])


🔹 Tabla: ranking
Shape: (210342, 13)

Total nulos en la tabla: 206352

Nulos por columna:
RETURNTOPLAY    206352
dtype: int64


In [67]:
# Eliminar la columna RETURNTOPLAY porque no es relevante y tiene demasiados nulos
if "RETURNTOPLAY" in ranking_df.columns:
    ranking_df = ranking_df.drop(columns=["RETURNTOPLAY"])

# Verificación
print("Shape después de limpieza:", ranking_df.shape)
print("Nulos después de limpieza:", ranking_df.isna().sum().sum())

Shape después de limpieza: (210342, 12)
Nulos después de limpieza: 0


In [68]:
import os


os.makedirs("csv_clean", exist_ok=True)

# Guardar cada DataFrame limpio
game_df.to_csv("csv_clean/game.csv", index=False)
other_stats_df.to_csv("csv_clean/other_stats.csv", index=False)
line_score_df.to_csv("csv_clean/line_score.csv", index=False)
game_info_df.to_csv("csv_clean/game_info.csv", index=False)
team_df.to_csv("csv_clean/team.csv", index=False)
player_df.to_csv("csv_clean/player.csv", index=False)
play_df.to_csv("csv_clean/play_by_play.csv", index=False)
team_info_common_df.to_csv("csv_clean/team_info_common.csv", index=False)
common_player_info_df.to_csv("csv_clean/common_player_info.csv", index=False)
ranking_df.to_csv("csv_clean/ranking.csv", index=False)

print("✅ Todos los CSV limpios fueron exportados a la carpeta 'csv_clean'")


✅ Todos los CSV limpios fueron exportados a la carpeta 'csv_clean'
