In [5]:
# --- Importación de librerías ---
import pandas as pd
import numpy as np

# --- Configuración opcional ---
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 120)

# --- Cargar Datasets ---
races = pd.read_csv("../Data/races.csv")
drivers = pd.read_csv("../Data/drivers.csv")
results = pd.read_csv("../Data/results.csv")
constructors = pd.read_csv("../Data/constructors.csv")
lap_times = pd.read_csv("../Data/lap_times.csv")

print("✅ Archivos cargados correctamente.")

# --- Inspección inicial ---
print("\n--- Estructura de cada Dataset ---")
print("races:", races.shape)
print("drivers:", drivers.shape)
print("results:", results.shape)
print("constructors:", constructors.shape)
print("lap_times:", lap_times.shape)

# Mostrar primeras filas de cada uno
print("\nEjemplo de 'drivers':")
display(drivers.head())

# ============================================
# 2. LIMPIEZA BÁSICA DE CADA DataFRAME
# ============================================

# --- Limpieza: eliminar duplicados ---
races.drop_duplicates(inplace=True)
drivers.drop_duplicates(inplace=True)
results.drop_duplicates(inplace=True)
constructors.drop_duplicates(inplace=True)
lap_times.drop_duplicates(inplace=True)

# --- Revisar valores nulos ---
print("\n--- Valores nulos ---")
print("races:", races.isna().sum().sum())
print("drivers:", drivers.isna().sum().sum())
print("results:", results.isna().sum().sum())
print("constructors:", constructors.isna().sum().sum())
print("lap_times:", lap_times.isna().sum().sum())

# --- Rellenar o eliminar valores nulos relevantes ---
# (Ejemplo: si un valor es crítico se elimina, si no, se reemplaza por 'Desconocido')
drivers['code'] = drivers['code'].fillna("UNK")
races['name'] = races['name'].fillna("Unknown Race")

# ============================================
# 3. UNIÓN DE TABLAS PRINCIPALES
# ============================================

# --- Unir results con drivers ---
merged = results.merge(drivers, on='driverId', how='left')

# --- Unir con races ---
merged = merged.merge(races[['raceId', 'year', 'name', 'round', 'circuitId', 'date']], on='raceId', how='left')

# --- Unir con constructors ---
merged = merged.merge(constructors[['constructorId', 'name']], on='constructorId', how='left', suffixes=('', '_team'))

# Renombrar columnas para mayor claridad
merged.rename(columns={
    'name': 'race_name',
    'name_team': 'team_name',
    'forename': 'driver_firstname',
    'surname': 'driver_lastname'
}, inplace=True)

# Crear una columna con el nombre completo del piloto
merged['driver_fullname'] = merged['driver_firstname'] + " " + merged['driver_lastname']

# ============================================
# 4. LIMPIEZA FINAL Y VALIDACIÓN
# ============================================

# --- Eliminar columnas que no aportan al análisis ---
cols_to_drop = ['number', 'positionText', 'positionOrder', 'milliseconds', 'statusId']
merged.drop(columns=[c for c in cols_to_drop if c in merged.columns], inplace=True, errors='ignore')

# --- Verificar duplicados finales ---
merged.drop_duplicates(inplace=True)

# --- Revisar tipos de datos ---
print("\n--- Tipos de datos ---")
print(merged.dtypes)

# --- Estadísticas básicas ---
print("\n--- Estadísticas ---")
display(merged.describe())

# --- Vista previa del Dataset limpio ---
print("\nEjemplo del Dataset limpio:")
display(merged.head(10))

# ============================================
# 5. GUARDAR EL DataSET LIMPIO
# ============================================

merged.to_csv("../Data/f1_clean_Dataset.csv", index=False)
print("\n💾 Dataset limpio guardado como: Data/f1_clean_Dataset.csv")

# ============================================
# 6. RESUMEN FINAL
# ============================================

print(f"""
✅ Limpieza completada:
- Total de filas finales: {len(merged)}
- Columnas: {len(merged.columns)}
- Pilotos únicos: {merged['driver_fullname'].nunique()}
- Equipos únicos: {merged['team_name'].nunique()}
- Años cubiertos: {merged['year'].min()} - {merged['year'].max()}
""")

✅ Archivos cargados correctamente.

--- Estructura de cada Dataset ---
races: (1125, 18)
drivers: (861, 9)
results: (26759, 18)
constructors: (212, 5)
lap_times: (589081, 6)

Ejemplo de 'drivers':


Unnamed: 0,driverId,driverRef,number,code,forename,surname,dob,nationality,url
0,1,hamilton,44,HAM,Lewis,Hamilton,1985-01-07,British,http://en.wikipedia.org/wiki/Lewis_Hamilton
1,2,heidfeld,\N,HEI,Nick,Heidfeld,1977-05-10,German,http://en.wikipedia.org/wiki/Nick_Heidfeld
2,3,rosberg,6,ROS,Nico,Rosberg,1985-06-27,German,http://en.wikipedia.org/wiki/Nico_Rosberg
3,4,alonso,14,ALO,Fernando,Alonso,1981-07-29,Spanish,http://en.wikipedia.org/wiki/Fernando_Alonso
4,5,kovalainen,\N,KOV,Heikki,Kovalainen,1981-10-19,Finnish,http://en.wikipedia.org/wiki/Heikki_Kovalainen



--- Valores nulos ---
races: 0
drivers: 0
results: 0
constructors: 0
lap_times: 0

--- Tipos de datos ---
resultId              int64
raceId                int64
driverId              int64
constructorId         int64
number_x             object
grid                  int64
position             object
points              float64
laps                  int64
time                 object
fastestLap           object
rank                 object
fastestLapTime       object
fastestLapSpeed      object
driverRef            object
number_y             object
code                 object
driver_firstname     object
driver_lastname      object
dob                  object
nationality          object
url                  object
year                  int64
race_name            object
round                 int64
circuitId             int64
date                 object
team_name            object
driver_fullname      object
dtype: object

--- Estadísticas ---


Unnamed: 0,resultId,raceId,driverId,constructorId,grid,points,laps,year,round,circuitId
count,26759.0,26759.0,26759.0,26759.0,26759.0,26759.0,26759.0,26759.0,26759.0,26759.0
mean,13380.977391,551.687283,278.67353,50.180537,11.134796,1.987632,46.301768,1991.394372,8.511192,23.820808
std,7726.134642,313.265036,282.703039,61.551498,7.20286,4.351209,29.496557,19.952885,5.070231,19.112002
min,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1950.0,1.0,1.0
25%,6690.5,300.0,57.0,6.0,5.0,0.0,23.0,1977.0,4.0,9.0
50%,13380.0,531.0,172.0,25.0,11.0,0.0,53.0,1991.0,8.0,18.0
75%,20069.5,811.0,399.5,63.0,17.0,2.0,66.0,2009.0,12.0,34.0
max,26764.0,1144.0,862.0,215.0,34.0,50.0,200.0,2024.0,24.0,80.0



Ejemplo del Dataset limpio:


Unnamed: 0,resultId,raceId,driverId,constructorId,number_x,grid,position,points,laps,time,fastestLap,rank,fastestLapTime,fastestLapSpeed,driverRef,number_y,code,driver_firstname,driver_lastname,dob,nationality,url,year,race_name,round,circuitId,date,team_name,driver_fullname
0,1,18,1,1,22,1,1,10.0,58,1:34:50.616,39,2,1:27.452,218.3,hamilton,44,HAM,Lewis,Hamilton,1985-01-07,British,http://en.wikipedia.org/wiki/Lewis_Hamilton,2008,Australian Grand Prix,1,1,2008-03-16,McLaren,Lewis Hamilton
1,2,18,2,2,3,5,2,8.0,58,+5.478,41,3,1:27.739,217.586,heidfeld,\N,HEI,Nick,Heidfeld,1977-05-10,German,http://en.wikipedia.org/wiki/Nick_Heidfeld,2008,Australian Grand Prix,1,1,2008-03-16,BMW Sauber,Nick Heidfeld
2,3,18,3,3,7,7,3,6.0,58,+8.163,41,5,1:28.090,216.719,rosberg,6,ROS,Nico,Rosberg,1985-06-27,German,http://en.wikipedia.org/wiki/Nico_Rosberg,2008,Australian Grand Prix,1,1,2008-03-16,Williams,Nico Rosberg
3,4,18,4,4,5,11,4,5.0,58,+17.181,58,7,1:28.603,215.464,alonso,14,ALO,Fernando,Alonso,1981-07-29,Spanish,http://en.wikipedia.org/wiki/Fernando_Alonso,2008,Australian Grand Prix,1,1,2008-03-16,Renault,Fernando Alonso
4,5,18,5,1,23,3,5,4.0,58,+18.014,43,1,1:27.418,218.385,kovalainen,\N,KOV,Heikki,Kovalainen,1981-10-19,Finnish,http://en.wikipedia.org/wiki/Heikki_Kovalainen,2008,Australian Grand Prix,1,1,2008-03-16,McLaren,Heikki Kovalainen
5,6,18,6,3,8,13,6,3.0,57,\N,50,14,1:29.639,212.974,nakajima,\N,NAK,Kazuki,Nakajima,1985-01-11,Japanese,http://en.wikipedia.org/wiki/Kazuki_Nakajima,2008,Australian Grand Prix,1,1,2008-03-16,Williams,Kazuki Nakajima
6,7,18,7,5,14,17,7,2.0,55,\N,54,8,1:29.534,213.224,bourdais,\N,BOU,Sébastien,Bourdais,1979-02-28,French,http://en.wikipedia.org/wiki/S%C3%A9bastien_Bo...,2008,Australian Grand Prix,1,1,2008-03-16,Toro Rosso,Sébastien Bourdais
7,8,18,8,6,1,15,8,1.0,53,\N,20,4,1:27.903,217.18,raikkonen,7,RAI,Kimi,Räikkönen,1979-10-17,Finnish,http://en.wikipedia.org/wiki/Kimi_R%C3%A4ikk%C...,2008,Australian Grand Prix,1,1,2008-03-16,Ferrari,Kimi Räikkönen
8,9,18,9,2,4,2,\N,0.0,47,\N,15,9,1:28.753,215.1,kubica,88,KUB,Robert,Kubica,1984-12-07,Polish,http://en.wikipedia.org/wiki/Robert_Kubica,2008,Australian Grand Prix,1,1,2008-03-16,BMW Sauber,Robert Kubica
9,10,18,10,7,12,18,\N,0.0,43,\N,23,13,1:29.558,213.166,glock,\N,GLO,Timo,Glock,1982-03-18,German,http://en.wikipedia.org/wiki/Timo_Glock,2008,Australian Grand Prix,1,1,2008-03-16,Toyota,Timo Glock



💾 Dataset limpio guardado como: Data/f1_clean_Dataset.csv

✅ Limpieza completada:
- Total de filas finales: 26759
- Columnas: 29
- Pilotos únicos: 861
- Equipos únicos: 211
- Años cubiertos: 1950 - 2024

