In [6]:
import pandas as pd

# Verify the file path and ensure the file exists
file_path = '/content/drive/MyDrive/TFM/qualifying_1960-2024.csv'  # Double check this path!
data = pd.read_csv(file_path)

# Información básica del dataset
data.info()

# Análisis de valores nulos
null_values = data.isnull().sum()
print("Valores Nulos por Columna:")
print(null_values[null_values > 0])

# Análisis de valores fuera de rango en columnas específicas
# Listas de verificación de rangos válidos
columns_to_check = {
    'home_score': (0, float('inf')),
    'away_score': (0, float('inf')),
    'stadium_latitude': (-90, 90),
    'stadium_longitude': (-180, 180),
    'match_attendance': (0, float('inf')),
    'condition_temperature': (-50, 50),
    'condition_humidity': (0, 100),
    'condition_wind_speed': (0, 200)
}

# Función para identificar valores fuera de rango
def find_anomalies(df, col, valid_range):
    return df[(df[col] < valid_range[0]) | (df[col] > valid_range[1])]

# Análisis de anomalías
anomalies = {col: find_anomalies(data, col, valid_range) for col, valid_range in columns_to_check.items()}

# Resumen de anomalías encontradas
anomalies_summary = {col: len(anomalies[col]) for col in anomalies}
print("\nAnomalías Encontradas:")
print(anomalies_summary)

# Mostrar algunas anomalías encontradas (si existen)
for col, df in anomalies.items():
    if not df.empty:
        print(f"\nAnomalías en {col}:")
        print(df)

# Ejemplo de imputación de valores nulos (puede ajustarse según necesidades)
# Aquí se imputan con la mediana solo si el valor es realmente nulo, no si es cero
for col in ['home_score', 'away_score', 'home_score_total', 'away_score_total']:
    if data[col].isnull().sum() > 0:
        median_value = data[col].median()
        data[col].fillna(median_value, inplace=True)

# Corrección del valor fuera de rango en 'condition_temperature'
# Asumimos que se puede corregir a un valor dentro del rango permitido (-50 a 50)
data.loc[(data['condition_temperature'] < -50) | (data['condition_temperature'] > 50), 'condition_temperature'] = data['condition_temperature'].median()

# Guardar el dataset limpio
data.to_csv('/content/drive/MyDrive/TFM/qualifying_1960-2024_cleaned.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2845 entries, 0 to 2844
Data columns (total 47 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id_match               2845 non-null   int64  
 1   home_team              2845 non-null   object 
 2   away_team              2845 non-null   object 
 3   home_team_code         2845 non-null   object 
 4   away_team_code         2845 non-null   object 
 5   home_score             2842 non-null   float64
 6   away_score             2842 non-null   float64
 7   home_penalty           7 non-null      float64
 8   away_penalty           7 non-null      float64
 9   home_score_total       2842 non-null   float64
 10  away_score_total       2842 non-null   float64
 11  winner                 2291 non-null   object 
 12  winner_reason          2842 non-null   object 
 13  year                   2845 non-null   int64  
 14  date                   2845 non-null   object 
 15  date