In [1]:
pip install pandas sqlalchemy psycopg2-binary pymongo textblob

Note: you may need to restart the kernel to use updated packages.




In [2]:
import pandas as pd
from sqlalchemy import create_engine

# --- 1. Configurar los detalles de la conexión 
db_user = "postgres"
db_pass = "12345"
db_host = "localhost"
db_port = "5432"
db_name = "proyecto_futbol"

# Creamos la cadena de conexión
db_string = f"postgresql://{db_user}:{db_pass}@{db_host}:{db_port}/{db_name}"

# --- 2. Crear el "motor" de conexión y extraer los datos ---
try:
    engine = create_engine(db_string)
    print("Conexión a PostgreSQL local exitosa")
    
    query = "SELECT * FROM eventos_crudos;"
    df = pd.read_sql_query(query, engine)
    
    print(f"Se cargaron {len(df)} registros desde PostgreSQL.")
    
    # Verificamos las primeras filas
    print(df.head())

except Exception as e:
    print(f"Error al conectar a PostgreSQL: {e}")

Conexión a PostgreSQL local exitosa
Se cargaron 941009 registros desde PostgreSQL.
     id_odsp   id_event  sort_order  time  \
0  UFot0hit/  UFot0hit1           1     2   
1  UFot0hit/  UFot0hit2           2     4   
2  UFot0hit/  UFot0hit3           3     4   
3  UFot0hit/  UFot0hit4           4     7   
4  UFot0hit/  UFot0hit5           5     7   

                                                text  event_type  event_type2  \
0  Attempt missed. Mladen Petric (Hamburg) left f...           1         12.0   
1  Corner,  Borussia Dortmund. Conceded by Dennis...           2          NaN   
2  Corner,  Borussia Dortmund. Conceded by Heiko ...           2          NaN   
3           Foul by Sven Bender (Borussia Dortmund).           3          NaN   
4  Gokhan Tore (Hamburg) wins a free kick in the ...           8          NaN   

   side         event_team           opponent  ... player_in player_out  \
0     2         Hamburg SV  Borussia Dortmund  ...      None       None   
1     1  

In [3]:
#Info general del dataset
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 941009 entries, 0 to 941008
Data columns (total 22 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   id_odsp        941009 non-null  object 
 1   id_event       941009 non-null  object 
 2   sort_order     941009 non-null  int64  
 3   time           941009 non-null  int64  
 4   text           941009 non-null  object 
 5   event_type     941009 non-null  int64  
 6   event_type2    214293 non-null  float64
 7   side           941009 non-null  int64  
 8   event_team     941009 non-null  object 
 9   opponent       941009 non-null  object 
 10  player         880273 non-null  object 
 11  player2        291313 non-null  object 
 12  player_in      51715 non-null   object 
 13  player_out     51738 non-null   object 
 14  shot_place     227459 non-null  float64
 15  shot_outcome   228498 non-null  float64
 16  is_goal        941009 non-null  int64  
 17  location       467067 non-nul

In [5]:
#Contar valores nulos
print(df.isnull().sum())

id_odsp               0
id_event              0
sort_order            0
time                  0
text                  0
event_type            0
event_type2      726716
side                  0
event_team            0
opponent              0
player            60736
player2          649696
player_in        889294
player_out       889271
shot_place       713550
shot_outcome     712511
is_goal               0
location         473942
bodypart         711824
assist_method         0
situation        711872
fast_break            0
dtype: int64


In [8]:
#Contar duplicados
duplicados = df.duplicated().sum()
print(f"Número de filas duplicadas: {duplicados}")

Número de filas duplicadas: 0


In [9]:
# Creamos un nuevo DataFrame manteniendo solo las filas donde 'shot_place' NO es nulo
# Usamos .copy() para evitar advertencias de pandas más adelante
df_filtrado = df.dropna(subset=['shot_place']).copy()

# Verificamos cuántas filas nos quedan
print(f"Número de filas después de filtrar por eventos de disparo: {len(df_filtrado)}")

Número de filas después de filtrar por eventos de disparo: 227459


In [11]:
print(df_filtrado.isnull().sum())

id_odsp               0
id_event              0
sort_order            0
time                  0
text                  0
event_type            0
event_type2       60266
side                  0
event_team            0
opponent              0
player                1
player2           60370
player_in        227459
player_out       227459
shot_place            0
shot_outcome          7
is_goal               0
location              7
bodypart              0
assist_method         0
situation             7
fast_break            0
dtype: int64


In [12]:
tamano_objetivo = 160000

# Si el DataFrame filtrado es todavía muy grande, tomamos una muestra aleatoria
if len(df_filtrado) > tamano_objetivo:
    
    df_final = df_filtrado.sample(n=tamano_objetivo, random_state=42)
    print(f"Tamaño del DataFrame final ajustado: {len(df_final)}")
else:
    df_final = df_filtrado
    print("El DataFrame filtrado ya tiene el tamaño adecuado o es menor.")

print("\nPrimeras 5 filas del dataset final:")
print(df_final.head())

Tamaño del DataFrame final ajustado: 160000

Primeras 5 filas del dataset final:
          id_odsp     id_event  sort_order  time  \
763513  l4NYrav2/  l4NYrav2107         107    79   
143052  CSIVz9Rt/   CSIVz9Rt33          33    44   
901248  AT8BU3Vt/   AT8BU3Vt71          71    74   
127083  0YfXdzoH/  0YfXdzoH100         100    90   
659214  ptpp6zve/    ptpp6zve8           8     9   

                                                     text  event_type  \
763513  Attempt saved. Caiuby (FC Augsburg) header fro...           1   
143052  Attempt saved. Javier Pinola (FC Nurnberg) lef...           1   
901248  Jimmy Briand (Guingamp) hits the bar with a ri...           1   
127083  Goal!  Villarreal 2, MA¡laga 1. HernA¡n PA©rez...           1   
659214  Goal!  Bordeaux 1, Montpellier 0. Diego Rolan ...           1   

        event_type2  side   event_team          opponent  ... player_in  \
763513         12.0     2  FC Augsburg     Hertha Berlin  ...      None   
143052         12

In [14]:
df_filtrado

Unnamed: 0,id_odsp,id_event,sort_order,time,text,event_type,event_type2,side,event_team,opponent,...,player_in,player_out,shot_place,shot_outcome,is_goal,location,bodypart,assist_method,situation,fast_break
0,UFot0hit/,UFot0hit1,1,2,Attempt missed. Mladen Petric (Hamburg) left f...,1,12.0,2,Hamburg SV,Borussia Dortmund,...,,,6.0,2.0,0,9.0,2.0,1,1.0,0
11,UFot0hit/,UFot0hit12,12,14,Attempt missed. Shinji Kagawa (Borussia Dortmu...,1,12.0,1,Borussia Dortmund,Hamburg SV,...,,,13.0,2.0,0,15.0,1.0,1,1.0,0
13,UFot0hit/,UFot0hit14,14,17,"Goal! Borussia Dortmund 1, Hamburg 0. Kevin G...",1,12.0,1,Borussia Dortmund,Hamburg SV,...,,,4.0,1.0,1,9.0,2.0,1,1.0,0
14,UFot0hit/,UFot0hit15,15,19,Attempt blocked. Mats Hummels (Borussia Dortmu...,1,,1,Borussia Dortmund,Hamburg SV,...,,,2.0,3.0,0,15.0,1.0,0,1.0,0
17,UFot0hit/,UFot0hit18,18,20,Attempt blocked. Tomas Rincon (Hamburg) right ...,1,,2,Hamburg SV,Borussia Dortmund,...,,,2.0,3.0,0,15.0,1.0,0,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
940983,z5L2OT5E/,z5L2OT5E102,102,81,Attempt blocked. Remo Freuler (Atalanta) right...,1,,1,Atalanta,Sampdoria,...,,,2.0,3.0,0,15.0,1.0,0,1.0,0
940991,z5L2OT5E/,z5L2OT5E110,110,84,Attempt missed. Alberto Grassi (Atalanta) righ...,1,12.0,1,Atalanta,Sampdoria,...,,,10.0,2.0,0,15.0,1.0,1,1.0,0
940992,z5L2OT5E/,z5L2OT5E111,111,86,Attempt saved. Alejandro Gomez (Atalanta) righ...,1,12.0,1,Atalanta,Sampdoria,...,,,5.0,1.0,0,9.0,1.0,1,1.0,0
940993,z5L2OT5E/,z5L2OT5E112,112,87,Attempt saved. Fabio Quagliarella (Sampdoria) ...,1,12.0,2,Sampdoria,Atalanta,...,,,5.0,1.0,0,15.0,1.0,1,1.0,0


In [13]:
df_final

Unnamed: 0,id_odsp,id_event,sort_order,time,text,event_type,event_type2,side,event_team,opponent,...,player_in,player_out,shot_place,shot_outcome,is_goal,location,bodypart,assist_method,situation,fast_break
763513,l4NYrav2/,l4NYrav2107,107,79,Attempt saved. Caiuby (FC Augsburg) header fro...,1,12.0,2,FC Augsburg,Hertha Berlin,...,,,12.0,1.0,0,3.0,3.0,2,1.0,0
143052,CSIVz9Rt/,CSIVz9Rt33,33,44,Attempt saved. Javier Pinola (FC Nurnberg) lef...,1,12.0,1,Nurnberg,Bayer Leverkusen,...,,,3.0,1.0,0,9.0,2.0,1,1.0,0
901248,AT8BU3Vt/,AT8BU3Vt71,71,74,Jimmy Briand (Guingamp) hits the bar with a ri...,1,,1,Guingamp,Bordeaux,...,,,7.0,4.0,0,3.0,1.0,0,1.0,0
127083,0YfXdzoH/,0YfXdzoH100,100,90,"Goal! Villarreal 2, MA¡laga 1. HernA¡n PA©rez...",1,,1,Villarreal,Malaga,...,,,4.0,1.0,1,15.0,2.0,0,3.0,0
659214,ptpp6zve/,ptpp6zve8,8,9,"Goal! Bordeaux 1, Montpellier 0. Diego Rolan ...",1,12.0,1,Bordeaux,Montpellier,...,,,3.0,1.0,1,11.0,1.0,4,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41022,tpCQIFZr/,tpCQIFZr75,75,63,Attempt saved. Landry N'GuA©mo (Bordeaux) righ...,1,,2,Bordeaux,AC Ajaccio,...,,,13.0,1.0,0,15.0,1.0,0,1.0,0
507189,0hbyNSHF/,0hbyNSHF50,50,50,Attempt saved. Diogo Figueiras (FC Sevilla) le...,1,,2,Sevilla,Atletico Madrid,...,,,11.0,1.0,0,15.0,2.0,0,1.0,0
918877,lWF1kiFf/,lWF1kiFf80,80,85,Attempt missed. Marc Pugh (Bournemouth) right ...,1,12.0,1,Bournemouth,Leicester City,...,,,6.0,2.0,0,15.0,1.0,1,1.0,0
518282,fsKhx9is/,fsKhx9is93,93,90,Attempt saved. Riccardo Meggiorini (Chievo) ri...,1,12.0,2,Chievo Verona,AS Roma,...,,,5.0,1.0,0,15.0,1.0,1,1.0,0


In [15]:
# Guarda el DataFrame en un archivo JSON
# orient='records' crea una lista de diccionarios
df_final.to_json('eventos_deportivos_limpios.json', orient='records', lines=True)

print("DataFrame guardado como 'eventos_deportivos_limpios.json'")

DataFrame guardado como 'eventos_deportivos_limpios.json'


In [18]:
import pymongo
import pandas as pd

# --- 1. Configuración de la Conexión ---

MONGO_URI = "mongodb+srv://juan:12345@cluster0.agcdm8h.mongodb.net/"

try:
    # --- 2. Conectar al Cliente de MongoDB ---
    client = pymongo.MongoClient(MONGO_URI)
    client.admin.command('ping')
    print("¡Conexión a MongoDB Atlas exitosa!")

    # --- 3. Seleccionar Base de Datos y Colección ---
    db = client['proyecto_futbol']
    collection = db['eventos_procesados']

    # --- 4. Preparar los Datos ---
    datos_para_mongo = df_final.to_dict(orient='records')

    # --- 5. Borrar Datos Antiguos ---
    print("Limpiando colección existente...")
    collection.delete_many({})

    # --- 6. Insertar los Datos en Lotes ---
    batch_size = 10000  # Tamaño de cada lote
    total_docs = len(datos_para_mongo)
    print(f"Iniciando inserción de {total_docs} documentos en lotes de {batch_size}...")

    for i in range(0, total_docs, batch_size):
        # Creamos el lote (batch)
        batch = datos_para_mongo[i:i + batch_size]
        
        # Insertamos el lote
        collection.insert_many(batch)
        
        print(f"  -> Lote {i//batch_size + 1} insertado ({i + len(batch)}/{total_docs} documentos)")

    print("¡Carga de datos a MongoDB Atlas completada!")

except Exception as e:
    print(f"Error durante el proceso: {e}")

finally:
    if 'client' in locals():
        client.close()

¡Conexión a MongoDB Atlas exitosa!
Limpiando colección existente...
Iniciando inserción de 160000 documentos en lotes de 10000...
  -> Lote 1 insertado (10000/160000 documentos)
  -> Lote 2 insertado (20000/160000 documentos)
  -> Lote 3 insertado (30000/160000 documentos)
  -> Lote 4 insertado (40000/160000 documentos)
  -> Lote 5 insertado (50000/160000 documentos)
  -> Lote 6 insertado (60000/160000 documentos)
  -> Lote 7 insertado (70000/160000 documentos)
  -> Lote 8 insertado (80000/160000 documentos)
  -> Lote 9 insertado (90000/160000 documentos)
  -> Lote 10 insertado (100000/160000 documentos)
  -> Lote 11 insertado (110000/160000 documentos)
  -> Lote 12 insertado (120000/160000 documentos)
  -> Lote 13 insertado (130000/160000 documentos)
  -> Lote 14 insertado (140000/160000 documentos)
  -> Lote 15 insertado (150000/160000 documentos)
  -> Lote 16 insertado (160000/160000 documentos)
✅ ¡Carga de datos a MongoDB Atlas completada!
