Influencia Geografica

In [None]:
# 1) Instala todo lo necesario
!pip install --quiet kaggle kagglehub[pandas-datasets]

# 2) Sube tu kaggle.json desde tu máquina local
from google.colab import files
files.upload()  # selecciona tu kaggle.json

# 3) Configura la API key
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# 4) (Opcional pero recomendado) Define explícitamente la variable de entorno
import os
os.environ['KAGGLE_CONFIG_DIR'] = "/root/.kaggle"

# 5) Descarga y descomprime el dataset con el CLI
!kaggle datasets download \
    stefanoleone992/fifa-22-complete-player-dataset \
    -p ./FIFA22 --unzip

# 6) Comprueba que existen los archivos
import os, logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
download_path = "./FIFA22"
files = os.listdir(download_path)
logging.info(f"Archivos descargados: {files}")

# 7) Lee el CSV sin la coma extra
import pandas as pd
file_name = "players_22.csv"
file_path = os.path.join(download_path, file_name)
df = pd.read_csv(file_path)
print(df.head())


Saving kaggle.json to kaggle.json
Dataset URL: https://www.kaggle.com/datasets/stefanoleone992/fifa-22-complete-player-dataset
License(s): CC0-1.0
   sofifa_id                                         player_url  \
0     158023  https://sofifa.com/player/158023/lionel-messi/...   
1     188545  https://sofifa.com/player/188545/robert-lewand...   
2      20801  https://sofifa.com/player/20801/c-ronaldo-dos-...   
3     190871  https://sofifa.com/player/190871/neymar-da-sil...   
4     192985  https://sofifa.com/player/192985/kevin-de-bruy...   

          short_name                            long_name player_positions  \
0           L. Messi       Lionel Andrés Messi Cuccittini       RW, ST, CF   
1     R. Lewandowski                   Robert Lewandowski               ST   
2  Cristiano Ronaldo  Cristiano Ronaldo dos Santos Aveiro           ST, LW   
3          Neymar Jr        Neymar da Silva Santos Júnior          LW, CAM   
4       K. De Bruyne                      Kevin De Bruyne   

  df = pd.read_csv(file_path)


In [None]:
league_to_country = {
    "French Ligue 1": "France",
    "German 1. Bundesliga": "Germany",
    "English Premier League": "England",
    "Spain Primera Division": "Spain",
    "Italian Serie A": "Italy",
    "Holland Eredivisie": "Netherlands",
    "USA Major League Soccer": "United States",
    "Saudi Abdul L. Jameel League": "Saudi Arabia",
    "Portuguese Liga ZON SAGRES": "Portugal",
    "Campeonato Brasileiro Série A": "Brazil",
    "Turkish Süper Lig": "Turkey",
    "Chinese Super League": "China",
    "Russian Premier League": "Russia",
    "Croatian Prva HNL": "Croatia",
    "Mexican Liga MX": "Mexico",
    "Ukrainian Premier League": "Ukraine",
    "Spanish Segunda División": "Spain",
    "Greek Super League": "Greece",
    "Italian Serie B": "Italy",
    "Belgian Jupiler Pro League": "Belgium",
    "Argentina Primera División": "Argentina",
    "German 2. Bundesliga": "Germany",
    "Japanese J. League Division 1": "Japan",
    "Swiss Super League": "Switzerland",
    "Czech Republic Gambrinus Liga": "Czech Republic",
    "Scottish Premiership": "Scotland",
    "English League Championship": "England",
    "French Ligue 2": "France",
    "Australian Hyundai A-League": "Australia",
    "Danish Superliga": "Denmark",
    "Chilian Campeonato Nacional": "Chile",
    "Austrian Football Bundesliga": "Austria",
    "Paraguayan Primera División": "Paraguay",
    "Ecuadorian Serie A": "Ecuador",
    "Uruguayan Primera División": "Uruguay",
    "Norwegian Eliteserien": "Norway",
    "Swedish Allsvenskan": "Sweden",
    "Korean K League 1": "South Korea",
    "Colombian Liga Postobón": "Colombia",
    "Hungarian Nemzeti Bajnokság I": "Hungary",
    "Liga de Fútbol Profesional Boliviano": "Bolivia",
    "South African Premier Division": "South Africa",
    "UAE Arabian Gulf League": "United Arab Emirates",
    "Polish T-Mobile Ekstraklasa": "Poland",
    "English League One": "England",
    "Romanian Liga I": "Romania",
    "Venezuelan Primera División": "Venezuela",
    "Peruvian Primera División": "Peru",
    "Indian Super League": "India",
    "Cypriot First Division": "Cyprus",
    "German 3. Bundesliga": "Germany",
    "Rep. Ireland Airtricity League": "Republic of Ireland",
    "Finnish Veikkausliiga": "Finland",
    "English League Two": "England",
    "English National League": "England"
}


In [None]:
import pandas as pd
import numpy as np

# Identificar columnas de tipo string (objetos) y numéricas
string_cols = df.select_dtypes(include="object").columns
numeric_cols = df.select_dtypes(include=[np.number]).columns

# Eliminar filas con NaN en columnas de tipo string
#df = df.dropna(subset=string_cols)

# Imputar valores numéricos NaN con la mediana
for col in numeric_cols:
    if df[col].isna().any():
        mediana = df[col].median()
        df[col].fillna(mediana, inplace=True)

# Mapear liga a país usando tu diccionario
df["club_country"] = df["league_name"].map(league_to_country)

# Verificar si hay ligas no mapeadas (NaN)
print("Ligas sin país asignado:", df[df["club_country"].isna()]["league_name"].unique())

# Estadísticas por nacionalidad de jugador (sin importar club)
nationality_stats = df.groupby("nationality_name").agg(
    total_jugadores=("sofifa_id", "count"),
    media_overall=("overall", "mean"),
    media_potencial=("potential", "mean"),
    media_edad=("age", "mean"),
    valor_total_eur=("value_eur", "sum"),
    salario_total_eur=("wage_eur", "sum"),
    media_ataque=("attacking_short_passing", "mean"),
    media_fisico=("physic", "mean")
).reset_index().rename(columns={"nationality_name": "país"})

# Estadísticas por club
club_detail_stats = df.groupby(["club_country", "club_name"]).agg(
    total_jugadores=("sofifa_id", "count"),
    media_overall=("overall", "mean"),
    media_potencial=("potential", "mean"),
    media_edad=("age", "mean"),
    valor_total_eur=("value_eur", "sum"),
    salario_total_eur=("wage_eur", "sum"),
    media_pase=("passing", "mean"),
    media_defensa=("defending", "mean")
).reset_index().rename(columns={
    "club_country": "país",
    "club_name": "club"
})


Ligas sin país asignado: [nan]


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(mediana, inplace=True)


In [None]:
nationality_df = nationality_stats
nationality_df.head()

Unnamed: 0,país,total_jugadores,media_overall,media_potencial,media_edad,valor_total_eur,salario_total_eur,media_ataque,media_fisico
0,Afghanistan,1,64.0,69.0,23.0,875000.0,2000.0,58.0,43.0
1,Albania,46,66.934783,71.586957,25.413043,125805000.0,466150.0,60.304348,65.847826
2,Algeria,51,70.647059,73.72549,26.784314,317095000.0,1048200.0,64.235294,66.72549
3,Andorra,1,64.0,64.0,31.0,400000.0,1000.0,56.0,74.0
4,Angola,17,67.352941,73.411765,24.588235,42875000.0,157600.0,63.470588,62.705882


In [None]:
clubes_df = club_detail_stats
clubes_df.head()

Unnamed: 0,país,club,total_jugadores,media_overall,media_potencial,media_edad,valor_total_eur,salario_total_eur,media_pase,media_defensa
0,Argentina,Argentinos Juniors,28,68.071429,71.928571,26.75,47175000.0,189000.0,59.321429,53.821429
1,Argentina,Arsenal de Sarandí,28,65.714286,68.964286,27.428571,29190000.0,100600.0,57.714286,53.035714
2,Argentina,Atlético Tucumán,28,66.821429,71.571429,26.464286,36300000.0,133000.0,58.357143,54.642857
3,Argentina,Boca Juniors,28,71.857143,77.535714,25.428571,154425000.0,306500.0,62.035714,55.071429
4,Argentina,Club Atlético Aldosivi,28,66.857143,69.857143,27.785714,31120000.0,114900.0,59.392857,53.392857


In [None]:
!pip install kmapper

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN
import kmapper as km

# 1. Carga de datos
df = clubes_df  # Tu DataFrame original

# 2. Selección de características
features = [
    'media_overall', 'media_potencial', 'media_edad',
    'valor_total_eur', 'salario_total_eur',
    'media_pase', 'media_defensa'
]
X = df[features].values

# 3. Escalado
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 4. PCA para obtener lens y también informes en texto
pca = PCA(n_components=2)
lens = pca.fit_transform(X_scaled)

# —— Salida textual de PCA ——
print("=== PCA ===")
print("Varianza explicada por PC:", pca.explained_variance_ratio_)
print("Componentes principales (vectores):\n", pca.components_)

# 5. Configuración de KeplerMapper
mapper = km.KeplerMapper(verbose=1)

# 6. Clustering con DBSCAN
clusterer = DBSCAN(eps=0.5, min_samples=5)
labels = clusterer.fit_predict(X_scaled)  # vuelve a ejecutar DBSCAN para asignar labels

# —— Salida textual de clustering ——
print("\n=== DBSCAN ===")
print("Número de clusters (incluye ruido = -1):", len(set(labels)))
print("Tamaño de cada cluster:\n", pd.Series(labels).value_counts().sort_index())

# Añadimos la etiqueta al DataFrame original
df['cluster'] = labels

# —— Estadísticas por cluster ——
print("\n=== Estadísticas de agrupación por cluster ===")
agg = df.groupby('cluster')[features].agg(['count', 'mean'])
print(agg)

# 7. Mapeo con KeplerMapper
graph = mapper.map(
    lens,
    X_scaled,
    clusterer=clusterer,
    cover=km.Cover(n_cubes=10, perc_overlap=0.3)
)

# 8. Visualización
color_values = pd.Categorical(df['país']).codes

mapper.visualize(
    graph,
    path_html="clubes_mapper.html",
    title="Clusters de Estilo de Juego por Región",
    color_values=color_values,
    color_function_name=["País"],
    custom_tooltips=df['club'].values
)


=== PCA ===
Varianza explicada por PC: [0.68930463 0.15999935]
Componentes principales (vectores):
 [[ 0.4339491   0.41907593  0.01664739  0.38667601  0.37647232  0.42723424
   0.40250163]
 [ 0.12095231 -0.2183348   0.92715449 -0.14043458 -0.11926911  0.13883295
   0.1576814 ]]
KeplerMapper(verbose=1)

=== DBSCAN ===
Número de clusters (incluye ruido = -1): 5
Tamaño de cada cluster:
 -1    320
 0    366
 1      5
 2      5
 3      5
Name: count, dtype: int64

=== Estadísticas de agrupación por cluster ===
        media_overall            media_potencial            media_edad  \
                count       mean           count       mean      count   
cluster                                                                  
-1                320  67.626753             320  72.727686        320   
 0                366  63.969095             366  69.396934        366   
 1                  5  59.885755               5  66.258549          5   
 2                  5  69.502699             

'<!DOCTYPE html>\n<html>\n\n<head>\n  <meta charset="utf-8">\n  <meta name="generator" content="KeplerMapper">\n  <title>Clusters de Estilo de Juego por Región | KeplerMapper</title>\n\n  <link rel="icon" type="image/png" href="http://i.imgur.com/axOG6GJ.jpg" />\n\n  <link href=\'https://fonts.googleapis.com/css?family=Roboto+Mono:700,300\' rel=\'stylesheet\' type=\'text/css\'>\n  <style>* {\n  margin: 0;\n  padding: 0;\n}\n\nhtml, body {\n  height: 100%;\n}\n\nbody {\n  font-family: "Roboto Mono", "Helvetica", sans-serif;\n  font-size: 14px;\n}\n\n#logo {\n  width:  85px;\n  height: 85px;\n}\n\n#display {\n  color: #95A5A6;\n  background: #212121;\n}\n\n#header {\n  background: #111111;\n}\n\n#print {\n  color: #000;\n  background: #FFF;\n}\n\nh1 {\n  font-size: 21px;\n  font-weight: 300;\n  font-weight: 300;\n}\n\nh2 {\n  font-size: 18px;\n  padding-bottom: 20px;\n  font-weight: 300;\n}\n\nh3 {\n  font-size: 14px;\n  font-weight: 700;\n  text-transform: uppercase;\n}\n\nh4 {\n  font-

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN
import kmapper as km

# 1. Carga de datos
df = nationality_df  # Tu DataFrame con estadísticas por nacionalidad

# 2. Selección de características
features = [
    'media_overall', 'media_potencial', 'media_edad',
    'valor_total_eur', 'salario_total_eur',
    'media_ataque', 'media_fisico'
]
X = df[features].values

# 3. Escalado de los datos
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# —— Salida textual de PCA ——
pca = PCA(n_components=2)
lens_pca = pca.fit_transform(X_scaled)

print("=== PCA ===")
print("Varianza explicada por cada componente:", pca.explained_variance_ratio_)
print("Vectores de componentes principales:\n", pca.components_)

# 4. Configuración de KeplerMapper
mapper = km.KeplerMapper(verbose=1)

# 5. (Re)uso del lens obtenido por PCA
lens = lens_pca

# 6. Clustering con DBSCAN
clusterer = DBSCAN(eps=0.5, min_samples=3)
labels = clusterer.fit_predict(X_scaled)

print("\n=== DBSCAN ===")
print("Número de clusters (incluye ruido = -1):", len(set(labels)))
print("Tamaño de cada cluster:\n", pd.Series(labels).value_counts().sort_index())

# Añadimos la etiqueta de cluster al DataFrame
df['cluster'] = labels

# —— Estadísticas por cluster ——
print("\n=== Estadísticas de agrupación por cluster ===")
stats = df.groupby('cluster')[features].agg(['count', 'mean'])
print(stats)

# 7. Mapeo con KeplerMapper
graph = mapper.map(
    lens,
    X_scaled,
    clusterer=clusterer,
    cover=km.Cover(n_cubes=10, perc_overlap=0.3)
)

# 8. Visualización
color_values = pd.Categorical(df['país']).codes
custom_tooltips = df.apply(
    lambda row: f"{row['país']} ({int(row['total_jugadores'])} jugadores)", axis=1
).values

mapper.visualize(
    graph,
    path_html="nacionalidades_mapper.html",
    title="Clusters de Estilo de Juego por Nacionalidad",
    color_values=color_values,
    color_function_name=["País"],
    custom_tooltips=custom_tooltips
)


=== PCA ===
Varianza explicada por cada componente: [0.40954327 0.28960425]
Vectores de componentes principales:
 [[ 0.54603467  0.4139227   0.22246279  0.25713628  0.24381416  0.44600033
   0.39565608]
 [-0.14925702  0.13869792 -0.36988357  0.59617225  0.60004464 -0.21979755
  -0.24059404]]
KeplerMapper(verbose=1)

=== DBSCAN ===
Número de clusters (incluye ruido = -1): 7
Tamaño de cada cluster:
 -1    131
 0     13
 1      3
 2      3
 3      3
 4      4
 5      6
Name: count, dtype: int64

=== Estadísticas de agrupación por cluster ===
        media_overall            media_potencial            media_edad  \
                count       mean           count       mean      count   
cluster                                                                  
-1                131  65.531228             131  69.804200        131   
 0                 13  66.479950              13  70.897346         13   
 1                  3  64.968254               3  68.674603          3   
 2         

'<!DOCTYPE html>\n<html>\n\n<head>\n  <meta charset="utf-8">\n  <meta name="generator" content="KeplerMapper">\n  <title>Clusters de Estilo de Juego por Nacionalidad | KeplerMapper</title>\n\n  <link rel="icon" type="image/png" href="http://i.imgur.com/axOG6GJ.jpg" />\n\n  <link href=\'https://fonts.googleapis.com/css?family=Roboto+Mono:700,300\' rel=\'stylesheet\' type=\'text/css\'>\n  <style>* {\n  margin: 0;\n  padding: 0;\n}\n\nhtml, body {\n  height: 100%;\n}\n\nbody {\n  font-family: "Roboto Mono", "Helvetica", sans-serif;\n  font-size: 14px;\n}\n\n#logo {\n  width:  85px;\n  height: 85px;\n}\n\n#display {\n  color: #95A5A6;\n  background: #212121;\n}\n\n#header {\n  background: #111111;\n}\n\n#print {\n  color: #000;\n  background: #FFF;\n}\n\nh1 {\n  font-size: 21px;\n  font-weight: 300;\n  font-weight: 300;\n}\n\nh2 {\n  font-size: 18px;\n  padding-bottom: 20px;\n  font-weight: 300;\n}\n\nh3 {\n  font-size: 14px;\n  font-weight: 700;\n  text-transform: uppercase;\n}\n\nh4 {\n 