In [2]:
# Step-by-step EDA for: ulrikthygepedersen / fifa-players
# Paste into a Kaggle notebook and run.

# %% Imports and styling
import os, re, glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.ticker import FuncFormatter

sns.set(style="whitegrid")
plt.rcParams.update({'figure.max_open_warning': 0})

# %% Helper functions

def find_csv_in_kaggle_dataset(dataset_slug=None):
    """
    Looks for CSV files under /kaggle/input.
    If dataset_slug is provided (like 'ulrikthygepedersen-fifa-players'), it will look inside that folder first.
    Returns the path to the first CSV found.
    """
    base = '/kaggle/input'
    if dataset_slug:
        cand = os.path.join(base, dataset_slug)
        if os.path.exists(cand):
            files = glob.glob(os.path.join(cand, '**', '*.csv'), recursive=True)
            if files:
                return files[0]
    # fallback: search all csv under /kaggle/input
    files = glob.glob(os.path.join(base, '**', '*.csv'), recursive=True)
    if files:
        return files[0]
    raise FileNotFoundError("No CSV file found under /kaggle/input. Make sure the dataset is attached.")

def money_str_to_float(s):
    """
    Parse values like '€110.5M', '€200K', '110.5M', '200K', '€0' into float euros.
    Returns np.nan if can't parse.
    """
    if pd.isna(s):
        return np.nan
    if isinstance(s, (int, float)):
        return float(s)
    s = str(s).strip().replace(',', '').replace('€','').replace('$','')
    if s in ['', '-', '0', '0.0']:
        return 0.0
    m = re.match(r'^([0-9]*\.?[0-9]+)\s*([MmKk]?)', s)
    if not m:
        # try to extract first number
        m2 = re.search(r'([0-9]*\.?[0-9]+)', s)
        if m2:
            return float(m2.group(1))
        return np.nan
    val = float(m.group(1))
    suf = m.group(2).upper()
    if suf == 'M':
        return val * 1_000_000
    if suf == 'K':
        return val * 1_000
    return val

def cm_to_meters(v):
    try:
        if pd.isna(v):
            return np.nan
        return float(v) / 100.0
    except:
        return np.nan

def euro_formatter(x, pos=None):
    if x >= 1_000_000:
        return f'€{x/1_000_000:.1f}M'
    if x >= 1_000:
        return f'€{x/1_000:.0f}K'
    return f'€{x:.0f}'

# %% Locate the CSV
try:
    csv_path = find_csv_in_kaggle_dataset('ulrikthygepedersen-fifa-players')
except FileNotFoundError:
    # fallback: try to find any csv
    csv_path = find_csv_in_kaggle_dataset()
print("Using CSV:", csv_path)

# %% Quick load (small preview to inspect columns quickly)
df_preview = pd.read_csv(csv_path, nrows=5)
print("Preview (first 5 rows):")
display(df_preview)
print("\nColumns found:", df_preview.columns.tolist())

# %% Full load (use low_memory=False for mixed types)
df = pd.read_csv(csv_path, low_memory=False)
print("Loaded full dataframe shape:", df.shape)

# %% Inspect columns and sample values to determine names to use
cols = df.columns.tolist()
print("Columns (%d):" % len(cols))
for c in cols:
    print("-", c)

# Optionally show top few rows again
display(df.head())

# %% Common column mapping
# Many datasets have slightly different column names. We'll create a mapping to canonical names we use later.
column_map = {}

# Typical names and fallbacks:
candidates = {
    'name': ['name','short_name','full_name','player_name'],
    'age': ['age','Age'],
    'overall': ['overall','Overall'],
    'potential': ['potential','Potential'],
    'value': ['value','Value','Value_eur'],
    'wage': ['wage','Wage','wage_eur'],
    'release_clause': ['release_clause','release clause','Release Clause'],
    'club': ['club','Club'],
    'nationality': ['nationality','Nationality','nationality_name'],
    'height': ['height','height_cm','Height'],
    'weight': ['weight','weight_kg','Weight'],
    'position': ['player_positions','position','positions','preferred_positions']
}

for canonical, names in candidates.items():
    found = None
    for n in names:
        if n in df.columns:
            found = n
            break
    if found:
        column_map[canonical] = found

print("\nColumn mapping detected (canonical -> dataset):")
for k,v in column_map.items():
    print(f"  {k} -> {v}")

# %% Rename mapped columns to canonical names in a working copy (not required but convenient)
df = df.rename(columns={v: k for k,v in column_map.items()})
print("Columns after renaming (showing canonical ones if available):")
for c in ['name','age','overall','potential','value','wage','club','nationality','height','weight','position']:
    if c in df.columns:
        print(f" - {c} (present)")
    else:
        print(f" - {c} (missing)")

# %% Clean monetary columns (value, wage, release_clause)
for col in ['value','wage','release_clause']:
    if col in df.columns:
        newcol = col + '_eur'
        df[newcol] = df[col].apply(money_str_to_float)
        print(f"Converted {col} -> {newcol} (non-null: {df[newcol].notna().sum()})")

# %% Clean height and weight
# Height may be '180 cm' or '6\'2"' style; weight may be '80kg'
if 'height' in df.columns:
    # attempt to extract the first number (assume cm)
    def parse_height(x):
        if pd.isna(x):
            return np.nan
        s = str(x)
        m = re.search(r'([0-9]{2,3})', s)  # look for 2-3 digit number
        if m:
            return float(m.group(1))
        return np.nan
    df['height_cm'] = df['height'].apply(parse_height)
    df['height_m'] = df['height_cm'].apply(cm_to_meters)
    print("Parsed height_cm & height_m")

if 'weight' in df.columns:
    def parse_weight(x):
        if pd.isna(x):
            return np.nan
        s = str(x).lower().replace('kg','').strip()
        m = re.search(r'([0-9]{2,3})', s)
        if m:
            return float(m.group(1))
        try:
            return float(s)
        except:
            return np.nan
    df['weight_kg'] = df['weight'].apply(parse_weight)
    print("Parsed weight_kg")

# %% Basic summary stats
print("\nBasic dataset info:")
print(df.info())

print("\nNumeric summary (selected cols):")
num_sample = [c for c in ['age','overall','potential','value_eur','wage_eur','height_cm','weight_kg'] if c in df.columns]
if num_sample:
    display(df[num_sample].describe(percentiles=[.01,.05,.25,.5,.75,.95,.99]).T)

# %% Top-level plots (will save to /kaggle/working/figures)
os.makedirs('/kaggle/working/figures', exist_ok=True)

# Age distribution
if 'age' in df.columns:
    plt.figure(figsize=(8,4))
    sns.histplot(df['age'].dropna(), bins=25)
    plt.title('Age distribution')
    plt.xlabel('Age')
    plt.ylabel('Count')
    path = '/kaggle/working/figures/age_distribution.png'
    plt.savefig(path, bbox_inches='tight')
    print("Saved", path)
    plt.close()

# Overall vs Potential
if set(['overall','potential']).issubset(df.columns):
    g = sns.jointplot(x='overall', y='potential', data=df, kind='hex', height=7)
    g.fig.suptitle('Overall vs Potential (hex)'); g.fig.tight_layout()
    path = '/kaggle/working/figures/overall_vs_potential.png'
    g.fig.savefig(path, bbox_inches='tight')
    print("Saved", path)
    plt.close()

# Top clubs by count
if 'club' in df.columns:
    top_clubs = df['club'].value_counts().nlargest(30)
    plt.figure(figsize=(10,6))
    sns.barplot(x=top_clubs.values, y=top_clubs.index)
    plt.title('Top 30 clubs by number of players (in dataset)')
    plt.xlabel('Player count')
    plt.ylabel('Club')
    path = '/kaggle/working/figures/top_clubs_count.png'
    plt.savefig(path, bbox_inches='tight')
    print("Saved", path)
    plt.close()

# Value & Wage distributions
for col in ['value_eur','wage_eur']:
    if col in df.columns:
        clean = df[col].dropna()
        q99 = clean.quantile(0.99)
        plt.figure(figsize=(8,4))
        sns.histplot(clean.clip(upper=q99), bins=50)
        plt.title(f'{col} distribution (clipped at 99th percentile)')
        plt.xlabel('Euros')
        plt.gca().xaxis.set_major_formatter(FuncFormatter(lambda x, p: euro_formatter(x)))
        path = f'/kaggle/working/figures/{col}_dist.png'
        plt.savefig(path, bbox_inches='tight')
        print("Saved", path)
        plt.close()

# %% Correlation heatmap for available attribute columns
corr_candidates = ['age','overall','potential',
                   'pace','shooting','passing','dribbling','defending','physic','physical']
corr_candidates = [c for c in corr_candidates if c in df.columns]
if len(corr_candidates) >= 2:
    plt.figure(figsize=(10,8))
    cc = df[corr_candidates].corr()
    sns.heatmap(cc, annot=True, fmt='.2f', cmap='coolwarm', square=True)
    plt.title('Correlation between key attributes')
    path = '/kaggle/working/figures/attributes_correlation.png'
    plt.savefig(path, bbox_inches='tight')
    print("Saved", path)
    plt.close()

# %% Position analysis (if position column exists)
pos_col = None
for p in ['position','player_positions','positions','preferred_positions']:
    if p in df.columns:
        pos_col = p
        break

if pos_col:
    pos_series = df[pos_col].dropna().astype(str).str.split(',')
    pos_exploded = pos_series.explode().str.strip()
    pos_counts = pos_exploded.value_counts().nlargest(30)
    plt.figure(figsize=(10,6))
    sns.barplot(x=pos_counts.values, y=pos_counts.index)
    plt.title('Top Positions (exploded)')
    plt.xlabel('Count')
    path = '/kaggle/working/figures/top_positions.png'
    plt.savefig(path, bbox_inches='tight')
    print("Saved", path)
    plt.close()

    # Average overall by primary position
    if 'overall' in df.columns:
        pos_df = df[[pos_col,'overall']].dropna()
        pos_df = pos_df.assign(primary=pos_df[pos_col].astype(str).str.split(',').str[0].str.strip())
        avg_by_pos = pos_df.groupby('primary')['overall'].mean().sort_values(ascending=False).head(20)
        plt.figure(figsize=(10,6))
        sns.barplot(x=avg_by_pos.values, y=avg_by_pos.index)
        plt.xlabel('Average Overall')
        plt.title('Average Overall by Primary Position (top 20)')
        path = '/kaggle/working/figures/avg_overall_by_position.png'
        plt.savefig(path, bbox_inches='tight')
        print("Saved", path)
        plt.close()

# %% Top players table and save
name_col = 'name' if 'name' in df.columns else (df.columns[0] if len(df.columns)>0 else None)
if 'overall' in df.columns and name_col:
    rank_cols = [c for c in ['overall','potential','value_eur','wage_eur'] if c in df.columns]
    display_cols = [name_col] + [c for c in ['age','club','nationality'] if c in df.columns] + rank_cols
    top_players = df.sort_values('overall', ascending=False).loc[:, display_cols].head(50)
    print("Top 15 players by overall:")
    display(top_players.head(15))
    top_players.to_csv('/kaggle/working/top_players_overall.csv', index=False)
    print("Saved /kaggle/working/top_players_overall.csv")

# %% Feature idea: value_per_overall
if 'value_eur' in df.columns and 'overall' in df.columns:
    df['value_per_overall'] = df['value_eur'] / df['overall'].replace({0:np.nan})
    print("value_per_overall summary:")
    display(df['value_per_overall'].describe(percentiles=[.01,.05,.25,.5,.75,.95,.99]))

# %% Save a sample parquet for fast reuse
sample_size = min(20000, len(df))
df.sample(sample_size, random_state=42).to_parquet('/kaggle/working/fifa_sample.parquet', index=False)
print("Saved sample parquet to /kaggle/working/fifa_sample.parquet")

# %% End: quick notes
print("""
Done. Next ideas you can add:
- Clustering player archetypes (KMeans on attribute subset)
- Small regression model to predict value / wage and show feature importances
- Choropleth of player nationality counts (requires country ISO map)
- Time series if you have multiple yearly files
If you want, I can:
- Convert this into a polished Kaggle notebook with markdown commentary and final insights
- Add a model cell (value/wage prediction) + SHAP/feature importances
""")


Using CSV: /kaggle/input/fifa-players/fifa_players.csv
Preview (first 5 rows):


Unnamed: 0,wage_eur,age,height_cm,weight_kg,nationality_name,overall,potential,attacking_crossing,attacking_finishing,attacking_heading_accuracy,...,movement_agility,movement_reactions,movement_balance,defending_standing_tackle,defending_sliding_tackle,goalkeeping_diving,goalkeeping_handling,goalkeeping_kicking,goalkeeping_positioning,goalkeeping_reflexes
0,320000.0,34.0,170.0,72.0,b'Argentina',93.0,93.0,85.0,95.0,70.0,...,91.0,94.0,95.0,35.0,24.0,6.0,11.0,15.0,14.0,8.0
1,270000.0,32.0,185.0,81.0,b'Poland',92.0,92.0,71.0,95.0,90.0,...,77.0,93.0,82.0,42.0,19.0,15.0,6.0,12.0,8.0,10.0
2,270000.0,36.0,187.0,83.0,b'Portugal',91.0,91.0,87.0,95.0,90.0,...,86.0,94.0,74.0,32.0,24.0,7.0,11.0,15.0,14.0,11.0
3,270000.0,29.0,175.0,68.0,b'Brazil',91.0,91.0,85.0,83.0,63.0,...,96.0,89.0,84.0,32.0,29.0,9.0,9.0,15.0,15.0,11.0
4,350000.0,30.0,181.0,70.0,b'Belgium',91.0,91.0,94.0,82.0,55.0,...,79.0,91.0,78.0,65.0,53.0,15.0,13.0,5.0,10.0,13.0



Columns found: ['wage_eur', 'age', 'height_cm', 'weight_kg', 'nationality_name', 'overall', 'potential', 'attacking_crossing', 'attacking_finishing', 'attacking_heading_accuracy', 'attacking_short_passing', 'attacking_volleys', 'skill_dribbling', 'skill_curve', 'skill_fk_accuracy', 'skill_long_passing', 'skill_ball_control', 'movement_acceleration', 'movement_sprint_speed', 'movement_agility', 'movement_reactions', 'movement_balance', 'defending_standing_tackle', 'defending_sliding_tackle', 'goalkeeping_diving', 'goalkeeping_handling', 'goalkeeping_kicking', 'goalkeeping_positioning', 'goalkeeping_reflexes']
Loaded full dataframe shape: (19178, 29)
Columns (29):
- wage_eur
- age
- height_cm
- weight_kg
- nationality_name
- overall
- potential
- attacking_crossing
- attacking_finishing
- attacking_heading_accuracy
- attacking_short_passing
- attacking_volleys
- skill_dribbling
- skill_curve
- skill_fk_accuracy
- skill_long_passing
- skill_ball_control
- movement_acceleration
- movement

Unnamed: 0,wage_eur,age,height_cm,weight_kg,nationality_name,overall,potential,attacking_crossing,attacking_finishing,attacking_heading_accuracy,...,movement_agility,movement_reactions,movement_balance,defending_standing_tackle,defending_sliding_tackle,goalkeeping_diving,goalkeeping_handling,goalkeeping_kicking,goalkeeping_positioning,goalkeeping_reflexes
0,320000.0,34.0,170.0,72.0,b'Argentina',93.0,93.0,85.0,95.0,70.0,...,91.0,94.0,95.0,35.0,24.0,6.0,11.0,15.0,14.0,8.0
1,270000.0,32.0,185.0,81.0,b'Poland',92.0,92.0,71.0,95.0,90.0,...,77.0,93.0,82.0,42.0,19.0,15.0,6.0,12.0,8.0,10.0
2,270000.0,36.0,187.0,83.0,b'Portugal',91.0,91.0,87.0,95.0,90.0,...,86.0,94.0,74.0,32.0,24.0,7.0,11.0,15.0,14.0,11.0
3,270000.0,29.0,175.0,68.0,b'Brazil',91.0,91.0,85.0,83.0,63.0,...,96.0,89.0,84.0,32.0,29.0,9.0,9.0,15.0,15.0,11.0
4,350000.0,30.0,181.0,70.0,b'Belgium',91.0,91.0,94.0,82.0,55.0,...,79.0,91.0,78.0,65.0,53.0,15.0,13.0,5.0,10.0,13.0



Column mapping detected (canonical -> dataset):
  age -> age
  overall -> overall
  potential -> potential
  wage -> wage_eur
  nationality -> nationality_name
  height -> height_cm
  weight -> weight_kg
Columns after renaming (showing canonical ones if available):
 - name (missing)
 - age (present)
 - overall (present)
 - potential (present)
 - value (missing)
 - wage (present)
 - club (missing)
 - nationality (present)
 - height (present)
 - weight (present)
 - position (missing)
Converted wage -> wage_eur (non-null: 19178)
Parsed height_cm & height_m
Parsed weight_kg

Basic dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19178 entries, 0 to 19177
Data columns (total 33 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   wage                        19178 non-null  float64
 1   age                         19178 non-null  float64
 2   height                      19178 non-null  float64
 3   we

Unnamed: 0,count,mean,std,min,1%,5%,25%,50%,75%,95%,99%,max
age,19178.0,25.201168,4.746183,16.0,17.0,18.0,21.0,25.0,29.0,34.0,37.0,54.0
overall,19178.0,65.760246,6.882432,47.0,50.0,54.0,61.0,66.0,70.0,77.0,83.0,93.0
potential,19178.0,71.076442,6.089398,49.0,57.0,62.0,67.0,71.0,75.0,82.0,86.0,95.0
wage_eur,19178.0,9017.989363,19470.176724,500.0,500.0,500.0,1000.0,3000.0,8000.0,37150.0,90000.0,350000.0
height_cm,19178.0,181.288143,6.86215,155.0,166.0,170.0,176.0,181.0,186.0,193.0,196.0,206.0
weight_kg,19178.0,74.930493,7.065573,49.0,60.0,64.0,70.0,75.0,80.0,87.0,92.0,110.0


  with pd.option_context('mode.use_inf_as_na', True):


Saved /kaggle/working/figures/age_distribution.png


  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  g.fig.suptitle('Overall vs Potential (hex)'); g.fig.tight_layout()


Saved /kaggle/working/figures/overall_vs_potential.png


  with pd.option_context('mode.use_inf_as_na', True):


Saved /kaggle/working/figures/wage_eur_dist.png
Saved /kaggle/working/figures/attributes_correlation.png
Top 15 players by overall:


Unnamed: 0,wage,age,nationality,overall,potential,wage_eur
0,320000.0,34.0,b'Argentina',93.0,93.0,320000.0
1,270000.0,32.0,b'Poland',92.0,92.0,270000.0
2,270000.0,36.0,b'Portugal',91.0,91.0,270000.0
3,270000.0,29.0,b'Brazil',91.0,91.0,270000.0
4,350000.0,30.0,b'Belgium',91.0,91.0,350000.0
5,130000.0,28.0,b'Slovenia',91.0,93.0,130000.0
6,230000.0,22.0,b'France',91.0,95.0,230000.0
7,86000.0,35.0,b'Germany',90.0,90.0,86000.0
8,250000.0,29.0,b'Germany',90.0,92.0,250000.0
9,240000.0,27.0,b'England',90.0,90.0,240000.0


Saved /kaggle/working/top_players_overall.csv
Saved sample parquet to /kaggle/working/fifa_sample.parquet

Done. Next ideas you can add:
- Clustering player archetypes (KMeans on attribute subset)
- Small regression model to predict value / wage and show feature importances
- Choropleth of player nationality counts (requires country ISO map)
- Time series if you have multiple yearly files
If you want, I can:
- Convert this into a polished Kaggle notebook with markdown commentary and final insights
- Add a model cell (value/wage prediction) + SHAP/feature importances

