# Imports

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv('../data/processed/cleaned_data.csv')

# Display the top 15 xG overperformers in Ligue 2 or less with 'low' salary

In [None]:
# Retrieve players in french leagues
french_leagues = ["Ligue 2 BKT", "Championnat National", "Ligues régionales"]
df_france = df[df['Division'].isin(french_leagues)]

# Calculate players xG overperformance
df_france['Overperformance_Gls_xG'] = df_france['Goals'] - df_france['xG']

# Select the 15 top overperformers
top_overperformers_france = df_france.nlargest(15, 'Overperformance_Gls_xG')

# Filter out the high salaries
top_overperformers_france = top_overperformers_france[top_overperformers_france['Wage'] <= 50000]

# Create a overperform x salary scatterplot
plt.figure(figsize=(14, 10))

# Calculate point thickness by age 
age_factor = (top_overperformers_france['Age'].max() - top_overperformers_france['Age']) + 10
plt.scatter(top_overperformers_france['Overperformance_Gls_xG'], top_overperformers_france['Wage'], 
            s=age_factor * 10, color='blue', alpha=0.6)

# Add players infos 
for i in range(len(top_overperformers_france)):
    player_name = top_overperformers_france.iloc[i]['Name']
    league_name = top_overperformers_france.iloc[i]['Division']
    age = top_overperformers_france.iloc[i]['Age']
    x = top_overperformers_france.iloc[i]['Overperformance_Gls_xG']
    y = top_overperformers_france.iloc[i]['Wage']
    
    plt.annotate(f"{age}", (x, y), 
                 textcoords="offset points", xytext=(0, 0), ha='center', va='center', 
                 fontsize=9, color='white', weight='bold')

  
    plt.annotate(f"{player_name} ({league_name})", (x, y),
                 textcoords="offset points", xytext=(0, 10), ha='center', fontsize=8)

plt.xlabel('Overperformance (Gls - xG)')
plt.ylabel('Salary (Wage)')
plt.title('Top 15 Overperforming Players in France: Overperformance vs Salary (Sized by Age)')
plt.grid(True)
plt.show()


# French players that overperformed their xG

In [None]:
# Retrieve french players
df_filtered_xG_realistic_fra = df_filtered_xG_realistic[df_filtered_xG_realistic['Nationality'] == 'FRA']

# Get the overperformers
df_filtered_xG_realistic_fra['Overperf_Gls_xG'] = df_filtered_xG_realistic_fra['Goals'] - df_filtered_xG_realistic_fra['xG']
top_overperformers_realistic_fra = df_filtered_xG_realistic_fra.nlargest(15, 'Overperf_Gls_xG')

print(top_overperformers_realistic_fra[['Name', 'Club', 'Division', 'Goals', 'Overperf_Gls_xG', 'Wage']])

# Show it with a graph
plt.figure(figsize=(12, 8))
sns.scatterplot(x='Overperf_Gls_xG', y='Wage', hue='Position_Categorized', data=top_overperformers_realistic_fra, palette='viridis')

for i in range(len(top_overperformers_realistic_fra)):
    player_name = top_overperformers_realistic_fra.iloc[i]['Name']
    overperformance = top_overperformers_realistic_fra.iloc[i]['Overperf_Gls_xG']
    wage = top_overperformers_realistic_fra.iloc[i]['Wage']
    
    plt.annotate(
        player_name, 
        (overperformance, wage),
        textcoords="offset points",
        xytext=(0,5), 
        ha='center',
        fontsize=9,
        color='black'
    )

plt.title('Top Overperforming French Players with Salary < 50,000: Overperformance vs Salary')
plt.xlabel('Overperformance (Goals - xG)')
plt.ylabel('Salary (Wage)')
plt.legend(title='Position')
plt.grid(True)
plt.tight_layout()
plt.show()


