In [1]:
import re
import requests
import pandas as pd
import os
import soccerdata as sd
import numpy as np
import json
import ftfy

from PIL import Image
from bs4 import BeautifulSoup
from zenrows import ZenRowsClient
from fuzzywuzzy import process
from fuzzywuzzy import fuzz
from unidecode import unidecode

In [2]:
def scraping_header(url, headers=None):
    """
    Scrape content from the provided URL with optional custom headers and return the parsed HTML content of the page.
    """
    if headers is None:
        headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'}
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raises an HTTPError for bad responses
    except requests.RequestException as e:
        print(f"Request failed: {e}")
        return None
    return BeautifulSoup(response.content, 'html.parser')

def compare_teams(df1, df2, column='Team', df1_name='DF1', df2_name='DF2'):
    """
    Compare specified columns between two dataframes to find unique data in each and diplay the difference from each dataframe.
    """
    if column not in df1 or column not in df2:
        raise ValueError(f"The column '{column}' must exist in both dataframes.")
    
    # Get unique values from each dataframe
    df1_unique = df1[~df1[column].isin(df2[column])][column].drop_duplicates().sort_values().reset_index(drop=True) # ascending=False
    df2_unique = df2[~df2[column].isin(df1[column])][column].drop_duplicates().sort_values().reset_index(drop=True) # ascending=False

    if df1_unique.empty and df2_unique.empty:
        print(f"All {column} entries are present and correctly named in both {df1_name} and {df2_name}.")
        return None
    
    comparison_df = pd.DataFrame({
        f'{df1_name} Unique': df1_unique,
        f'{df2_name} Unique': df2_unique
    }).fillna('')
    
    return comparison_df


def align_and_merge_teams(df1, df2, column='Team'):
    """
    Merges the two dataframes according to the team
    """
    
    if column not in df1.columns or column not in df2.columns:
        raise ValueError(f"The column '{column}' must exist in both dataframes.")
        
    df1_sorted = df1.sort_values(by=column).reset_index(drop=True)
    df2_sorted = df2.sort_values(by=column).reset_index(drop=True)
    
    mapping_dict = dict(zip(df1_sorted[column], df2_sorted[column]))
    
    df1_aligned = df1.copy()
    df1_aligned[column] = df1[column].map(mapping_dict).fillna(df1[column])
    
    missing_teams = set(df1_aligned[column]) - set(df2_sorted[column])
    if missing_teams:
        print("Missing teams in alignment:", missing_teams)
    else:
        print("All", column, "are present and correctly named.")
        
    merged_df = pd.merge(df1_aligned, df2_sorted, on=column, how='left', suffixes=('', '_drop'))
    merged_df.drop([col for col in merged_df.columns if '_drop' in col], axis=1, inplace=True)
    
    return merged_df

## Import players salary

In [3]:
def import_players_salary():
    soup = scraping_header("https://mlsplayers.org/resources/salary-guide")

    table = soup.find('table', {'id': 'salary-report'})
    rows = table.find_all('tr')

    data = []
    for row in rows[1:]:  # Skip the header row
        cols = row.find_all('td')
        cols = [ele.text.strip() for ele in cols]
        data.append(cols)  # Get rid of empty values

    df_players_salary = pd.DataFrame(data, columns=['FirstName', 'LastName', 'Team', 'Position', 'SalaryBase ($)', 'SalaryGuaranteed ($)']) 
    
    df_players_salary.isnull().sum() # Check if there is null values
    
    df_players_salary['SalaryBase ($)'] = pd.to_numeric(df_players_salary['SalaryBase ($)'].str.replace('[$,]', '', regex=True))
    df_players_salary['SalaryGuaranteed ($)'] = pd.to_numeric(df_players_salary['SalaryGuaranteed ($)'].str.replace('[$,]', '', regex=True))
    
    df_players_salary.to_csv('datasets/players_salary.csv', index=False)
    return df_players_salary

In [4]:
# Function no longer used in our case, as the site has been updated to show salaries for 2024. It's the 2023 salaries 
# we're interested in at the moment

# df_players_salary = import_players_salary() 

In [4]:
df_players_salary = pd.read_csv('datasets/players_salary.csv') # read the csv file created with 2023 data
df_players_salary.head()

Unnamed: 0,FirstName,LastName,Team,Position,SalaryBase ($),SalaryGuaranteed ($)
0,Luis,Abram,Atlanta United,D,556364.0,695977.0
1,Lalas,Abubakar,Colorado Rapids,D,665000.0,702125.0
2,Nicolás,Acevedo,New York City FC,D-M,230000.0,274800.0
3,Alonso,Aceves,Chicago Fire,D,330000.0,368900.0
4,Ifunanyachi,Achara,Houston Dynamo,F,85444.0,85444.0


## Create Team expenses table

In [5]:
salary_column = 'SalaryGuaranteed ($)'
total_salary_by_club = df_players_salary.groupby('Team')[salary_column].sum().sort_values(ascending=False).reset_index()
total_salary_by_club = total_salary_by_club[total_salary_by_club['Team'] != "Major League Soccer"]

## Add colors Team

In [6]:
soup = scraping_header("https://teamcolorcodes.com/soccer/mls-team-color-codes/")
teams = soup.find_all('a', class_='team-button')

team_colors = {}
for team in teams:
    name = team.get_text().strip()
    main_color = team['style'].split(';')[0].split(':')[-1].strip()
    secondary_color = team['style'].split(';')[1].split(':')[-1].replace('4px solid ', '').strip()
    team_colors[name] = {'Main Color': main_color, 'Secondary Color': secondary_color}

df_teams = pd.DataFrame.from_dict(team_colors, orient='index').reset_index()
df_teams.columns = ['Team', 'MainColor', 'SecondaryColor']

In [7]:
manual_teams = {
    'Nashville SC': {'Main Color': '#ECE83A', 'Secondary Color': '#1F1646'},
    'St. Louis City SC': {'Main Color': '#E0004D', 'Secondary Color': '#0C2340'},
    'Charlotte FC': {'Main Color': '#000000', 'Secondary Color': '#1A85C8'},
}
team_name_mappingSC = {
    "Vancouver Whitecaps": "Vancouver Whitecaps FC",
    "Minnesota United": "Minnesota United FC",
    "LAFC": "Los Angeles FC",
    "DC United": "D.C. United",
    "Columbus Crew": "Columbus Crew SC",
    "Inter Miami": "Inter Miami CF",
    "FC Cincinnati": "Cincinnati FC",
    "CF Montreal": "Montreal Impact",
}

In [8]:
team_colors.update(manual_teams)
df_teams = pd.DataFrame.from_dict(team_colors, orient='index').reset_index()
df_teams.columns = ['Team', 'MainColor', 'SecondaryColor']
total_salary_by_club['Team'] = total_salary_by_club['Team'].replace(team_name_mappingSC)

In [9]:
compare_teams(total_salary_by_club, df_teams, df1_name='Team Salary', df2_name='Team Colors')
df_teams = align_and_merge_teams(total_salary_by_club, df_teams)

All Team entries are present and correctly named in both Team Salary and Team Colors.
All Team are present and correctly named.


In [10]:
df_teams.to_csv('datasets/MLS_team_colors.csv', index=False)

## Final Table

In [11]:
mls_tables = pd.read_html('https://en.wikipedia.org/wiki/Template:2023_Major_League_Soccer_season_table')
mls_table = mls_tables[0]
mls_table = mls_table.rename(columns={mls_table.columns[1]: 'Team'})
mls_table = mls_table.drop(mls_table.columns[-1], axis=1)

mls_table['GD'] = mls_table['GD'].replace({'−': '-'}, regex=True)
mls_table['GD'] = pd.to_numeric(mls_table['GD'])

mls_table.head()

Unnamed: 0,Pos,Team,Pld,W,L,T,GF,GA,GD,Pts
0,1,FC Cincinnati (S),34,20,5,9,57,39,18,69
1,2,Orlando City SC,34,18,7,9,55,39,16,63
2,3,Columbus Crew (C),34,16,9,9,67,46,21,57
3,4,St. Louis City SC,34,17,12,5,62,45,17,56
4,5,Philadelphia Union,34,15,9,10,57,41,16,55


In [12]:
compare_teams(mls_table, df_teams, df1_name='MLS Table', df2_name='Team Colors')

Unnamed: 0,MLS Table Unique,Team Colors Unique
0,Vancouver Whitecaps FC (V),Vancouver Whitecaps FC
1,Inter Miami CF (L),Montreal Impact
2,Houston Dynamo FC (U),Inter Miami CF
3,FC Cincinnati (S),Houston Dynamo
4,Columbus Crew (C),Columbus Crew SC
5,Chicago Fire FC,Cincinnati FC
6,CF Montréal,Chicago Fire
7,Atlanta United FC,Atlanta United


In [13]:
team_name_mappingSC = {
    "FC Cincinnati (S)": "Cincinnati FC",
    "CF Montréal": "Montreal Impact",
}
mls_table['Team'] = mls_table['Team'].replace(team_name_mappingSC)
mls_table = align_and_merge_teams(mls_table, df_teams)

All Team are present and correctly named.


## Teams logo

In [8]:
folder_destination = 'datasets/Teamslogo/'

In [9]:
soup = scraping_header("https://www.sportslogos.net/teams/list_by_league/9/major_league_soccer/mls/logos/")

if not os.path.exists(folder_destination):
    os.makedirs(folder_destination)

# Download logos
logo_wall = soup.find('ul', class_='logoWall')
logos = logo_wall.find_all('li', style=True)
for logo in logos:
    team_name = logo.find('a')['title'].replace(' Logos', '')
    logo_path = logo.find('img')['src']
    logo_response = requests.get(logo_path, stream=True)
    extension = os.path.splitext(logo_path)[1]
    filename = f"{team_name}{extension}"
    file_path = os.path.join(folder_destination, filename)

    if logo_response.status_code == 200:
        with open(file_path, 'wb') as f:
            for chunk in logo_response.iter_content(chunk_size=128):
                f.write(chunk)

team_logos = []
for filename in os.listdir(folder_destination):
    if filename.endswith('.gif'):
        base_filename = filename[:-4].rstrip()
        new_filename = f"{base_filename}.png"
        
        original_filepath = os.path.join(folder_destination, filename)
        
        with Image.open(original_filepath) as img:
            new_filepath = os.path.join(folder_destination, new_filename)
            img.save(new_filepath)
        
        os.remove(original_filepath)
        
        team_name = base_filename.strip()
        image_path = os.path.join(folder_destination, new_filename)
        
        team_logos.append({'Team': team_name, 'Logo path': image_path})

logos_df = pd.DataFrame(team_logos)
logos_df = logos_df[logos_df['Team'] != 'San Diego FC']

In [21]:
team_name_mappingSC = {
    "CF Montreal": "Montreal Impact",
}
logos_df['Team'] = logos_df['Team'].replace(team_name_mappingSC) 
mls_table = align_and_merge_teams(mls_table, logos_df)
mls_table = align_and_merge_teams(total_salary_by_club, mls_table)

### Enhance and adjust image display

In [65]:
def remove_white_background(img_path, output_path):
    """
    Changes the background of the image from white to transparent
    """
    with Image.open(img_path) as img:
        # Convert the image to RGBA mode to access the alpha channel
        img = img.convert("RGBA")
        datas = img.getdata()
        newData = []
        
        for item in datas:
            if item[0] > 220 and item[1] > 220 and item[2] > 220:
                newData.append((255, 255, 255, 0))
            else:
                newData.append(item)

        img.putdata(newData)
        bbox = img.getbbox()
        img_cropped = img.crop(bbox)
        img_cropped.save(output_path)

In [66]:
for filename in os.listdir(folder_destination):
    if filename.endswith('.png'):
        file_path = os.path.join(folder_destination, filename)
        remove_white_background(file_path, file_path)

In [67]:
mls_table.sort_values(by='Pos', ascending=True)

Unnamed: 0,Team,SalaryGuaranteed ($),Pos,Pld,W,L,T,GF,GA,GD,Pts,MainColor,SecondaryColor,Logo path
20,Colorado Rapids,14348728.0,1,34,20,5,9,57,39,18,69,#003087,#FE5000,datasets/Teamslogo/Colorado Rapids.png
28,Orlando City SC,9642918.0,2,34,18,7,9,55,39,16,63,#61259E,#FFE293,datasets/Teamslogo/Orlando City SC.png
13,D.C. United,15313747.0,3,34,16,9,9,67,46,21,57,#231f20,#FEF200,datasets/Teamslogo/D.C. United.png
26,St Louis City SC,11186787.0,4,34,17,12,5,62,45,17,56,#E0004D,#0C2340,datasets/Teamslogo/St Louis City SC.png
24,Philadelphia Union,13419282.0,5,34,15,9,10,57,41,16,55,#002D55,#B38707,datasets/Teamslogo/Philadelphia Union.png
10,New England Revolution,17013587.0,6,34,15,9,10,58,46,12,55,#E51938,#002B5C,datasets/Teamslogo/New England Revolution.png
6,Seattle Sounders FC,19185352.0,7,34,14,9,11,41,32,9,53,#236192,#658D1B,datasets/Teamslogo/Seattle Sounders FC.png
3,LAFC,20841863.0,8,34,14,10,10,54,39,15,52,#000000,#C39e6d,datasets/Teamslogo/LAFC.png
8,Houston Dynamo,17459127.0,9,34,14,11,9,51,38,13,51,#F68712,#8DC6ED,datasets/Teamslogo/Houston Dynamo.png
5,Atlanta United FC,19847184.0,10,34,13,9,12,66,53,13,51,#80000A,#A19060,datasets/Teamslogo/Atlanta United FC.png


In [68]:
mls_table.to_csv('datasets/MLS_23_table.csv', index=False)

## Scraping Fbref

In [95]:
soup = scraping_header("https://fbref.com/en/comps/22/2023/2023-Major-League-Soccer-Stats")
table = soup.find('table', {'id': 'stats_squads_standard_for'})
rows = table.find_all('tr')

data = []
for row in rows[2:]:  # Skip the headers row
    cols = row.find_all('th')
    cols = cols + row.find_all('td')
    cols = [ele.text.strip() for ele in cols]
    data.append(cols)  # Get rid of empty values
    
df_team_FBref_for = pd.DataFrame(data, columns=['Team', '# Pl', 'Age', 'Poss', 'MP', 'Starts', 'Min', '90s', 'Gls', 'Ast', 
                                            'G+A', 'G-PK', 'PK', 'PKatt', 'CrdY', 'CrdR', 'xG', 'npxG', 'xAG', 'npxG+xAG', 
                                            'PrgC', 'PrgP', 'Gls/90', 'Ast/90', 'G+A/90', 'G-PK/90', 'G+A-PK/90', 'xG/90', 
                                            'xAG/90', 'xG+xAG/90', 'npxG/90', 'npxG+xAG/90'])

In [96]:
table = soup.find('table', {'id': 'stats_squads_standard_against'})
rows = table.find_all('tr')

data = []
for row in rows[2:]:  # Skip the headers row
    cols = row.find_all('th')
    cols = cols + row.find_all('td')
    cols = [ele.text.strip() for ele in cols]
    data.append(cols)  # Get rid of empty values
    
df_team_FBref_against = pd.DataFrame(data, columns=['Team', '# Pl', 'Age vs', 'Poss vs', 'MP', 'Starts', 'Min', '90s', 'Gls vs', 'Ast vs', 
                                            'G+A vs', 'G-PK vs', 'PK vs', 'PKatt vs', 'CrdY vs', 'CrdR vs', 'xG vs', 'npxG vs', 'xAG vs', 'npxG+xAG vs', 
                                            'PrgC vs', 'PrgP vs', 'Gls/90 vs', 'Ast/90 vs', 'G+A/90 vs', 'G-PK/90 vs', 'G+A-PK/90 vs', 'xG/90 vs', 
                                            'xAG/90 vs', 'xG+xAG/90 vs', 'npxG/90 vs', 'npxG+xAG/90'])
df_team_FBref_against['Team'] = df_team_FBref_against['Team'].str.replace("vs ", "")

In [97]:
df_team_Fbref = align_and_merge_teams(df_team_FBref_for, df_team_FBref_against)

All Team are present and correctly named.


In [73]:
compare_teams(df_team_Fbref, mls_table, df1_name='Fbref Table', df2_name='Logo Table')

Unnamed: 0,Fbref Table Unique,Logo Table Unique
0,Atlanta Utd,Atlanta United FC
1,Austin,Austin FC
2,CF Montréal,Charlotte FC
3,Charlotte,Chicago Fire
4,Crew,Colorado Rapids
5,Dynamo FC,Columbus Crew SC
6,Fire,Houston Dynamo
7,Inter Miami,Inter Miami C.F.
8,Minnesota Utd,Minnesota United FC
9,NE Revolution,Montreal Impact


In [140]:
team_name_mappingFL = {
    "CF Montréal": "Montreal Impact",
    "Crew": "Columbus Crew SC",
    "Dynamo FC": "Houston Dynamo",
    "Inter Miami": "Inter Miami C.F.",
    "Rapids": "Colorado Rapids",
    "Nashville": "Nashville SC",
    "NY Red Bulls": "New York Red Bulls",
    "Atlanta Utd": "Atlanta United FC",
    "Austin": "Austin FC",
    "Charlotte": "Charlotte FC",
    "Columbus Crew": "Columbus Crew SC",
    "Los Angeles FC": "LAFC",
    "Minnesota Utd": "Minnesota United FC",
    "NYCFC": "New York City FC",
    "New England": "New England Revolution",
    "Orlando City": "Orlando City SC",
    "Philadelphia": "Philadelphia Union",
    "San Jose": "San Jose Earthquakes",
    "Seattle": "Seattle Sounders FC",
    "Sporting KC": "Sporting Kansas City",
    "St. Louis": "St Louis City SC",
    "Vancouver": "Vancouver Whitecaps FC"
}
df_team_Fbref['Team'] = df_team_Fbref['Team'].replace(team_name_mappingFL)
df_team_Fbref = align_and_merge_teams(df_team_Fbref, mls_table)
df_team_Fbref.drop(columns=["Pld", "GF", "GA", "GD"])

All Team are present and correctly named.


Unnamed: 0,Team,# Pl,Age,Poss,MP,Starts,Min,90s,Gls,Ast,...,Pos,W,L,T,Pts,MainColor,SecondaryColor,Logo path,Diff_Gls,Diff_Gls vs
0,Atlanta United FC,33,25.8,55.6,34,374,3060,34.0,64,44,...,10,13,9,12,51,#80000A,#A19060,datasets/Teamslogo/Atlanta United FC.png,15.3,6.0
1,Austin FC,26,27.9,51.2,34,374,3060,34.0,48,37,...,25,10,15,9,39,#000000,#00b140,datasets/Teamslogo/Austin FC.png,5.5,4.1
2,Montreal Impact,31,24.4,48.3,34,374,3060,34.0,33,22,...,20,12,17,5,41,#2B63AD,#373536,datasets/Teamslogo/CF Montreal.png,-4.8,-1.2
3,Charlotte FC,32,27.2,52.6,34,374,3060,34.0,43,29,...,19,10,11,13,43,#000000,#1A85C8,datasets/Teamslogo/Charlotte FC.png,6.0,0.3
4,Colorado Rapids,28,25.8,57.1,34,374,3060,34.0,63,49,...,1,20,5,9,69,#003087,#FE5000,datasets/Teamslogo/Colorado Rapids.png,5.3,3.7
5,Columbus Crew SC,31,28.4,49.5,34,374,3060,34.0,44,27,...,28,5,17,12,27,#862633,#8BB8E8,datasets/Teamslogo/Columbus Crew SC.png,-1.8,5.4
6,Houston Dynamo,25,27.2,51.6,34,374,3060,34.0,51,36,...,9,14,11,9,51,#F68712,#8DC6ED,datasets/Teamslogo/Houston Dynamo.png,6.1,-4.0
7,D.C. United,27,26.3,48.8,34,374,3060,34.0,56,35,...,3,16,9,9,57,#231f20,#FEF200,datasets/Teamslogo/D.C. United.png,-0.3,-3.7
8,FC Cincinnati,29,25.8,48.8,34,374,3060,34.0,41,34,...,23,10,14,10,40,#ee1a39,#231f20,datasets/Teamslogo/FC Cincinnati.png,3.2,-3.6
9,FC Dallas,26,26.5,46.1,34,374,3060,34.0,36,29,...,14,11,10,13,46,#BF0D3E,#00205B,datasets/Teamslogo/FC Dallas.png,-2.2,-2.1


In [11]:
def convert_columns_to_numeric(df, columns):
    """
    Convert specified columns in the dataframe to numeric
    """
    df[columns] = df[columns].apply(pd.to_numeric, errors='coerce')
    return df

def calculate_diff_columns(df, cols_to_diff):
    """
    Calculate the difference between pairs of columns and create a new column for each pair
    """
    for col1, col2 in cols_to_diff:
        diff_col_name = f"Diff_{col1}"
        df[diff_col_name] = df[col1] - df[col2]
    return df

In [142]:
# Convert specified columns to numeric
columns_to_convert = ['Gls', 'Gls vs', 'xG', 'xG vs']
df_team_Fbref = convert_columns_to_numeric(df_team_Fbref, columns_to_convert)

# Calculate differences and create new columns
columns_to_difference = [('Gls', 'xG'), ('Gls vs', 'xG vs')]
df_team_Fbref = calculate_diff_columns(df_team_Fbref, columns_to_difference)

In [143]:
df_team_Fbref

Unnamed: 0,Team,# Pl,Age,Poss,MP,Starts,Min,90s,Gls,Ast,...,T,GF,GA,GD,Pts,MainColor,SecondaryColor,Logo path,Diff_Gls,Diff_Gls vs
0,Atlanta United FC,33,25.8,55.6,34,374,3060,34.0,64,44,...,12,66,53,13,51,#80000A,#A19060,datasets/Teamslogo/Atlanta United FC.png,15.3,6.0
1,Austin FC,26,27.9,51.2,34,374,3060,34.0,48,37,...,9,49,55,-6,39,#000000,#00b140,datasets/Teamslogo/Austin FC.png,5.5,4.1
2,Montreal Impact,31,24.4,48.3,34,374,3060,34.0,33,22,...,5,36,52,-16,41,#2B63AD,#373536,datasets/Teamslogo/CF Montreal.png,-4.8,-1.2
3,Charlotte FC,32,27.2,52.6,34,374,3060,34.0,43,29,...,13,45,52,-7,43,#000000,#1A85C8,datasets/Teamslogo/Charlotte FC.png,6.0,0.3
4,Colorado Rapids,28,25.8,57.1,34,374,3060,34.0,63,49,...,9,57,39,18,69,#003087,#FE5000,datasets/Teamslogo/Colorado Rapids.png,5.3,3.7
5,Columbus Crew SC,31,28.4,49.5,34,374,3060,34.0,44,27,...,12,26,54,-28,27,#862633,#8BB8E8,datasets/Teamslogo/Columbus Crew SC.png,-1.8,5.4
6,Houston Dynamo,25,27.2,51.6,34,374,3060,34.0,51,36,...,9,51,38,13,51,#F68712,#8DC6ED,datasets/Teamslogo/Houston Dynamo.png,6.1,-4.0
7,D.C. United,27,26.3,48.8,34,374,3060,34.0,56,35,...,9,67,46,21,57,#231f20,#FEF200,datasets/Teamslogo/D.C. United.png,-0.3,-3.7
8,FC Cincinnati,29,25.8,48.8,34,374,3060,34.0,41,34,...,10,45,49,-4,40,#ee1a39,#231f20,datasets/Teamslogo/FC Cincinnati.png,3.2,-3.6
9,FC Dallas,26,26.5,46.1,34,374,3060,34.0,36,29,...,13,41,37,4,46,#BF0D3E,#00205B,datasets/Teamslogo/FC Dallas.png,-2.2,-2.1


In [144]:
df_team_Fbref.to_csv('datasets/squad_stats_FBref.csv', index=False)

### Player Standard Stats 

In [25]:
fbref = sd.FBref(leagues=['US-MLS'], seasons=['2324'])
player_season_stats = fbref.read_player_season_stats(stat_type="standard")
player_season_stats.head()



Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,nation,pos,age,born,Playing Time,Playing Time,Playing Time,Playing Time,Performance,Performance,...,Per 90 Minutes,Per 90 Minutes,Per 90 Minutes,Per 90 Minutes,Per 90 Minutes,Per 90 Minutes,Per 90 Minutes,Per 90 Minutes,Per 90 Minutes,Per 90 Minutes
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,MP,Starts,Min,90s,Gls,Ast,...,Gls,Ast,G+A,G-PK,G+A-PK,xG,xAG,xG+xAG,npxG,npxG+xAG
league,season,team,player,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2
US-MLS,2324,Atlanta Utd,Ajani Fortune,TRI,MF,20,2002,14,3,374,4.2,0,1,...,0.0,0.24,0.24,0.0,0.24,0.18,0.03,0.21,0.18,0.21
US-MLS,2324,Atlanta Utd,Amar Sejdic,USA,MF,26,1996,23,14,1061,11.8,0,0,...,0.0,0.0,0.0,0.0,0.0,0.03,0.02,0.05,0.03,0.05
US-MLS,2324,Atlanta Utd,Andrew Gutman,USA,DF,26,1996,18,16,1385,15.4,3,2,...,0.19,0.13,0.32,0.19,0.32,0.11,0.14,0.24,0.11,0.24
US-MLS,2324,Atlanta Utd,Brad Guzan,USA,GK,38,1984,27,27,2430,27.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
US-MLS,2324,Atlanta Utd,Brooks Lennon,USA,DF,25,1997,33,32,2881,32.0,4,9,...,0.12,0.28,0.41,0.12,0.41,0.08,0.16,0.24,0.08,0.24


In [22]:
def clean_and_rename_columns(df):
    """
    Reset index of the dataframe, remove unnecessary columns and rename to simplify
    """
    df = df.reset_index()
    df.drop(columns=['league', 'season'], errors='ignore', inplace=True)
    
    new_columns = []
    for col in df.columns:
        if col[1] == '':
            new_columns.append(col[0])

        elif col[0] == 'Per 90 Minutes':
            new_columns.append(col[1] + '/90')
        
        else:
            new_columns.append(col[1])
    
    df.columns = new_columns
    df = df.rename(columns={"team": "Team"})
    return df

In [26]:
player_season_stats = clean_and_rename_columns(player_season_stats)
player_season_stats.columns

  df.drop(columns=['league', 'season'], errors='ignore', inplace=True)


Index(['Team', 'player', 'nation', 'pos', 'age', 'born', 'MP', 'Starts', 'Min',
       '90s', 'Gls', 'Ast', 'G+A', 'G-PK', 'PK', 'PKatt', 'CrdY', 'CrdR', 'xG',
       'npxG', 'xAG', 'npxG+xAG', 'PrgC', 'PrgP', 'PrgR', 'Gls/90', 'Ast/90',
       'G+A/90', 'G-PK/90', 'G+A-PK/90', 'xG/90', 'xAG/90', 'xG+xAG/90',
       'npxG/90', 'npxG+xAG/90'],
      dtype='object')

In [27]:
player_season_stats['Team'] = player_season_stats['Team'].replace(team_name_mappingFL)
player_season_stats = pd.merge(player_season_stats, logos_df, on='Team', how='left')

In [29]:
player_season_stats.head()

Unnamed: 0,Team,player,nation,pos,age,born,MP,Starts,Min,90s,...,Ast/90,G+A/90,G-PK/90,G+A-PK/90,xG/90,xAG/90,xG+xAG/90,npxG/90,npxG+xAG/90,Logo path
0,Atlanta United FC,Ajani Fortune,TRI,MF,20,2002,14,3,374,4.2,...,0.24,0.24,0.0,0.24,0.18,0.03,0.21,0.18,0.21,datasets/Teamslogo/Atlanta United FC.png
1,Atlanta United FC,Amar Sejdic,USA,MF,26,1996,23,14,1061,11.8,...,0.0,0.0,0.0,0.0,0.03,0.02,0.05,0.03,0.05,datasets/Teamslogo/Atlanta United FC.png
2,Atlanta United FC,Andrew Gutman,USA,DF,26,1996,18,16,1385,15.4,...,0.13,0.32,0.19,0.32,0.11,0.14,0.24,0.11,0.24,datasets/Teamslogo/Atlanta United FC.png
3,Atlanta United FC,Brad Guzan,USA,GK,38,1984,27,27,2430,27.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,datasets/Teamslogo/Atlanta United FC.png
4,Atlanta United FC,Brooks Lennon,USA,DF,25,1997,33,32,2881,32.0,...,0.28,0.41,0.12,0.41,0.08,0.16,0.24,0.08,0.24,datasets/Teamslogo/Atlanta United FC.png


In [30]:
player_season_stats.to_csv('datasets/player_stats_FBref.csv', index=False)

## Import WhoScored Data

In [44]:
ws = sd.WhoScored(leagues="US-MLS", seasons=2324)
mls_schedule = ws.read_schedule()



In [45]:
missing_players = ws.read_missing_players()



In [5]:
def clean_columns(df, drop_columns):
    """
    Cleans the DataFrame by resetting the index and dropping specified columns.
    """
    df = df.reset_index()
    df.drop(columns=drop_columns, errors='ignore', inplace=True)
    return df

def categorize_reason(reason):
    if reason in ['injured', 'injured doubtful', 'unfit']:
        return 'total_missed_injured'
    elif reason == 'suspended':
        return 'total_missed_suspended'
    elif reason == 'international duty':
        return 'total_missed_international_duty'
    else:
        return 'total_missed_other' # 'ineligible' categorized as 'other' awaiting further understanding

def calculate_missed_games(df, group_by_cols):
    """
    Calculates the total and unique missed games by the given column (team or player).
    """
    total_missed = df.groupby(group_by_cols)['game_id'].count().rename('total_missed_games')
    missed_by_reason = df.groupby(group_by_cols + ['reason_category'])['game_id'].count().unstack(fill_value=0)
    
    if group_by_cols == ['team']:
        unique_missed = df.groupby('team')['player'].nunique().rename('unique_players_missing')
        most_missed = df.groupby(['team', 'player'])['game_id'].count().reset_index(name='missed_games')
        most_missed = most_missed.loc[most_missed.groupby('team')['missed_games'].idxmax()]
        
        missed_summary = pd.concat([total_missed, unique_missed], axis=1).reset_index()
        missed_summary = missed_summary.merge(most_missed, on='team', how='left')
        missed_summary.rename(columns={'player': 'player_most_missed', 'missed_games': 'most_missed_games'}, inplace=True)
    else:
        missed_summary = total_missed.reset_index()
    
    missed_summary = missed_summary.merge(missed_by_reason, on=group_by_cols, how='left').fillna(0)
    missed_summary.rename(columns={'team': 'Team'}, inplace=True)
    return missed_summary

def add_opponent_info(df, schedule_df):
    """
    Adds opponent information to the DataFrame.
    """
    df = df.merge(schedule_df, on='game_id', how='left')
    df['opponent'] = df.apply(lambda row: row['away_team'] if row['team'] == row['home_team'] else row['home_team'], axis=1)
    return df

In [752]:
ws_schedule = clean_columns(mls_schedule, ['league', 'season'])
ws_missing_players = clean_columns(missing_players, ['league', 'season'])
ws_missing_players['player'] = ws_missing_players['player'].apply(ftfy.fix_text) # To correct incorrectly named names

In [306]:
ws_missing_players.to_csv('datasets/missed_players.csv', index=False)

In [754]:
ws_missing_players['reason_category'] = ws_missing_players['reason'].apply(categorize_reason)

df_missing_players = add_opponent_info(ws_missing_players, ws_schedule)

# Aggregation by club
total_missed_by_club = calculate_missed_games(df_missing_players, ['team'])
missed_by_opponent = (
    df_missing_players.groupby('opponent')['game_id']
    .count()
    .reset_index(name='total_missed_games_by_opponent')
    .rename(columns={'opponent': 'Team'})
)
total_missed_by_club = total_missed_by_club.merge(missed_by_opponent, on='Team', how='left').fillna({'total_missed_games_by_opponent': 0})

In [755]:
total_missed_by_club.sort_values(by='total_missed_games', ascending=False).head()

Unnamed: 0,Team,total_missed_games,unique_players_missing,player_most_missed,most_missed_games,total_missed_injured,total_missed_international_duty,total_missed_other,total_missed_suspended,total_missed_games_by_opponent
11,Inter Miami CF,181,23,Gregore,25,159,18,1,3,97
16,New England Revolution,175,23,Dylan Borrero,21,156,11,5,3,100
21,Portland Timbers,163,22,David Ayala,26,150,3,3,7,81
25,Sporting Kansas City,161,19,Kortne Ford,32,155,2,0,4,103
7,DC United,150,18,Martín Rodríguez,29,143,3,0,4,88


In [309]:
total_missed_by_club.to_csv('datasets/missed_games_by_club.csv', index=False)

In [756]:
# Aggregation by player
total_missed_by_player = calculate_missed_games(df_missing_players, ['player', 'team'])

In [758]:
total_missed_by_player.sort_values(by='total_missed_games', ascending=False).head()

Unnamed: 0,player,Team,total_missed_games,total_missed_injured,total_missed_international_duty,total_missed_other,total_missed_suspended
373,Nick DePuy,Nashville SC,33,33,0,0,0
295,Kortne Ford,Sporting Kansas City,32,32,0,0,0
209,Ifunanyachi Achara,Houston Dynamo FC,29,29,0,0,0
330,Martín Rodríguez,DC United,29,29,0,0,0
424,Ryen Jiba,Minnesota United,29,29,0,0,0


In [312]:
total_missed_by_player.to_csv('datasets/missed_games_by_player.csv', index=False)

### Add WhoScored injury data to Fbref stats

#### Team data

In [313]:
compare_teams(df_team_Fbref, total_missed_by_club, df1_name='Fbref Table', df2_name='Missed Table')

Unnamed: 0,Fbref Table Unique,Missed Table Unique
0,Atlanta United FC,Atlanta United
1,Chicago Fire,CF Montreal
2,Columbus Crew SC,Chicago Fire FC
3,D.C. United,Columbus Crew
4,Houston Dynamo,DC United
5,Inter Miami C.F.,Houston Dynamo FC
6,LAFC,Inter Miami CF
7,Minnesota United FC,Los Angeles FC
8,Montreal Impact,Minnesota United
9,Orlando City SC,Orlando City


In [501]:
team_name_mappingWF = {
    "Atlanta United": "Atlanta United FC",
    "CF Montreal": "Montreal Impact",
    "Chicago Fire FC": "Chicago Fire",
    "Columbus Crew": "Columbus Crew SC",
    "DC United": "D.C. United",
    "Houston Dynamo FC": "Houston Dynamo",
    "Inter Miami CF": "Inter Miami C.F.",
    "Los Angeles FC": "LAFC",
    "Minnesota United": "Minnesota United FC",
    "Orlando City": "Orlando City SC",
    "St. Louis City": "St Louis City SC",
    "Vancouver Whitecaps": "Vancouver Whitecaps FC",
}
total_missed_by_club['Team'] = total_missed_by_club['Team'].replace(team_name_mappingWF)
stats_team_mls = align_and_merge_teams(df_team_Fbref, total_missed_by_club)

All Team are present and correctly named.


In [502]:
stats_team_mls.to_csv('datasets/stats_team_mls.csv', index=False)

#### Player data

In [799]:
total_missed_by_player['Team'] = total_missed_by_player['Team'].replace(team_name_mappingWF)

In [6]:
manual_adjustments = {
    ('Carlos Gómez', 'Real Salt Lake'): 'Andrés Gómez',
    ('Machop Malual', 'Atlanta United FC'): 'Machop Chol',
    ('Bode Davis', 'Real Salt Lake'): 'Bode Hidalgo'
}
incorrect_matches = [
    ('Jonathan Mensah', 'San Jose Earthquakes')
]

def preprocess_name(name):
    return unidecode(name).lower()

def get_best_matches(df1, df2, column1, column2, team_column1, team_column2, score_cutoff=80):
    """
    Find the best matches for player names between two DataFrames based on team and name similarity.
    """
    matches = {}
    used_matches = set()

    df1['processed_player'] = df1[column1].apply(preprocess_name)
    df2['processed_player'] = df2[column2].apply(preprocess_name)

    for index, row in df1.iterrows():
        name = row['processed_player']
        team = row[team_column1]

        potential_matches = df2[(df2[team_column2] == team) & (~df2['processed_player'].isin(used_matches))]

        if not potential_matches.empty:
            result = process.extractOne(name, potential_matches['processed_player'], score_cutoff=score_cutoff)
            if result:
                match, score, idx = result
                original_match = df2.iloc[idx][column2]
                if match not in used_matches:
                    matches[(row[column1], team)] = original_match
                    used_matches.add(match)
                else:
                    matches[(row[column1], team)] = None
            else:
                matches[(row[column1], team)] = None
        else:
            matches[(row[column1], team)] = None

    df1.drop(columns=['processed_player'], inplace=True)
    df2.drop(columns=['processed_player'], inplace=True)
    
    return matches

def apply_manual_adjustments(best_matches: dict, adjustments: dict, incorrect: list):
    """
    Apply manual adjustments to the matches dictionary.
    """
    for key, value in adjustments.items():
        if key in best_matches:
            best_matches[key] = value

    for key in incorrect:
        if key in best_matches:
            best_matches[key] = None
            
def merge_player_df(best_matches, total_missed_by_player, player_season_stats):
    """
    Merge the player season stats DataFrame with the total missed games DataFrame.
    """
    # Create a reverse mapping from best_matches
    best_matches_reverse = {v: k for k, v in best_matches.items() if v is not None}
    
    total_missed_by_player['matched_player'] = total_missed_by_player['player'].apply(lambda x: best_matches_reverse.get(x, (None, None))[0])
    
    # Filter out unmatched rows
    matched_total_missed = total_missed_by_player.dropna(subset=['matched_player'])
    
    # Update with matched player names
    matched_total_missed.loc[:, 'player'] = matched_total_missed['matched_player']
    matched_total_missed = matched_total_missed.drop(columns=['matched_player'])
    
    merged_df = player_season_stats.merge(matched_total_missed, how='left', left_on=['player', 'Team'], right_on=['player', 'Team'])
    merged_df = merged_df.fillna(0)

    return merged_df

In [766]:
# best_matches

In [741]:
best_matches = get_best_matches(player_season_stats, total_missed_by_player, 'player', 'player', 'Team', 'Team')
apply_manual_adjustments(best_matches, manual_adjustments, incorrect_matches)

In [806]:
stats_player_mls = merge_player_df(best_matches, total_missed_by_player, player_season_stats)

In [815]:
stats_player_mls.head()

Unnamed: 0,Team,player,nation,pos,age,born,MP,Starts,Min,90s,...,xAG/90,xG+xAG/90,npxG/90,npxG+xAG/90,Logo path,total_missed_games,total_missed_injured,total_missed_international_duty,total_missed_other,total_missed_suspended
0,Atlanta United FC,Ajani Fortune,TRI,MF,20,2002,14,3,374,4.2,...,0.03,0.21,0.18,0.21,datasets/Teamslogo/Atlanta United FC.png,0.0,0.0,0.0,0.0,0.0
1,Atlanta United FC,Amar Sejdic,USA,MF,26,1996,23,14,1061,11.8,...,0.02,0.05,0.03,0.05,datasets/Teamslogo/Atlanta United FC.png,3.0,3.0,0.0,0.0,0.0
2,Atlanta United FC,Andrew Gutman,USA,DF,26,1996,18,16,1385,15.4,...,0.14,0.24,0.11,0.24,datasets/Teamslogo/Atlanta United FC.png,3.0,3.0,0.0,0.0,0.0
3,Atlanta United FC,Brad Guzan,USA,GK,38,1984,27,27,2430,27.0,...,0.0,0.0,0.0,0.0,datasets/Teamslogo/Atlanta United FC.png,8.0,8.0,0.0,0.0,0.0
4,Atlanta United FC,Brooks Lennon,USA,DF,25,1997,33,32,2881,32.0,...,0.16,0.24,0.08,0.24,datasets/Teamslogo/Atlanta United FC.png,0.0,0.0,0.0,0.0,0.0


In [811]:
stats_player_mls.to_csv('datasets/stats_player_mls.csv', index=False)

## Import player list

### For StatsBomb data

In [11]:
def map_nationalities(players_df, flags_iso_df):
    """
    Maps nationality_name to alpha-3 code using the provided flags_iso dataframe.
    """
    nationality_map = flags_iso_df.set_index('Country')['Alpha-3 code'].to_dict()
    players_df['alpha_3_code'] = players_df['nationality_name'].map(nationality_map)
    return players_df

In [12]:
team_name_mapping_picstat = {
    'Atlanta United': 'Atlanta United FC',
    'Austin': 'Austin FC',
    'CF Montréal': 'Montreal Impact',
    'Charlotte': 'Charlotte FC',
    'Cincinnati': 'FC Cincinnati',
    'Columbus Crew': 'Columbus Crew SC',
    'DC United': 'D.C. United',
    'Dallas': 'FC Dallas',
    'Inter Miami': 'Inter Miami C.F.',
    'Los Angeles FC': 'LAFC',
    'Minnesota United': 'Minnesota United FC',
    'New England': 'New England Revolution',
    'New York City': 'New York City FC',
    'New York RB': 'New York Red Bulls',
    'Orlando City': 'Orlando City SC',
    'Saint Louis City': 'St Louis City SC',
    'Seattle Sounders': 'Seattle Sounders FC',
    'Toronto': 'Toronto FC',
    'Vancouver Whitecaps': 'Vancouver Whitecaps FC'
}

In [13]:
flags_iso = pd.read_csv('datasets/flags_iso.csv')

columns_to_read = ['player_url', 'fifa_version', 'short_name', 'long_name', 'club_name', 'league_name', 'nationality_name']
# dataset from : https://www.kaggle.com/datasets/stefanoleone992/ea-sports-fc-24-complete-player-dataset?resource=download&select=male_players.csv
ea_players = pd.read_csv('datasets/male_players.csv', usecols=columns_to_read, low_memory=False)

# FIFA versions 23 and 24
ea2324_players = ea_players[ea_players['fifa_version'].isin([23, 24])].copy()
ea2324_players['player_url'] = 'https://sofifa.com' + ea2324_players['player_url']
ea2324_players = map_nationalities(ea2324_players, flags_iso)

# FIFA 24
ea24_players = ea2324_players[ea2324_players['fifa_version'] == 24].copy()

# MLS players
mls2223_players = ea2324_players[ea2324_players['league_name'] == 'Major League Soccer'].copy()
mls2223_players.rename(columns={'club_name': 'Team'}, inplace=True)
mls2223_players['Team'] = mls2223_players['Team'].replace(team_name_mapping_picstat)

# MLS FIFA 24
mls23_players = mls2223_players[mls2223_players['fifa_version'] == 24].copy()

In [92]:
# Customer has expired, subscription required if you wish to renew. Find a solution to scrape Sofifa
# client = ZenRowsClient("...")

### Download player image

In [43]:
statsBomb_clubs = ['Inter Miami C.F.', 'New York Red Bulls', 'LA Galaxy', 'Charlotte FC', 'Nashville SC', 'FC Cincinnati', 'Toronto FC']
statsBomb_players = mls23_players[mls23_players['club_name'].isin(statsBomb_clubs)].copy()

local_folder = 'datasets/playersImage'
if not os.path.exists(local_folder):
    os.makedirs(local_folder)

In [54]:
def download_player_image(row):
    try:
        url = row['player_url']
        response = client.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        script = soup.find('script', type='application/ld+json')
        json_data = script.string
        player_data = json.loads(json_data)

        family_name = player_data['familyName']
        image_url = player_data['image']

        player_id = url.split('/')[4]
        filename = f"{player_id}{family_name}.png"

        image_filename = os.path.normpath(os.path.join(local_folder, filename)).replace("\\", "/")
        response = requests.get(image_url)
        if response.status_code == 200:
            with open(image_filename, 'wb') as f:
                f.write(response.content)
            return image_filename
        else:
            print(f"Failed to download {family_name} image")
            return row['player_url']
    except Exception as e:
        print(f"Error processing {row['short_name']}: {e}")
        return row['player_url']

In [58]:
statsBomb_players['player_url'] = statsBomb_players.apply(download_player_image, axis=1)

Failed to download Jensen image


In [59]:
statsBomb_players.to_csv('datasets/mls23_statsBomb_players.csv', index=False)
statsBomb_players.head()

Unnamed: 0,player_url,fifa_version,short_name,long_name,club_name,league_name,nationality_name
3,datasets/playersImage/158023Messi.png,24.0,L. Messi,Lionel Andrés Messi Cuccittini,Inter Miami,Major League Soccer,Argentina
172,datasets/playersImage/189332Alba Ramos.png,24.0,Jordi Alba,Jordi Alba Ramos,Inter Miami,Major League Soccer,Spain
173,datasets/playersImage/189511Busquets Burgos.png,24.0,Sergio Busquets,Sergio Busquets i Burgos,Inter Miami,Major League Soccer,Spain
329,datasets/playersImage/198219Insigne.png,24.0,L. Insigne,Lorenzo Insigne,Toronto,Major League Soccer,Italy
464,datasets/playersImage/210021Mukhtar.png,24.0,H. Mukhtar,Hany Mukhtar,Nashville SC,Major League Soccer,Germany


### For Fbref data

#### Match Players name

In [9]:
def preprocess_name(name):
    return unidecode(name).lower()

def preprocess_ea_players(players_df):
    players_df = players_df.copy()
    
    players_df['processed_long_name'] = players_df['long_name'].apply(preprocess_name)
    players_df['processed_short_name'] = players_df['short_name'].apply(preprocess_name)
    return players_df

def preprocess_players(players_df, stats_players_df):
    """
    Preprocess player names and add processed columns for matching.
    """
    stats_players_df = stats_players_df.copy()
    
    players_df = preprocess_ea_players(players_df)
    stats_players_df.loc[:, 'processed_player'] = stats_players_df['player'].apply(preprocess_name)
    return players_df, stats_players_df

def drop_preprocess_players(players_df, stats_players_df):
    players_df.drop(columns=['processed_long_name', 'processed_short_name'], inplace=True, errors='ignore')
    stats_players_df.drop(columns=['processed_player'], inplace=True, errors='ignore')
    return players_df, stats_players_df

def find_best_match(player_name, potential_matches, score_cutoff):
    """
    Finds the best match for a player name within the potential matches using fuzzy matching.
    """
    long_name_matches = process.extract(player_name, potential_matches['processed_long_name'], scorer=fuzz.token_sort_ratio)
    short_name_matches = process.extract(player_name, potential_matches['processed_short_name'], scorer=fuzz.token_sort_ratio)
    #print(f'player_name : {player_name}, long_name_matches : {long_name_matches}, short_name_matches : {short_name_matches}')
    best_match = None
    best_score = 0
    
    for match, score, idx in long_name_matches + short_name_matches:
        if score >= score_cutoff and score > best_score:
            best_match = (match, score, idx)
            best_score = score
    return best_match

def match_players(df1, df2, use_nation=True, score_cutoff=70):
    """
    Matches players from df1 with df2 based on name, team, and optionally nation.
    """
    matches = {}
    unmatched_players = []

    for idx, row in df1.iterrows():
        player_name = row['processed_player']
        team = row['Team']
        filters = (df2['Team'] == team)
        
        if use_nation:
            nation = row['nation']
            filters &= (df2['alpha_3_code'] == nation)

        potential_matches = df2.loc[filters].reset_index()
        
        if not potential_matches.empty:
            result = find_best_match(player_name, potential_matches, score_cutoff)
            if result:
                match, score, match_idx = result
                original_match = potential_matches.iloc[match_idx]['player_url']
                matches[(row['player'], team, nation)] = original_match
            else:
                unmatched_players.append((row['player'], team, nation))
        else:
            unmatched_players.append((row['player'], team, nation))
            
    return matches, unmatched_players

def find_best_matches(df1, df2, score_cutoff=70):
    """
    Finds the best matches for players in stats_player_mls within mls23_players based on name, team, and nation.
    """
    matches, unmatched_players_1 = match_players(df1, df2, use_nation=True, score_cutoff=score_cutoff)
    unmatched_players_2 = []

    for player, team, nation in unmatched_players_1:
        player_name = preprocess_name(player)
        teams_df1 = df1[df1['player'] == player]['Team'].unique()

        for team_df1 in teams_df1:
            potential_matches = df2[df2['Team'] == team_df1].reset_index()
            if not potential_matches.empty:
                result = find_best_match(player_name, potential_matches, score_cutoff)
                if result:
                    match, score, match_idx = result
                    original_match = potential_matches.iloc[match_idx]['player_url']
                    matches[(player, team, nation)] = original_match
                    break
            else:
                matches[(player, team, nation)] = None

        if (player, team, nation) not in matches:
            unmatched_players_2.append((player, team, nation, player_name))

    return matches, unmatched_players_2

def find_best_matches_global(unmatched_players, df_global, score_cutoff=75):
    """
    Finds the best matches for unmatched players globally without considering team.
    """
    matches = {}
    final_unmatched_players = []
    
    unmatched_players_df = pd.DataFrame(unmatched_players, columns=['player', 'Team', 'nation', 'processed_player'])
    #unmatched_players_df['processed_player'] = unmatched_players_df['player'].apply(preprocess_name)
    #df_global, unmatched_players_df = preprocess_players(df_global, unmatched_players_df)

    for idx, row in unmatched_players_df.iterrows():
        player_name = row['processed_player']
        nation = row['nation']
        potential_matches = df_global[df_global['alpha_3_code'] == nation].reset_index()
        
        if not potential_matches.empty:
            result = find_best_match(player_name, potential_matches, score_cutoff)
            if result:
                match, score, match_idx = result
                original_match = potential_matches.iloc[match_idx]['player_url']
                matches[(row['player'], row['Team'], nation)] = original_match
            else:
                final_unmatched_players.append((row['player'], row['Team'], nation))
        else:
            final_unmatched_players.append((row['player'], row['Team'], nation))
    
    df_global, unmatched_players_df = drop_preprocess_players(df_global, unmatched_players_df)
    return matches, final_unmatched_players

In [14]:
mls2223_players, stats_player_mls = preprocess_players(mls2223_players, stats_player_mls)
mls22_preprocess_players = mls2223_players[mls2223_players['fifa_version'] == 23]
mls23_preprocess_players = mls2223_players[mls2223_players['fifa_version'] == 24]

matches, unmatched_players = find_best_matches(stats_player_mls, mls23_preprocess_players, 65)

# Retry unmatched players with fifa_version 23
unmatched_players_df = pd.DataFrame(unmatched_players, columns=['player', 'Team', 'nation', 'processed_player'])
matches2, unmatched_players2 = find_best_matches(unmatched_players_df, mls22_preprocess_players, 65)

#df2, df1 = drop_preprocess_players(df2, df1)

ea24_players = preprocess_ea_players(ea24_players)
matches_global, unmatched_players_global = find_best_matches_global(unmatched_players2, ea24_players)

# Combine all matches
all_matches = {**matches, **matches2, **matches_global}

In [15]:
manual_links = [
    ("Erik López", "Atlanta United FC", "PAR", "https://sofifa.com/player/12902127/erik-lopez/210064/"),
    ("Luke Brennan", "Atlanta United FC", "USA", "https://sofifa.com/player/278490/luke-brennan/240028"),
    ("Machop Malual", "Atlanta United FC", "SSD", "https://sofifa.com/player/13628076/machop-chol/240019/"),
    ("Ilias Iliadis", "Montreal Impact", "GRE", "https://sofifa.com/player/270876/ilias-iliadis/240025"),
    ("Cucho", "Columbus Crew SC", "COL", "https://sofifa.com/player/237034/juan-camilo-hernandez-suarez/240049"),
    ("Keegan Hughes", "Columbus Crew SC", "USA", "https://sofifa.com/player/273552/keegan-hughes/230021"),
    ("Kristian Fletcher", "D.C. United", "USA", "https://sofifa.com/player/271986/kristian-fletcher/240001"),
    ("Matai Akinmboni", "D.C. United", "USA", "https://sofifa.com/player/271756/matai-akinmboni/240037/"),
    ("Stiven Jimenez", "FC Cincinnati", "USA", "https://sofifa.com/player/274542/stiven-jimenez/240049"),
    ("Israel Boatwright", "Inter Miami C.F.", "DOM", "https://sofifa.com/player/276169/israel-boatwright/240028"),
    ("Lawson Sunderland", "Inter Miami C.F.", "USA", "https://sofifa.com/player/276821/lawson-sunderland/240028"),
    ("Shanyder Borgelin", "Inter Miami C.F.", "HAI", "https://sofifa.com/player/274545/shanyder-borgelin/240047"),
    ("Aaron Bibout", "LA Galaxy", "CMR", "https://sofifa.com/player/278479/aaron-bibout/240031"),
    ("Douglas Costa", "LA Galaxy", "BRA", "https://sofifa.com/player/14376565/douglas-costa-de-souza/210047/"),
    ("Mikael Marques", "Minnesota United FC", "SWE", "https://sofifa.com/player/274268/mikael-marques/230015"),
    ("Ronald Donkor", "New York Red Bulls", "MLI", "https://sofifa.com/player/275294/ronald-donkor/240021"),
    ("Gabe Segal", "New York City FC", "USA", "https://sofifa.com/player/273553/gabriel-segal/230037"),
    ("Jacob Jackson", "New England Revolution", "USA", "https://sofifa.com/player/267922/jacob-jackson/230011"),
    ("Joshua Bolma", "New England Revolution", "GHA", "https://sofifa.com/player/274666/joshua-bolma/240028"),
    ("Jozy Altidore", "New England Revolution", "USA", "https://sofifa.com/player/176237/jozy-altidore/230041"),
    ("Bode Davis", "Real Salt Lake", "USA", "https://sofifa.com/player/259039/bode-davis/220069"),
    ("Ilijah Paul", "Real Salt Lake", "USA", "https://sofifa.com/player/275014/ilijah-paul/240047/"),
    ("Zavier Gozo", "Real Salt Lake", "USA", "https://sofifa.com/player/278875/zavier-gozo/240034"),
    ("Cam Cilley", "San Jose Earthquakes", "USA", "https://sofifa.com/player/274275/cameron-cilley/230042"),
    ("Danny Flores", "Sporting Kansas City", "USA", "https://sofifa.com/player/273549/danny-flores/240004"),
    ("Adam Pearlman", "Toronto FC", "USA", "https://sofifa.com/player/278682/adam-pearlman/240030"),
    ("Hugo-Hilaire Mbongue Mbongue", "Toronto FC", "CAN", "https://sofifa.com/player/272667/hugo-mbongue/240025/"),
    ("Junior Hoilett", "Vancouver Whitecaps FC", "CAN", "https://sofifa.com/player/189462/junior-hoilett/240026")
]

# Update all_matches with the manual links
for player, team, nation, link in manual_links:
    all_matches[(player, team, nation)] = link

#### Import player logo

In [19]:
def get_image_url_from_sofifa(sofifa_url):
    try:
        response = client.get(sofifa_url)
        soup = BeautifulSoup(response.text, 'html.parser')
        script = soup.find('script', type='application/ld+json')
        
        if script:
            json_data = json.loads(script.string)
            image_url = json_data.get('image', None)
            return image_url
        else:
            print(f"Warning: JSON-LD script not found for {sofifa_url}")
            return None

    except Exception as e:
        print(f"Error processing {sofifa_url}: {e}")
        return None

# Map players to their image URLs
def map_player_to_logo_url(row):
    key = (row['player'], row['Team'], row['nation'])
    sofifa_url = all_matches.get(key, None)
    
    if sofifa_url:
        return get_image_url_from_sofifa(sofifa_url)
    else:
        return None

In [20]:
client = ZenRowsClient("...") 

In [21]:
def is_valid_image_url(url):
    try:
        response = requests.head(url)
        if response.status_code == 200:
            return True
        else:
            return False
    except Exception:
        return False

def get_image_url_from_sofifa(sofifa_url):
    try:
        response = client.get(sofifa_url)
        soup = BeautifulSoup(response.text, 'html.parser')
        script = soup.find('script', type='application/ld+json')
        
        if script:
            json_data = json.loads(script.string)
            image_url = json_data.get('image', None)
            
            if image_url and is_valid_image_url(image_url):
                return image_url
            else:
                print(f"Warning: Invalid or missing image URL for {sofifa_url}, using default image.")
                return "assets/footballer.png"
        else:
            print(f"Warning: JSON-LD script not found for {sofifa_url}, using default image.")
            return "assets/footballer.png"

    except Exception as e:
        print(f"Error processing {sofifa_url}: {e}, using default image.")
        return "assets/footballer.png"

def map_player_to_logo_url(row):
    key = (row['player'], row['Team'], row['nation'])
    sofifa_url = all_matches.get(key, None)
    
    if sofifa_url:
        return get_image_url_from_sofifa(sofifa_url)
    else:
        return "assets/footballer.png"

In [25]:
stats_player_mls['logo_url'] = stats_player_mls.apply(map_player_to_logo_url, axis=1)



### Download images locally

In [2]:
local_folder = 'plotly-dash/assets/playersImage'
placeholder_image = "assets/footballer.png"
if not os.path.exists(local_folder):
    os.makedirs(local_folder)

In [11]:
def download_player_image(row):
    try:
        url = row['logo_url']
        if pd.isna(url) or not url.startswith('http'):
            print(f"Invalid or missing URL for {row['player']}, using default image.")
            return placeholder_image
            
        response = requests.get(url, stream=True)

        if response.status_code == 200:
            family_name = row['player'].replace(" ", "_")
            player_team = row['Team'].replace(" ", "_")
            filename = f"{family_name}_{player_team}.png"

            image_filename = os.path.normpath(os.path.join(local_folder, filename)).replace("\\", "/")

            with open(image_filename, 'wb') as f:
                f.write(response.content)

            return image_filename
        
        else:
            print(f"Failed to download {row['player']} image, using default image.")
            return placeholder_image

    except Exception as e:
        print(f"Error processing {row['player']}: {e}")
        return placeholder_image

In [12]:
stats_player_mls['local_image_path'] = stats_player_mls.apply(download_player_image, axis=1)

Invalid or missing URL for Erik López, using default image.
Invalid or missing URL for Luke Brennan, using default image.
Invalid or missing URL for Machop Malual, using default image.
Invalid or missing URL for Marlon Vargas, using default image.
Invalid or missing URL for Yosuke Hanya, using default image.
Invalid or missing URL for Keegan Hughes, using default image.
Invalid or missing URL for Stiven Jimenez, using default image.
Invalid or missing URL for Eugene Ansah, using default image.
Invalid or missing URL for Israel Boatwright, using default image.
Invalid or missing URL for Lawson Sunderland, using default image.
Invalid or missing URL for Aaron Bibout, using default image.
Invalid or missing URL for Douglas Costa, using default image.
Invalid or missing URL for Mikael Marques, using default image.
Invalid or missing URL for Ibrahim Kasule, using default image.
Invalid or missing URL for Julian Hall, using default image.
Invalid or missing URL for O'Vonte Mullings, using de

In [9]:
stats_player_mls.to_csv('datasets/stats_player_mls.csv', index=False)

### Optimizing Player Image Sizes

In [1]:
from PIL import Image
import os

In [2]:
local_folder = 'plotly-dash/assets/playersImage'
target_size = (50, 50)

In [6]:
for filename in os.listdir(local_folder):
    if filename.endswith(".png"):
        image_path = os.path.join(local_folder, filename)
        with Image.open(image_path) as img:
            img = img.resize(target_size, Image.Resampling.LANCZOS)
            img.save(image_path, format='PNG', optimize=True, quality=85)