In [1]:
import re
import requests
import pandas as pd
import os
import soccerdata as sd
import numpy as np
import json

from PIL import Image
from bs4 import BeautifulSoup
from zenrows import ZenRowsClient

In [2]:
def scraping_header(url, headers=None):
    """
    Scrape content from the provided URL with optional custom headers and return the parsed HTML content of the page.
    """
    if headers is None:
        headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'}
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raises an HTTPError for bad responses
    except requests.RequestException as e:
        print(f"Request failed: {e}")
        return None
    return BeautifulSoup(response.content, 'html.parser')

def compare_teams(df1, df2, column='Team', df1_name='DF1', df2_name='DF2'):
    """
    Compare specified columns between two dataframes to find unique data in each and diplay the difference from each dataframe.
    """
    if column not in df1 or column not in df2:
        raise ValueError(f"The column '{column}' must exist in both dataframes.")
    
    # Get unique values from each dataframe
    df1_unique = df1[~df1[column].isin(df2[column])][column].drop_duplicates().sort_values().reset_index(drop=True) # ascending=False
    df2_unique = df2[~df2[column].isin(df1[column])][column].drop_duplicates().sort_values().reset_index(drop=True) # ascending=False

    if df1_unique.empty and df2_unique.empty:
        print(f"All {column} entries are present and correctly named in both {df1_name} and {df2_name}.")
        return None
    
    comparison_df = pd.DataFrame({
        f'{df1_name} Unique': df1_unique,
        f'{df2_name} Unique': df2_unique
    }).fillna('')
    
    return comparison_df


def align_and_merge_teams(df1, df2, column='Team'):
    """
    Merges the two dataframes according to the team
    """
    
    if column not in df1.columns or column not in df2.columns:
        raise ValueError(f"The column '{column}' must exist in both dataframes.")
        
    df1_sorted = df1.sort_values(by=column).reset_index(drop=True)
    df2_sorted = df2.sort_values(by=column).reset_index(drop=True)
    
    mapping_dict = dict(zip(df1_sorted[column], df2_sorted[column]))
    
    df1_aligned = df1.copy()
    df1_aligned[column] = df1[column].map(mapping_dict).fillna(df1[column])
    
    missing_teams = set(df1_aligned[column]) - set(df2_sorted[column])
    if missing_teams:
        print("Missing teams in alignment:", missing_teams)
    else:
        print("All", column, "are present and correctly named.")
        
    merged_df = pd.merge(df1_aligned, df2_sorted, on=column, how='left', suffixes=('', '_drop'))
    merged_df.drop([col for col in merged_df.columns if '_drop' in col], axis=1, inplace=True)
    
    return merged_df

## Import players salary

In [3]:
def import_players_salary():
    soup = scraping_header("https://mlsplayers.org/resources/salary-guide")

    table = soup.find('table', {'id': 'salary-report'})
    rows = table.find_all('tr')

    data = []
    for row in rows[1:]:  # Skip the header row
        cols = row.find_all('td')
        cols = [ele.text.strip() for ele in cols]
        data.append(cols)  # Get rid of empty values

    df_players_salary = pd.DataFrame(data, columns=['FirstName', 'LastName', 'Team', 'Position', 'SalaryBase ($)', 'SalaryGuaranteed ($)']) 
    
    df_players_salary.isnull().sum() # Check if there is null values
    
    df_players_salary['SalaryBase ($)'] = pd.to_numeric(df_players_salary['SalaryBase ($)'].str.replace('[$,]', '', regex=True))
    df_players_salary['SalaryGuaranteed ($)'] = pd.to_numeric(df_players_salary['SalaryGuaranteed ($)'].str.replace('[$,]', '', regex=True))
    
    df_players_salary.to_csv('datasets/players_salary.csv', index=False)
    return df_players_salary

In [4]:
# Function no longer used in our case, as the site has been updated to show salaries for 2024. It's the 2023 salaries 
# we're interested in at the moment

# df_players_salary = import_players_salary() 

In [4]:
df_players_salary = pd.read_csv('datasets/players_salary.csv') # read the csv file created with 2023 data
df_players_salary.head()

Unnamed: 0,FirstName,LastName,Team,Position,SalaryBase ($),SalaryGuaranteed ($)
0,Luis,Abram,Atlanta United,D,556364.0,695977.0
1,Lalas,Abubakar,Colorado Rapids,D,665000.0,702125.0
2,Nicolás,Acevedo,New York City FC,D-M,230000.0,274800.0
3,Alonso,Aceves,Chicago Fire,D,330000.0,368900.0
4,Ifunanyachi,Achara,Houston Dynamo,F,85444.0,85444.0


## Create Team expenses table

In [5]:
salary_column = 'SalaryGuaranteed ($)'
total_salary_by_club = df_players_salary.groupby('Team')[salary_column].sum().sort_values(ascending=False).reset_index()
total_salary_by_club = total_salary_by_club[total_salary_by_club['Team'] != "Major League Soccer"]

## Add colors Team

In [6]:
soup = scraping_header("https://teamcolorcodes.com/soccer/mls-team-color-codes/")
teams = soup.find_all('a', class_='team-button')

team_colors = {}
for team in teams:
    name = team.get_text().strip()
    main_color = team['style'].split(';')[0].split(':')[-1].strip()
    secondary_color = team['style'].split(';')[1].split(':')[-1].replace('4px solid ', '').strip()
    team_colors[name] = {'Main Color': main_color, 'Secondary Color': secondary_color}

df_teams = pd.DataFrame.from_dict(team_colors, orient='index').reset_index()
df_teams.columns = ['Team', 'MainColor', 'SecondaryColor']

In [7]:
manual_teams = {
    'Nashville SC': {'Main Color': '#ECE83A', 'Secondary Color': '#1F1646'},
    'St. Louis City SC': {'Main Color': '#E0004D', 'Secondary Color': '#0C2340'},
    'Charlotte FC': {'Main Color': '#000000', 'Secondary Color': '#1A85C8'},
}
team_name_mappingSC = {
    "Vancouver Whitecaps": "Vancouver Whitecaps FC",
    "Minnesota United": "Minnesota United FC",
    "LAFC": "Los Angeles FC",
    "DC United": "D.C. United",
    "Columbus Crew": "Columbus Crew SC",
    "Inter Miami": "Inter Miami CF",
    "FC Cincinnati": "Cincinnati FC",
    "CF Montreal": "Montreal Impact",
}

In [8]:
team_colors.update(manual_teams)
df_teams = pd.DataFrame.from_dict(team_colors, orient='index').reset_index()
df_teams.columns = ['Team', 'MainColor', 'SecondaryColor']
total_salary_by_club['Team'] = total_salary_by_club['Team'].replace(team_name_mappingSC)

In [9]:
compare_teams(total_salary_by_club, df_teams, df1_name='Team Salary', df2_name='Team Colors')
df_teams = align_and_merge_teams(total_salary_by_club, df_teams)

All Team entries are present and correctly named in both Team Salary and Team Colors.
All Team are present and correctly named.


In [10]:
df_teams.to_csv('datasets/MLS_team_colors.csv', index=False)

## Final Table

In [11]:
mls_tables = pd.read_html('https://en.wikipedia.org/wiki/Template:2023_Major_League_Soccer_season_table')
mls_table = mls_tables[0]
mls_table = mls_table.rename(columns={mls_table.columns[1]: 'Team'})
mls_table = mls_table.drop(mls_table.columns[-1], axis=1)

mls_table['GD'] = mls_table['GD'].replace({'−': '-'}, regex=True)
mls_table['GD'] = pd.to_numeric(mls_table['GD'])

mls_table.head()

Unnamed: 0,Pos,Team,Pld,W,L,T,GF,GA,GD,Pts
0,1,FC Cincinnati (S),34,20,5,9,57,39,18,69
1,2,Orlando City SC,34,18,7,9,55,39,16,63
2,3,Columbus Crew (C),34,16,9,9,67,46,21,57
3,4,St. Louis City SC,34,17,12,5,62,45,17,56
4,5,Philadelphia Union,34,15,9,10,57,41,16,55


In [12]:
compare_teams(mls_table, df_teams, df1_name='MLS Table', df2_name='Team Colors')

Unnamed: 0,MLS Table Unique,Team Colors Unique
0,Vancouver Whitecaps FC (V),Vancouver Whitecaps FC
1,Inter Miami CF (L),Montreal Impact
2,Houston Dynamo FC (U),Inter Miami CF
3,FC Cincinnati (S),Houston Dynamo
4,Columbus Crew (C),Columbus Crew SC
5,Chicago Fire FC,Cincinnati FC
6,CF Montréal,Chicago Fire
7,Atlanta United FC,Atlanta United


In [13]:
team_name_mappingSC = {
    "FC Cincinnati (S)": "Cincinnati FC",
    "CF Montréal": "Montreal Impact",
}
mls_table['Team'] = mls_table['Team'].replace(team_name_mappingSC)
mls_table = align_and_merge_teams(mls_table, df_teams)

All Team are present and correctly named.


## Teams logo

In [8]:
folder_destination = 'datasets/Teamslogo/'

In [9]:
soup = scraping_header("https://www.sportslogos.net/teams/list_by_league/9/major_league_soccer/mls/logos/")

if not os.path.exists(folder_destination):
    os.makedirs(folder_destination)

# Download logos
logo_wall = soup.find('ul', class_='logoWall')
logos = logo_wall.find_all('li', style=True)
for logo in logos:
    team_name = logo.find('a')['title'].replace(' Logos', '')
    logo_path = logo.find('img')['src']
    logo_response = requests.get(logo_path, stream=True)
    extension = os.path.splitext(logo_path)[1]
    filename = f"{team_name}{extension}"
    file_path = os.path.join(folder_destination, filename)

    if logo_response.status_code == 200:
        with open(file_path, 'wb') as f:
            for chunk in logo_response.iter_content(chunk_size=128):
                f.write(chunk)

team_logos = []
for filename in os.listdir(folder_destination):
    if filename.endswith('.gif'):
        base_filename = filename[:-4].rstrip()
        new_filename = f"{base_filename}.png"
        
        original_filepath = os.path.join(folder_destination, filename)
        
        with Image.open(original_filepath) as img:
            new_filepath = os.path.join(folder_destination, new_filename)
            img.save(new_filepath)
        
        os.remove(original_filepath)
        
        team_name = base_filename.strip()
        image_path = os.path.join(folder_destination, new_filename)
        
        team_logos.append({'Team': team_name, 'Logo path': image_path})

logos_df = pd.DataFrame(team_logos)
logos_df = logos_df[logos_df['Team'] != 'San Diego FC']

In [21]:
team_name_mappingSC = {
    "CF Montreal": "Montreal Impact",
}
logos_df['Team'] = logos_df['Team'].replace(team_name_mappingSC) 
mls_table = align_and_merge_teams(mls_table, logos_df)
mls_table = align_and_merge_teams(total_salary_by_club, mls_table)

### Enhance and adjust image display

In [65]:
def remove_white_background(img_path, output_path):
    """
    Changes the background of the image from white to transparent
    """
    with Image.open(img_path) as img:
        # Convert the image to RGBA mode to access the alpha channel
        img = img.convert("RGBA")
        datas = img.getdata()
        newData = []
        
        for item in datas:
            if item[0] > 220 and item[1] > 220 and item[2] > 220:
                newData.append((255, 255, 255, 0))
            else:
                newData.append(item)

        img.putdata(newData)
        bbox = img.getbbox()
        img_cropped = img.crop(bbox)
        img_cropped.save(output_path)

In [66]:
for filename in os.listdir(folder_destination):
    if filename.endswith('.png'):
        file_path = os.path.join(folder_destination, filename)
        remove_white_background(file_path, file_path)

In [67]:
mls_table.sort_values(by='Pos', ascending=True)

Unnamed: 0,Team,SalaryGuaranteed ($),Pos,Pld,W,L,T,GF,GA,GD,Pts,MainColor,SecondaryColor,Logo path
20,Colorado Rapids,14348728.0,1,34,20,5,9,57,39,18,69,#003087,#FE5000,datasets/Teamslogo/Colorado Rapids.png
28,Orlando City SC,9642918.0,2,34,18,7,9,55,39,16,63,#61259E,#FFE293,datasets/Teamslogo/Orlando City SC.png
13,D.C. United,15313747.0,3,34,16,9,9,67,46,21,57,#231f20,#FEF200,datasets/Teamslogo/D.C. United.png
26,St Louis City SC,11186787.0,4,34,17,12,5,62,45,17,56,#E0004D,#0C2340,datasets/Teamslogo/St Louis City SC.png
24,Philadelphia Union,13419282.0,5,34,15,9,10,57,41,16,55,#002D55,#B38707,datasets/Teamslogo/Philadelphia Union.png
10,New England Revolution,17013587.0,6,34,15,9,10,58,46,12,55,#E51938,#002B5C,datasets/Teamslogo/New England Revolution.png
6,Seattle Sounders FC,19185352.0,7,34,14,9,11,41,32,9,53,#236192,#658D1B,datasets/Teamslogo/Seattle Sounders FC.png
3,LAFC,20841863.0,8,34,14,10,10,54,39,15,52,#000000,#C39e6d,datasets/Teamslogo/LAFC.png
8,Houston Dynamo,17459127.0,9,34,14,11,9,51,38,13,51,#F68712,#8DC6ED,datasets/Teamslogo/Houston Dynamo.png
5,Atlanta United FC,19847184.0,10,34,13,9,12,66,53,13,51,#80000A,#A19060,datasets/Teamslogo/Atlanta United FC.png


In [68]:
mls_table.to_csv('datasets/MLS_23_table.csv', index=False)

## Scraping Fbref

In [95]:
soup = scraping_header("https://fbref.com/en/comps/22/2023/2023-Major-League-Soccer-Stats")
table = soup.find('table', {'id': 'stats_squads_standard_for'})
rows = table.find_all('tr')

data = []
for row in rows[2:]:  # Skip the headers row
    cols = row.find_all('th')
    cols = cols + row.find_all('td')
    cols = [ele.text.strip() for ele in cols]
    data.append(cols)  # Get rid of empty values
    
df_team_FBref_for = pd.DataFrame(data, columns=['Team', '# Pl', 'Age', 'Poss', 'MP', 'Starts', 'Min', '90s', 'Gls', 'Ast', 
                                            'G+A', 'G-PK', 'PK', 'PKatt', 'CrdY', 'CrdR', 'xG', 'npxG', 'xAG', 'npxG+xAG', 
                                            'PrgC', 'PrgP', 'Gls/90', 'Ast/90', 'G+A/90', 'G-PK/90', 'G+A-PK/90', 'xG/90', 
                                            'xAG/90', 'xG+xAG/90', 'npxG/90', 'npxG+xAG/90'])

In [96]:
table = soup.find('table', {'id': 'stats_squads_standard_against'})
rows = table.find_all('tr')

data = []
for row in rows[2:]:  # Skip the headers row
    cols = row.find_all('th')
    cols = cols + row.find_all('td')
    cols = [ele.text.strip() for ele in cols]
    data.append(cols)  # Get rid of empty values
    
df_team_FBref_against = pd.DataFrame(data, columns=['Team', '# Pl', 'Age vs', 'Poss vs', 'MP', 'Starts', 'Min', '90s', 'Gls vs', 'Ast vs', 
                                            'G+A vs', 'G-PK vs', 'PK vs', 'PKatt vs', 'CrdY vs', 'CrdR vs', 'xG vs', 'npxG vs', 'xAG vs', 'npxG+xAG vs', 
                                            'PrgC vs', 'PrgP vs', 'Gls/90 vs', 'Ast/90 vs', 'G+A/90 vs', 'G-PK/90 vs', 'G+A-PK/90 vs', 'xG/90 vs', 
                                            'xAG/90 vs', 'xG+xAG/90 vs', 'npxG/90 vs', 'npxG+xAG/90'])
df_team_FBref_against['Team'] = df_team_FBref_against['Team'].str.replace("vs ", "")

In [97]:
df_team_Fbref = align_and_merge_teams(df_team_FBref_for, df_team_FBref_against)

All Team are present and correctly named.


In [73]:
compare_teams(df_team_Fbref, mls_table, df1_name='Fbref Table', df2_name='Logo Table')

Unnamed: 0,Fbref Table Unique,Logo Table Unique
0,Atlanta Utd,Atlanta United FC
1,Austin,Austin FC
2,CF Montréal,Charlotte FC
3,Charlotte,Chicago Fire
4,Crew,Colorado Rapids
5,Dynamo FC,Columbus Crew SC
6,Fire,Houston Dynamo
7,Inter Miami,Inter Miami C.F.
8,Minnesota Utd,Minnesota United FC
9,NE Revolution,Montreal Impact


In [140]:
team_name_mappingFL = {
    "CF Montréal": "Montreal Impact",
    "Crew": "Columbus Crew SC",
    "Dynamo FC": "Houston Dynamo",
    "Inter Miami": "Inter Miami C.F.",
    "Rapids": "Colorado Rapids",
    "Nashville": "Nashville SC",
    "NY Red Bulls": "New York Red Bulls",
    "Atlanta Utd": "Atlanta United FC",
    "Austin": "Austin FC",
    "Charlotte": "Charlotte FC",
    "Columbus Crew": "Columbus Crew SC",
    "Los Angeles FC": "LAFC",
    "Minnesota Utd": "Minnesota United FC",
    "NYCFC": "New York City FC",
    "New England": "New England Revolution",
    "Orlando City": "Orlando City SC",
    "Philadelphia": "Philadelphia Union",
    "San Jose": "San Jose Earthquakes",
    "Seattle": "Seattle Sounders FC",
    "Sporting KC": "Sporting Kansas City",
    "St. Louis": "St Louis City SC",
    "Vancouver": "Vancouver Whitecaps FC"
}
df_team_Fbref['Team'] = df_team_Fbref['Team'].replace(team_name_mappingFL)
df_team_Fbref = align_and_merge_teams(df_team_Fbref, mls_table)
df_team_Fbref.drop(columns=["Pld", "GF", "GA", "GD"])

All Team are present and correctly named.


Unnamed: 0,Team,# Pl,Age,Poss,MP,Starts,Min,90s,Gls,Ast,...,Pos,W,L,T,Pts,MainColor,SecondaryColor,Logo path,Diff_Gls,Diff_Gls vs
0,Atlanta United FC,33,25.8,55.6,34,374,3060,34.0,64,44,...,10,13,9,12,51,#80000A,#A19060,datasets/Teamslogo/Atlanta United FC.png,15.3,6.0
1,Austin FC,26,27.9,51.2,34,374,3060,34.0,48,37,...,25,10,15,9,39,#000000,#00b140,datasets/Teamslogo/Austin FC.png,5.5,4.1
2,Montreal Impact,31,24.4,48.3,34,374,3060,34.0,33,22,...,20,12,17,5,41,#2B63AD,#373536,datasets/Teamslogo/CF Montreal.png,-4.8,-1.2
3,Charlotte FC,32,27.2,52.6,34,374,3060,34.0,43,29,...,19,10,11,13,43,#000000,#1A85C8,datasets/Teamslogo/Charlotte FC.png,6.0,0.3
4,Colorado Rapids,28,25.8,57.1,34,374,3060,34.0,63,49,...,1,20,5,9,69,#003087,#FE5000,datasets/Teamslogo/Colorado Rapids.png,5.3,3.7
5,Columbus Crew SC,31,28.4,49.5,34,374,3060,34.0,44,27,...,28,5,17,12,27,#862633,#8BB8E8,datasets/Teamslogo/Columbus Crew SC.png,-1.8,5.4
6,Houston Dynamo,25,27.2,51.6,34,374,3060,34.0,51,36,...,9,14,11,9,51,#F68712,#8DC6ED,datasets/Teamslogo/Houston Dynamo.png,6.1,-4.0
7,D.C. United,27,26.3,48.8,34,374,3060,34.0,56,35,...,3,16,9,9,57,#231f20,#FEF200,datasets/Teamslogo/D.C. United.png,-0.3,-3.7
8,FC Cincinnati,29,25.8,48.8,34,374,3060,34.0,41,34,...,23,10,14,10,40,#ee1a39,#231f20,datasets/Teamslogo/FC Cincinnati.png,3.2,-3.6
9,FC Dallas,26,26.5,46.1,34,374,3060,34.0,36,29,...,14,11,10,13,46,#BF0D3E,#00205B,datasets/Teamslogo/FC Dallas.png,-2.2,-2.1


In [11]:
def convert_columns_to_numeric(df, columns):
    """
    Convert specified columns in the dataframe to numeric
    """
    df[columns] = df[columns].apply(pd.to_numeric, errors='coerce')
    return df

def calculate_diff_columns(df, cols_to_diff):
    """
    Calculate the difference between pairs of columns and create a new column for each pair
    """
    for col1, col2 in cols_to_diff:
        diff_col_name = f"Diff_{col1}"
        df[diff_col_name] = df[col1] - df[col2]
    return df

In [142]:
# Convert specified columns to numeric
columns_to_convert = ['Gls', 'Gls vs', 'xG', 'xG vs']
df_team_Fbref = convert_columns_to_numeric(df_team_Fbref, columns_to_convert)

# Calculate differences and create new columns
columns_to_difference = [('Gls', 'xG'), ('Gls vs', 'xG vs')]
df_team_Fbref = calculate_diff_columns(df_team_Fbref, columns_to_difference)

In [143]:
df_team_Fbref

Unnamed: 0,Team,# Pl,Age,Poss,MP,Starts,Min,90s,Gls,Ast,...,T,GF,GA,GD,Pts,MainColor,SecondaryColor,Logo path,Diff_Gls,Diff_Gls vs
0,Atlanta United FC,33,25.8,55.6,34,374,3060,34.0,64,44,...,12,66,53,13,51,#80000A,#A19060,datasets/Teamslogo/Atlanta United FC.png,15.3,6.0
1,Austin FC,26,27.9,51.2,34,374,3060,34.0,48,37,...,9,49,55,-6,39,#000000,#00b140,datasets/Teamslogo/Austin FC.png,5.5,4.1
2,Montreal Impact,31,24.4,48.3,34,374,3060,34.0,33,22,...,5,36,52,-16,41,#2B63AD,#373536,datasets/Teamslogo/CF Montreal.png,-4.8,-1.2
3,Charlotte FC,32,27.2,52.6,34,374,3060,34.0,43,29,...,13,45,52,-7,43,#000000,#1A85C8,datasets/Teamslogo/Charlotte FC.png,6.0,0.3
4,Colorado Rapids,28,25.8,57.1,34,374,3060,34.0,63,49,...,9,57,39,18,69,#003087,#FE5000,datasets/Teamslogo/Colorado Rapids.png,5.3,3.7
5,Columbus Crew SC,31,28.4,49.5,34,374,3060,34.0,44,27,...,12,26,54,-28,27,#862633,#8BB8E8,datasets/Teamslogo/Columbus Crew SC.png,-1.8,5.4
6,Houston Dynamo,25,27.2,51.6,34,374,3060,34.0,51,36,...,9,51,38,13,51,#F68712,#8DC6ED,datasets/Teamslogo/Houston Dynamo.png,6.1,-4.0
7,D.C. United,27,26.3,48.8,34,374,3060,34.0,56,35,...,9,67,46,21,57,#231f20,#FEF200,datasets/Teamslogo/D.C. United.png,-0.3,-3.7
8,FC Cincinnati,29,25.8,48.8,34,374,3060,34.0,41,34,...,10,45,49,-4,40,#ee1a39,#231f20,datasets/Teamslogo/FC Cincinnati.png,3.2,-3.6
9,FC Dallas,26,26.5,46.1,34,374,3060,34.0,36,29,...,13,41,37,4,46,#BF0D3E,#00205B,datasets/Teamslogo/FC Dallas.png,-2.2,-2.1


In [144]:
df_team_Fbref.to_csv('datasets/squad_stats_FBref.csv', index=False)

### Player Standard Stats 

In [25]:
fbref = sd.FBref(leagues=['US-MLS'], seasons=['2324'])
player_season_stats = fbref.read_player_season_stats(stat_type="standard")
player_season_stats.head()



Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,nation,pos,age,born,Playing Time,Playing Time,Playing Time,Playing Time,Performance,Performance,...,Per 90 Minutes,Per 90 Minutes,Per 90 Minutes,Per 90 Minutes,Per 90 Minutes,Per 90 Minutes,Per 90 Minutes,Per 90 Minutes,Per 90 Minutes,Per 90 Minutes
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,MP,Starts,Min,90s,Gls,Ast,...,Gls,Ast,G+A,G-PK,G+A-PK,xG,xAG,xG+xAG,npxG,npxG+xAG
league,season,team,player,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2
US-MLS,2324,Atlanta Utd,Ajani Fortune,TRI,MF,20,2002,14,3,374,4.2,0,1,...,0.0,0.24,0.24,0.0,0.24,0.18,0.03,0.21,0.18,0.21
US-MLS,2324,Atlanta Utd,Amar Sejdic,USA,MF,26,1996,23,14,1061,11.8,0,0,...,0.0,0.0,0.0,0.0,0.0,0.03,0.02,0.05,0.03,0.05
US-MLS,2324,Atlanta Utd,Andrew Gutman,USA,DF,26,1996,18,16,1385,15.4,3,2,...,0.19,0.13,0.32,0.19,0.32,0.11,0.14,0.24,0.11,0.24
US-MLS,2324,Atlanta Utd,Brad Guzan,USA,GK,38,1984,27,27,2430,27.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
US-MLS,2324,Atlanta Utd,Brooks Lennon,USA,DF,25,1997,33,32,2881,32.0,4,9,...,0.12,0.28,0.41,0.12,0.41,0.08,0.16,0.24,0.08,0.24


In [22]:
def clean_and_rename_columns(df):
    """
    Reset index of the dataframe, remove unnecessary columns and rename to simplify
    """
    df = df.reset_index()
    df.drop(columns=['league', 'season'], errors='ignore', inplace=True)
    
    new_columns = []
    for col in df.columns:
        if col[1] == '':
            new_columns.append(col[0])

        elif col[0] == 'Per 90 Minutes':
            new_columns.append(col[1] + '/90')
        
        else:
            new_columns.append(col[1])
    
    df.columns = new_columns
    df = df.rename(columns={"team": "Team"})
    return df

In [26]:
player_season_stats = clean_and_rename_columns(player_season_stats)
player_season_stats.columns

  df.drop(columns=['league', 'season'], errors='ignore', inplace=True)


Index(['Team', 'player', 'nation', 'pos', 'age', 'born', 'MP', 'Starts', 'Min',
       '90s', 'Gls', 'Ast', 'G+A', 'G-PK', 'PK', 'PKatt', 'CrdY', 'CrdR', 'xG',
       'npxG', 'xAG', 'npxG+xAG', 'PrgC', 'PrgP', 'PrgR', 'Gls/90', 'Ast/90',
       'G+A/90', 'G-PK/90', 'G+A-PK/90', 'xG/90', 'xAG/90', 'xG+xAG/90',
       'npxG/90', 'npxG+xAG/90'],
      dtype='object')

In [27]:
player_season_stats['Team'] = player_season_stats['Team'].replace(team_name_mappingFL)
player_season_stats = pd.merge(player_season_stats, logos_df, on='Team', how='left')

In [29]:
player_season_stats.head()

Unnamed: 0,Team,player,nation,pos,age,born,MP,Starts,Min,90s,...,Ast/90,G+A/90,G-PK/90,G+A-PK/90,xG/90,xAG/90,xG+xAG/90,npxG/90,npxG+xAG/90,Logo path
0,Atlanta United FC,Ajani Fortune,TRI,MF,20,2002,14,3,374,4.2,...,0.24,0.24,0.0,0.24,0.18,0.03,0.21,0.18,0.21,datasets/Teamslogo/Atlanta United FC.png
1,Atlanta United FC,Amar Sejdic,USA,MF,26,1996,23,14,1061,11.8,...,0.0,0.0,0.0,0.0,0.03,0.02,0.05,0.03,0.05,datasets/Teamslogo/Atlanta United FC.png
2,Atlanta United FC,Andrew Gutman,USA,DF,26,1996,18,16,1385,15.4,...,0.13,0.32,0.19,0.32,0.11,0.14,0.24,0.11,0.24,datasets/Teamslogo/Atlanta United FC.png
3,Atlanta United FC,Brad Guzan,USA,GK,38,1984,27,27,2430,27.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,datasets/Teamslogo/Atlanta United FC.png
4,Atlanta United FC,Brooks Lennon,USA,DF,25,1997,33,32,2881,32.0,...,0.28,0.41,0.12,0.41,0.08,0.16,0.24,0.08,0.24,datasets/Teamslogo/Atlanta United FC.png


In [30]:
player_season_stats.to_csv('datasets/player_stats_FBref.csv', index=False)

# TODO Create dataFrame with team logo 

## Import WhoScored Data

In [38]:
def clean_columns(df, drop_columns):
    """
    Cleans the DataFrame by resetting the index and dropping specified columns.
    """
    df = df.reset_index()
    df.drop(columns=drop_columns, errors='ignore', inplace=True)
    return df

def calculate_missed_games(df):
    """
    Calculates the total and unique missed games by club and the player who missed the most games.
    """
    total_missed = df.groupby('team')['game_id'].count().rename('total_missed_games')
    unique_missed = df.groupby('team')['player'].nunique().rename('unique_players_missing')
    most_missed = df.groupby(['team', 'player'])['game_id'].count().reset_index(name='missed_games')
    most_missed = most_missed.loc[most_missed.groupby('team')['missed_games'].idxmax()]

    missed_summary = pd.concat([total_missed, unique_missed], axis=1).reset_index()
    missed_summary = missed_summary.merge(most_missed, on='team', how='left')

    missed_summary.rename(columns={'player': 'player_most_missed', 'missed_games': 'most_missed_games'}, inplace=True)
    return missed_summary

def add_opponent_info(df, schedule_df):
    """
    Calculates the number of missing players for opposing teams
    """
    df = df.merge(schedule_df, on='game_id', how='left')
    df['opponent'] = df.apply(lambda row: row['away_team'] if row['team'] == row['home_team'] else row['home_team'], axis=1)
    return df

In [53]:
def import_whoScored_data():
    ws = sd.WhoScored(leagues="US-MLS", seasons=2324)
    mls_schedule = ws.read_schedule()
    missing_players = ws.read_missing_players()
    
    mls_schedule = clean_columns(mls_schedule, ['league', 'season', 'game', 'url', 'stage'])
    missing_players = clean_columns(missing_players, ['league', 'season', 'game'])
    
    missing_players = add_opponent_info(missing_players, mls_schedule)
    total_missed_by_club = calculate_missed_games(missing_players)
    # Calculate missed games by opponent
    missed_by_opponent = (
        missing_players.groupby('opponent')['game_id']
        .count()
        .reset_index(name='total_missed_games_by_opponent')
        .rename(columns={'opponent': 'team'})
    )
    total_missed_by_club = total_missed_by_club.merge(
        missed_by_opponent, on='team', how='left'
    ).fillna({'total_missed_games_by_opponent': 0})
    
    total_missed_by_club.sort_values(by='total_missed_games', ascending=False)
    
    total_missed_by_club.to_csv('datasets/missed_games_by_club.csv', index=False)
    
    return total_missed_by_club

In [52]:
# The soccerdata library is no longer functional for scraping whoScored, so the import of the csv file generated when it was 
# functional is performed while waiting for the anomaly to be corrected

# total_missed_by_club = total_missed_by_club()

In [68]:
total_missed_by_club = pd.read_csv('datasets/missed_games_by_club.csv')
total_missed_by_club.rename(columns={'team': 'Team'}, inplace=True)
total_missed_by_club.head() 

Unnamed: 0,Team,total_missed_games,unique_players_missing,player_most_missed,most_missed_games,total_missed_games_by_opponent
0,Atlanta United,85,19,Osvaldo Alonso,22,105
1,Austin FC,76,16,Zan Kolmanic,24,108
2,CF Montreal,99,17,Romell Quioto,17,89
3,Charlotte FC,113,21,GuzmÃ¡n Corujo,15,121
4,Chicago Fire FC,119,18,Federico Navarro,19,109


In [57]:
compare_teams(df_team_Fbref, total_missed_by_club, df1_name='Fbref Table', df2_name='Missed Table')

Unnamed: 0,Fbref Table Unique,Missed Table Unique
0,Vancouver W'caps,Vancouver Whitecaps
1,St. Louis,St. Louis City
2,Sporting KC,Sporting Kansas City
3,Seattle,Seattle Sounders FC
4,SJ Earthquakes,San Jose Earthquakes
5,Rapids,Real Salt Lake
6,RSL,Philadelphia Union
7,Philadelphia,New England Revolution
8,Nashville,Nashville SC
9,NE Revolution,Minnesota United


In [69]:
team_name_mappingWF = {
    "Rapids": "Colorado Rapids",
    "Nashville": "Nashville SC",
    "Montreal Impact": "CF Montreal",
    "Nashville": "Nashville SC",
    "Fire": "Chicago Fire FC",
}
df_team_Fbref['Team'] = df_team_Fbref['Team'].replace(team_name_mappingWF)
stats_mls = align_and_merge_teams(df_team_Fbref, total_missed_by_club)

All Team are present and correctly named.


In [78]:
stats_mls.to_csv('datasets/stats_mls.csv', index=False)

## Import player list

In [None]:
client = ZenRowsClient("4388747f3784d380412716a437ba6fb1bdba2049")

In [3]:
columns_to_read = ['player_url', 'fifa_version', 'short_name', 'long_name', 'club_name', 'league_name', 'nationality_name']
# dataset from : https://www.kaggle.com/datasets/stefanoleone992/ea-sports-fc-24-complete-player-dataset?resource=download&select=male_players.csv
ea_players = pd.read_csv('datasets/male_players.csv', usecols=columns_to_read, low_memory=False)
mls23_players = ea_players[(ea_players.league_name == 'Major League Soccer') & (ea_players.fifa_version == 24)].copy()
mls23_players.loc[:, 'player_url'] = 'https://sofifa.com' + mls23_players['player_url']
mls23_players.to_csv('datasets/mls23_players.csv', index=False)

In [43]:
statsBomb_clubs = ['Inter Miami', 'New York RB', 'LA Galaxy', 'Charlotte', 'Nashville SC', 'Cincinnati', 'Toronto']
statsBomb_players = mls23_players[mls23_players['club_name'].isin(statsBomb_clubs)].copy()

local_folder = 'datasets/playersImage'
if not os.path.exists(local_folder):
    os.makedirs(local_folder)

In [54]:
def download_player_image(row):
    try:
        url = row['player_url']
        response = client.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        script = soup.find('script', type='application/ld+json')
        json_data = script.string
        player_data = json.loads(json_data)

        family_name = player_data['familyName']
        image_url = player_data['image']

        player_id = url.split('/')[4]
        filename = f"{player_id}{family_name}.png"

        image_filename = os.path.normpath(os.path.join(local_folder, filename)).replace("\\", "/")
        response = requests.get(image_url)
        if response.status_code == 200:
            with open(image_filename, 'wb') as f:
                f.write(response.content)
            return image_filename
        else:
            print(f"Failed to download {family_name} image")
            return row['player_url']
    except Exception as e:
        print(f"Error processing {row['short_name']}: {e}")
        return row['player_url']

In [58]:
statsBomb_players['player_url'] = statsBomb_players.apply(download_player_image, axis=1)

Failed to download Jensen image


In [59]:
statsBomb_players.to_csv('datasets/mls23_statsBomb_players.csv', index=False)
statsBomb_players.head()

Unnamed: 0,player_url,fifa_version,short_name,long_name,club_name,league_name,nationality_name
3,datasets/playersImage/158023Messi.png,24.0,L. Messi,Lionel Andrés Messi Cuccittini,Inter Miami,Major League Soccer,Argentina
172,datasets/playersImage/189332Alba Ramos.png,24.0,Jordi Alba,Jordi Alba Ramos,Inter Miami,Major League Soccer,Spain
173,datasets/playersImage/189511Busquets Burgos.png,24.0,Sergio Busquets,Sergio Busquets i Burgos,Inter Miami,Major League Soccer,Spain
329,datasets/playersImage/198219Insigne.png,24.0,L. Insigne,Lorenzo Insigne,Toronto,Major League Soccer,Italy
464,datasets/playersImage/210021Mukhtar.png,24.0,H. Mukhtar,Hany Mukhtar,Nashville SC,Major League Soccer,Germany
