In [1]:
import pandas as pd

In [2]:
file_names = [
    "MLSNP_players", "MLSNP_players_xgoals", "MLSNP_players_xpass", 
    "MLSNP_players_goals-added", "MLSNP_goalkeepers_xgoals", 
    "MLSNP_goalkeepers_goals-added", "MLSNP_teams", 
    "MLSNP_teams_xgoals", "MLSNP_teams_xpass", 
    "MLSNP_teams_goals-added", "MLSNP_games", "MLSNP_games_xgoals", 
    "MLSNP_managers", "MLSNP_referees", "MLSNP_stadia"
]

In [3]:
# load each csv into dataframes
data_frames = {}
for file_name in file_names:
    file_path = f"data/{file_name}.csv"
    data_frames[file_name] = pd.read_csv(file_path)

In [4]:
data_frames['MLSNP_players']

Unnamed: 0,player_id,player_name,nationality,primary_broad_position,primary_general_position,season_name,birth_date,height_ft,height_in,weight_lb,secondary_general_position,secondary_broad_position
0,0Oq624kdq6,Christopher Rodgers,USA,DF,CB,2023,,,,,,
1,0Oq624oPq6,Kalani Kossa-Rienzi,USA,MF,W,2024,2002-06-27,5.0,8.0,150.0,,
2,0Oq62blzq6,Christian Tchouante,Cameroon,,,{},2006-01-17,,,,,
3,0Oq62L42q6,Thiago,Brazil,FW,W,2023,2003-03-06,,,,,
4,0Oq62O1rq6,Robert Bailey,USA,,,{},2000-05-19,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
995,KAqBkZ1vQb,David Duque,USA,MF,CM,2023,2006-06-21,,,,,
996,KAqBkZyvQb,Noah Santos,USA,FW,ST,"['2023', '2024']",2007-01-10,,,,,
997,KAqBNbaWqb,Efrain Morales,USA,DF,CB,"['2023', '2024']",2004-03-04,6.0,3.0,170.0,,
998,KAqBNEB7qb,Kieran Sargeant,USA,FW,W,2024,2003-05-15,,,,CB,DF


In [5]:
for i in data_frames:
    print(i)
    print(data_frames[i].head(2))

MLSNP_players
    player_id          player_name nationality primary_broad_position  \
0  0Oq624kdq6  Christopher Rodgers         USA                     DF   
1  0Oq624oPq6  Kalani Kossa-Rienzi         USA                     MF   

  primary_general_position season_name  birth_date  height_ft  height_in  \
0                       CB        2023         NaN        NaN        NaN   
1                        W        2024  2002-06-27        5.0        8.0   

   weight_lb secondary_general_position secondary_broad_position  
0        NaN                        NaN                      NaN  
1      150.0                        NaN                      NaN  
MLSNP_players_xgoals
    player_id     team_id general_position  minutes_played  shots  \
0  0Oq624kdq6  0Oq6Yad56D               CB            1214      4   
1  0Oq624oPq6  KXMe8Z2Q64                W             307      7   

   shots_on_target  goals  xgoals  xplace  goals_minus_xgoals  key_passes  \
0                1      0  0.1

## Team Performance Metrics:
1. Goal Difference: Calculate the difference between goals scored and goals conceded.
2. Expected Goals (xGoals): Compare actual goals scored/conceded with expected goals to assess finishing and defensive capabilities.
3. Pass Completion Percentage: Measure the accuracy of passing.
4. Points Earned: Directly indicates team success in matches.

## Player Performance Metrics:
1. Goals Scored: Traditional measure of attacking prowess.
2. Expected Goals (xGoals) for Attackers: To evaluate finishing ability.
3. Pass Completion Percentage: Indicates passing accuracy and involvement in build-up play.
4. Assists: Measure of creative ability.
5. Defensive Contributions: For defenders and midfielders, consider metrics like interceptions, tackles won, and clearances.
6. Goalkeeper Metrics: Save percentage, goals conceded, and expected goals conceded.

In [8]:
team_metrics = pd.merge(data_frames['MLSNP_teams'], data_frames['MLSNP_teams_xgoals'], on='team_id')
team_metrics = pd.merge(team_metrics, data_frames['MLSNP_teams_xpass'], on='team_id')

# team metrics
team_metrics['goal_difference'] = team_metrics['goals_for'] - team_metrics['goals_against']
team_metrics['xgoal_difference'] = team_metrics['xgoals_for'] - team_metrics['xgoals_against']
team_metrics['points_difference'] = team_metrics['points'] - team_metrics['xpoints']
team_metrics['pass_accuracy_difference'] = team_metrics['pass_completion_percentage_for'] - team_metrics['xpass_completion_percentage_for']

# classification of teams
team_metrics['team_performance'] = 'Good'
team_metrics.loc[team_metrics['goal_difference'] < 0, 'team_performance'] = 'Bad'
team_metrics.loc[team_metrics['points_difference'] < 0, 'team_performance'] = 'Bad'
team_metrics.loc[team_metrics['pass_accuracy_difference'] < 0, 'team_performance'] = 'Bad'

print(team_metrics)

       team_id                  team_name   team_short_name team_abbreviation  \
0   0Oq6Yad56D            Columbus Crew 2          Columbus               CLB   
1   0x5gb3bM7O         Chicago Fire FC II           Chicago               CHI   
2   2lqRX1AMr0      Minnesota United FC 2         Minnesota               MIN   
3   2vQ14GKqrA    Sporting Kansas City II       Kansas City               SKC   
4   2vQ1XzlqrA           Los Angeles FC 2  Los Angeles FC 2              LAFC   
5   4JMAkpDqKg              Toronto FC II           Toronto               TOR   
6   4wM4E4d5jB               LA Galaxy II      LA Galaxy II               LAG   
7   7VqG1oWMvW          Colorado Rapids 2          Colorado               COL   
8   9Yqdwg85vJ      New York Red Bulls II          New York              NYRB   
9   a35reDLML6            Crown Legacy FC         Charlotte               CLT   
10  BLMv6m3Mxe              Real Monarchs     Real Monarchs               SLC   
11  eV5Dw4EMKn              

In [13]:
# player metrics
player_metrics_xgoals = data_frames['MLSNP_players_xgoals'].copy()
player_metrics_xgoals['goal_difference'] = player_metrics_xgoals['goals'] - player_metrics_xgoals['xgoals']

player_metrics_xpass = data_frames['MLSNP_players_xpass'].copy()
player_metrics_xpass['pass_accuracy_difference'] = player_metrics_xpass['pass_completion_percentage'] - player_metrics_xpass['xpass_completion_percentage']

merged_player_metrics = pd.merge(player_metrics_xgoals, player_metrics_xpass, on='player_id', suffixes=('_xgoals', '_xpass'))

merged_player_metrics['player_performance'] = 'Good'
merged_player_metrics.loc[merged_player_metrics['goal_difference'] < 0, 'player_performance'] = 'Bad'
merged_player_metrics.loc[merged_player_metrics['pass_accuracy_difference'] < 0, 'player_performance'] = 'Bad'

print(merged_player_metrics)

      player_id                team_id_xgoals general_position_xgoals  \
0    0Oq624kdq6                    0Oq6Yad56D                      CB   
1    0Oq624oPq6                    KXMe8Z2Q64                       W   
2    0Oq62L42q6                    a35reDLML6                       W   
3    0Oq62Oo2q6                    eV5Dw4EMKn                       W   
4    0Oq62Y3zq6                    a35reDLML6                      GK   
..          ...                           ...                     ...   
995  KXMe8mgxQ6                    7VqG1oWMvW                      CM   
996  KXMe8nlPQ6  ['KXMe8Z2Q64', 'eVq3Z0D5WO']                      ST   
997  KXMe8NYXQ6                    jYQJXkP5GR                      FB   
998  KXMe8VRrQ6                    Oa5wDy8q14                      CB   
999  KXMegRk3q6                    kRQaW3L5KZ                      FB   

     minutes_played_xgoals  shots  shots_on_target  goals  xgoals  xplace  \
0                     1214      4             