In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
playerstat_fall_2019 = pd.read_csv('data/all_playerstat_fall_2019.csv')
playerstat_spring_2020 = pd.read_csv('data/all_playerstat_spring_2020.csv')
playerstat_fall_2020 = pd.read_csv('data/all_playerstat_fall_2020.csv')
playerstat_spring_2021 = pd.read_csv('data/all_playerstat_spring_2021.csv')
playerstat_fall_2021 = pd.read_csv('data/all_playerstat_fall_2021.csv')
playerstat_2022 = pd.read_csv('data/all_playerstat_2022.csv')
playerstat_2023 = pd.read_csv('data/all_playerstat_2023.csv')

In [None]:
playerstat_fall_2019.head()

In [None]:
playerstat_fall_2019['Year'] = '2019'
playerstat_fall_2019['Season'] = 'Fall'
playerstat_spring_2020['Year'] = '2020'
playerstat_spring_2020['Season'] = 'Spring'
playerstat_fall_2020['Year'] = '2020'
playerstat_fall_2020['Season'] = 'Fall'
playerstat_fall_2021['Year'] = '2021'
playerstat_fall_2021['Season'] = 'Spring'
playerstat_fall_2021['Year'] = '2021'
playerstat_fall_2021['Season'] = 'Fall'
playerstat_2022['Year'] = '2022'
playerstat_2022['Season'] = 'Fall'
playerstat_2023['Year'] = '2023'
playerstat_2023['Season'] = 'Fall'

In [None]:
playerstats_df = pd.concat([playerstat_fall_2019, playerstat_spring_2020, playerstat_fall_2020,
                            playerstat_spring_2021, playerstat_fall_2021, playerstat_2022,
                            playerstat_2023], ignore_index = True)

In [None]:
playerstats_df

In [None]:
# Renaming columns
new_column_names = {
    "Pos": "Position",
    "Name": "Player Name",
    "Team": "Team",
    "GP": "Games Played",
    "GS": "Games Started",
    "MIN": "Minutes Played",
    "G": "Goals",
    "A": "Assists",
    "SH": "Shots",
    "Y": "Yellow Cards",
    "R": "Red Cards",
    "Year": "Year",
    "Season": "Season"
}

playerstats_df = playerstats_df.rename(columns=new_column_names)

In [None]:
# Replacing team abbreviations with full names
team_abbreviations_to_full = {
    'OAK': 'Oakland Roots SC',
    'CUSFC': 'California United Strikers FC',
    'MIA': 'The Miami FC',
    'LAF': 'Los Angeles Force',
    'SDFC': 'San Diego 1904 FC',
    'SA': 'Stumptown Athletic',
    'DCFC': 'Detroit City FC',
    'MBFC': 'Maryland Bobcats FC',
    'MISFC': 'Michigan Stars FC',
    'NAFC': 'New Amsterdam FC',
    'CFC': 'Chattanooga FC',
    'COS': 'The New York Cosmos'
}

playerstats_df['Team'] = playerstats_df['Team'].map(team_abbreviations_to_full)

In [None]:
# Display basic information about the DataFrame
playerstats_df.info()

In [None]:
# Display summary statistics
playerstats_df.describe()

In [None]:
# Correlation heatmap
corr_matrix = playerstats_df.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.show()

In [None]:
# Distribution of Goals (G) and Assists (A)
plt.figure(figsize=(10, 6))
sns.histplot(data=playerstats_df, x="Goals", bins=10, kde=True, color="blue", label="Goals")
sns.histplot(data=playerstats_df, x="Assists", bins=10, kde=True, color="orange", label="Assists")
plt.title("Distribution of Goals and Assists")
plt.xlabel("Count")
plt.ylabel("Frequency")
plt.legend()
plt.show()

In [None]:
# Finding the top 3 players with the most goals, assists, yellow cards, and red cards
player_stats_agg = playerstats_df.groupby('Player Name').agg({
    'Goals': 'sum',
    'Assists': 'sum',
    'Yellow Cards': 'sum',
    'Red Cards': 'sum',
    'Team': 'last'
})

top_goals_players = player_stats_agg.nlargest(3, 'Goals')
top_assists_players = player_stats_agg.nlargest(3, 'Assists')
top_yellow_cards_players = player_stats_agg.nlargest(3, 'Yellow Cards')
top_red_cards_players = player_stats_agg.nlargest(3, 'Red Cards')

print("Top 3 Players with Most All-Time Goals:")
print(top_goals_players[['Goals', 'Team']])

print("\nTop 3 Players with Most All-Time Assists:")
print(top_assists_players[['Assists', 'Team']])

print("\nTop 3 Players with Most All-Time Yellow Cards:")
print(top_yellow_cards_players[['Yellow Cards', 'Team']])

print("\nTop 3 Players with Most All-Time Red Cards:")
print(top_red_cards_players[['Red Cards', 'Team']])


In [None]:
# Finding the players in 2023 with the most goals, assists, yellow cards, and red cards for each club
playerstats_2023 = playerstats_df[playerstats_df['Year'] == '2023']

goals_max_2023 = playerstats_2023.groupby(['Season', 'Team'])['Goals'].idxmax()
assists_max_2023 = playerstats_2023.groupby(['Season', 'Team'])['Assists'].idxmax()
yellow_cards_max_2023 = playerstats_2023.groupby(['Season', 'Team'])['Yellow Cards'].idxmax()
red_cards_max_2023 = playerstats_2023.groupby(['Season', 'Team'])['Red Cards'].idxmax()

most_goals_2023 = playerstats_2023.loc[goals_max_2023][['Season', 'Team', 'Player Name', 'Goals']]
most_assists_2023 = playerstats_2023.loc[assists_max_2023][['Season', 'Team', 'Player Name', 'Assists']]
most_yellow_cards_2023 = playerstats_2023.loc[yellow_cards_max_2023][['Season', 'Team', 'Player Name', 'Yellow Cards']]
most_red_cards_2023 = playerstats_2023.loc[red_cards_max_2023][['Season', 'Team', 'Player Name', 'Red Cards']]

print("Most Goals in 2023:")
print(most_goals_2023)

print("\nMost Assists in 2023:")
print(most_assists_2023)

print("\nMost Yellow Cards in 2023:")
print(most_yellow_cards_2023)

print("\nMost Red Cards in 2023:")
print(most_red_cards_2023)

In [None]:
playerstats_df.to_csv('data/complete_player_stats.csv', index=False)