In [3]:
import pandas as pd
import os

In [4]:
# Define the new working directory path
new_working_directory = r'C:\cloudresume\react\resume\sports-data'

# Change the current working directory
os.chdir(new_working_directory)

In [5]:
# Define the path 
data_folder = 'data/'

# Load the CSV files to df
appearances_df = pd.read_csv(data_folder + 'appearances_filtered.csv')
clubs_df = pd.read_csv(data_folder + 'clubs_filtered.csv')
players_df = pd.read_csv(data_folder + 'players_filtered.csv')

In [6]:
appearances_df.head()

Unnamed: 0.1,Unnamed: 0,appearance_id,game_id,player_id,player_club_id,player_current_club_id,date,player_name,competition_id,yellow_cards,red_cards,goals,assists,minutes_played
0,4,2234421_122011,2234421,122011,195,3008,2012-07-05,Markus Henriksen,ELQ,0,0,0,1,90
1,24,2234404_14940,2234404,14940,660,367,2012-07-09,Razvan Rat,UKRS,0,0,0,0,90
2,25,2234404_14942,2234404,14942,660,1390,2012-07-09,Darijo Srna,UKRS,0,0,0,1,90
3,29,2234404_26267,2234404,26267,660,281,2012-07-09,Fernandinho,UKRS,0,0,0,1,90
4,39,2234404_55735,2234404,55735,660,46,2012-07-09,Henrikh Mkhitaryan,UKRS,0,0,0,0,90


In [10]:
clubs_df

Unnamed: 0.1,Unnamed: 0,player_id,yellow_cards,red_cards,goals,assists,minutes_played
0,0,38004,0,0,2,0,90
1,1,79232,0,0,0,0,90
2,2,42792,0,0,0,0,45
3,3,73333,0,0,0,0,90
4,4,122011,0,0,0,1,90
...,...,...,...,...,...,...,...
1530225,1530225,687609,1,0,0,1,76
1530226,1530226,730581,0,0,0,0,90
1530227,1530227,73517,0,0,0,0,90
1530228,1530228,82503,0,0,0,0,90


In [11]:
players_df

Unnamed: 0.1,Unnamed: 0,name,current_club_id,country_of_citizenship,date_of_birth,position,foot,height_in_cm,market_value_in_eur,highest_market_value_in_eur,player_id
0,0,Miroslav Klose,398,Germany,1978-06-09,Attack,right,184.0,1000000.0,30000000.0,10
1,1,Roman Weidenfeller,16,Germany,1980-08-06,Goalkeeper,left,190.0,750000.0,8000000.0,26
2,3,Lúcio,506,Brazil,1978-05-08,Defender,,,200000.0,24500000.0,77
3,4,Tom Starke,27,Germany,1981-03-18,Goalkeeper,right,194.0,100000.0,3000000.0,80
4,6,Christoph Metzelder,33,Germany,1980-11-05,Defender,,,1500000.0,9500000.0,123
...,...,...,...,...,...,...,...,...,...,...,...
10987,30421,Yunus Emre Konak,1148,Turkey,2006-01-10,Midfield,right,181.0,1800000.0,1800000.0,1141628
10988,30439,Luck Zogbé,3911,Cote d'Ivoire,2005-03-24,Defender,left,,100000.0,100000.0,1144999
10989,30443,Mateus Lusuardi,8970,Brazil,2004-01-08,Defender,left,190.0,50000.0,50000.0,1159022
10990,30455,Mahamadou Nagida,273,Cameroon,2005-06-28,Defender,left,176.0,150000.0,150000.0,1176345


In [15]:
import pandas as pd

def merge_and_calculate_stats(appearances_df, clubs_df, players_df):
    # Merge the appearances dataframe with the players dataframe.
    # This combines the game appearance information with player-specific information.
    # The merge is performed on the 'player_id' column which is common to both dataframes.
    merged_df = pd.merge(appearances_df, players_df, left_on='player_id', right_on='player_id')

    # Group the merged dataframe by 'player_id' and count the unique 'game_id's to determine
    # the number of games each player has played.
    games_played = merged_df.groupby('player_id')['game_id'].nunique()

    # Group the merged dataframe by 'player_id' and sum the 'minutes_played' to find out
    # the total minutes played by each player.
    total_minutes = merged_df.groupby('player_id')['minutes_played'].sum()

    # Group the merged dataframe by 'player_id' and sum the 'yellow_cards' to find out
    # the total number of yellow cards received by each player.
    yellow_cards_sum = merged_df.groupby('player_id')['yellow_cards'].sum()

    # Group the merged dataframe by 'player_id' and sum the 'red_cards' to find out
    # the total number of red cards received by each player.
    red_cards_sum = merged_df.groupby('player_id')['red_cards'].sum()

    # Using the groupby results, create a new DataFrame that contains all the player stats.
    # Calculate the average minutes per game by dividing total minutes by the number of games played.
    # Calculate the average yellow cards per game by dividing the total yellow cards by the number of games played.
    # Calculate the average red cards per game by dividing the total red cards by the number of games played.
    # Reset the index so that 'player_id' becomes a column in the new dataframe rather than the index.
    player_stats = pd.DataFrame({
        'games_played': games_played,
        'total_minutes': total_minutes,
        'average_minutes': total_minutes / games_played,
        'yellow_cards_sum': yellow_cards_sum,
        'yellow_cards_avg': yellow_cards_sum / games_played,
        'red_cards_sum': red_cards_sum,
        'red_cards_avg': red_cards_sum / games_played
    }).reset_index()

    # Merge the new dataframe containing player stats with the players dataframe.
    # This adds the player names to the stats.
    # Only the 'player_id' and 'name' columns are used from the players dataframe.
    player_stats = pd.merge(player_stats, players_df[['player_id', 'name']], on='player_id')

    # Return the completed dataframe containing all the statistics combined with player names.
    return player_stats

final_stats_df = merge_and_calculate_stats(appearances_df, clubs_df, players_df)

In [16]:
final_stats_df

Unnamed: 0,player_id,games_played,total_minutes,average_minutes,yellow_cards_sum,yellow_cards_avg,red_cards_sum,red_cards_avg,name
0,10,136,8808,64.764706,19,0.139706,0,0.000000,Miroslav Klose
1,26,152,13508,88.868421,4,0.026316,2,0.013158,Roman Weidenfeller
2,77,4,307,76.750000,0,0.000000,0,0.000000,Lúcio
3,80,12,1080,90.000000,0,0.000000,0,0.000000,Tom Starke
4,123,7,427,61.000000,0,0.000000,0,0.000000,Christoph Metzelder
...,...,...,...,...,...,...,...,...,...
8071,1111912,4,26,6.500000,0,0.000000,0,0.000000,Adama Bojang
8072,1141628,17,1330,78.235294,2,0.117647,0,0.000000,Yunus Emre Konak
8073,1159022,5,395,79.000000,1,0.200000,0,0.000000,Mateus Lusuardi
8074,1176345,5,201,40.200000,0,0.000000,0,0.000000,Mahamadou Nagida
