In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sqlite3 as sql

In [2]:
path = 'data/database.sqlite'
con = sql.connect(path)

In [3]:
# Let's read all available tables and explore what we got
country = pd.read_sql('select * from Country;', con)
league = pd.read_sql('select * from League;', con)
player = pd.read_sql('select * from Player;', con)
player_attributes = pd.read_sql('select * from Player_Attributes;', con)

In [11]:
player_attributes.date = pd.to_datetime(player_attributes.date)

In [4]:
best_players = pd.read_excel('data/best_players.xlsx')

In [5]:
player_best = player[player.player_name.isin(best_players.name.to_list())]

2 names more are present

Let's find and remove the extra players from the player_best table

In [6]:
duplicate_players = player_best.player_name[player_best.player_name.duplicated()].to_list()
best_players[best_players.name.isin(duplicate_players)]

Unnamed: 0,season,name,club,champion,league
17,2009–10,Lisandro Lopez,Lyon,0,France Ligue 1
48,2008–9,Bruno Alves,Porto,1,Portugal Liga ZON Sagres


In [7]:
player_best[player_best.player_name.isin(duplicate_players)]

Unnamed: 0,id,player_api_id,player_name,player_fifa_api_id,birthday,height,weight
1406,1409,25920,Bruno Alves,138110,1981-11-27 00:00:00,187.96,183
1407,1410,375782,Bruno Alves,210292,1990-06-09 00:00:00,177.8,165
6177,6184,182456,Lisandro Lopez,215051,1989-09-01 00:00:00,187.96,176
6178,6185,30536,Lisandro Lopez,142707,1983-03-02 00:00:00,175.26,163


Lisandro Lopez, best player for 2009-10 season in Ligue 1 is born in 1983, source: https://en.wikipedia.org/wiki/Lisandro_L%C3%B3pez_(footballer,_born_1983)

Bruno Alves, best player for the 2008-09 season in Portugal Liga ZON Sagres is born in 1981, source:https://en.wikipedia.org/wiki/Bruno_Alves

In [8]:
player_best = player_best.drop(index = [1407, 6177])

In [9]:
best_players = best_players.merge(player_best, left_on = 'name', right_on = 'player_name')

best_players.set_index('id')[['player_api_id', 
                              'player_fifa_api_id', 
                              'name', 'season', 
                              'birthday', 'height', 
                              'weight', 'league', 
                              'champion']]
best_players.birthday = pd.to_datetime(best_players.birthday)

In [10]:
best_players.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 72 entries, 0 to 71
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   season              72 non-null     object        
 1   name                72 non-null     object        
 2   club                72 non-null     object        
 3   champion            72 non-null     int64         
 4   league              72 non-null     object        
 5   id                  72 non-null     int64         
 6   player_api_id       72 non-null     int64         
 7   player_name         72 non-null     object        
 8   player_fifa_api_id  72 non-null     int64         
 9   birthday            72 non-null     datetime64[ns]
 10  height              72 non-null     float64       
 11  weight              72 non-null     int64         
dtypes: datetime64[ns](1), float64(1), int64(5), object(5)
memory usage: 7.3+ KB


Now, let's make a 'season' column based on the date feature

New season usually starts ~July, so

In [12]:
def seasonize_dates(df):
    
    #Create the season column
    df['season'] = player_attributes.date
    
    #Equate to the current year where date is after June (e.g. 2015-9-12 is for season 2015-16)
    #Equate to the previous year where date is before June (e.g. 2011-2-18 is for season 2010-11)
    #In this way we only need to add the year after to all records to finish the seasonizing of the dates
    df.season = np.where(df.season.dt.month > 6, df.season.dt.year, df.season.dt.year - 1)
    
    #Add the next year as string to the column (e.g. if 2015 = "2015-(2015 - 1999)" =  "2015-16")
    df.season = df.season.astype('str').str.cat((df.season - 1999).astype('str'), sep = '–')
    
    return df

In [14]:
player_attributes = seasonize_dates(player_attributes)

In [15]:
player_attributes = player_attributes.drop_duplicates(subset = ['season', 'player_api_id'])

In [16]:
pl_best_players = best_players[best_players.league == 'England Premier League']

In [19]:
pl_best_players = pl_best_players.merge(player_attributes, how = 'left', on = ['season', 'player_api_id'])

In [20]:
pl_best_players

Unnamed: 0,season,name,club,champion,league,id_x,player_api_id,player_name,player_fifa_api_id_x,birthday,...,vision,penalties,marking,standing_tackle,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes
0,2008–9,Nemanja Vidic,Manchester United,1,England Premier League,7846,30865,Nemanja Vidic,140601,1981-10-21,...,63.0,79.0,88.0,90.0,85.0,10.0,23.0,62.0,23.0,23.0
1,2010–11,Nemanja Vidic,Manchester United,1,England Premier League,7846,30865,Nemanja Vidic,140601,1981-10-21,...,63.0,62.0,90.0,94.0,85.0,12.0,5.0,14.0,8.0,6.0
2,2009–10,Wayne Rooney,Manchester United,0,England Premier League,10749,30829,Wayne Rooney,54050,1985-10-24,...,88.0,88.0,37.0,36.0,41.0,11.0,21.0,86.0,21.0,21.0
3,2011–12,Vincent Kompany,Manchester City,1,England Premier League,10645,39027,Vincent Kompany,139720,1986-04-10,...,63.0,63.0,84.0,90.0,85.0,10.0,9.0,5.0,8.0,6.0
4,2012–13,Gareth Bale,Tottenham Hotspur,0,England Premier League,3660,31921,Gareth Bale,173731,1989-07-16,...,78.0,76.0,68.0,75.0,74.0,15.0,15.0,11.0,5.0,6.0
5,2013–14,Luis Suarez,Liverpool,0,England Premier League,6377,40636,Luis Suarez,176580,1987-01-24,...,84.0,85.0,30.0,45.0,38.0,27.0,25.0,31.0,33.0,37.0
6,2014–15,Eden Hazard,Chelsea,1,England Premier League,2838,107417,Eden Hazard,183277,1991-01-07,...,86.0,86.0,25.0,27.0,22.0,11.0,12.0,6.0,8.0,8.0
7,2015–16,Jamie Vardy,Leicester City,1,England Premier League,4623,286119,Jamie Vardy,208830,1987-01-11,...,69.0,76.0,54.0,58.0,56.0,15.0,14.0,7.0,15.0,11.0
