In [1]:
import sqlite3
import pandas as pd
import numpy as np

In [2]:
# Loading Data
db = sqlite3.connect('../data/database.sqlite')
country = pd.read_sql_query('SELECT * FROM Country', db)
league = pd.read_sql_query('SELECT * FROM League', db)
match = pd.read_sql_query('SELECT * FROM Match', db)
player = pd.read_sql_query('SELECT * FROM Player', db)
player_attributes = pd.read_sql_query('SELECT * FROM Player_Attributes', db)
team = pd.read_sql_query('SELECT * FROM Team', db)
team_attributes = pd.read_sql_query('SELECT * FROM Team_Attributes', db)
league_season_winners= pd.read_sql_query("""
select league_id, season,team_api_id, team_long_name, max(wins) as wins
from
	(SELECT league_id, season,team_api_id, team_long_name, count(*) as wins
	FROM Match, Team 
	WHERE (home_team_api_id==team_api_id AND home_team_goal > away_team_goal) OR (away_team_api_id == team_api_id AND home_team_goal < away_team_goal)
	GROUP By league_id, season,team_api_id, team_long_name)
where wins>10
GROUP by league_id, season;
 """, db)
db.close()

In [3]:
league_season_winners.head(18)

Unnamed: 0,league_id,season,team_api_id,team_long_name,wins
0,1,2008/2009,8635,RSC Anderlecht,24
1,1,2009/2010,8635,RSC Anderlecht,22
2,1,2010/2011,8635,RSC Anderlecht,19
3,1,2011/2012,8635,RSC Anderlecht,20
4,1,2012/2013,8635,RSC Anderlecht,20
5,1,2014/2015,8342,Club Brugge KV,17
6,1,2015/2016,8342,Club Brugge KV,21
7,1729,2008/2009,10260,Manchester United,28
8,1729,2009/2010,8455,Chelsea,27
9,1729,2010/2011,10260,Manchester United,23


In [4]:
winners_md = league_season_winners.sample(5).to_markdown()
winners_md

'|    |   league_id | season    |   team_api_id | team_long_name   |   wins |\n|---:|------------:|:----------|--------------:|:-----------------|-------:|\n|  1 |           1 | 2009/2010 |          8635 | RSC Anderlecht   |     22 |\n| 77 |       21518 | 2014/2015 |          8633 | Real Madrid CF   |     30 |\n| 23 |        7809 | 2008/2009 |          8721 | VfL Wolfsburg    |     21 |\n|  3 |           1 | 2011/2012 |          8635 | RSC Anderlecht   |     20 |\n| 76 |       21518 | 2013/2014 |          9906 | Atlético Madrid  |     28 |'

In [None]:
home_player_cols = [f'home_player_{i}' for i in range(1,12)]
away_player_cols = [f'away_player_{i}' for i in range(1,12)]

match['home_players'] = match[home_player_cols].agg(list, axis=1)
match['away_players'] = match[away_player_cols].agg(list, axis=1)

In [26]:
match = match[['id', 'league_id', 'season', 'stage', 'date', 'match_api_id', 'home_team_api_id', 'away_team_api_id',
               'home_team_goal', 'away_team_goal', 'goal', 'card', 'home_players', 'away_players']].dropna()

match.head(5)

Unnamed: 0,id,league_id,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,away_team_goal,goal,card,home_players,away_players
1728,1729,1729,2008/2009,1,2008-08-17 00:00:00,489042,10260,10261,1,1,<goal><value><comment>n</comment><stats><goals...,<card><value><comment>y</comment><stats><ycard...,"[30726.0, 30362.0, 30620.0, 30865.0, 32569.0, ...","[24224.0, 25518.0, 24228.0, 30929.0, 29581.0, ..."
1729,1730,1729,2008/2009,1,2008-08-16 00:00:00,489043,9825,8659,1,0,<goal><value><comment>n</comment><stats><goals...,<card />,"[23686.0, 26111.0, 38835.0, 30986.0, 31291.0, ...","[36373.0, 36832.0, 23115.0, 37280.0, 24728.0, ..."
1730,1731,1729,2008/2009,1,2008-08-16 00:00:00,489044,8472,8650,0,1,<goal><value><comment>n</comment><stats><goals...,<card><value><comment>y</comment><stats><ycard...,"[32562.0, 38836.0, 24446.0, 24408.0, 36786.0, ...","[30660.0, 37442.0, 30617.0, 24134.0, 414792.0,..."
1731,1732,1729,2008/2009,1,2008-08-16 00:00:00,489045,8654,8528,2,1,<goal><value><comment>n</comment><stats><goals...,<card><value><comment>y</comment><stats><ycard...,"[36374.0, 30966.0, 23818.0, 37277.0, 30687.0, ...","[34421.0, 34987.0, 35472.0, 111865.0, 25005.0,..."
1732,1733,1729,2008/2009,1,2008-08-17 00:00:00,489046,10252,8456,4,2,<goal><value><comment>n</comment><stats><goals...,<card><value><comment>y</comment><stats><ycard...,"[30380.0, 30357.0, 24658.0, 43280.0, 23282.0, ...","[31432.0, 46403.0, 24208.0, 23939.0, 33963.0, ..."


In [45]:
def unique_players(series):
    players = []
    for entry in series:
        players.extend(entry)

    return list(set(players))

In [48]:
players_per_team_per_season = {}

teams = match.home_team_api_id.append(match.away_team_api_id, ignore_index=True).unique()

for team in teams:
    players_per_team_per_season[team] = {}
    temp = match[match.away_team_api_id == team]
    for season, df in temp.groupby('season'):
        players_per_team_per_season[team][season] = unique_players(df.away_players)

    temp = match[match.away_team_api_id == team]
    for season,df in temp.groupby('season'):
        players_per_team_per_season[team][season].extend(unique_players(df.away_players))
        players_per_team_per_season[team][season] = list(set(players_per_team_per_season[team][season]))

In [44]:
for s in match.away_players:
    print(s)
    break

[24224.0, 25518.0, 24228.0, 30929.0, 29581.0, 38807.0, 40565.0, 30360.0, 33852.0, 34574.0, 37799.0]


In [14]:
for x in match.columns:
    print(x)

id
country_id
league_id
season
stage
date
match_api_id
home_team_api_id
away_team_api_id
home_team_goal
away_team_goal
home_player_X1
home_player_X2
home_player_X3
home_player_X4
home_player_X5
home_player_X6
home_player_X7
home_player_X8
home_player_X9
home_player_X10
home_player_X11
away_player_X1
away_player_X2
away_player_X3
away_player_X4
away_player_X5
away_player_X6
away_player_X7
away_player_X8
away_player_X9
away_player_X10
away_player_X11
home_player_Y1
home_player_Y2
home_player_Y3
home_player_Y4
home_player_Y5
home_player_Y6
home_player_Y7
home_player_Y8
home_player_Y9
home_player_Y10
home_player_Y11
away_player_Y1
away_player_Y2
away_player_Y3
away_player_Y4
away_player_Y5
away_player_Y6
away_player_Y7
away_player_Y8
away_player_Y9
away_player_Y10
away_player_Y11
home_player_1
home_player_2
home_player_3
home_player_4
home_player_5
home_player_6
home_player_7
home_player_8
home_player_9
home_player_10
home_player_11
away_player_1
away_player_2
away_player_3
away_player_4
a

In [51]:
players_per_team_df = pd.DataFrame(players_per_team_per_season).reset_index()

Unnamed: 0,index,10260,9825,8472,8654,10252,8668,8549,8559,8667,...,10179,9824,10192,7896,10199,10190,10191,10243,2183,8673
0,2008/2009,"[34944.0, 40961.0, 30726.0, 67850.0, 30865.0, ...","[39297.0, 23686.0, 30986.0, 27277.0, 30613.0, ...","[nan, nan, nan, 23939.0, nan, nan, nan, nan, n...","[nan, nan, nan, nan, nan, 23818.0, 30734.0, 27...","[23264.0, 40128.0, 34466.0, 23782.0, 33863.0, ...","[109058.0, 24846.0, 30735.0, 26256.0, 30876.0,...","[97932.0, 35608.0, 30892.0, 24753.0, 23352.0, ...","[23937.0, 26244.0, 24336.0, 24372.0, 11576.0, ...","[30595.0, 34437.0, 23946.0, 23438.0, 26143.0, ...",...,,,,,,,,,,
1,2009/2010,"[34944.0, 40961.0, 30726.0, 67850.0, 30865.0, ...","[39297.0, 23686.0, 27277.0, 187668.0, 26005.0,...","[30988.0, 23949.0, 30352.0, 38802.0, 23073.0, ...","[41475.0, 34947.0, 23818.0, 39306.0, 30353.0, ...","[24211.0, 30357.0, 38807.0, 34466.0, 30380.0, ...","[109058.0, 24846.0, 30735.0, 24216.0, 30876.0,...",,"[23937.0, 24336.0, 24728.0, 30753.0, 143793.0,...","[30595.0, 34437.0, 23563.0, 24843.0, 23438.0, ...",...,,,,,,,,,,
2,2010/2011,"[34944.0, 40961.0, 30726.0, 70409.0, 30865.0, ...","[39297.0, 23686.0, 30986.0, 27277.0, 33812.0, ...","[31104.0, 131583.0, 23949.0, 30352.0, 24209.0,...","[24450.0, 23939.0, 23818.0, 30734.0, 26256.0, ...","[161414.0, 161415.0, 23949.0, 189074.0, 24211....","[109058.0, 24846.0, 30735.0, 24216.0, 30876.0,...",,"[23937.0, 24455.0, 24336.0, 24728.0, 34430.0, ...",,...,,,,,,,,,,
3,2011/2012,"[34944.0, 40961.0, 182917.0, 70409.0, 30865.0,...","[30986.0, 26005.0, 35606.0, 214685.0, 63520.0,...","[24064.0, 242308.0, 30352.0, 30362.0, 23073.0,...",,"[161414.0, 161415.0, 282251.0, 23949.0, 29581....","[109058.0, 24846.0, 30735.0, 24216.0, 30876.0,...",,"[191616.0, 23937.0, 33048.0, 24728.0, 30753.0,...",,...,,,,,,,,,"[nan, 114817.0, 163618.0, 13449.0, 13551.0, 68...","[13376.0, nan, 69601.0, nan, nan, nan, nan, na..."
4,2012/2013,"[34944.0, 40961.0, 182917.0, 70409.0, 30865.0,...","[46469.0, 27277.0, 26005.0, 35606.0, 214685.0,...","[30352.0, 46353.0, 23073.0, 24230.0, 36786.0, ...","[23936.0, 295060.0, 47382.0, 30630.0, 35110.0,...","[161414.0, 149895.0, 141576.0, 161415.0, 30602...","[32705.0, 39618.0, 30371.0, 30628.0, 23268.0, ...",,,,...,,,,,,,,,,
5,2013/2014,"[34944.0, 182917.0, 70409.0, 30865.0, 186137.0...","[46469.0, 23688.0, 26005.0, 35606.0, 36378.0, ...","[24064.0, 242308.0, 70297.0, 30362.0, 39331.0,...","[23936.0, 34193.0, 47382.0, 40220.0, 169756.0,...","[150401.0, 161414.0, 149895.0, 30602.0, 69650....","[24846.0, 251925.0, 24216.0, 181276.0, 26151.0...",,,"[163200.0, 42247.0, 35480.0, 23839.0, 24227.0,...",...,,,,,,,,,,
6,2014/2015,"[34944.0, 182917.0, 161035.0, 22543.0, 32657.0...","[46469.0, 23688.0, 50065.0, 35606.0, 192921.0,...","[109058.0, 103428.0, 30348.0, 50189.0, 118929....","[23936.0, nan, 192899.0, 155782.0, 570760.0, 4...","[161414.0, 30602.0, 56972.0, 29581.0, 30352.0,...","[212866.0, 24846.0, 316688.0, 24216.0, 191132....",,,"[163200.0, 42247.0, 35480.0, 190872.0, 23839.0...",...,,,,,,,,,,
7,2015/2016,"[34944.0, 109060.0, 182917.0, 161035.0, 693138...","[46469.0, 23688.0, 30859.0, 50065.0, 35606.0, ...","[109058.0, 166019.0, 103428.0, 30348.0, 118929...","[nan, 192899.0, 155782.0, 49543.0, 37780.0, 47...","[150401.0, 24579.0, 161414.0, 56972.0, 24208.0...","[212866.0, 24846.0, 316688.0, 251925.0, 181276...",,,,...,"[nan, 451335.0, 10637.0, 67349.0, 535957.0, 48...","[37257.0, 358156.0, 41621.0, 154261.0, 56868.0...","[493185.0, 25860.0, 119702.0, 108451.0, 176680...","[406283.0, 429986.0, 114212.0, 408104.0, 28228...","[638593.0, 141704.0, 95257.0, 36382.0, 34082.0...","[638592.0, 566785.0, 210423.0, 67333.0, 278917...","[133126.0, 42257.0, 464665.0, 186524.0, 659742...","[520450.0, 661509.0, 451982.0, 462608.0, 42262...",,


In [54]:
players_per_team_df = players_per_team_df.melt(id_vars='index')
players_per_team_df.columns = ['season', 'team_api_id', 'players']

KeyError: 'index'

In [55]:
print(players_per_team_df.shape)
players_per_team_df.sample(5)

(1616, 3)


Unnamed: 0,season,team_api_id,players
263,2015/2016,8678,"[184321.0, 155913.0, 192780.0, 46349.0, 68237...."
518,2014/2015,6391,
468,2012/2013,10242,"[39303.0, 159883.0, 41101.0, 242958.0, 36123.0..."
1356,2012/2013,8633,"[29590.0, 36378.0, 25759.0, 30889.0, 46509.0, ..."
1033,2009/2010,8640,"[38913.0, 26434.0, 39139.0, 5703.0, 46120.0, 3..."


In [57]:
players_per_team_df.to_csv('players_per_team.csv')