In [1]:
import sqlite3
import pandas as pd
import numpy as np

In [2]:
# Loading Data
db = sqlite3.connect('../data/database.sqlite')
country = pd.read_sql_query('SELECT * FROM Country', db)
league = pd.read_sql_query('SELECT * FROM League', db)
match = pd.read_sql_query('SELECT * FROM Match', db)
player = pd.read_sql_query('SELECT * FROM Player', db)
player_attributes = pd.read_sql_query('SELECT * FROM Player_Attributes', db)
team = pd.read_sql_query('SELECT * FROM Team', db)
team_attributes = pd.read_sql_query('SELECT * FROM Team_Attributes', db)
league_season_winners= pd.read_sql_query("""
select league_id, season,team_api_id, team_long_name, max(wins) as wins
from
	(SELECT league_id, season,team_api_id, team_long_name, count(*) as wins
	FROM Match, Team 
	WHERE (home_team_api_id==team_api_id AND home_team_goal > away_team_goal) OR (away_team_api_id == team_api_id AND home_team_goal < away_team_goal)
	GROUP By league_id, season,team_api_id, team_long_name)
where wins>10
GROUP by league_id, season;
 """, db)
db.close()

In [3]:
league_season_winners.head(18)

Unnamed: 0,league_id,season,team_api_id,team_long_name,wins
0,1,2008/2009,8635,RSC Anderlecht,24
1,1,2009/2010,8635,RSC Anderlecht,22
2,1,2010/2011,8635,RSC Anderlecht,19
3,1,2011/2012,8635,RSC Anderlecht,20
4,1,2012/2013,8635,RSC Anderlecht,20
5,1,2014/2015,8342,Club Brugge KV,17
6,1,2015/2016,8342,Club Brugge KV,21
7,1729,2008/2009,10260,Manchester United,28
8,1729,2009/2010,8455,Chelsea,27
9,1729,2010/2011,10260,Manchester United,23


In [4]:
winners_md = league_season_winners.sample(5).to_markdown()
winners_md

'|    |   league_id | season    |   team_api_id | team_long_name   |   wins |\n|---:|------------:|:----------|--------------:|:-----------------|-------:|\n|  1 |           1 | 2009/2010 |          8635 | RSC Anderlecht   |     22 |\n| 77 |       21518 | 2014/2015 |          8633 | Real Madrid CF   |     30 |\n| 23 |        7809 | 2008/2009 |          8721 | VfL Wolfsburg    |     21 |\n|  3 |           1 | 2011/2012 |          8635 | RSC Anderlecht   |     20 |\n| 76 |       21518 | 2013/2014 |          9906 | Atlético Madrid  |     28 |'

In [None]:
home_player_cols = [f'home_player_{i}' for i in range(1,12)]
away_player_cols = [f'away_player_{i}' for i in range(1,12)]

match['home_players'] = match[home_player_cols].agg(list, axis=1)
match['away_players'] = match[away_player_cols].agg(list, axis=1)

In [26]:
match = match[['id', 'league_id', 'season', 'stage', 'date', 'match_api_id', 'home_team_api_id', 'away_team_api_id',
               'home_team_goal', 'away_team_goal', 'goal', 'card', 'home_players', 'away_players']].dropna()

match.head(5)

Unnamed: 0,id,league_id,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,away_team_goal,goal,card,home_players,away_players
1728,1729,1729,2008/2009,1,2008-08-17 00:00:00,489042,10260,10261,1,1,<goal><value><comment>n</comment><stats><goals...,<card><value><comment>y</comment><stats><ycard...,"[30726.0, 30362.0, 30620.0, 30865.0, 32569.0, ...","[24224.0, 25518.0, 24228.0, 30929.0, 29581.0, ..."
1729,1730,1729,2008/2009,1,2008-08-16 00:00:00,489043,9825,8659,1,0,<goal><value><comment>n</comment><stats><goals...,<card />,"[23686.0, 26111.0, 38835.0, 30986.0, 31291.0, ...","[36373.0, 36832.0, 23115.0, 37280.0, 24728.0, ..."
1730,1731,1729,2008/2009,1,2008-08-16 00:00:00,489044,8472,8650,0,1,<goal><value><comment>n</comment><stats><goals...,<card><value><comment>y</comment><stats><ycard...,"[32562.0, 38836.0, 24446.0, 24408.0, 36786.0, ...","[30660.0, 37442.0, 30617.0, 24134.0, 414792.0,..."
1731,1732,1729,2008/2009,1,2008-08-16 00:00:00,489045,8654,8528,2,1,<goal><value><comment>n</comment><stats><goals...,<card><value><comment>y</comment><stats><ycard...,"[36374.0, 30966.0, 23818.0, 37277.0, 30687.0, ...","[34421.0, 34987.0, 35472.0, 111865.0, 25005.0,..."
1732,1733,1729,2008/2009,1,2008-08-17 00:00:00,489046,10252,8456,4,2,<goal><value><comment>n</comment><stats><goals...,<card><value><comment>y</comment><stats><ycard...,"[30380.0, 30357.0, 24658.0, 43280.0, 23282.0, ...","[31432.0, 46403.0, 24208.0, 23939.0, 33963.0, ..."


In [45]:
def unique_players(series):
    players = []
    for entry in series:
        players.extend(entry)

    return list(set(players))

In [48]:
players_per_team_per_season = {}

teams = match.home_team_api_id.append(match.away_team_api_id, ignore_index=True).unique()

for team in teams:
    players_per_team_per_season[team] = {}
    temp = match[match.away_team_api_id == team]
    for season, df in temp.groupby('season'):
        players_per_team_per_season[team][season] = unique_players(df.away_players)

    temp = match[match.away_team_api_id == team]
    for season,df in temp.groupby('season'):
        players_per_team_per_season[team][season].extend(unique_players(df.away_players))
        players_per_team_per_season[team][season] = list(set(players_per_team_per_season[team][season]))

In [44]:
for s in match.away_players:
    print(s)
    break

[24224.0, 25518.0, 24228.0, 30929.0, 29581.0, 38807.0, 40565.0, 30360.0, 33852.0, 34574.0, 37799.0]


In [14]:
for x in match.columns:
    print(x)

id
country_id
league_id
season
stage
date
match_api_id
home_team_api_id
away_team_api_id
home_team_goal
away_team_goal
home_player_X1
home_player_X2
home_player_X3
home_player_X4
home_player_X5
home_player_X6
home_player_X7
home_player_X8
home_player_X9
home_player_X10
home_player_X11
away_player_X1
away_player_X2
away_player_X3
away_player_X4
away_player_X5
away_player_X6
away_player_X7
away_player_X8
away_player_X9
away_player_X10
away_player_X11
home_player_Y1
home_player_Y2
home_player_Y3
home_player_Y4
home_player_Y5
home_player_Y6
home_player_Y7
home_player_Y8
home_player_Y9
home_player_Y10
home_player_Y11
away_player_Y1
away_player_Y2
away_player_Y3
away_player_Y4
away_player_Y5
away_player_Y6
away_player_Y7
away_player_Y8
away_player_Y9
away_player_Y10
away_player_Y11
home_player_1
home_player_2
home_player_3
home_player_4
home_player_5
home_player_6
home_player_7
home_player_8
home_player_9
home_player_10
home_player_11
away_player_1
away_player_2
away_player_3
away_player_4
a

In [50]:
pd.DataFrame(players_per_team_per_season).to_csv('players_per_team.csv')