# Deconfounder for soccer data

Example usage for the _sport deconfounder_.

Input: 
    - lineup of 11+11 players
    - outcome in terms of goals difference scored per match


In [1]:
import numpy as np
import pandas as pd
import sys
from argparse import ArgumentParser

import FactorLineups as fl
import SportDeconfounder as sd
import OutcomeModel as om



## Input parameters

In [817]:
algo='BPTF'
col_date='date'
match_days=3
segments=False
recon_flag=True
verbose=True
verbose_om=False
verbose_fact=False
test=False
normalize=True
mask_flag=True
S=10
seed0=10
min_caps=15
n_iter_alpha=10
epsilon=0.1
weighted_by_duration=False
vad_portion=0.1
n_iter=1
n_iter_cv=10
weights=False
solver='auto'
alphas=None

In [818]:
league='10257'
season='2014'
segments=False
K=sd.import_best_k(league=league,season=season)
F=1
print K

9


## Input data

In [None]:
infile='your_path_to_file/filename.csv'
df=pd.read_csv(infile)

try:df=df.sort_values(by=[col_date]) 
except:print('Warning: DataFrame not sorted by date')  # only needed if building an 'intervened test set'

cols_home_score='home_team_goal'
cols_away_score='away_team_goal'

In [820]:
cols=df.columns.values
if league=='nba':
    cols_home=['h'+str(i) for i in range(1,6)]
    cols_away=['a'+str(i) for i in range(1,6)]
    cols_home_team='home_team'
    cols_away_team='away_team'
else:
    cols_home=[c for c in cols if 'home_player_' in c and 'name' not in c]
    cols_away=[c for c in cols if 'away_player_' in c and 'name' not in c]
    cols_home_team='home_team_api_id'
    cols_away_team='away_team_api_id'
assert len(cols_home)==len(cols_away)

In [821]:
df_player=pd.read_csv('../data/output/players.csv')
df_team=pd.read_csv('../data/output/teams.csv')

## Step 1:  Factor Model
Fits a factorization model to the input tensor with entries $A_{abi}$.  
This is a quantity processed from the raw lineup data $\ell^{a}_{(a,b)}, \ell^{b}_{(a,b)}$, representing the number of times that player $i$ played for team $a$ against team $b$  

In [822]:
FL=fl.FactorLineups(df,cols_home=cols_home,cols_away=cols_away,cols_home_team=cols_home_team,cols_away_team=cols_away_team,seed_cv=seed0,K=K,segments=segments,mask_flag=mask_flag,verbose=verbose,verbose_fact=verbose_fact,S=S)
try:
    FL.set_players_names(df_player,col1='player_api_id',col2='player_name')
    FL.set_team_names(df_team,col1='team_api_id',col2='team_long_name')
    FL.teams_to_players()
except: 1
    
SD=sd.SportDeconfounder(df,FL,verbose=verbose_om,Z=None,A_recon=None,algo=algo,weighted_by_duration=weighted_by_duration) # produces Z and A_recon

Example latent factors.

In [823]:
player_membership=np.copy(SD.fact_obj['BPTF'].G_DK_M[2])
player_membership.shape

(504, 9)

In [824]:
try:
    player_membership=np.copy(SD.fact_obj['BPTF'].G_DK_M[2])
    D,K=player_membership.shape
    max_i=10
    for k in range(K):
        top_words_idx = player_membership[ :,k].argsort()[-max_i:][::-1]
        top_words = "; ".join([FL.idx2player_name[i] for i in top_words_idx])
        print("Player group {}: {}".format(k, top_words))
except: print ('missing player name information')

Player group 0: Lorenzo De Silvestri; Angelo da Costa; Luca Rigoni; Gianluca Pegolo; Manolo Gabbiadini; Angelo Palombo; Shkodran Mustafi; Daniele Gastaldello; None; Nicholas Sebastian Frey
Player group 1: Daniele Padelli; Matteo Darmian; Emiliano Moretti; Luca Rossettini; Alessio Cerci; Davide Astori; Kamil Glik; Giuseppe Vives; Daniele Conti; Daniele Dessena
Player group 2: Danilo; Paul Pogba; Roberto Pereyra; Gianluigi Buffon; Kwadwo Asamoah; Thomas Heurtaux; Giorgio Chiellini; Carlos Tevez; Allan; Antonio Di Natale
Player group 3: Rafael; German Denis; Andrea Consigli; Luca Toni; Fabrizio Cacciatore; Emil Hallfredsson; Carlos Carmona; Romulo; Vangelis Moras; Giacomo Bonaventura
Player group 4: Francesco Bardi; Neto; Paulinho; Gonzalo Rodriguez; Federico Ceccherini; Borja Valero; Stefan Savic; Juan Cuadrado; Alberto Aquilani; Emerson
Player group 5: Mattia Perin; Rodrigo Palacio; Samir Handanovic; Alberto Gilardino; Jose Maria Callejon; Yuto Nagatomo; Esteban Cambiasso; Jonathan; Pep

In [825]:
try:
    team_membership=np.copy(SD.fact_obj['BPTF'].G_DK_M[0])
    T,K=team_membership.shape
    for k in range(K):
        top_words_idx = team_membership[ :,k].argsort()[-max_i:][::-1]
        top_words = "; ".join([ FL.team_id2team_name[i] for i in top_words_idx])
        print("Team group {}: {}".format(k, top_words))  
except: print ('missing team name information')

Team group 0: Sampdoria; Chievo Verona; Sassuolo; Parma; Genoa; Torino; Catania; Cagliari; Napoli; Atalanta
Team group 1: Torino; Cagliari; Sassuolo; Inter; Chievo Verona; Udinese; Fiorentina; Roma; Catania; Sampdoria
Team group 2: Juventus; Udinese; Genoa; Torino; Inter; Milan; Sassuolo; Catania; Lazio; Bologna
Team group 3: Atalanta; Hellas Verona; Parma; Napoli; Chievo Verona; Roma; Inter; Milan; Sassuolo; Catania
Team group 4: Livorno; Fiorentina; Inter; Parma; Milan; Sassuolo; Cagliari; Catania; Lazio; Bologna
Team group 5: Napoli; Genoa; Inter; Sassuolo; Lazio; Juventus; Catania; Hellas Verona; Torino; Roma
Team group 6: Bologna; Lazio; Catania; Parma; Sassuolo; Genoa; Inter; Chievo Verona; Torino; Sampdoria
Team group 7: Roma; Inter; Milan; Cagliari; Genoa; Hellas Verona; Livorno; Sassuolo; Catania; Lazio
Team group 8: Milan; Fiorentina; Inter; Catania; Sassuolo; Bologna; Lazio; Parma; Genoa; Chievo Verona


# Step 3: outcome model
Ridge regression with goal difference as outcomes.  
 - `home`- `away` goals


# Outcome on score difference

$Y^a_{ab}-Y^b_{ab}=\sum_{i\in \text{Home team}} \beta_i \, \ell^a_{(a,b)i} -\sum_{i\in \text{Away team}} \beta_i \, \ell^b_{(a,b)i} +  \gamma^T \, (r_{a}\circ q_b-  r_{b}\circ q_a)$   (substitute confounder)

$Y^a_{ab}-Y^b_{ab}=\sum_{i\in \text{Home team}} \beta_i \, \ell^a_{(a,b)i} -\sum_{i\in \text{Away team}} \beta_i \, \ell^b_{(a,b)i} +  \sum_i \gamma_i(\, \hat{\ell}^a_{(a,b)i} - \, \hat{\ell}^b_{(a,b)i} )$   (reconstructed causes)
- $\lambda \,||\beta||_2^2 +\lambda \,||\gamma||_2^2 $ regularization



In [826]:
OM=om.OutcomeModel(df,FL,cols_home_score=cols_home_score,cols_away_score=cols_away_score,seed_cv=seed0,vad_portion=vad_portion,recon_flag=recon_flag,normalize=normalize,verbose=FL.verbose,n_iter=n_iter,n_iter_cv=n_iter_alpha,min_caps=min_caps,epsilon=epsilon,weights=weights)

Run outcome model

In [827]:
X_aug,regr,alpha_ridge=SD.run_outcome_model(df,OM,FL,solver=solver,alphas=alphas,test=test,seed=seed0,holdout_portion_alpha_ridge=OM.vad_portion)

True
GridSearchCV(cv=ShuffleSplit(380, n_iter=10, test_size=0.1, random_state=1244),
       error_score='raise',
       estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=True, random_state=10, solver='auto', tol=0.001),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'alpha': array([  1.,   2.,   3.,   4.,   5.,   6.,   7.,   8.,   9.,  10.,  11.,
        12.,  13.,  14.,  15.,  16.,  17.,  18.,  19.,  10.,  20.,  30.,
        40.,  50.,  60.,  70.,  80.,  90., 100., 110., 120., 130., 140.,
       150., 160., 170., 180., 190., 200.])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)
9.0
False
GridSearchCV(cv=ShuffleSplit(380, n_iter=10, test_size=0.1, random_state=1244),
       error_score='raise',
       estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=True, random_state=10, solver='auto', tol=0.001),
       fit_params=None, iid=True, n_job

In [828]:
rho=np.corrcoef(OM.X.T)
max_rho=(-np.sort(-rho,axis=1))[:,1]

# OM.X[:,np.where(max_rho>0.96)[0]]=0    # set to zero all games of collinear players

In [829]:
SD.Z[algo].shape

(20, 20, 9)

In [830]:
labels=['intercept']
labels.extend(FL.idx2player_name.values())  
if OM.recon_flag==True:labels.extend(np.arange(FL.n_players))
else:labels.extend(np.arange(SD.Z[algo],shape[-1]))

coef_name='beta'    
df_res=sd.extract_coeff_linear_regression(regr,labels=labels,rounding=10,coef_name=coef_name)
c1=coef_name+'_causal'
c0=coef_name+'_non_causal'

In [831]:
df_res['team']=df_res['player_name'].map(FL.player_name2team_name)
df_res['caps']=df_res['player_name'].map(OM.player_name2caps)
df_res['diff']=(df_res[c1]-df_res[c0])/df_res[c0]

player_name2max_corr=dict((FL.idx2player_name[i],max_rho[i]) for i in range(FL.n_players) )
df_res['max_corr']=df_res['player_name'].map(player_name2max_corr)

### Show players in the top that change their position

In [832]:
max_rho_disp=0.9
min_caps_viz=10
res=df_res[((df_res.max_corr)<max_rho_disp) & (np.isfinite(df_res['diff'])) & (df_res['caps']>min_caps_viz)]
rankings=res.rank(axis=0,ascending=False)[[c0,c1]]
rankings.loc[:,'diff']=rankings[c0]-rankings[c1]
res.loc[:,'rank_diff']=rankings['diff']
res.loc[:,'rank0']=rankings[c0]
res.loc[:,'rank1']=rankings[c1]

In [833]:
res[res.rank0<40].sort_values(by='rank_diff',ascending=False)

Unnamed: 0,beta_causal,beta_non_causal,player_name,team,caps,diff,max_corr,rank_diff,rank0,rank1
437,0.043953,0.065129,Danilo Fernando Avelar,Cagliari,16.0,-0.325148,0.685875,28.0,35.0,7.0
472,0.042865,0.097499,Jorginho,"Napoli,Hellas Verona",30.0,-0.560348,0.57666,11.0,20.0,9.0
221,0.03095,0.064743,Andrea Lazzari,Udinese,18.0,-0.521951,0.700587,9.0,36.0,27.0
308,0.038289,0.090357,Adil Rami,Milan,16.0,-0.576248,0.673871,8.0,23.0,15.0
43,0.04839,0.122344,Andrea Ranocchia,Inter,23.0,-0.604478,0.811349,1.0,6.0,5.0
156,0.050098,0.137119,Carlos Tevez,Juventus,30.0,-0.634642,0.871574,0.0,4.0,4.0
101,0.063298,0.162934,Francesco Totti,Roma,20.0,-0.611508,0.751123,0.0,1.0,1.0
442,0.030982,0.081929,Afriyie Acquah,Parma,19.0,-0.62184,0.698778,0.0,26.0,26.0
341,0.062488,0.16074,Miguel Britos,Napoli,15.0,-0.611249,0.639006,0.0,2.0,2.0
340,0.037902,0.104702,Kevin Strootman,Roma,23.0,-0.638003,0.811305,0.0,17.0,17.0


Sorted by **increasing** their evaluation with the confounder.

In [834]:
n_disp=20
max_rho_disp=1.

res[((res.max_corr)<max_rho_disp) & ((res[c1])>=1e-5) & (np.isfinite(res['diff']))].sort_values(by=['diff'],ascending=False).head(n=n_disp)


Unnamed: 0,beta_causal,beta_non_causal,player_name,team,caps,diff,max_corr,rank_diff,rank0,rank1
420,0.008742,0.003243,Emmanuel Agyemang-Badu,Udinese,20.0,1.69581,0.745465,18.0,100.0,82.0
192,0.002754,0.00116,Luca Antonini,Genoa,25.0,1.37451,0.824253,-2.0,103.0,105.0
166,0.019272,0.008265,Mariano Julio Izco,Catania,26.0,1.331747,0.778225,41.0,94.0,53.0
124,0.024278,0.010747,Paolo Cannavaro,"Sassuolo,Napoli",18.0,1.258995,0.574402,50.0,89.0,39.0
495,0.030104,0.014421,Domenico Berardi,Sassuolo,26.0,1.087533,0.85352,54.0,84.0,30.0
188,0.008199,0.005024,Perparim Hetemaj,Chievo Verona,28.0,0.63206,0.828234,12.0,98.0,86.0
52,0.001643,0.001042,Lorik Cana,Lazio,24.0,0.576681,0.801834,-6.0,104.0,110.0
396,0.015817,0.012368,Innocent Emeghara,Livorno,17.0,0.278821,0.69712,23.0,87.0,64.0
476,0.033475,0.027212,Luca Antei,Sassuolo,22.0,0.23018,0.816529,52.0,72.0,20.0
422,0.02668,0.028015,Nicola Sansone,"Sassuolo,Parma",16.0,-0.047653,0.471542,36.0,71.0,35.0


Sorted by players **changing sign** with the deconfounder.

In [835]:
n_disp=40
max_rho_disp=1.
res[((res.max_corr)<max_rho_disp) & ((res[c1])>=0) &((res[c0])<=0) & (np.isfinite(res['diff']))].sort_values(by=['diff'],ascending=True).head(n=n_disp)


Unnamed: 0,beta_causal,beta_non_causal,player_name,team,caps,diff,max_corr,rank_diff,rank0,rank1
443,0.02194,-0.000668,Alessandro Longhi,Sassuolo,27.0,-33.823126,0.871714,65.0,109.0,44.0
498,0.001124,-7.2e-05,Bruno Fernandes,Udinese,17.0,-16.697339,0.71772,-8.0,107.0,115.0
475,0.021797,-0.002149,Norbert Gyomber,Catania,15.0,-11.142164,0.645426,67.0,112.0,45.0
220,0.010732,-0.001166,Andrea Luci,Livorno,22.0,-10.203411,0.829888,32.0,110.0,78.0
398,0.001186,-0.000404,Sime Vrsaljko,Genoa,22.0,-3.933685,0.771127,-6.0,108.0,114.0
198,0.01687,-0.015025,Paolo Bianco,Sassuolo,15.0,-2.122817,0.675115,69.0,129.0,60.0
164,0.009572,-0.009861,Davide Biondini,"Sassuolo,Genoa",27.0,-1.970661,0.635031,44.0,124.0,80.0
384,0.008448,-0.008836,Lucas Nahuel Castro,Catania,23.0,-1.956039,0.695147,37.0,121.0,84.0
497,0.015032,-0.017853,Gianluca Pegolo,Sassuolo,33.0,-1.841998,0.871714,69.0,135.0,66.0
143,0.006572,-0.012132,Gonzalo Bergessio,Catania,28.0,-1.54167,0.800059,35.0,126.0,91.0


 Sorted by **decreasing** their evaluation with the deconfounder.

In [836]:
n_disp=20
max_rho_disp=1.
df_res[((df_res.max_corr)<max_rho_disp) & ((df_res[c1]*df_res[c0])>=0) & ((df_res[c1])>=1e-5) & (np.isfinite(df_res['diff']))].sort_values(by=['diff'],ascending=True).head(n=n_disp)


Unnamed: 0,beta_causal,beta_non_causal,player_name,team,caps,diff,max_corr
366,0.001275,0.021676,Juan Cuadrado,Fiorentina,31.0,-0.941157,0.851383
46,0.004329,0.044751,Goran Pandev,Napoli,17.0,-0.903273,0.664263
401,0.005809,0.029504,Stefan Savic,Fiorentina,31.0,-0.803103,0.880536
132,0.008409,0.038131,Raul Albiol,Napoli,30.0,-0.779479,0.93229
33,0.007825,0.034721,Manuel Pasqual,Fiorentina,26.0,-0.774644,0.795622
70,0.00533,0.02208,Alessandro Lucarelli,Parma,34.0,-0.758616,0.914801
365,0.01841,0.073685,Federico Fernandez,Napoli,25.0,-0.750149,0.742618
134,0.005344,0.021314,Christian Abbiati,Milan,28.0,-0.749265,0.801815
111,0.023133,0.09148,Blerim Dzemaili,Napoli,18.0,-0.747125,0.667768
235,0.016607,0.065634,Marek Hamsik,Napoli,23.0,-0.746968,0.799843


Top players **noncausal** model (as in the standard APM model)

In [837]:
n_disp=20
max_rho_disp=0.9
df_res[((df_res.max_corr)<max_rho_disp) & ((df_res[c1])>=-1e-3) & (np.isfinite(df_res['diff']))].sort_values(by=[c0],ascending=False).head(n=n_disp)

Unnamed: 0,beta_causal,beta_non_causal,player_name,team,caps,diff,max_corr
101,0.063298,0.162934,Francesco Totti,Roma,20.0,-0.611508,0.751123
341,0.062488,0.16074,Miguel Britos,Napoli,15.0,-0.611249,0.639006
149,0.055454,0.144635,Gervinho,Roma,30.0,-0.616596,0.883062
156,0.050098,0.137119,Carlos Tevez,Juventus,30.0,-0.634642,0.871574
269,0.044514,0.124993,Dries Mertens,Napoli,21.0,-0.643872,0.700006
43,0.04839,0.122344,Andrea Ranocchia,Inter,23.0,-0.604478,0.811349
279,0.042602,0.122296,Arturo Vidal,Juventus,28.0,-0.651645,0.828016
273,0.043017,0.12217,Mehdi Benatia,Roma,32.0,-0.647892,0.884029
105,0.040155,0.117539,Andrea Barzagli,Juventus,26.0,-0.658372,0.820346
125,0.041099,0.11697,Fernando Llorente,Juventus,29.0,-0.648637,0.853841


Top players **causal** model, sport deconfounder

In [838]:
rankings=df_res.rank(axis=0,ascending=False)[[c0,c1]]
df_res['rank_causal']=rankings[c1]
df_res['rank_non_causal']=rankings[c0]
df_res[((df_res.max_corr)<max_rho_disp) & ((df_res[c1])>=1e-5) & (np.isfinite(df_res['diff']))].sort_values(by=[c1],ascending=False).head(n=n_disp)

Unnamed: 0,beta_causal,beta_non_causal,player_name,team,caps,diff,max_corr,rank_causal,rank_non_causal
101,0.063298,0.162934,Francesco Totti,Roma,20.0,-0.611508,0.751123,68.0,2.0
341,0.062488,0.16074,Miguel Britos,Napoli,15.0,-0.611249,0.639006,71.0,3.0
149,0.055454,0.144635,Gervinho,Roma,30.0,-0.616596,0.883062,82.0,4.0
156,0.050098,0.137119,Carlos Tevez,Juventus,30.0,-0.634642,0.871574,91.0,5.0
43,0.04839,0.122344,Andrea Ranocchia,Inter,23.0,-0.604478,0.811349,94.0,7.0
269,0.044514,0.124993,Dries Mertens,Napoli,21.0,-0.643872,0.700006,109.0,6.0
437,0.043953,0.065129,Danilo Fernando Avelar,Cagliari,16.0,-0.325148,0.685875,111.0,42.0
273,0.043017,0.12217,Mehdi Benatia,Roma,32.0,-0.647892,0.884029,113.0,9.0
472,0.042865,0.097499,Jorginho,"Napoli,Hellas Verona",30.0,-0.560348,0.57666,114.0,25.0
279,0.042602,0.122296,Arturo Vidal,Juventus,28.0,-0.651645,0.828016,115.0,8.0


In [839]:
res.sort_values(by=c1,ascending=False)

Unnamed: 0,beta_causal,beta_non_causal,player_name,team,caps,diff,max_corr,rank_diff,rank0,rank1
101,0.063298,0.162934,Francesco Totti,Roma,20.0,-0.611508,0.751123,0.0,1.0,1.0
341,0.062488,0.160740,Miguel Britos,Napoli,15.0,-0.611249,0.639006,0.0,2.0,2.0
149,0.055454,0.144635,Gervinho,Roma,30.0,-0.616596,0.883062,0.0,3.0,3.0
156,0.050098,0.137119,Carlos Tevez,Juventus,30.0,-0.634642,0.871574,0.0,4.0,4.0
43,0.048390,0.122344,Andrea Ranocchia,Inter,23.0,-0.604478,0.811349,1.0,6.0,5.0
269,0.044514,0.124993,Dries Mertens,Napoli,21.0,-0.643872,0.700006,-1.0,5.0,6.0
437,0.043953,0.065129,Danilo Fernando Avelar,Cagliari,16.0,-0.325148,0.685875,28.0,35.0,7.0
273,0.043017,0.122170,Mehdi Benatia,Roma,32.0,-0.647892,0.884029,0.0,8.0,8.0
472,0.042865,0.097499,Jorginho,"Napoli,Hellas Verona",30.0,-0.560348,0.576660,11.0,20.0,9.0
279,0.042602,0.122296,Arturo Vidal,Juventus,28.0,-0.651645,0.828016,-3.0,7.0,10.0
