In [1]:
import pandas as pd
import numpy as np
import os,json

In [2]:
# This can be defined as an external class
from sklearn.base import BaseEstimator, RegressorMixin, TransformerMixin
# Create the transformer to handle the attributes data
class FlattenDict(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        # This transformer doesn't need to learn anything about the data,
        # so it can just return self without any further processing
        return self


    def flat_inner(self, J, dict_out=None, parent_key=None, separator="_"):
        if dict_out is None:
            dict_out = {}
        for k, v in J.items():
            k = f"{parent_key}{separator}{k}" if parent_key else k
            if isinstance(v, dict):
                self.flat_inner(J=v, dict_out=dict_out, parent_key=k)
            else:
                dict_out[k] = v
        return dict_out
    
    def transform(self,X):
        dict_list=[]
        for i in range(len(X)):
            dict_list.append(self.flat_inner(X[i]))    
        return pd.DataFrame(dict_list)

In [3]:
flatten=FlattenDict()

# Input data structure

##  Importing events

In [4]:
path_to_json = '/Users/bshakirov/Desktop/TDI/Capstone_project/open-data-master/data/events/'
all_events=[]
for file_name in [file for file in os.listdir(path_to_json) if file.endswith('.json')]:
    with open(path_to_json + file_name) as json_file:
        data = json.load(json_file)
        data=[dict(item, **{'match_id':int(file_name.replace('.json',''))}) for item in data]
        all_events.extend(data)

In [5]:
#Flatten the columns and convert to df
all_events_df=flatten.transform(all_events)

In [6]:
#Just to backup
#all_events_df_2=all_events_df.copy()

## Importing matches

In [7]:
#Getting all match data
all_matches=[]
parent_dir='/Users/bshakirov/Desktop/TDI/Capstone_project/open-data-master/data/matches/'

for path_name, dir_name, file_name in os.walk(parent_dir):
    for sub_file in file_name: 
        if sub_file.endswith('.json'):
            with open(path_name+'/'+sub_file) as json_file:
                data=json.load(json_file)
                all_matches.extend(data)

In [8]:
#Convert all matches into df
all_matches_df=flatten.transform(all_matches)

## Defining the columns for use 

In [9]:
#Only columns with priority
columns_select=[
                #columns needed for grouping or selecting
                'team_name',
                'position_id',
                'player_id',
                'match_id',
    
                #columns with bolean type
                'ball_recovery_offensive',  #D
                'ball_recovery_recovery_failure', #D
                'block_deflection', #D
                'block_offensive', #D
                'block_save_block', #D
                'clearance_aerial_won', #D
                'counterpress', #D
                'foul_committed_advantage', #FC   
                'foul_committed_offensive',#FC
                'foul_committed_penalty', #FC               
                'foul_won_advantage', #FW
                'foul_won_defensive', #FW
                'foul_won_penalty', #FW
                #columns that have second order priority
                'dribble_nutmeg', #A
                'miscontrol_aerial_won', #D
                'pass_aerial_won', #P
                'pass_backheel',#P
                'pass_cross',#P
                'pass_cut_back', #P
                'pass_deflected', #P
                'pass_goal_assist', #P
                'pass_no_touch',#P
                'pass_shot_assist', #P
                'pass_switch', #P
                'shot_aerial_won', #S
                'shot_deflected', #S
                'shot_follows_dribble', #S
                'shot_one_on_one',#S
                'shot_open_goal',#S
                'shot_redirect', #S
                
                #columns for dummy variables
                '50_50_outcome_name', #D
                'duel_outcome_name', #D
                'bad_behaviour_card_name', #Beh
                'ball_receipt_outcome_name',
                'foul_committed_card_name', #Beh
                'foul_committed_type_name', #Beh
                'interception_outcome_name', 
                'pass_outcome_name', #P
                'shot_outcome_name', #P
                ]

#defensive 2-9 
#foul 9 - 15
#pass 15 - 26
#shot 26 - 32


In [10]:
#Columns to create dummy variables
coded_columns=[ '50_50_outcome_name',
                'bad_behaviour_card_name',
                'ball_receipt_outcome_name',
                'duel_outcome_name',
                'foul_committed_card_name',
                'foul_committed_type_name',
                'interception_outcome_name',
                'pass_outcome_name',
                'shot_outcome_name'
                ]

In [11]:
#Columns needed from matches dataframe
match_col_names=['match_id',
                 'away_team_away_team_name',
                 'home_team_home_team_name',
                 'away_score',
                 'home_score',
                 'home_team_home_team_gender'
                ]

## Defining functions

In [12]:
#Function for player's position
def set_positions(df):
    #Conditions to set multiple positions into compact style
    conditions=[
        (df['position_id']>25) | (df['position_id']<=1),
        (df['position_id']>20) | (df['position_id']==17),
        (df['position_id']>8),
        (df['position_id']>1)
        ]
    values=['Unknown', 'Attack', 'Midfield', 'Defense']
    
    #Apply conditions and remove any misspecified position
    df['position_id']=df['position_id'].astype(int)
    df['position_id']=np.select(conditions, values)
    df=df[df['position_id']!='Unknown']
    
    return df

In [13]:
#Function used to create match result column: with logic if win then 1 else 0
def won_condition(df_merge):
    if df_merge['team_name']==df_merge['away_team_away_team_name'] and df_merge['away_score']>df_merge['home_score']:
        return 1
    elif df_merge['team_name']==df_merge['home_team_home_team_name'] and df_merge['away_score']<df_merge['home_score']:
        return 1
    else:
        return 0

In [14]:
def get_weights(df, player=False, position=False, gender=None):
    
     #Fill NA with False and cleaning data
    df=df[df["position_id"]!=1]    
    
    #Create df with selected features
    features_df=df[columns_select]
    
    #Create dummy columns
    features_df=pd.get_dummies(features_df, columns=coded_columns).fillna(False)
    
    #Combine positions to main positions
    features_df=set_positions(features_df) #Added here because some process need to be done anyway
    
    #Grouping by conditions
    if position:
        
        if player:
            features_df=features_df.groupby(['match_id', 'team_name','player_id','position_id']).sum().reset_index()
        
        else:
            features_df=features_df.groupby(['match_id', 'team_name','position_id']).sum().reset_index()
    
    elif player:
        features_df=features_df.groupby(['match_id', 'team_name','player_id']).sum().reset_index()
    
    else:
        features_df=features_df.groupby(['match_id', 'team_name']).sum().reset_index()
   
    #Merging matches and events df
    features_df=pd.merge(features_df,all_matches_df[match_col_names],on='match_id',how='left')
    
    #Create match result column and drop columns that are not needed
    features_df['match_result']=features_df.apply(won_condition, axis=1)
    
    #Select genders
    if gender=='male':
        features_df=features_df[features_df['home_team_home_team_gender']=='male']
    elif gender=='female':
        features_df=features_df[features_df['home_team_home_team_gender']=='female']
    
    features_df=features_df.drop(match_col_names[1:],axis=1).reset_index(drop=True)
    return features_df

# Data for training has been created at this point

In [15]:
# Get the events based on some condition
count_all = get_weights(all_events_df, gender = 'male')
count_by_ply = get_weights(all_events_df, gender = 'male', player=True)

In [41]:
#SPYDER CHART

import plotly as plt
import pandas as pd

n = 6 #You can use player id as well 
df_spyder = pd.DataFrame(dict(
        r =[sum(count_all.iloc[n,2:8]), sum(count_all.iloc[n,9:15]), sum(count_all.iloc[n,15:25]), 
        sum(count_all.iloc[n,26:32]), sum(count_all.iloc[n,32:39])],
        theta = ['defensive','foul','pass','shot', 'other']))
fig = plt.express.line_polar(df_spyder, r='r', theta='theta', line_close=True, 
                    title="The skill rating (# of occurence per match)")
fig.show()
plt.offline.plot(fig, filename='/Users/bshakirov/Desktop/TDI/Capstone_project/soccer_analytics/fair_player_value/modelling_part/skill_rating.html')


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



'/Users/bshakirov/Desktop/TDI/Capstone_project/soccer_analytics/fair_player_value/modelling_part/skill_rating.html'

In [17]:
#Choosing variables to fit the model
X=count_all.iloc[:,2:-1]
y=count_all['match_result']

In [39]:
pwd

'/Users/bshakirov/Desktop/TDI/Capstone_project/soccer_analytics/fair_player_value/modelling_part'

In [18]:
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import train_test_split
 
# To calculate the accuracy score of the model
from sklearn.metrics import accuracy_score, confusion_matrix

from sklearn.svm import SVC, LinearSVC

from sklearn.preprocessing import StandardScaler, LabelEncoder

X = StandardScaler().fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 10)

pipe_svc = Pipeline([('stdscaler', StandardScaler()),
                     ('linearSVC', LinearSVC(fit_intercept=True, dual = False))])

param_svc = {'linearSVC__C': [0.001, 0.01, 0.1, 1, 10, 100]}

svc_gs = GridSearchCV(pipe_svc, param_svc, cv=5, n_jobs = 4)



# Building a Support Vector Machine on train data

svc_model_gs = svc_gs.fit(X_train, y_train)
#Found the best model
#svc_model = LinearSVC(fit_intercept=True, dual = False,  max_iter = 50000,random_state=10)
#svc_model.fit(X_train, y_train)
 
prediction = svc_model_gs.predict(X_test)
# check the accuracy on the training set
print(svc_model_gs.score(X_train, y_train))
print(svc_model_gs.score(X_test, y_test))
confusion_mat = confusion_matrix(y_test,prediction)
print(confusion_mat)

#We need to create pipeline and gridsearch
svc_model_gs.best_params_

0.8650980392156863
0.8150470219435737
[[168  27]
 [ 32  92]]


{'linearSVC__C': 1}

In [19]:
#Normalizing the weights
total=sum(np.abs(svc_model_gs.best_estimator_.named_steps.linearSVC.coef_[0]))
weights=svc_model_gs.best_estimator_.named_steps.linearSVC.coef_[0]/total

In [20]:
#Setting index of df with player based counts
count_by_ply=count_by_ply.drop(['match_result','team_name'],axis=1).set_index(['match_id','player_id'])

In [21]:
weighted_counts = count_by_ply.dot(weights).reset_index()

In [22]:
# Sort weights according to recent matches
sorted_weights = pd.merge(weighted_counts,all_matches_df[['match_id','match_date']], on='match_id', how='left')\
                .sort_values('match_date').set_index(['match_id','match_date','player_id'])\
                .unstack(level=-1).droplevel(0)

In [23]:
#Ewma calculation
def ewma_calculation(column):
    return column.dropna().sort_index().ewm(alpha=0.5).mean()[-1]

ewma_df = pd.DataFrame(sorted_weights.apply(ewma_calculation, axis=0), columns=['weights']).droplevel(0)

In [24]:
#Adding player names to the final table and export to csv
names_only = all_events_df[['player_id', 'player_name']].drop_duplicates()
features_df = pd.merge(ewma_df, names_only, on=['player_id'], how = 'left').to_csv('features.csv')