In [7]:
import math
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

from sklearn.preprocessing import StandardScaler

from sklearn.datasets import load_iris
from factor_analyzer import FactorAnalyzer

from sklearn import linear_model
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

import statsmodels.api as sm
from statsmodels.formula.api import ols

# Data Analysis

### Import cleaned data file

In [2]:
data=pd.read_csv('C:/Users/Anne_2/Documents/Ironhack/Fifa_Moneyball-Case-Study/fifa21_cleaned.csv')

In [3]:
#view data
data.head()

Unnamed: 0,id,name,age,ova,nationality,club,bp,height,weight,foot,...,gk_reflexes,pac,sho,pas,dri,def,phy,gender,position,market_value
0,16,Luis García,37,71,Spain,KAS Eupen,CM,"5'10""",143lbs,Right,...,11,53,69,73,69,58,63,Male,midfielder,600000
1,41,Iniesta,36,81,Spain,Vissel Kobe,CAM,"5'7""",150lbs,Right,...,7,58,70,85,85,63,59,Male,midfielder,5500000
2,80,E. Belözoğlu,38,77,Turkey,Fenerbahçe SK,CM,"5'7""",159lbs,Left,...,8,44,71,83,77,62,66,Male,midfielder,2800000
3,330,R. Keane,35,80,Republic of Ireland,LA Galaxy,CF,"5'9""",161lbs,Right,...,16,68,82,75,82,31,69,Male,forward,5500000
4,1179,G. Buffon,42,82,Italy,Juventus,GK,"6'4""",203lbs,Right,...,78,77,76,74,78,33,91,Male,keeper,2200000


#### There are 34 'basic' skills in the data set with a score between 0 and 100. There are also 6 indices (pac,	sho, pas, dri, def, phy) in the data which are calculated using the 34 basic skills - also having a range from 0 to 100. In the original data set there are another 6 indices out of the 34 basic skills in a different composition but these indices have a different range (values can be higher than 100).

#### Since the calculation method of the indices in the data set isn't 100% clear, the analysis will start with a factor analysis of the 34 basic skills, so that the factors can be used in a regression analysis on market value. This will also allow a more detailed analysis than using the already calculated indices.

## Factor Analysis of basic skills

In [4]:
data.columns

Index(['id', 'name', 'age', 'ova', 'nationality', 'club', 'bp', 'height',
       'weight', 'foot', 'value', 'wage', 'crossing', 'finishing',
       'heading_accuracy', 'short_passing', 'volleys', 'dribbling', 'curve',
       'fk_accuracy', 'long_passing', 'ball_control', 'acceleration',
       'sprint_speed', 'agility', 'reactions', 'balance', 'shot_power',
       'jumping', 'stamina', 'strength', 'long_shots', 'aggression',
       'interceptions', 'positioning', 'vision', 'penalties', 'composure',
       'marking', 'standing_tackle', 'sliding_tackle', 'gk_diving',
       'gk_handling', 'gk_kicking', 'gk_positioning', 'gk_reflexes', 'pac',
       'sho', 'pas', 'dri', 'def', 'phy', 'gender', 'position',
       'market_value'],
      dtype='object')

In [5]:
#Standardize variables for factor analysis
factors=data[['crossing', 'finishing',
       'heading_accuracy', 'short_passing', 'volleys', 'dribbling', 'curve',
       'fk_accuracy', 'long_passing', 'ball_control', 'acceleration',
       'sprint_speed', 'agility', 'reactions', 'balance', 'shot_power',
       'jumping', 'stamina', 'strength', 'long_shots', 'aggression',
       'interceptions', 'positioning', 'vision', 'penalties', 'composure',
       'marking', 'standing_tackle', 'sliding_tackle', 'gk_diving',
       'gk_handling', 'gk_kicking', 'gk_positioning', 'gk_reflexes']]
transformer = StandardScaler().fit(factors)
factors = pd.DataFrame(transformer.transform(factors), columns=factors.columns)

In [13]:
#Bartlett test - are variables uncorrelated?
from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity
chi_square_value,p_value=calculate_bartlett_sphericity(factors)
print('Chi_square:', chi_square_value, '; p_value:', p_value)

Chi_square: 952138.0277382658 ; p_value: 0.0


In [15]:
#KMO test - measure of sampling adequacy for whole model (should be at least 0.5, best between 0.85 and 1) 
from factor_analyzer.factor_analyzer import calculate_kmo
kmo_all,kmo_model=calculate_kmo(factors)
kmo_model

0.9705493736318477

In [25]:
#Choosing number of factors - could choose 6 factors, to replicate the given indices in dataset or
#using Kaiser Criterion to choose factors based on eigenvalues (eigenvalue > 1)

#Initial Factor analysis with as many factors as variables
fa = FactorAnalyzer(n_factors=34, rotation=None)
fa.fit(factors)

#get the Eigenvalues
fa.get_eigenvalues()

(array([17.98661373,  5.19784172,  2.91563142,  1.87221083,  1.38052917,
         0.61371814,  0.4555    ,  0.34778241,  0.31626853,  0.2852002 ,
         0.26526281,  0.24245119,  0.20975631,  0.2005312 ,  0.19672804,
         0.17405565,  0.15603434,  0.13517231,  0.12960107,  0.12096537,
         0.10550594,  0.09148513,  0.08968179,  0.07753629,  0.0652116 ,
         0.06232297,  0.06099747,  0.05682985,  0.03820303,  0.03450464,
         0.03282465,  0.03227828,  0.02830259,  0.02246133]),
 array([ 1.79099501e+01,  5.11655207e+00,  2.80153128e+00,  1.77032205e+00,
         1.15796352e+00,  3.45479859e-01,  3.30515608e-01,  2.08300464e-01,
         1.82563363e-01,  1.53034109e-01,  1.14337684e-01,  9.21019168e-02,
         6.82995744e-02,  5.80376739e-02,  4.91818457e-02,  3.85031327e-02,
         3.63517019e-02,  2.89004907e-02,  2.78784030e-02,  2.42126333e-02,
         1.59102622e-02,  1.33937693e-02,  1.22325425e-02,  1.04167953e-02,
         9.98371433e-03,  8.68282912e-03,  8

#### --> there are 5 factors with eigenvalue > 1

In [37]:
#Factor analyses with 5 factors
fa = FactorAnalyzer(n_factors=5, rotation='varimax')
fa.fit(factors)

#get the loadings of variables on factors
print(pd.DataFrame(fa.loadings_,index=factors.columns))

#get the variance explained by factors
print(pd.DataFrame(fa.get_factor_variance(),index=['Variance','Proportional Var','Cumulative Var']))

#get the communalities of variables (variance of each variable explained by factors)
print(pd.DataFrame(fa.get_communalities(),index=factors.columns,columns=['Communalities']))

                         0         1         2         3         4
crossing          0.655835  0.294058  0.342049  0.415070 -0.085587
finishing         0.794527  0.398334 -0.215675  0.257865  0.099057
heading_accuracy  0.305318  0.596600  0.337186 -0.033688  0.548396
short_passing     0.693477  0.353689  0.467815  0.233759  0.082598
volleys           0.819917  0.362684 -0.110022  0.200410  0.107258
dribbling         0.713601  0.442842  0.168820  0.437089  0.006071
curve             0.805691  0.271257  0.166112  0.292788 -0.079672
fk_accuracy       0.789374  0.237666  0.185099  0.173127 -0.121719
long_passing      0.651425  0.178810  0.577740  0.154064 -0.026365
ball_control      0.716699  0.460366  0.285233  0.333129  0.093460
acceleration      0.293300  0.294282  0.007132  0.864324  0.056841
sprint_speed      0.264539  0.328250  0.008706  0.795958  0.143656
agility           0.481959  0.182706  0.042331  0.751699 -0.079582
reactions         0.586567 -0.315867  0.283751  0.035608  0.40

#### --> the last 5 variables - skills for goalkeeping - are all loading negatively on the extracted factors, especially on factor 2, which has only "heading_accuracy" loading above 0.5 in positive direction. It makes interpretation of this factor complicated - 'bad goalkeeper' isn't a desirable skill. Explanation could be, that goalkeeper skills are so different and contrary to all other skills, that good keepers always are terrible forwards/midfielders/defenders and the other way around. Taking goalkeeping skills out of factor analysis and let them be their own skill set.

In [41]:
#two new subsets: factors_player and factors_keeper
#starting with factors_player
factors_player=data[['crossing', 'finishing',
       'heading_accuracy', 'short_passing', 'volleys', 'dribbling', 'curve',
       'fk_accuracy', 'long_passing', 'ball_control', 'acceleration',
       'sprint_speed', 'agility', 'reactions', 'balance', 'shot_power',
       'jumping', 'stamina', 'strength', 'long_shots', 'aggression',
       'interceptions', 'positioning', 'vision', 'penalties', 'composure',
       'marking', 'standing_tackle', 'sliding_tackle']]

#Bartlett test - are variables uncorrelated?
chi_square_value,p_value=calculate_bartlett_sphericity(factors_player)
print('Chi_square:', chi_square_value, '; p_value:', p_value)
                    
#KMO test - measure of sampling adequacy for whole model (should be at least 0.5, best between 0.85 and 1) 
kmo_all,kmo_model=calculate_kmo(factors_player)
print('KMO:', kmo_model)

Chi_square: 710654.9723194506 ; p_value: 0.0
KMO: 0.9595284119697213


In [43]:
#Choosing number of factors (Eigenvalue >1)
fa = FactorAnalyzer(n_factors=29, rotation=None)
fa.fit(factors_player)
fa.get_eigenvalues()

(array([14.89284686,  5.04034595,  2.29505466,  1.47448115,  0.98138129,
         0.60512217,  0.41971542,  0.34380248,  0.31611884,  0.28504897,
         0.26366392,  0.24202135,  0.2092924 ,  0.19954051,  0.19093151,
         0.16383489,  0.14617914,  0.13228622,  0.12525016,  0.10948954,
         0.09313374,  0.08974319,  0.0782919 ,  0.0651808 ,  0.0623171 ,
         0.06103464,  0.05681458,  0.0345929 ,  0.02248371]),
 array([ 1.48027871e+01,  4.95443006e+00,  2.16715711e+00,  1.28438854e+00,
         7.74483328e-01,  3.47328493e-01,  3.05679942e-01,  2.07181986e-01,
         1.79455534e-01,  1.53575547e-01,  9.63514810e-02,  7.98005906e-02,
         6.44667704e-02,  5.75337046e-02,  4.32722986e-02,  3.91999811e-02,
         3.52665213e-02,  3.05293206e-02,  2.56474457e-02,  2.54308440e-02,
         1.39438129e-02,  1.24969873e-02,  8.94086152e-03,  8.74469150e-03,
         8.01239005e-03,  5.93214620e-03,  3.50779439e-03,  1.79925857e-03,
        -4.61959179e-06]))

#### --> 5 factors with eigenvalue > 1

In [44]:
#Factor analyses with 5 factors
fa = FactorAnalyzer(n_factors=5, rotation='varimax')
fa.fit(factors_player)

#get the loadings of variables on factors
print(pd.DataFrame(fa.loadings_,index=factors_player.columns))

                         0         1         2         3         4
crossing          0.683450  0.381139  0.431838 -0.092719  0.095612
finishing         0.887506 -0.138818  0.299890  0.144991 -0.017827
heading_accuracy  0.454115  0.455810  0.063565  0.625094 -0.155963
short_passing     0.716411  0.511980  0.269745  0.084742  0.159626
volleys           0.889768 -0.043229  0.236195  0.140423  0.041398
dribbling         0.795473  0.240297  0.481376  0.035242  0.010594
curve             0.827785  0.203814  0.305668 -0.082671  0.130261
fk_accuracy       0.805842  0.218430  0.180919 -0.124103  0.127168
long_passing      0.613177  0.584697  0.160324 -0.060167  0.263296
ball_control      0.790675  0.356966  0.384364  0.122839  0.048888
acceleration      0.351250  0.048942  0.890639  0.054312 -0.017694
sprint_speed      0.336230  0.060537  0.831988  0.152463 -0.047435
agility           0.480719  0.052314  0.759297 -0.109506  0.145158
reactions         0.368190  0.194834 -0.016288  0.275523  0.65

#### --> Factor 1: crossing, finishing, short_passing, volleys, dribbling, curve, fk_accuracy, long_passing, ball_control, shot_power, long_shots, positioning, vision, penalties, composure --> BALLPOWER/ PLAYING_THE_BALL/ TECHNIC
#### --> Factor 2: (short_passing - though higher load on factor 1), (long_passing - though higher load on factor 1), aggression, interceptions, marking, standing_tackle, sliding_tackle, (stamina - though load is a bit below 0.5) --> TACKLING
#### --> Factor 3: acceleration, sprint_speed, agility, balance, (aggression - though higher load on factor 2) --> MOVEMENT
#### --> Factor 4: heading_accuracy, strength, (jumping - though load is a bit below 0.5) ---> HEADER&STRENGTH
#### --> Factor 5: reactions --> REACTIONS

In [45]:
#get the variance explained by factors
print(pd.DataFrame(fa.get_factor_variance(),index=['Variance','Proportional Var','Cumulative Var']))

                          0         1         2         3         4
Variance          10.573061  5.853452  4.117074  1.886104  1.179924
Proportional Var   0.364588  0.201843  0.141968  0.065038  0.040687
Cumulative Var     0.364588  0.566431  0.708400  0.773438  0.814125


#### --> In total 81% of variance is beeing explained by the factors

In [46]:
#get the communalities of variables (variance of each variable explained by factors)
print(pd.DataFrame(fa.get_communalities(),index=factors_player.columns,columns=['Communalities']))

                  Communalities
crossing               0.816593
finishing              0.918211
heading_accuracy       0.833091
short_passing          0.880792
volleys                0.870777
dribbling              0.923598
curve                  0.844003
fk_accuracy            0.761398
long_passing           0.816505
ball_control           0.917807
acceleration           0.922273
sprint_speed           0.834415
agility                0.843421
reactions              0.680424
balance                0.704612
shot_power             0.722154
jumping                0.323643
stamina                0.704270
strength               0.671740
long_shots             0.892005
aggression             0.742877
interceptions          0.929890
positioning            0.888427
vision                 0.790142
penalties              0.795285
composure              0.765902
marking                0.896068
standing_tackle        0.964535
sliding_tackle         0.954756


#### --> very high communalities, for most variables more than 70% of verance is explained by factors - variable jumping is only one with low explained variance