# EDA & Profiling

This EDA is based on the <a href="https://www.kaggle.com/hugomathien/soccer">European Soccer Database</a> with more than 25,000 matches and more than 10,000 players for European professional soccer seasons from 2008 to 2016.

### Import Libraries

customplot: contains functions written for this notebook

In [None]:
import sqlite3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import scale

In [None]:
!find . | grep customplot

In [None]:
%%!
mkdir customplot && touch ./customplot/__init__.py
cp ../_pycode/customplot.py ./customplot/customplot.py

In [None]:
from customplot import *

### Data 

Download the data from: <a href="https://www.kaggle.com/hugomathien/soccer">https://www.kaggle.com/hugomathien/soccer</a>

#### Ingest Data

In [None]:
%find.. 'database.sqlite'

In [None]:
# Create your connection.
cnx = sqlite3.connect('../_data/database.sqlite')

In [None]:
c = cnx.cursor()

In [None]:
c.execute('select name from sqlite_master where type = "table";').fetchall()

### Players

In [None]:
df_player = pd.read_sql_query("SELECT * FROM Player", cnx)
df_player.sample(5)

### Player Attributes

In [None]:
df_attr = pd.read_sql_query("SELECT * FROM Player_Attributes", cnx)
df_attr.sample(5)

In [None]:
df_attr.columns

### Feature stats

In [None]:
df_attr.describe().T

#### Check nulls, NaN's, etc.

In [None]:
df_attr.isnull().any().any()
'percentage null: '; df_attr.isnull().any().sum() / df_attr.shape[0] * 100

#### Percentage nulls

In [None]:
%precision 2
df_attr.isnull().sum(axis=0).describe()
df_attr.isnull().sum(axis=0).max() * 100 / df_attr.shape[0], 'max % NaN'

#### Drop nulls

In [None]:
df_attr = df_attr.dropna()

##### Sanity check

In [None]:
df_attr.isnull().sum(axis=0).max() * 100 / df_attr.shape[0], 'max % NaN'
df_attr.info()

#### Shuffle df

In [None]:
df_attr = df_attr.reindex(np.random.permutation(df_attr.index))

### Predicting: 'overall_rating' of a player

In [None]:
df_attr.sample(5)

### Feature Correlation Analysis 
Next, we will check if 'penalties' is correlated to 'overall_rating'. We are using a similar selection operation, bu this time for all the rows and within the correlation function. 

In [None]:
df_attr[:10][['penalties', 'overall_rating']]

In [None]:
df_attr['overall_rating'].corr(df_attr['penalties'])

### Create a list of potentially correlated features

In [None]:
potentialFeatures = ['acceleration', 'curve', 'free_kick_accuracy', 'ball_control', 'shot_power', 'stamina']

#### Check correlation coefficient of "overall_rating" of a player with each feature we added to the list as potential.

In [None]:
for f in potentialFeatures:
    related = df_attr['overall_rating'].corr(df_attr[f])
    print("%s: %f" % (f, related))

In [None]:
df_attr.columns.values.shape

In [None]:
cols = ['potential',  'crossing', 'finishing', 'heading_accuracy',
       'short_passing', 'volleys', 'dribbling', 'curve', 'free_kick_accuracy',
       'long_passing', 'ball_control', 'acceleration', 'sprint_speed',
       'agility', 'reactions', 'balance', 'shot_power', 'jumping', 'stamina',
       'strength', 'long_shots', 'aggression', 'interceptions', 'positioning',
       'vision', 'penalties', 'marking', 'standing_tackle', 'sliding_tackle']

In [None]:
corr_list = [(f, df_attr['overall_rating'].corr(df_attr[f])) for f in cols]

In [None]:
df_corr = pd.DataFrame(corr_list, columns=['attributes', 'correlation'])

In [None]:
df_corr.sample(5)

In [None]:
p25 = df_corr.describe().loc['25%',][0]
p50 = df_corr.describe().loc['50%',][0]
p75 = df_corr.describe().loc['75%',][0]
p25, p50, p75

### Visualisation of correlations

In [None]:
def plot_dataframe(df, y_label):  
    global p25, p50, p75
    color = 'coral'
    fig = plt.gcf()
    fig.set_size_inches(20, 6)
    plt.title(y_label)

    ax = df['correlation'].plot(linewidth=3.3, color=color)
    ax.axhline(p25, c='gray')
    ax.axhline(p50, c='k')
    ax.axhline(p75, c='gray')
    ax.xaxis.grid()
    ax.set_xticks(df.index)
    ax.set_xticklabels(df.attributes, rotation=75); #Notice the ; (remove it and see what happens !)
    plt.show()

In [None]:
plot_dataframe(df_corr, 'Player\'s Overall Rating')

### Correlation heatmap

The features with highest correlation coefficients are indicative for high Overall Rating. However we are never sure if the top features are independent!

In [None]:
import seaborn as sns

plt.figure(figsize=(20, 12))
sns.set(style="white")
cmap = sns.diverging_palette(220, 10, as_cmap=True)

cor = df_attr.loc[:, cols].corr()
cor.shape
mask = np.zeros_like(cor)
mask[np.triu_indices_from(mask)] = True

ax = sns.heatmap(cor, mask=mask, cmap=cmap, vmax=.85);

## Clustering Players into similar groups

We can group similar players based on certain features.

<b>Note:</b> Generally, someone with domain knowledge needs to define important features. We could have also selected some of the features with highest correlation with overall_rating. However, it does not guarantee best outcome always as we are not sure if the top five features are independent. For example, if 4 of the 5 features depend on the remaining 1 feature, taking all 5 does not give new information.

#### Select features for clustering - looking for youg mid-field player

In [None]:
sel_features = ['reactions', 'short_passing', 'long_passing', 'vision', 'interceptions', 'standing_tackle', 'potential']

In [None]:
df_select = df_attr[sel_features].copy(deep=True)

In [None]:
df_select.head()

### Perform K-Means Clustering

We use K-Means to cluster the selected features in K clusters.

In [None]:
# Perform scaling on the dataframe containing the features
data = scale(df_select)

# Define number of clusters
k = 6

# Train a model
model = KMeans(init='k-means++', n_clusters=k, n_init=20).fit(data)

### DataFrame with feature coords for each cluster center

In [None]:
df_km = pd.DataFrame(model.cluster_centers_)
df_km.columns = sel_features
df_km['players'] = pd.value_counts(model.labels_, sort=False)
df_km['cluster'] = df_km.index.astype(int)
df_km

## Cluster profiles
We have K clusters based on the selected features and visualise them as profiles for similar groups of players. Each point is the average value of the cluster for that feature.

In [None]:
# Broadcast colors over data set
repeat = len(data)//5 + len(data) % 5
my_colors = list('brgykcm' * repeat)[:len(data)]

In [None]:
from pandas.plotting import parallel_coordinates

plt.figure(figsize=(15,8)).gca().axes.set_ylim([-2.5, +2.5])
df_km.pop('players')
parallel_coordinates(df_km, 'cluster', color=my_colors, marker='o');

### Predict profile for all players

In [None]:
sel_features = ['reactions', 'short_passing', 'long_passing', 'vision', 'interceptions', 'standing_tackle', 'potential']
df_select = df_attr[sel_features].copy(deep=True)
data = scale(df_select)

In [None]:
pred = model.predict(data)
df_attr['profile'] = pred
df_attr.loc[:,['player_api_id', 'profile']].sample(10)

In [None]:
profile = df_attr['profile']==2

In [None]:
df_player.info()

### Merge Players and Players Attributes data

In [None]:
df_merge = pd.merge(df_player.loc[:,['player_api_id', 'player_name', 'birthday', 'height', 'weight']], 
                    df_attr[profile], how='right', left_on='player_api_id', right_on='player_api_id')
df_merge.sample(5)
df_merge.info()

### Convert dates

In [None]:
df_merge['age'] = pd.to_datetime(df_merge['birthday']).dt.strftime('%Y')
df_merge.pop('birthday')
df_merge['date'] = pd.to_datetime(df_merge['date']);

In [None]:
df_merge['age'] = 2018 - df_merge['age'].astype('int')

## Best mid field players fitting profile and ranked by age and overall rating

In [None]:
M = (df_merge['overall_rating']>65.) & (df_merge['potential']>70.) # & (df_merge['preferred_foot']=='left')

In [None]:
df_candidates = df_merge[M].groupby('player_api_id').min().sort_values(['age', 'potential', 'overall_rating'], ascending=[True, False, False])

In [None]:
features = ['player_name', 'age', 'height', 'weight', 'preferred_foot', 'overall_rating'] + sel_features
features

In [None]:
df_candidates.head()

## Feature importance in relation to overall-rating

In [None]:
features = ['height', 'weight', 'potential', 'crossing', 'finishing', 'heading_accuracy',
        'short_passing', 'volleys', 'dribbling', 'curve', 'free_kick_accuracy',
        'long_passing', 'ball_control', 'acceleration', 'sprint_speed',
        'agility', 'reactions', 'balance', 'shot_power', 'jumping', 'stamina',
        'strength', 'long_shots', 'aggression', 'interceptions', 'positioning',
        'vision', 'penalties', 'marking', 'standing_tackle', 'sliding_tackle', 'age']

In [None]:
y = df_candidates.pop('overall_rating')

In [None]:
X = df_candidates.loc[:, features]

In [None]:
X.info();

### Split data sets in train and test

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

### Linear regression

In [None]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression().fit(X_train, y_train)

In [None]:
lr.coef_

In [None]:
df_skills = pd.DataFrame(list(zip(X_train.columns.tolist(), lr.coef_)), columns=['skill', 'importance'])
df_skills = df_skills.set_index('skill')
df_skills.head()

In [None]:
plt.xkcd();

In [None]:
# Pandas plot
df_skills.plot.barh(figsize=(16, 12))
plt.title('Soccer skill weights (/1000)', size=24, loc='left', ha='center')

# Plot axes handle (OOP)
ax = plt.gca()

# Spines
ax.spines['left'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['bottom'].set_visible(False)

# Legend
ax.legend([])

# Axis labels
ax.set_xlabel("")
ax.set_ylabel("")

# Text
for i, v in enumerate(df_skills['importance']):
    ax.text(v+.002, i-.2, str(int(v *1000)), color='k', size=10)

# Ticks & labels
ax.yaxis.set_ticklabels(df_skills.index, size=14, ha='center')
ax.xaxis.set_ticks([])
ax.xaxis.set_ticklabels(df_skills['importance'], color='white');

## Model metrics: train & test prediction score

http://scikit-learn.org/stable/modules/model_evaluation.html

In [None]:
from sklearn.metrics import accuracy_score, r2_score, mean_squared_error

In [None]:
y_pred = lr.predict(X_train)
print("Train - Mean squared error: {:.2f}".format(mean_squared_error(y_train, y_pred)))

# Explained variance score: 1 is perfect prediction
print('Train - Variance score: {:.2f}'.format(r2_score(y_train, y_pred)))

In [None]:
y_pred = lr.predict(X_test)
print("Test - Mean squared error: {:.2f}".format(mean_squared_error(y_test, y_pred)))

# Explained variance score: 1 is perfect prediction
print('Test - Variance score: {:.2f}'.format(r2_score(y_test, y_pred)))