In [60]:
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from scipy.cluster.hierarchy import linkage
from scipy.cluster.hierarchy import dendrogram
from scipy.cluster.hierarchy import cut_tree
from sklearn.neighbors import NearestNeighbors
from random import sample
from numpy.random import uniform
from math import isnan

In [61]:
df = pd.read_csv("../Datasets/pass_types.csv")
df2 = pd.read_csv("../Datasets/passing_stats.csv")

### About the dataset (pass_types.csv) df
- **Nation:** 
  - Records in international play at senior level
  - Records in international play at youth level
  - Citizenship presented on Wikipedia
  - Birthplace when available

- **Pos:** Position most commonly played:
  - GK: Goalkeepers
  - DF: Defenders (FB, LB, RB, CB)
  - MF: Midfielders (DM, CM, LM, RM, WM, LW, RW, AM)
  - FW: Forwards

- **Age:** Age at the start of the season

- **90s:** 90s played (minutes played divided by 90)

- **Passing Statistics:**
  - Passes Attempted (Att) by type:
    - Live-ball Passes
    - Dead-ball Passes (includes free kicks, corner kicks, kickoffs, throw-ins, and goal kicks)
    - Passes from Free Kicks (FK)
    - Through Balls (TB)
    - Switches (Sw)
    - Crosses (Crs)
    - Throw-ins Taken (TI)
    - Corner Kicks (CK) with subcategories:
      - Inswinging (In)
      - Outswinging (Out)
      - Straight (Str)

- **Outcomes of Passes:**
  - Passes Completed (Cmp)
  - Offsides (Off)
  - Passes Blocked (Blocks)

In [62]:
df.head()

Unnamed: 0.1,Unnamed: 0,Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Pass Types,Pass Types.1,Pass Types.2,...,Pass Types.5,Pass Types.6,Pass Types.7,Corner Kicks,Corner Kicks.1,Corner Kicks.2,Outcomes,Outcomes.1,Outcomes.2,Unnamed: 20_level_0
0,,Player,Nation,Pos,Age,90s,Att,Live,Dead,FK,...,Crs,TI,CK,In,Out,Str,Cmp,Off,Blocks,Matches
1,0.0,David de Gea,es ESP,GK,31,58.3,1581.0,1265.0,308.0,77.0,...,0,0.0,0.0,0.0,0.0,0.0,1148.0,8.0,1.0,Matches
2,1.0,Bruno Fernandes,pt POR,"MF,FW",27,57.3,2581.0,2330.0,235.0,43.0,...,243,36.0,109.0,26.0,36.0,2.0,1919.0,16.0,82.0,Matches
3,2.0,Marcus Rashford,eng ENG,FW,24,47.6,1020.0,982.0,33.0,3.0,...,38,13.0,0.0,0.0,0.0,0.0,787.0,5.0,38.0,Matches
4,3.0,Casemiro,br BRA,MF,30,43.3,2166.0,2115.0,41.0,36.0,...,24,4.0,0.0,0.0,0.0,0.0,1686.0,10.0,45.0,Matches


### About the dataset (Passing_stats.csv) df2
- **Nation:** 
  - Records in international play at senior level
  - Records in international play at youth level
  - Citizenship presented on Wikipedia
  - Birthplace when available

- **Pos:** Position most commonly played:
  - GK: Goalkeepers
  - DF: Defenders (including FB, LB, RB, CB)
  - MF: Midfielders (including DM, CM, LM, RM, WM, LW, RW, AM)
  - FW: Forwards

- **Age:** Age at the start of the season

- **90s:** 90s played (minutes played divided by 90)

- **Passing Statistics:**
  - Total Passes Completed (Cmp) and Attempted (Att)
  - Pass Completion Percentage (Cmp%)
  - Total Passing Distance (TotDist)
  - Progressive Passing Distance (PrgDist)
  - Short Passes (between 5 and 15 yards)
    - Passes Completed (Cmp)
    - Passes Attempted (Att)
    - Pass Completion Percentage (Cmp%)
  - Medium Passes (between 15 and 30 yards)
    - Passes Completed (Cmp)
    - Passes Attempted (Att)
    - Pass Completion Percentage (Cmp%)
  - Long Passes (longer than 30 yards)
    - Passes Completed (Cmp)
    - Passes Attempted (Att)
    - Pass Completion Percentage (Cmp%)

- **Assists and Expected Assists (xA):**
  - Total Assists (Ast)
  - Expected Assisted Goals (xAG)
  - Expected Assists (xA)
  - Assists minus Expected Goals Assisted (A-xAG)

- **Key Passes (KP):** Passes that directly lead to a shot

- **Passes into Final Third (1/3):** Completed passes that enter the 1/3 of the pitch closest to the goal, not including set pieces

- **Passes into Penalty Area (PPA):** Completed passes into the 18-yard box, not including set pieces

- **Crosses into Penalty Area (CrsPA):** Completed crosses into the 18-yard box, not including set pieces

- **Progressive Passes (PrgP):** Completed passes that move the ball towards the opponent's goal line, excluding passes from the defending 40% of the pitch

In [63]:
df2.head()

Unnamed: 0.1,Unnamed: 0,Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Total,Total.1,Total.2,Total.3,...,Unnamed: 19_level_0,Unnamed: 20_level_0,Expected,Expected.1,Unnamed: 23_level_0,Unnamed: 24_level_0,Unnamed: 25_level_0,Unnamed: 26_level_0,Unnamed: 27_level_0,Unnamed: 28_level_0
0,,Player,Nation,Pos,Age,90s,Cmp,Att,Cmp%,TotDist,...,Ast,xAG,xA,A-xAG,KP,1/3,PPA,CrsPA,PrgP,Matches
1,0.0,David de Gea,es ESP,GK,31,58.3,1148.0,1581.0,72.6,29543.0,...,0,0.1,0.2,-0.1,1.0,9.0,1.0,0.0,0.0,Matches
2,1.0,Bruno Fernandes,pt POR,"MF,FW",27,57.3,1919.0,2581.0,74.4,32978.0,...,13,21.7,14.9,-8.7,150.0,201.0,115.0,15.0,327.0,Matches
3,2.0,Marcus Rashford,eng ENG,FW,24,47.6,787.0,1020.0,77.2,11143.0,...,9,4.3,3.7,4.7,40.0,45.0,63.0,3.0,105.0,Matches
4,3.0,Casemiro,br BRA,MF,30,43.3,1686.0,2166.0,77.8,30359.0,...,6,4.4,4.9,1.6,41.0,214.0,30.0,5.0,232.0,Matches


### Preprocessing the datasets to name the first row as the columns

In [64]:
## real column names are those in the first row
real_column_names = df.iloc[0].values

## renaming the column names
df.columns = real_column_names

##dropping the first row as it consists of the actual column names
df = df.drop(df.index[0])

##Droping the first column as it's name is nan and consists of unrequired values
df=df.drop(columns=df.columns[0])

##we will drop the last column 'Matches' as it does not add any value
df=df.drop(columns=df.columns[-1]).reset_index()

df = df.drop(columns=df.columns[0])

df.head()

Unnamed: 0,Player,Nation,Pos,Age,90s,Att,Live,Dead,FK,TB,Sw,Crs,TI,CK,In,Out,Str,Cmp,Off,Blocks
0,David de Gea,es ESP,GK,31,58.3,1581.0,1265.0,308.0,77.0,1.0,3.0,0,0.0,0.0,0.0,0.0,0.0,1148.0,8.0,1.0
1,Bruno Fernandes,pt POR,"MF,FW",27,57.3,2581.0,2330.0,235.0,43.0,42.0,48.0,243,36.0,109.0,26.0,36.0,2.0,1919.0,16.0,82.0
2,Marcus Rashford,eng ENG,FW,24,47.6,1020.0,982.0,33.0,3.0,6.0,9.0,38,13.0,0.0,0.0,0.0,0.0,787.0,5.0,38.0
3,Casemiro,br BRA,MF,30,43.3,2166.0,2115.0,41.0,36.0,18.0,22.0,24,4.0,0.0,0.0,0.0,0.0,1686.0,10.0,45.0
4,Luke Shaw,eng ENG,DF,27,40.5,2469.0,2096.0,363.0,44.0,4.0,14.0,129,238.0,47.0,23.0,10.0,0.0,2048.0,10.0,48.0


In [68]:
## real column names are those in the first row
real_column_names = df2.iloc[0].values

## renaming the column names
df2.columns = real_column_names

##dropping the first row as it consists of the actual column names
df2 = df2.drop(df2.index[0])

##Droping the first column as it's name is nan and consists of unrequired values
df2=df2.drop(columns=df2.columns[0]).reset_index()

# df2
##we will drop the last column 'Matches' as it does not add any value
df2=df2.drop(columns=df2.columns[-1])

df2 = df2.drop(columns=df2.columns[:2])

df2.head()

Unnamed: 0,David de Gea,es ESP,GK,31,58.3,1148.0,1581.0,72.6,29543.0,21024.0,...,44.8,0,0.1,0.2,-0.1,1.0,9.0,1.0.1,0.0,0.0.1
0,Bruno Fernandes,pt POR,"MF,FW",27,57.3,1919.0,2581.0,74.4,32978.0,11448.0,...,55.6,13,21.7,14.9,-8.7,150.0,201.0,115.0,15.0,327.0
1,Marcus Rashford,eng ENG,FW,24,47.6,787.0,1020.0,77.2,11143.0,2620.0,...,64.9,9,4.3,3.7,4.7,40.0,45.0,63.0,3.0,105.0
2,Casemiro,br BRA,MF,30,43.3,1686.0,2166.0,77.8,30359.0,11348.0,...,64.1,6,4.4,4.9,1.6,41.0,214.0,30.0,5.0,232.0
3,Luke Shaw,eng ENG,DF,27,40.5,2048.0,2469.0,82.9,32145.0,12375.0,...,52.5,6,4.7,4.2,1.3,43.0,195.0,37.0,15.0,172.0
4,Lisandro Martínez,ar ARG,DF,24,37.9,1888.0,2169.0,87.0,34120.0,12255.0,...,55.2,0,0.8,2.3,-0.8,12.0,155.0,11.0,0.0,128.0


In [53]:
df2.drop(columns='NaN',axis=1)

KeyError: "['NaN'] not found in axis"

In [67]:
df2.columns[:2]

Index(['index', nan], dtype='object')