## Cluster Analysis on English Premier League Teams From 1993 - 2022

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sweetviz as sv
import warnings
warnings.filterwarnings("ignore")
plt.rcParams['figure.figsize'] = (15, 5)
from sklearn.cluster import KMeans
from sklearn import metrics

In [None]:
# Load the dataset
EPL = pd.read_csv("English Premier League Data Analytics\Data\EPL_full.csv")

In [None]:
display(EPL[0:10])

In [None]:
#Generate Statistical Report
report = sv.analyze(EPL)
report.show_notebook()

#### Clustering Based on All Variables

In [None]:
#Selecting useful columns
x = EPL[['GF', 'GA', 'W', 'D', 'L', 'GP', 'win_pct','pyth_win_pct', 'Goal_Diff']]
x.head()

In [None]:
# Create an empty list to store the within-cluster sum of squares (WCSS) for each number of clusters
wcss = []

# Loop through a range of possible number of clusters
for i in range(1, 11):
    # Create a KMeans model with the current number of clusters
    kmeans = KMeans(n_clusters=i)

    # Fit the model to the data
    kmeans.fit(x)

    # Append the WCSS for the current model to the list
    wcss.append(kmeans.inertia_)

In [None]:
# Plot the WCSS for each number of clusters
plt.plot(range(1, 11), wcss)
plt.xlabel('Number of clusters')
plt.ylabel('WCSS') 
plt.show()

In [None]:
#The best Number of cluster
kmeans = KMeans(5)
kmeans.fit(x)

In [None]:
#Identifying clusters
identified_cluster = kmeans.fit_predict(x)

In [None]:
data = EPL.copy()

In [None]:
#Attaching Clusters to each team
data['Clusters'] = identified_cluster

In [None]:
data[0:10]

In [None]:
#Teams Belonging to cluster 0
data[data['Clusters'] == 0].sort_values(by = 'PTS', ascending = False)

In [None]:
data[data['Clusters'] == 0].describe(include = 'all')

In [None]:
#Teams Belonging to cluster 1
data[data['Clusters'] == 1].sort_values(by = 'PTS', ascending = False)

In [None]:
data[data['Clusters'] == 1].describe(include = 'all')

In [None]:
#Teams Belonging to cluster 2
data[data['Clusters'] == 2].sort_values(by = 'PTS', ascending = False)

In [None]:
data[data['Clusters'] == 2].describe(include = 'all')

In [None]:
#Teams Belonging to cluster 3
data[data['Clusters'] == 3].sort_values(by = 'PTS')

In [None]:
data[data['Clusters'] == 3].describe(include = 'all')

In [None]:
#Teams Belonging to cluster 4
data[data['Clusters'] == 4].sort_values(by = 'PTS', ascending = False)

In [None]:
data[data['Clusters'] == 4].describe(include = 'all')

In [None]:
#Countplot for the clusters
sns.countplot(x = 'Clusters', data = data)
plt.title('Frequency of Clusters of English Premier League Teams From 1993 - 2022', fontsize = 20)
plt.ylabel('Frequency', fontsize = 15)
plt.xlabel('Team Clusters', fontsize = 15)

The Teams can be clustered into 2, 3, 4, 5, 6, but the suitable or best cluster is 5. This clusters is based on the number of games played and the number of points amassed. Surprisely,
manchester city is not part of the teams with the highest points and number games played. They are part of teams with some kind high game played and the teams includes, Everton, New Castle
united, Southampton.

#### Clustring Based on Win Percentage and the Number of Games Played

In [None]:
win = data[['win_pct', 'GP']]
wcss = []
for i in range(1,10):
    kmeans = KMeans(i)
    kmeans.fit(win)    
    wcss.append(kmeans.inertia_)

In [None]:
plt.plot(range(1,10),wcss)
plt.xlabel('Number of Clusters')
plt.ylabel('WCSS')
plt.show()

In [None]:
kmeans = KMeans(4)
kmeans.fit(win)
clusters = kmeans.fit_predict(win)
y = win.copy()
y['Teams'] = data['Team']
y['Clusters'] = clusters
y.head()

In [None]:
sns.countplot(x = 'Clusters', data = y)
plt.title('Frequency of Clusters of English Premier League Teams From 1993 - 2022 Based on Win Percentage and Games Played', fontsize = 20)
plt.ylabel('Frequency', fontsize = 15)
plt.xlabel('Team Clusters', fontsize = 15)

In [None]:
points = data[['PTS', 'GP']]
wcss = []
for i in range(1,10):
    kmeans = KMeans(i)
    kmeans.fit(points)    
    wcss.append(kmeans.inertia_)

In [None]:
plt.plot(range(1,10),wcss)
plt.xlabel('Number of Clusters')
plt.ylabel('WCSS')
plt.show()

In [None]:
kmeans = KMeans(5)
kmeans.fit(points)
clusters = kmeans.fit_predict(points)
y = points.copy()
y['Teams'] = data['Team']
y['Clusters'] = clusters
y.head()

In [None]:
sns.countplot(x = 'Clusters', data = y)
plt.title('Frequency of Clusters of English Premier League Teams From 1993 - 2022 Based on Points and Games Played', fontsize = 20)
plt.ylabel('Frequency', fontsize = 15)
plt.xlabel('Team Clusters', fontsize = 15)

In [None]:
pyth_win = data[['win_pct', 'pyth_win_pct']]
wcss = []
for i in range(1,10):
    kmeans = KMeans(i)
    kmeans.fit(pyth_win)    
    wcss.append(kmeans.inertia_)

In [None]:
plt.plot(range(1,10),wcss)
plt.xlabel('Number of Clusters')
plt.ylabel('WCSS')
plt.show()

In [None]:
kmeans = KMeans(6)
kmeans.fit(pyth_win)
clusters = kmeans.fit_predict(pyth_win)
y = pyth_win.copy()
y['Teams'] = data['Team']
y['Clusters'] = clusters
y.head()

In [None]:
sns.countplot(x = 'Clusters', data = y)
plt.title('Frequency of Clusters of English Premier League Teams From 1993 - 2022 Based on Win Percentages', fontsize = 20)
plt.ylabel('Frequency', fontsize = 15)
plt.xlabel('Team Clusters', fontsize = 15)

In [None]:
pyth_win_GP = data[['win_pct', 'pyth_win_pct', 'GP']]
wcss = []
for i in range(1,10):
    kmeans = KMeans(i)
    kmeans.fit(pyth_win_GP)    
    wcss.append(kmeans.inertia_)

In [None]:
plt.plot(range(1,10),wcss)
plt.xlabel('Number of Clusters')
plt.ylabel('WCSS')
plt.show()

In [None]:
kmeans = KMeans(5)
kmeans.fit(pyth_win_GP)
clusters = kmeans.fit_predict(pyth_win_GP)
y = pyth_win_GP.copy()
y['Teams'] = data['Team']
y['Clusters'] = clusters
y.head()

In [None]:
sns.countplot(x = 'Clusters', data = y)
plt.title('Frequency of Clusters of English Premier League Teams From 1993 - 2022 Based on Win Percentages and Games Played', fontsize = 20)
plt.ylabel('Frequency', fontsize = 15)
plt.xlabel('Team Clusters', fontsize = 15)

### Conclusions

Overall, the teams are basically being clustered into 5 based on the different variables used. Some of the models clustered the data into 6 and five clusters respectively. 
5 is the best number of cluster for the our teams in EPL since 1993 to 2022. There are only 5 top teams based on their points and the number of games they have played since 1993 which make 
them the most consistent teams to played in almost all the season from 1993 to 2022.