# ENGLISH PREMIER LEAGUE 2021-2022 ANALYSIS 

In this notebook, I will be analyzing the 2021-22 Premier League that crowned Manchester City for the 3rd time in 4 years. The run for the title has been once again epic between ManCity and Liverpool.  

The dataset is from Kaggle and gathers the 380 games that took place during the season. We have a couple of information on each game on the home and away team, goals scored, shots taken, shots on target, corners, cards...

First of all, my analysis will focus on the rivalry between ManCity and Liverpool, the 2 best teams in the league for the last 4 years. Then, I will look at the impact of leading at HalfTime.  

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt


In [2]:
df = pd.read_csv('soccer21-22.csv')

## English Premier League overall

In [3]:
df.head()

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,...,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR
0,13/08/2021,Brentford,Arsenal,2,0,H,1,0,H,M Oliver,...,3,4,12,8,2,5,0,0,0,0
1,14/08/2021,Man United,Leeds,5,1,H,1,0,H,P Tierney,...,8,3,11,9,5,4,1,2,0,0
2,14/08/2021,Burnley,Brighton,1,2,A,1,0,H,D Coote,...,3,8,10,7,7,6,2,1,0,0
3,14/08/2021,Chelsea,Crystal Palace,3,0,H,2,0,H,J Moss,...,6,1,15,11,5,2,0,0,0,0
4,14/08/2021,Everton,Southampton,3,1,H,0,1,A,A Madley,...,6,3,13,15,6,8,2,0,0,0


In [None]:
df.columns

In [None]:
df.shape

In [None]:
print(len(df['HomeTeam'].unique()))
print(len(df['AwayTeam'].unique()))

In [None]:
print('Number of games: ',len(df))
print('Number of goals scored: ',df['FTHG'].sum() + df['FTAG'].sum())


In [None]:
df['Hwin'] = np.where(df['FTHG'] > df['FTAG'],1,0)
df['Awin'] = np.where(df['FTHG'] < df['FTAG'],1,0)
df['count'] = 1

In [None]:
home_stats = df.groupby('HomeTeam')['Hwin','FTHG','FTAG','HTHG','HS', 'HST','AST','HF', 'HC', 'HY', 'HR'].sum().reset_index()

In [None]:
away_stats = df.groupby('AwayTeam')['Awin','FTAG','FTHG','HTAG','AS', 'AST','HST','AF', 'AC', 'AY', 'AR'].sum().reset_index()

In [None]:
home_stats.rename(columns={'HomeTeam':'Team','FTAG':'HG_Conceded','AST':'HST_Conceded'}, inplace=True)
away_stats.rename(columns={'AwayTeam':'Team', 'FTHG':'AG_Conceded', 'HST':'AST_Conceded'}, inplace=True)


In [None]:
# We create a dataset that gathers all the stats for each team
team_stats = pd.merge(home_stats, away_stats, on='Team')

In [None]:
team_stats

In [None]:
team_stats['Goals_Scored'] = team_stats['FTHG'] + team_stats['FTAG'] # Total Goals Scored
team_stats['Total_wins'] = team_stats['Hwin'] + team_stats['Awin'] # Total wins
team_stats['Total_shots'] = team_stats['HS'] + team_stats['AS'] # Total shots attempted
team_stats['Shots_on_target'] = team_stats['HST'] + team_stats['AST'] # Total shots on target
team_stats['Goals_Conceded'] = team_stats['HG_Conceded'] + team_stats['AG_Conceded'] # Total Goals conceded
team_stats['ST_Conceded'] = team_stats['HST_Conceded'] + team_stats['AST_Conceded'] # Total shots conceded

In [None]:
team_stats.head()

We would like to see the ranking's evolution during the season.

In [None]:
teams = list(df['HomeTeam'].unique()) # list of the 20 teams
game_list = []


for pl_team in teams:
    home = df[df['HomeTeam'] == pl_team][['Date','HomeTeam','FTR']] # Home Games for one team
    away = df[df['AwayTeam'] == pl_team][['Date','AwayTeam','FTR']] # Away Games for one team

    home.rename(columns={'HomeTeam':'Team'}, inplace=True)
    away.rename(columns={'AwayTeam':'Team'}, inplace=True)
    
    home['points'] = np.where(home['FTR'] == 'H', 3, np.where(home['FTR'] == 'A',0,1)) 
    away['points'] = np.where(away['FTR'] == 'A', 3, np.where(away['FTR'] == 'H',0,1))
    # Points given according to the result
    
    game = pd.concat([home,away])
    game['Date'] = pd.to_datetime(game['Date'], infer_datetime_format=True)
    game.sort_values(by='Date', inplace=True) # Games sorted by date from Day 1 to Day 38
    
    game['cum_points'] = game['points'].cumsum() # We cumulate the points won after each day. 
    
    game_list.append(game)
    

In [None]:
team_cum_pts = pd.concat(game_list) # Games played by each team with the points taken on the given game

In [None]:
team_cum_pts['Day'] = [i for i in range(1,39)]*20
# Colonne Day created for the 38 Premier League Games

In [None]:
team_cum_pts

In [None]:
team_cum_pts_table = team_cum_pts.pivot_table(columns='Day', values = 'cum_points', index = 'Team')

In [None]:
team_cum_pts_table

In [None]:
team_cum_pts_table = team_cum_pts_table.reset_index()

In [None]:
plt.figure(figsize=(12,15))
for j in range(0,10):
    
    
    plt.plot([i for i in range(1,39)],team_cum_pts_table.iloc[j,:].values[1:],'o-', label = team_cum_pts_table.loc[j].values[0] );
    plt.legend(loc= "upper left", bbox_to_anchor=(1,1))
    plt.xlabel('Day')
    plt.ylabel('Points')
    
for j in range(10,20):
    
    
    plt.plot([i for i in range(1,39)],team_cum_pts_table.iloc[j,:].values[1:],'^-', label = team_cum_pts_table.loc[j].values[0] );
    plt.legend(loc= "upper left", bbox_to_anchor=(1,1))
    

What is clear on the graph above is the gap between ManCity and Liverpool with the rest of the league. It seems like during the 2nd part of the season, they played in their own league. The gap with the rest of the league got bigger after each game.

Let's focus on what makes these 2 teams so much better than the rest.

## City and Liverpool's domination

In [None]:
ordered_pts = list(team_cum_pts_table[38].sort_values(ascending=False))

In [None]:
gaps = []

for i in range(len(ordered_pts)- 1):
    gap = ordered_pts[0] - ordered_pts[i+1]
    gaps.append(gap)

In [None]:
gaps

In [None]:
ranked_teams = list(team_cum_pts_table[['Team',38]].sort_values(by=38, ascending=False)['Team'])

In [None]:
plt.figure(figsize=(25,8))
plt.bar(x =ranked_teams[1:] ,height=gaps)
plt.title('Gap points with Man City');

In [None]:
plt.figure(figsize=(15,10))
plt.barh(y = team_stats[['Team','Goals_Scored']].sort_values(by='Goals_Scored', ascending=True)['Team'],
        width=team_stats[['Team','Goals_Scored']].sort_values(by='Goals_Scored', ascending=True)['Goals_Scored'])
plt.xlabel('Goals scored');

In [None]:
plt.figure(figsize=(20,10))
plt.barh(y = team_stats[['Team','Goals_Conceded']].sort_values(by='Goals_Conceded', ascending=False)['Team'],
        width=team_stats[['Team','Goals_Conceded']].sort_values(by='Goals_Conceded', ascending=False)['Goals_Conceded'])
plt.xlabel('Goals conceded');

In [None]:
plt.figure(figsize=(20,10))
plt.barh(y = team_stats[['Team','Shots_on_target']].sort_values(by='Shots_on_target', ascending=True)['Team'],
        width=team_stats[['Team','Shots_on_target']].sort_values(by='Shots_on_target', ascending=True)['Shots_on_target'])
plt.xlabel('Number of shots on target');

In [None]:
plt.figure(figsize=(20,10))
plt.barh(y = team_stats[['Team','ST_Conceded']].sort_values(by='ST_Conceded', ascending=False)['Team'],
        width=team_stats[['Team','ST_Conceded']].sort_values(by='ST_Conceded', ascending=False)['ST_Conceded'])
plt.title('Number of shots on target conceded');

Liverpool and ManCity dominated the season in every statistical metric. Whether offensively or defensively, they have been above the league. They scored much more, they defensed much better, they have been more precise on their shots and they allowed less big opportunities for their opponents. 

## The impact of leading at Half Time ?

Another aspect that I would like to focus on is the 2nd half. In fact, a lot of games are won in the 2nd half because coaches can adjust their teams. The players are also more tired, which lead to more mistakes. 

In [None]:
ht_ft_score = df[['HomeTeam','AwayTeam','FTHG','FTAG','HTHG','HTAG','FTR']]

In [None]:
ht_ft_score['HTR'] = np.where(ht_ft_score['HTHG'] > ht_ft_score['HTAG'],'H', np.where(ht_ft_score['HTHG'] < ht_ft_score['HTAG'],'A','D'))

In [None]:
ht_ft_score

In [None]:
ht_ft_score[ht_ft_score['FTR'] == ht_ft_score['HTR']]

For 59% of the games, the leading team at Half Time won the game. Though, we still have 41% of the game that were decided during the second half. 

In [None]:
ht_ft_score[ht_ft_score['HTR'] != ht_ft_score['FTR']]

In [None]:
changing_score = ht_ft_score[((ht_ft_score['HTR'] == 'H') & (ht_ft_score['FTR'] == 'A')) | 
            ((ht_ft_score['HTR'] == 'A') & (ht_ft_score['FTR'] == 'H')) |
           ((ht_ft_score['HTR'] == 'D') & (ht_ft_score['FTR'] == 'H')) |
           ((ht_ft_score['HTR'] == 'D') & (ht_ft_score['FTR'] == 'A')) |
            ((ht_ft_score['HTR'] == 'H') & (ht_ft_score['FTR'] == 'D')) |
            ((ht_ft_score['HTR'] == 'A') & (ht_ft_score['FTR'] == 'D'))]  

#changing_score = ht_ft_score[ht_ft_score['HTR'] != ht_ft_score['FTR']]
# Here, we created a dataset that lists all the games with the Half time scored and the Full time score different. 

In [None]:
changing_score

Now, we are going to determine which team has won the most points the 2nd half, depending on if they played at home or away. The points won during the 2nd Half depends on the score at HT.

- If a team was leading at HT and lost at the end, they lost 3pts and the opponent won 3pts.
- If a team was leading at HT and draw at the end, they lost 2pts and the opponent won 1pt.

We will make the case for every scenario possible as long as the HT and FT results are different.

In [None]:
changing_score['Pts_Taken_2ndH_Home'] = np.where((changing_score['HTR'] == 'H') & (changing_score['FTR'] == 'A'),-3,
                                        np.where((changing_score['HTR'] == 'H') & (changing_score['FTR'] =='D'),-2,
                                        np.where((changing_score['HTR'] == 'D') & (changing_score['FTR'] == 'H'),2,
                                        np.where((changing_score['HTR'] == 'D') & (changing_score['FTR'] == 'A'),-1,
                                        np.where((changing_score['HTR'] == 'A') & (changing_score['FTR'] == 'D'),1,3)))))


changing_score['Pts_Taken_2ndH_Away'] = np.where((changing_score['HTR'] == 'H') & (changing_score['FTR'] == 'A'),3,
                                        np.where((changing_score['HTR'] == 'H') & (changing_score['FTR'] =='D'),1,
                                        np.where((changing_score['HTR'] == 'D') & (changing_score['FTR'] == 'H'),-1,
                                        np.where((changing_score['HTR'] == 'D') & (changing_score['FTR'] == 'A'),2,
                                        np.where((changing_score['HTR'] == 'A') & (changing_score['FTR'] == 'D'),-2,-3)))))

In [None]:
changing_score

In [None]:
pts_won_2nd_H = changing_score.groupby('HomeTeam')['Pts_Taken_2ndH_Home'].sum()
pts_won_2nd_A = changing_score.groupby('AwayTeam')['Pts_Taken_2ndH_Away'].sum()

In [None]:
plt.figure(figsize=(15,15))



plt.subplot(2,2,1)
plt.pie(df['HTR'].value_counts(normalize=True).values, labels=['Draw','Home Team Lead', 'Away Team Lead'], autopct='%1.1f%%',
        shadow=True, textprops={'color':'grey'})
plt.title(label='Half Time Results', color='grey')

plt.subplot(2,2,2)
plt.pie(df['FTR'].value_counts(normalize=True).values, labels=['Home Team Wins','Away Team Wins', 'Draw'], autopct='%1.1f%%',
        shadow=True, textprops={'color':'grey'})
plt.title(label='Full Time Results', color='grey')

plt.show()

Like we said, games are generally won during the 2nd half because in most games of the 2021-22 season, the games were tied at HT. At the end, in most case, the Home Team won. 

In [None]:
plt.figure(figsize=(20,10))
plt.subplot(131)
plt.pie(df.loc[df['HTR'] == 'H']['FTR'].value_counts(normalize=True), labels=['HomeTeam Wins', 'Draw', 'AwayTeam Wins'],
        autopct='%1.1f%%',
        shadow=True, textprops={'color':'grey'})
plt.title(label='FT Result when HomeTeam leads at HT', color='grey')

plt.subplot(132)
plt.pie(df.loc[df['HTR'] == 'D']['FTR'].value_counts(normalize=True), labels=['HomeTeam Wins', 'Draw', 'AwayTeam Wins'],
        autopct='%1.1f%%',
        shadow=True, textprops={'color':'grey'})
plt.title(label='FT Result when draw at HT', color='grey')

plt.subplot(133)
plt.pie(df.loc[df['HTR'] == 'A']['FTR'].value_counts(normalize=True), labels=['AwayTeam Wins', 'Draw', 'HomeTeam Wins'],
        autopct='%1.1f%%',
        shadow=True, textprops={'color':'grey'})
plt.title(label='FT Result when AwayTeam leads at HT', color='grey')

plt.show()


These 3 pies show that in most case, when a team leads at HT, whether at home or away, they win the game. When the game is tied at HT, usually the home team wins. 

In [None]:
plt.figure(figsize=(15,20))

plt.subplot(221)
pts_won_2nd_H.plot(kind='barh',)
plt.ylabel('')
plt.xticks(rotation=80)
plt.title('Points won in 2nd Half when playing at Home')

plt.subplot(222)
pts_won_2nd_A.plot(kind='barh')
plt.ylabel('')
plt.xticks(rotation=80)
plt.title('Points won in 2nd Half when playing Away')




plt.show()


In [None]:
(pts_won_2nd_A + pts_won_2nd_H).plot(kind='barh', figsize=(10,10))
plt.xlabel('')
plt.title('Points won in 2nd Half')

A couple of points can be made from these graphs:

-  Liverpool and ManCity are the 2 teams that took the most points in the 2nd half after drawing or losing at HT. This show their superiority in several aspects of the game such as resilience and mental strenght. They also have deeper teams and their coach can make impactful substitutions. 

- Arsenal might have missed the 4th place because of their performances away. While they have been able to win many points in the 2nd half at home, they gave up points when playing away after leading or drawing at HT.

- Southampton almost got relegated and this is probably because they were a very bad 2nd half team. They managed to lose more than 10 pts after leading or drawing at HT. 

## Clusetering

Now, I will try to classify the team in group according to their performances over the season. I already established that Man City and Liverpool are above the rest, but what about the other teams between themselves. 

In [None]:
team_stats.head()

In [None]:
# Select the optimal number of cluster
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
X = team_stats.iloc[:,1:]
sc = StandardScaler()
X_norm = sc.fit_transform(X)

wcss = []
k = []  

for i in range(1,11):
    kmeans = KMeans(n_clusters=i, random_state=0)
    kmeans.fit(X_norm)
    
    wcss.append(kmeans.inertia_)
    k.append(i)

    print(f"for {k[i-1]} cluster, wcss --> {wcss[i-1]}")

In [None]:
plt.plot(k,wcss);

With the plot, we can determine graphically that the optimal k is either 3 or 4. We will take k = 4 

In [None]:
sc = StandardScaler()
X_norm = sc.fit_transform(X)

In [None]:
kmeans = KMeans(n_clusters=3, random_state=0)
kmeans.fit(X_norm)

In [None]:
team_stats.insert(1,'no_cluster', kmeans.labels_)

In [None]:
team_stats.head()

In [None]:
team_stats['final_pts'] = team_cum_pts_table[38]

In [None]:
a = team_stats[['Team','final_pts']].sort_values(ascending=False, by = 'final_pts')

In [None]:
plt.figure(figsize=(15,10))

sns.barplot(y =a['Team'], 
            x =a['final_pts'], 
            hue=team_stats['no_cluster'],
           orient='h')
plt.show()

Like expected, the clusters are created according to the points won during the season. Thus, Liverpool and City form their own created because they finished far ahead of the rest. Chelsea, Tottenham and Arsenal are in the same cluster (cluster n°3) as they finished respectively 3rd, 4th and 5th. Surprisingly, West Ham is also the cluster n°3 although they finished below Man United. However, looking at basic stats, we noticed that West Ham have a better defence and offense that Man United. 