### English Premier League (EPL) Matches Data Preprocessing and Analysics.
This project preprocessed data containing matches from 1993/1994 season to 2021/2022 Season.

#### Importing relevant libraries

In [None]:
#Importing relevant libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

#### Importing Data

In [None]:
#Reading data
data = pd.read_csv('EPL Matches.csv')
display(data[0:10])

## Data Preprocessing and Exploratory Data Analysis (EDA)

In [None]:
#Creating variable 'count' to track the number of matches played
data['count'] = 1
data.head()

In [None]:
data.describe()

In [None]:
#Count plot to find the number of games which resulted in Win, Draw and Loss
plt.figure(figsize = (15, 8))
sns.countplot(x = 'FTR', data = data)
plt.title('Count of Match / Game Outcome', fontsize = 20)
plt.xlabel('Match Outcome (Home Win, Draw, Away Win)', fontsize = 15)
plt.ylabel('Count', fontsize = 15)

### Data Preprocessing and Preparation

In [None]:
#Extracting the home teams from the dataset
home = data.sort_values(by = ['Year', 'Month'])[['Year', 'Month', 'Home', 'HomeGoals', 'AwayGoals', 'FTR', 'count']]
display(home[0:10])

In [None]:
#Extracting away teams from the dataset
away = data.sort_values(by = ['Year', 'Month'])[['Year', 'Month', 'Away', 'HomeGoals', 'AwayGoals', 'FTR', 'count']]
display(away[0:10])

In [None]:
#Creating useful variables like win, drae, loss, pts etc.
home['HW'] = np.where(home['FTR'] == 'H', 1, 0)
home['HD'] = np.where(home['FTR'] == 'D', 1, 0)
home['HL'] = np.where(home['FTR'] == 'A', 1, 0)
home['HPTS'] = np.where(home['FTR'] == 'H', 3, 
                        np.where(home['FTR'] == 'D', 1, 0))
home['hwinvalue'] = np.where(home['FTR'] == 'H', 1, 
                             np.where(home['FTR'] == 'D', .5, 0))
display(home[0:10])

In [None]:
# Creating useful variables like win, drae, loss, pts etc.
away['AW'] = np.where(away['FTR'] == 'A', 1, 0)
away['AD'] = np.where(away['FTR'] == 'D', 1, 0)
away['AL'] = np.where(away['FTR'] == 'H', 1, 0)
away['APTS'] = np.where(away['FTR'] == 'A', 3,
                        np.where(away['FTR'] == 'D', 1, 0))
away['awinvalue'] = np.where(
    away['FTR'] == 'A', 1, np.where(away['FTR'] == 'D', .5, 0))
display(away[0:10])


In [None]:
#Aggregation of the home data
home = home.groupby(['Year', 'Month', 'Home'])['HomeGoals', 'AwayGoals', 'FTR', 'count', 'HW',
                                                                       'HD', 'HL', 'HPTS', 'hwinvalue'].sum().reset_index()
home[0:10]

In [None]:
#Renaming Columns
home = home.rename(columns={'HomeGoals': 'Goals_for_h', 'Home':'Team',
                   'AwayGoals': 'Goals_against_a', 'count': 'GPh'})
home.head()


In [None]:
#Aggregation for away dataframe
away = away.groupby(['Year', 'Month', 'Away'])['HomeGoals', 'AwayGoals', 'FTR', 'count', 'AW',
                                               'AD', 'AL', 'APTS', 'awinvalue'].sum().reset_index()
away[0:10]

In [None]:
#Renaming columns / variables
away = away.rename(columns={'HomeGoals': 'Goals_against_h', 'Away': 'Team',
                   'AwayGoals': 'Goals_for_a', 'count': 'GPa'})
away.head()


In [None]:
#Saving our dataframes
home.to_csv('home_teams.csv', index = False)
away.to_csv('away_teams.csv',  index = False)

In [None]:
#Merging the two dataframes
EPL = pd.merge(home, away, on = ['Year', 'Month', 'Team'])
display(EPL[0:10])

In [None]:
#Saving our merged dataframes
EPL.to_csv('EPL_Unagg_data.csv', index = False)

In [None]:
#Data aggregation
EPL = EPL.groupby(['Year', 'Team']) ['Goals_for_h', 'Goals_against_a', 'GPh', 'HW',
'HD', 'HL', 'HPTS', 'hwinvalue', 'Goals_against_h', 'Goals_for_a',
'GPa', 'AW', 'AD', 'AL', 'APTS', 'awinvalue'].sum().reset_index()

In [None]:
EPL.head()

In [None]:
#Creating useful variable
# sourcery skip: remove-redundant-slice-index
EPL['PTS'] = EPL['HPTS'] + EPL['APTS']
EPL['GF'] = EPL['Goals_against_h'] + EPL['Goals_for_h']
EPL['GA'] = EPL['Goals_for_a'] + EPL['Goals_against_a']
EPL['W'] = EPL['HW'] + EPL['AW']
EPL['D'] = EPL['HD'] + EPL['AW']
EPL['L'] = EPL['HL'] + EPL['AL']
EPL['WV'] = EPL['hwinvalue'] + EPL['awinvalue']
EPL['GP'] = EPL['GPa'] + EPL['GPh']
display(EPL[0:10])

In [None]:
#Creating Winning Percentages
EPL['win_pct'] = EPL['W'] / EPL['GP']
EPL['pyth'] = EPL['GF'] ** 2 / (EPL['GF'] ** 2 + EPL['GA'] ** 2)
EPL.head()

In [None]:
#Correlation Coefficent
EPL.corr()

In [None]:
#Save dataframe
EPL.to_csv('EPL_Matches_93_22.csv', index = False)

### Exploratory Data Analysis (EDA)

In [None]:
#Plotting heatmap for the correlation
plt.figure(figsize = (20, 8))
sns.heatmap(EPL.corr(), annot = True, cmap = 'coolwarm')

Most of the variables are highly correlated. Some with Negative correlation and others with Positive correlation. This values indicates how 
these variables affects each other.

In [None]:
#Distribution plots for some variables
for feat in EPL.columns:
    if feat not in ['Team', 'Long']:
        plt.figure(figsize=(15, 8))
        sns.displot(EPL[feat])

In [None]:
#Distribution plots for useful variable
for feat in EPL.columns:
    if feat not in ['Team', 'Long']:
        plt.figure(figsize=(15, 8))
        sns.displot(EPL[feat])

In [None]:
#Regression plots for useful variables vs win percentage
for feat in EPL.columns:
    if feat not in ['Team', 'Long', 'win_pct']:
       plt.figure(figsize=(15, 8))
       sns.regplot(x = 'win_pct', y = feat, data = EPL)

In [None]:
#Regression plots for pythogoreon win percentage vs other variables or features
for feat in EPL.columns:
    if feat not in ['Team', 'Long','pyth']:
       plt.figure(figsize=(15, 8))
       sns.regplot(x='pyth', y=feat, data=EPL)


In [None]:
#Violin plot for the features
for feat in EPL.columns:
    if feat not in ['Team', 'Long']:
       plt.figure(figsize=(15, 8))
       sns.violinplot(x=feat, data=EPL)

In [None]:
#Box plot for features
for feat in EPL.columns:
    if feat not in ['Team', 'Long']:
        plt.figure(figsize=(15, 8))
        sns.boxplot(x=feat, data=EPL)

## Conclusions

The data contains some few outliers because of some team playing fewer games than 460 in the top flight league. Some of this teams include Bransley, Charlton Atheletics and some few others.