## Regression Analysis on EPL match data using Pythegoreon Winning Percentage and Actual Winning Percentage.

### Importing relevant libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
import warnings
warnings.filterwarnings("ignore")

In [None]:
EPL = pd.read_csv('English Premier League Data Analytics\Data\EPL_Matches_93_22.csv')
EPL.head()

## Data Preprocessing  Exploratory Data Analysis (EDA)

In [None]:
EPL.describe()

In [None]:
EPL.info()

### Splitting the dataset into two decades (from 1993 - 2000 and 2001 - 2022)

In [None]:
#Extracting data between 1993 and 2000
_93_2000 = EPL[(EPL['Year'] >= 1993) & (EPL['Year'] <= 2000)]

In [None]:
#Extracting data between 2001 and 2022
_2001_2022 = EPL[(EPL['Year'] >= 2001) & (EPL['Year'] <= 2022)]
_2001_2022.head()

In [None]:
_93_2000.shape

In [None]:
#Dropping unwanted variables
_93_2000 = _93_2000.drop(['win_pct', 'pyth'], axis = 1)

In [None]:
#Dropping unwanted variables
_2001_2022 = _2001_2022.drop(['win_pct', 'pyth'], axis=1)

In [None]:
#Data Aggregation for 1993- 2000 data
EPL_93 = _93_2000.groupby(['Team'])['PTS', 'GF', 'GA', 'W', 'D', 'L',
'WV', 'GP'].sum().reset_index()

In [None]:
#Data Aggregation for 2001 - 2022 data
EPL_2001_2022 = _2001_2022.groupby(['Team'])['PTS', 'GF', 'GA', 'W', 'D', 'L',
                                    'WV', 'GP'].sum().reset_index()

In [None]:
#Creating new variables for winning percentages
EPL_93['win_pct'] = EPL_93['W'] / EPL_93['GP']
EPL_93['pyth_win_pct']  = EPL_93['GF'] ** 2 / (EPL_93['GF'] ** 2 + EPL_93['GA'] ** 2)

In [None]:
#Creating new variables for winning percentages
EPL_2001_2022['win_pct'] = EPL_2001_2022['W'] / EPL_2001_2022['GP']
EPL_2001_2022['pyth_win_pct'] = EPL_2001_2022['GF'] ** 2 / (EPL_2001_2022['GF'] ** 2 + EPL_2001_2022['GA'] ** 2)
EPL_2001_2022.head()

In [None]:
EPL_93.describe()

In [None]:
EPL_2001_2022.describe()

In [None]:
EPL_93.head()

In [None]:
EPL_2001_2022.head()

### Exploratory Data Analysis (EDA)

In [None]:
#Correlation Matrix for data from 1993 to 2000
sns.set_style("whitegrid")
sns.set_style("ticks" , {"xtick.major.size" : 8, "ytick.major.size":8})
plt.figure(figsize = (10, 8))
sns.heatmap(EPL_93.corr(), annot = True, cmap = 'coolwarm')
plt.title('Correlation Coffiecent Matrix for 1993 to 2000 Season', fontsize = 20)

In [None]:
#Scatterplot between win percentages and pyth. win percentage
plt.figure(figsize=(10, 8))
sns.regplot(x='win_pct', y='pyth_win_pct', data=EPL_93)
plt.title('Winning Percentage vs Pythogorean Winning Percentage for 1993 - 2000', fontsize = 15)
plt.xlabel('Winning Percentage', fontsize = 10)
plt.ylabel('Pythogorean Winning Percentge', fontsize = 10)

In [None]:
#Correlation Matrix for data from 1993 to 2000
plt.figure(figsize = (10, 8))
sns.heatmap(EPL_2001_2022.corr(), annot=True, cmap = 'coolwarm')
plt.title('Correlation Coffiecent Matrix for 2001 to 2022 Season', fontsize=20)

In [None]:
#Scatterplot between win percentages and pyth. win percentage
plt.figure(figsize = (10, 8))
sns.regplot(x = 'win_pct', y = 'pyth_win_pct', data = EPL_2001_2022)
plt.title(
    'Winning Percentage vs Pythogorean Winning Percentage for 2001 - 2022', fontsize=15)
plt.xlabel('Winning Percentage', fontsize=10)
plt.ylabel('Pythogorean Winning Percentge', fontsize=10)

In [None]:
#Distributions of values
for feat in EPL_93.columns:
    if feat != 'Team':
        sns.displot(EPL_93[feat])
        plt.title(f"Distribution of {feat} 1993 - 2000", fontsize=20)
        plt.xlabel(feat, fontsize=15)
        plt.ylabel('Count', fontsize=15)

In [None]:
# Distributions of values
for feat in EPL_93.columns:
    if feat != 'Team':
        sns.displot(EPL_2001_2022[feat])
        plt.title(f"Distribution of {feat} 2001 - 2022", fontsize=20)
        plt.xlabel(feat, fontsize=15)
        plt.ylabel('Count', fontsize=15)

In [None]:
#Top 10 teams with the highest Winning Percentage
EPL_93.sort_values(by = 'win_pct', ascending = False).head(10)

In [None]:
# Top 10 teams with the low Winning Percentage
EPL_93.sort_values(by='win_pct', ascending=True).head(10)

In [None]:
# Top 10 teams with most points
EPL_93.sort_values(by='PTS', ascending=False).head(10)

In [None]:
# Top 10 teams with most goals
EPL_93.sort_values(by='GF', ascending=False).head(10)

In [None]:
# Top 10 teams with most wins
EPL_93.sort_values(by='W', ascending=False).head(10)


In [None]:
# Top 10 teams with high pyth win pct
EPL_93.sort_values(by='pyth_win_pct', ascending=False).head(10)

In [None]:
#Top 10 teams with high Winning Percentages
EPL_2001_2022.sort_values(by='win_pct', ascending=False).head(10)

In [None]:
# Top 10 teams with most wins
EPL_2001_2022.sort_values(by=['W','GP'], ascending=[False, False]).head(10)

In [None]:
# Top 10 teams with most loss
EPL_2001_2022.sort_values(by=['L','GP'], ascending=[False, False]).head(10)

In [None]:
# Top 10 teams with most Goals
EPL_2001_2022.sort_values(by=['GF', 'GF'], ascending=[False, False]).head(10)

In [None]:
# Top 10 teams with most points
EPL_2001_2022.sort_values(by=['PTS', 'GP', 'GA', 'GF'], ascending = [False, False, False, False]).head(10)

In [None]:
# Top 10 teams with most pyth win pct
EPL_2001_2022.sort_values(by='pyth_win_pct', ascending=False).head(10)

In [None]:
#Linear Model Plots between win pct vs all variables / features from 1993 - 2000
for feat in EPL_93.columns:
    if feat not in ['Team','win_pct']:
        sns.lmplot(x = 'win_pct', y = feat, data = EPL_93)
        plt.title(f"Regression of Win Percentage vs {feat} 1993 - 2000", fontsize = 20)
        plt.xlabel('Win Percentage', fontsize = 15)
        plt.ylabel(feat, fontsize = 15)

In [None]:
for feat in EPL_93.columns:
#Linear Model Plots between pyth. win pct vs all variables / features from 1993 - 2000
    if feat not in ['Team', 'pyth_win_pct']:
        sns.lmplot(x='pyth_win_pct', y=feat, data=EPL_93)
        plt.title(f"Regression of Pyth Win Percentage vs {feat} 1993 - 2000", fontsize=20)
        plt.xlabel('Pyth Win Percentage', fontsize=15)
        plt.ylabel(feat, fontsize=15)

In [None]:
#Linear Model Plots between win pct vs all variables / features from 2001 - 2022
for feat in EPL_2001_2022.columns:
    if feat not in ['Team', 'win_pct']:
        sns.lmplot(x='win_pct', y=feat, data=EPL_2001_2022)
        plt.title(f"Regression of Win Percentage vs {feat} 2001 - 2022", fontsize=20)
        plt.xlabel('Win Percentage', fontsize=15)
        plt.ylabel(feat, fontsize=15)


In [None]:
#Linear Model Plots between pyth. win pct vs all variables / features from 1993 - 2000
for feat in EPL_93.columns:
    if feat not in ['Team', 'pyth_win_pct']:
        sns.lmplot(x='pyth_win_pct', y=feat, data=EPL_93)
        plt.title(f"Regression of Pyth Win Percentage vs {feat} 1993 - 2000", fontsize=20)
        plt.xlabel('Pyth Win Percentage', fontsize=15)
        plt.ylabel(feat, fontsize=15)

### Regression Analysis

In [None]:
#Win_pct + pyth_win_pct (1993 - 2000)
reg1 = smf.ols(formula = 'win_pct ~ pyth_win_pct', data = EPL_93).fit()
print(reg1.summary())

In [None]:
#Win_pct + pyth_win_pct (2001 - 2022)
reg2 = smf.ols(formula='win_pct ~ pyth_win_pct', data=EPL_2001_2022).fit()
print(reg2.summary())

In [None]:
#Win_pct + GF (1993 - 2000)
reg3 = smf.ols(formula='win_pct ~ GF', data=EPL_93).fit()
print(reg3.summary())

In [None]:
#Win_pct + GF (2001- 2022)
reg4 = smf.ols(formula='win_pct ~ GF', data=EPL_2001_2022).fit()
print(reg4.summary())

In [None]:
#Win_pct + GF+GP (2001- 2022)
reg5_1 = smf.ols(formula='win_pct ~ GF+GP', data=EPL_93).fit()
print(reg5_1.summary())

In [None]:
#Win_pct + GF+GP (2001- 2022)
reg5 = smf.ols(formula='win_pct ~ GF+GP', data=EPL_2001_2022).fit()
print(reg5.summary())

In [None]:
#Win_pct + GF+GP (2001- 2022)
reg5__ = smf.ols(formula='win_pct ~ GF+GP+GA+WV+L+D', data=EPL_93).fit()
print(reg5__.summary())

In [None]:
#Win_pct + GF+GP (2001- 2022)
reg5__1 = smf.ols(formula='win_pct ~ GF+GP+GA+WV+L+D', data=EPL_2001_2022).fit()
print(reg5__1.summary())

In [None]:
#Win_pct + GF+ (2001- 2022)
reg6 = smf.wls(formula='win_pct ~ GF+GP+D',weight = 1 / EPL_2001_2022['PTS'] ,data=EPL_2001_2022).fit()
print(reg6.summary())

In [None]:
#Win_pct + GF+ (2001- 2022)
reg6 = smf.wls(formula='win_pct ~ GF+GP+D', weight=1 / EPL_93['PTS'], data=EPL_93).fit()
print(reg6.summary())
