### Exploratory Data Analysis (EDA) and Regression Analysis for Home and Away Teams for EPL 1993 - 2022 Seasons

In [None]:
#Importing Relevant libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [None]:
#Loading data
home = pd.read_csv('English Premier League Data Analytics\Data\home_teams.csv')
away = pd.read_csv('English Premier League Data Analytics\Data\away_teams.csv')
display(home)

In [None]:
display(away)

In [None]:
home.describe()

In [None]:
away.describe()

In [None]:
home.corr()

In [None]:
away.corr()

In [None]:
#Correlation for home teams
plt.figure(figsize=(15,8))
sns.heatmap(home.corr(), annot = True, cmap = 'coolwarm')
plt.title('Correlation Matrix', fontsize=20)

In [None]:
#Correlation for away teams
plt.figure(figsize=(15, 8))
sns.heatmap(away.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Matrix', fontsize = 20)

In [None]:
#Renaming Data columns for home team data
home = home.rename(columns = {"Goals_for_h": 'HomeGoals', 'Goals_against_a': 'AwayGoals', 'GPh':'GP', 'HW':'W',
                   'HD':'D', 'HL':'L', 'HPTS':'PTS', 'hwinvalue':'WV'})
home.head()

In [None]:
#Renaming data columns for away team data
away = away.rename(columns = {"Goals_for_a": 'AwayGoals', 'Goals_against_h': 'HomeGoals', 'GPa':'GP', 'AW':'W',
                   'AD':'D', 'AL':'L', 'APTS':'PTS', 'awinvalue':'WV'})
away.head()

In [None]:
#Data Aggregation for home and away team data
home = home.groupby(['Team'])['HomeGoals', 'AwayGoals', 'GP', 'W', 'D', 'L','PTS', 'WV'].sum().reset_index()
away = away.groupby(['Team'])['HomeGoals', 'AwayGoals', 'GP', 'W', 'D', 'L','PTS', 'WV'].sum().reset_index()
display(home[0:10])

In [None]:
#Renaming some columns to more meaningful names
home = home.rename(columns = {'HomeGoals':'Goals_Scored', 'AwayGoals':'Goals_Conced'})
home.head(3)

In [None]:
display(away[0:10])

In [None]:
#Renaming some columns to more meaningful names
away = away.rename(columns = {'HomeGoals':'Goals_conced', 'AwayGoals': 'Goals_scored'})
away[0:3]

In [None]:
#Selecting useful Data columns for analysis
away = away[['Team', 'Goals_scored', 'Goals_conced', 'GP', 'W', 'D', 'L', 'PTS','WV']]
away.head()

In [None]:
#Creating relevant variables
home['win_pct'] = home['W'] / home['GP']
home['pyth_win_pct'] = home['Goals_Scored'] ** 2 / (home['Goals_Scored'] ** 2 + home['Goals_Conced'] ** 2)
display(home[0:4])

In [None]:
home.describe()

In [None]:
#Creating relevant variables
away['win_pct'] = away['W'] / away['GP']
away['pyth_win_pct'] = away['Goals_scored'] ** 2 / (away['Goals_scored'] ** 2 + away['Goals_conced'] ** 2)
display(away[0:4])

In [None]:
#Identifying teams with more 260 home games played 
home['Long'] = ['Yes' if home.loc[ex, 'GP'] > 260 else 'No' 
                for ex in home.index]

In [None]:
# Identifying teams with more 260 home games played
away['Long'] = ['Yes' if away.loc[ex, 'GP'] > 260 else 'No'
                for ex in away.index]

In [None]:
#Goals Difference
home['Goals_diff'] = home['Goals_Scored'] - home['Goals_Conced']
away['Goals_diff'] = away['Goals_scored'] - away['Goals_conced']

In [None]:
#Summary Statistics
away.describe()

In [None]:
#Correlations
plt.figure(figsize = (15, 8))
sns.heatmap(home.corr(), cmap ='coolwarm', annot = True)
plt.title('Correlation Coffiecent Matrix For Home ', fontsize = 15)

In [None]:
#Regression plot ofr win percentage vs pythogreon win percentage for home stats
plt.figure(figsize=(15, 8))
sns.regplot(x = 'win_pct', y = 'pyth_win_pct', data = home)
plt.title('Winning Percentage vs Pythogorean Winning Percentage For Home ', fontsize=15)
plt.xlabel('Winning Percentage', fontsize = 15)
plt.ylabel('Pythogorean Winning Percentage', fontsize = 15)

In [None]:
#Top 10 teams with most home wins
home.sort_values(by = ['W', 'GP'], ascending = [False, False]).head(10)

In [None]:
# Top 10 teams with most home Scored
home.sort_values(by=['Goals_Scored', 'GP'], ascending=[False, False]).head(10)

In [None]:
# Top 10 teams with most home conced
home.sort_values(by=['Goals_Conced', 'GP'], ascending=[False, False]).head(10)

In [None]:
# Top 10 teams with most home loss
home.sort_values(by=['L', 'GP'], ascending=[False, False]).head(10)

In [None]:
# Top 10 teams with most goals diff
home.sort_values(by=['Goals_diff', 'GP'], ascending=[False, False]).head(10)

In [None]:
# Top 10 teams with most home draws
home.sort_values(by=['D', 'GP'], ascending=[False, False]).head(10)

In [None]:
# Top 10 teams with most home win value 
home.sort_values(by=['WV', 'GP'], ascending=[False, False]).head(10)

In [None]:
# Top 10 teams with high win pct
home.sort_values(by=['win_pct', 'GP'], ascending=[False, False]).head(10)

### Away Teams

In [None]:
#Correlations
plt.figure(figsize=(15, 8))
sns.heatmap(away.corr(), cmap='coolwarm', annot=True)
plt.title('Correlation Coffiecent Matrix For Away', fontsize=15)

In [None]:
#Regression plot for win percentage vs pythogorean win percentages for away stats
plt.figure(figsize=(15, 8))
sns.regplot(x='win_pct', y='pyth_win_pct', data=away)
plt.title(
    'Winning Percentage vs Pythogorean Winning Percentage For Away ', fontsize=15)
plt.xlabel('Winning Percentage', fontsize=15)
plt.ylabel('Pythogorean Winning Percentage', fontsize=15)


In [None]:
#Top 10 teams with most away wins
away.sort_values(by = ['W'], ascending=False).head(10)

In [None]:
# Top 10 teams with most away goals
away.sort_values(by=['Goals_scored'], ascending=False).head(10)


In [None]:
# Top 10 teams with high away win pct
away.sort_values(by=['win_pct'], ascending=False).head(10)


In [None]:
# Top 10 teams with most away goals conced
away.sort_values(by=['Goals_conced'], ascending=False).head(10)

In [None]:
# Top 10 teams with most goals diff
away.sort_values(by=['Goals_diff', 'GP'], ascending=[False, False]).head(10)

In [None]:
# Top 10 teams with most away loss
away.sort_values(by=['L'], ascending=False).head(15)

In [None]:
# Top 10 teams with most away draw
away.sort_values(by=['D'], ascending=False).head(10)

In [None]:
# Top 10 teams with most away win values (Combination of wins and draws (1 - for 0.5 for draw))
away.sort_values(by=['WV'], ascending=False).head(10)

In [None]:
#Creating a dataframe to hold some more relevant data for teams in home and in away
EPL = pd.DataFrame()
EPL['Teams'] = home['Team']
EPL['Long'] = home['Long']
EPL['H_win_pct'] = home['win_pct']
EPL['H_pyth'] = home['pyth_win_pct']
EPL['A_win_pct'] = away['win_pct']
EPL['A_pyth'] = away['pyth_win_pct']
display(EPL[0:10])

In [None]:
#Regression plot for win percentage for both home and away
plt.figure(figsize = (15, 8))
sns.regplot(x = 'H_win_pct', y = 'A_win_pct', data = EPL)
plt.title('Home Win Percentage vs Away Win Percentage', fontsize=20)
plt.xlabel('Win Percentage (Home)', fontsize=15)
plt.ylabel('Win Percentage (Away)', fontsize=15)

In [None]:
#Regression plot for win percentage for both home and away, for teams with more games played and less games played
sns.set_style('whitegrid')
sns.lmplot(x = 'H_win_pct', y = 'A_win_pct', data = EPL, hue = 'Long', palette = 'coolwarm', height = 6, aspect = 1, fit_reg = True)
plt.title('Regression Plot of Win Percentage vs Pyth Win Percentge For Home Matches', fontsize = 20)
plt.xlabel('Home Win Percentages', fontsize = 15)
plt.ylabel("Home Pyth Win Percentage", fontsize = 15)

In [None]:
#Regression plot for pythogorean win percentage for both home and away, for teams with more games played and less games played
sns.set_style('whitegrid')
sns.lmplot(x = 'H_pyth', y = 'A_pyth', data = EPL, hue = 'Long', palette = 'coolwarm', height = 6, aspect = 1, fit_reg = True)
plt.title('Regression Plot of Win Percentage vs Pyth Win Percentge For Away Matches', fontsize=20)
plt.xlabel('Away Win Percentages', fontsize = 15)
plt.ylabel("Away Pyth Win Percentage", fontsize = 15)

In [None]:
#Histogram for Home win percentages
sns.set_style('darkgrid')
g = sns.FacetGrid(EPL, hue = 'Long', palette = 'coolwarm', height = 6, aspect = 2)
g = g.map(plt.hist, 'H_win_pct', alpha = 0.7)
plt.title('Home Teams Win Percentage', fontsize = 20)
plt.xlabel('Win Percentage', fontsize = 15)
plt.legend()

In [None]:
#Histogram for Home win percentages
sns.set_style('darkgrid')
g = sns.FacetGrid(EPL, hue = 'Long', palette = 'coolwarm', height = 6, aspect = 2)
g = g.map(plt.hist, 'A_win_pct', alpha = 0.7)
plt.title('Away Teams Win Percentage', fontsize = 20)
plt.xlabel('Win Percentage', fontsize = 15)
plt.legend()

In [None]:
import statsmodels.formula.api as smf 
#Home win percentage  and home pythogoreon win percentage
reg1 = smf.ols(formula = 'H_win_pct ~ H_pyth', data = EPL).fit()
reg1.summary()

In [None]:
#Home win percentage  and away win percentage
reg2 = smf.ols(formula='H_win_pct ~ A_win_pct', data=EPL).fit()
reg2.summary()

In [None]:
#Home win percentage  and home pythogoreon win percentage plus using 'Long' variable
reg3 = smf.ols(formula='H_win_pct ~ H_pyth + C(Long)', data=EPL).fit()
reg3.summary()

In [None]:
#Home win percentage, home pythogoreon win percentage and away win percentage
reg4 = smf.ols(formula='H_win_pct ~ H_pyth+ A_win_pct', data=EPL).fit()
reg4.summary()

In [None]:
#Home win percentage, home pythogoreon win percentage, away win percentage and away pythogorean
reg5 = smf.ols(formula='H_win_pct ~ H_pyth+ A_win_pct+A_pyth', data=EPL).fit()
reg5.summary()

In [None]:
#Home win percentage, home pythogoreon win percentage, away win percentage and treating each team as a unique entity
reg6 = smf.ols(formula='H_win_pct ~ H_pyth+ A_win_pct + C(Long)', data=EPL).fit()
reg6.summary()

In [None]:
#Home win percentage, home pythogoreon win percentage, away win percentage and (home pythogorean and away win percentages)
reg7 = smf.ols(formula='H_win_pct ~ H_pyth+A_win_pct+H_pyth*A_win_pct', data=EPL).fit()
reg7.summary()

In [None]:
#Home win percentage, home pythogoreon win percentage and away win percentage
reg7 = smf.ols(formula='A_win_pct ~ A_pyth+ H_win_pct', data=EPL).fit()
reg7.summary()

### Conclusions

The regression plots shows a very good relationship for teams who have played more than 460 games in the top flight league as comapred to the once with fewer games than 460. This show the win percentage and the pythogorean win percentage gives a perfect regression line for teams with more games.

From the regression results, the home and away win percentage is better determined by the combination of pythogorean win percentage, win percentages for both home and away stats. in Home win percentage regression, the R-squared and the Adjusted r-squared are 0.914 and 0.911 respectively and for the away win percentage 0.931 and 0.927 for R-squared and Adjusted R-squared respectively.