# **Importing Libraries for our Code**

In [None]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as mplt # we only need pyplot
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from pylab import plt
import math

# **Importing Dataset**

In [None]:
pl_results = pd.read_csv("results.csv")
pl_results.head()

In [None]:
pl_results2018 = pl_results.loc[pl_results['season'] == "2017-2018"]
pl_results2018.head()

In [None]:
pl_results2018[list(pl_results2018.columns)].info()

In [None]:
pl_stats = pd.read_csv("stats.csv")
pl_stats.head()

In [None]:
pl_stats2018 = pl_stats.loc[pl_stats['season'] == "2017-2018"]
pl_stats2018.head()

In [None]:
pl_stats2018[list(pl_stats2018.columns)].info()

# **Exploratory Data analysis**

# 1. Pie chart of home goals and away goals

In [None]:
home_goals_sum = pl_results2018['home_goals'].sum()
print(home_goals_sum)

In [None]:
away_goals_sum = pl_results2018['away_goals'].sum()
print(away_goals_sum)

In [None]:
# Creating labels
venue_goals = ['Home', 'Away']
 
data = [582, 436]

# Creating autocpt arguments
def func(pct, allvalues):
    absolute = int(pct / 100.*np.sum(allvalues))
    return "{:.1f}%\n({:d} goals)".format(pct, absolute)

# Creating plot
fig, ax = plt.subplots(figsize =(10, 7))
wedges, texts, autotexts = ax.pie(data,
                                  autopct = lambda pct: func(pct, data),
                                  labels = venue_goals,)

 
# show plot
plt.show()

In [None]:
fig,ax6 = plt.subplots(1,1)
fig.set_size_inches(10,7)
sb.boxplot(data= pl_results2018.select_dtypes(include=['float64']), ax=ax6,showmeans=True)
ax6.set_title("Goal Distribution")
ax6.set_xticklabels(['Home Team',"Away Team"])
ax6.set_ylabel('Avg Goal/Game')
plt.show()

In [None]:
home_result = pl_results2018.loc[:,['home_team','result','season','home_goals']].groupby(['home_team','result','season']).count().\
sort_values(by=['home_team','season','home_goals']).reset_index()
home_result.loc[:,'result'] = home_result.loc[:,'result'].apply(lambda x: 'W' if x == 'H' else ('L' if x == 'A' else 'D'))
home_result.columns = home_result.columns[0:3].tolist() + ['NumOfGames']
home_result.head()

In [None]:
# create data
home_results = pd.DataFrame([['AFC Bournemouth', 7, 7, 5], ['Arsenal', 15, 2, 2], ['Brighton and Hove Albion', 7, 4, 8], ['Burnley', 7, 7, 5], ['Chelsea', 11, 4, 4], ['Crystal Palace', 7, 7, 5], ['Everton', 10, 5, 4], ['Huddersfield Town', 6, 5, 8], ['Leicester City', 7, 6, 6], ['Liverpool', 12, 0, 7], ['Manchester City', 16, 2, 1], ['Manchester United', 15, 2, 2], ['Newcastle United', 8, 7, 4], ['Southampton', 4, 8, 7], ['Stoke City', 5, 9, 5], ['Swansea City', 6, 10,3], ['Tottenham Hotspur', 13, 2, 4],
                     ['Watford', 7, 6, 6], ['West Bromwich Albion', 3, 7, 9], ['West Ham United', 7, 6, 6]]
                  , columns=['Team', 'Wins', 'Losses', 'Draws'])
# view data
print(home_results)
 
# plot data in stack manner of bar type
home_results.plot(x='Team', kind='bar', stacked=True,
        title='Team record at home in 2017-2018 season')
plt.show()

In [None]:
away_result = pl_results2018.loc[:,['away_team','result','season','away_goals']].groupby(['away_team','result','season']).count().\
sort_values(by=['away_team','season','away_goals']).reset_index()
away_result.loc[:,'result'] = away_result.loc[:,'result'].apply(lambda x: 'W' if x == 'A' else ('L' if x == 'H' else 'D'))
away_result.columns = away_result.columns[0:3].tolist() + ['NumOfGames']
away_result.head()

In [None]:
# create data
away_results = pd.DataFrame([['AFC Bournemouth', 4, 9, 6], ['Arsenal', 4, 11, 4], ['Brighton and Hove Albion', 2, 12, 5], ['Burnley', 7, 5, 7], ['Chelsea', 10, 6, 3], ['Crystal Palace', 4, 9, 6], ['Everton', 3, 10, 6], ['Huddersfield Town', 3, 11, 5], ['Leicester City', 5, 9, 5], ['Liverpool', 9, 5, 5], ['Manchester City', 16, 1, 2], ['Manchester United', 10, 5, 4], ['Newcastle United', 4, 11, 4], ['Southampton', 3, 8, 8], ['Stoke City', 2, 10, 7], ['Swansea City', 2, 11,6], ['Tottenham Hotspur', 10, 5, 4],
                     ['Watford', 4, 13, 2], ['West Bromwich Albion', 3, 12, 4], ['West Ham United', 3, 10, 6]]
                  , columns=['Team', 'Wins', 'Losses', 'Draws'])
# view data
print(away_results)
 
# plot data in stack manner of bar type
away_results.plot(x='Team', kind='bar', stacked=True,
        title='Team record away in 2017-2018 season')
plt.show()

# **Data Preparation and Cleaning**

In [None]:
draws = 38-pl_stats2018['wins']-pl_stats2018['losses']
print(draws)

In [None]:
# Creating new predictor variable
pl_stats2018['win_pct'] = pl_stats2018['wins'] * 100/ (draws + pl_stats2018['losses'] + pl_stats2018['wins'])
pl_stats2018["win_pct"].fillna(value = 0, inplace = True)
print (pl_stats2018["win_pct"])

In [None]:
print(pl_stats2018.corr())

In [None]:
f = plt.figure(figsize=(50, 50))
sb.heatmap(pl_stats2018.corr(), vmin = -1, vmax = 1, linewidths = 1,
           annot = True, fmt = ".2f", annot_kws = {"size": 16}, cmap = "RdBu")

We checked the correlation of the 40 factors compared to the win percentage using the heatmap function. We decided that the factors with a correlation coefficient range between -0.6 and 0.6 are not as relevant as those with a range below -0.6 or above 0.6. Hence, we decided to include only relevant factors that have either a below -0.6 correlation or an above 0.6 correlation.

In [None]:
columns_needed = ["team","wins","losses","goals","total_scoring_att","ontarget_scoring_att","att_hd_goal", "hit_woodwork", 
               "att_ibox_goal","att_obox_goal","clean_sheet","goals_conceded","saves",
               "outfielder_block","total_clearance","head_clearance","total_pass","total_through_ball",
               "total_long_balls","backward_pass","corner_taken","touches","big_chance_missed","win_pct","season"]

pl_stats2018 = pl_stats2018[columns_needed] 


for column in columns_needed:
    pl_stats2018 = pl_stats2018[pl_stats2018[column].isnull()==False]

In [None]:
pl_stats2018[list(pl_stats2018.columns)].info()

# **Renaming the Columns (for easier understanding)**

In [None]:
pl_stats2018 = pl_stats2018.rename(columns={'total_scoring_att': 'total_shots', 'ontarget_scoring_att': 'shots_on_target', 'att_hd_goal': 'header_goals', 
                                            'hit_woodwork': 'hit_goalpost', 'att_ibox_goal': 'insidebox_goals', 
                                            'att_obox_goal': 'outsidebox_goals', 'outfielder_block': 'blocks'})

In [None]:
pl_stats2018[list(pl_stats2018.columns)].info()

We narrowed the 40 factors to 21 factors that had a strong correlation to Win Percentage

# **Merging of Datasets**

In [None]:
pl_stats2018.reset_index(drop=True, inplace=True)
pl_stats2018

In [None]:
merged_left = pd.merge(left=pl_results2018, right=pl_stats2018, how='left', left_on='home_team', right_on='team')
pl_combined = pd.merge(left=merged_left, right=pl_stats2018, how='left', left_on='away_team', right_on='team')
pl_combined

In [None]:
pl_combined[list(pl_combined.columns)].info()

In [None]:
pl_combined.drop('season_x', axis=1, inplace=True)
pl_combined.drop('season_y', axis=1, inplace=True)
pl_combined.drop('season', axis=1, inplace=True)
pl_combined.drop('team_x', axis=1, inplace=True)
pl_combined.drop('team_y', axis=1, inplace=True)
pl_combined.drop('home_goals', axis=1, inplace=True)
pl_combined.drop('away_goals', axis=1, inplace=True)

In [None]:
pl_combined[list(pl_combined.columns)].info()

In [None]:
column_indices = [3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25]
new_names = ['home_wins','home_losses','home_goals','home_total_shots','home_shots_on_target','home_header_goals','home_hit_goalpost','home_insidebox_goals',
             'home_outsidebox_goals','home_clean_sheet','home_goals_conceded','home_saves','home_blocks','home_total_clearance','home_head_clearance','home_total_pass',
             'home_total_through_ball','home_total_long_balls','home_backward_pass','home_corner_taken','home_touches','home_big_chance_missed','home_win_pct']
old_names = pl_combined.columns[column_indices]
pl_combined.rename(columns=dict(zip(old_names, new_names)), inplace=True)

In [None]:
column_indices = [26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48]
new_names = ['away_wins','away_losses','away_goals','away_total_shots','away_shots_on_target','away_header_goals','away_hit_goalpost','away_insidebox_goals',
             'away_outsidebox_goals','away_clean_sheet','away_goals_conceded','away_saves','away_blocks','away_total_clearance','away_head_clearance','away_total_pass',
             'away_total_through_ball','away_total_long_balls','away_backward_pass','away_corner_taken','away_touches','away_big_chance_missed','away_win_pct']
old_names = pl_combined.columns[column_indices]
pl_combined.rename(columns=dict(zip(old_names, new_names)), inplace=True)

In [None]:
pl_combined[list(pl_combined.columns)].info()

# **Possible Outcomes of Winner**

In [None]:
cleandataset = pd.DataFrame(pl_results2018)
print((cleandataset["result"].unique()))

# **Creating new predictor variables**
The current columns are specific to each side, Home & Away. But for our prediction of outcome, it will be more useful to have variables that reflected the differences between the teams so we created additional columns for the comparisons using the columns provided to us by the dataset.

In [None]:
pl_combined["goals_diff"] = pl_combined['home_goals']-pl_combined['away_goals']
pl_combined["total_shots_diff"] = pl_combined['home_total_shots']-pl_combined['away_total_shots']
pl_combined["shots_on_target_dif"] = pl_combined['home_shots_on_target']-pl_combined['away_shots_on_target']
pl_combined["header_goals_dif"] = pl_combined['home_header_goals']-pl_combined['away_header_goals']
pl_combined["hit_goalpost_dif"] = pl_combined['home_hit_goalpost']-pl_combined['away_hit_goalpost']
pl_combined["insidebox_goals_dif"] = pl_combined['home_insidebox_goals']-pl_combined['away_insidebox_goals']
pl_combined["outsidebox_goals_dif"] = pl_combined['home_outsidebox_goals']-pl_combined['away_outsidebox_goals']
pl_combined["clean_sheet_dif"] = pl_combined['home_clean_sheet']-pl_combined['away_clean_sheet']
pl_combined["goals_conceded_dif"] = pl_combined['home_goals_conceded']-pl_combined['away_goals_conceded']
pl_combined["saves_dif"] = pl_combined['home_saves']-pl_combined['away_saves']
pl_combined["blocks_dif"] = pl_combined['home_blocks']-pl_combined['away_blocks']
pl_combined["total_clearance_dif"] = pl_combined['home_total_clearance']-pl_combined['away_total_clearance']
pl_combined["head_clearance_dif"] = pl_combined['home_head_clearance']-pl_combined['away_head_clearance']
pl_combined["total_pass_dif"] = pl_combined['home_total_pass']-pl_combined['away_total_pass']
pl_combined["total_through_ball_dif"] = pl_combined['home_total_through_ball']-pl_combined['away_total_through_ball']
pl_combined["total_long_balls_dif"] = pl_combined['home_total_long_balls']-pl_combined['away_total_long_balls']
pl_combined["backward_pass_dif"] = pl_combined['home_backward_pass']-pl_combined['away_backward_pass']
pl_combined["corner_taken_dif"] = pl_combined['home_corner_taken']-pl_combined['away_corner_taken']
pl_combined["touches_dif"] = pl_combined['home_touches']-pl_combined['away_touches']
pl_combined["big_chance_missed_dif"] = pl_combined['home_big_chance_missed']-pl_combined['away_big_chance_missed']
pl_combined["win_pct_diff"] = pl_combined["home_win_pct"]-pl_combined["away_win_pct"]
pl_combined.head(20)

In [None]:
pl_combined.info()

# **Generating Train - Test Split**

We split our dataset into 75% train data and 25% test data,as well as fixing the randomness of the split to ensure consistency in the results obtained

In [None]:
# Extract Response and Predictors
y = pl_combined["result"]
X = pl_combined[['goals_diff','total_shots_diff','header_goals_dif','hit_goalpost_dif','insidebox_goals_dif',
                 'outsidebox_goals_dif','clean_sheet_dif','goals_conceded_dif','saves_dif','blocks_dif','total_clearance_dif','head_clearance_dif',
                 'total_pass_dif','total_through_ball_dif','total_long_balls_dif','backward_pass_dif','corner_taken_dif','touches_dif',
                 'big_chance_missed_dif','win_pct_diff']]

    
# Split the Dataset into Train and Test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25,random_state = 2)

# Check the sample sizes
print("Train Set :", y_train.shape, X_train.shape)
print("Test Set  :", y_test.shape, X_test.shape)

In [None]:
X_test

In [None]:
y_train.value_counts()

# **Training Our Machine Learning Model**

**Binary Classification Model**

We used binary classification to help us get a predicted outcome since our outcome is a categorical value.

In [None]:
dectree = DecisionTreeClassifier(max_depth = 3)  # create the decision tree object
dectree.fit(X_train, y_train)                    # train the decision tree model

In [None]:
f = mplt.figure(figsize=(60,20))
plot_tree(dectree, filled=True, rounded=True, 
          feature_names=X_train.columns, 
          class_names=["H","A","D"])

In [None]:
y_train_pred = dectree.predict(X_train)
y_test_pred = dectree.predict(X_test)

# Check the Goodness of Fit (on Train Data)
print("Goodness of Fit of Model \tTrain Dataset")
print("Classification Accuracy \t:", dectree.score(X_train, y_train))
print()
# Plot the Confusion Matrix for Train and Test
train = confusion_matrix(y_train,y_train_pred)

TP = train[0][0]
FP = train[0][1] + train[0][2]
precision_away = TP / (FP + TP)

print("Precision Rate of Away Win \tTrain Dataset")
print(f"Precision:\t\t\t{precision_away:.2%}") 
print()

TP = train[1][1]
FP = train[1][0] + train[1][2]
precision_draw = TP / (FP + TP)

print("Precision Rate of Draw \t\tTrain Dataset")
print(f"Precision:\t\t\t{precision_draw:.2%}") 
print()

TP = train[2][2]
FP = train[2][1] + train[2][0]
precision_home = TP / (FP + TP)

print("Precision Rate of Home Win \tTrain Dataset")
print(f"Precision:\t\t\t{precision_home:.2%}") 
print()

# Check the Goodness of Fit (on Test Data)
print("Goodness of Fit of Model \tTest Dataset")
print("Classification Accuracy \t:", dectree.score(X_test, y_test))
print()
test = confusion_matrix(y_test, y_test_pred)

TP = test[0][0]
FP = test[0][1] + test[0][2]
precision_away = TP / (FP + TP)

print("Precision Rate of Away Win \tTest Dataset")
print(f"Precision:\t\t\t{precision_away:.2%}") 
print()

TP = test[1][1]
FP = test[1][0] + test[1][2]
precision_draw = TP / (FP + TP)

print("Precision Rate of Draw \t\tTest Dataset")
print(f"Precision:\t\t\t{precision_draw:.2%}") 
print()

TP = test[2][2]
FP = test[2][1] + test[2][0]
precision_home = TP / (FP + TP)

print("Precision Rate of Home Win \tTest Dataset")
print(f"Precision:\t\t\t{precision_home:.2%}") 
print()

# Tried Classification Report, but its is unreliable when dealing with 3x3 matrix
#class_names=["A","D","H"]
#classifier_tree = DecisionTreeClassifier()
#y_predict = classifier_tree.fit(X_train, y_train).predict(X_test)
#print(classification_report(y_test, y_predict, target_names=class_names))
#print(confusion_matrix(y_test, y_predict))

# Plot the Confusion Matrix for Train and Test
f, axes = mplt.subplots(1, 2, figsize=(12, 4))
sb.heatmap(confusion_matrix(y_train, y_train_pred),
           annot = True, fmt=".0f", annot_kws={"size": 18}, ax = axes[0])
sb.heatmap(confusion_matrix(y_test, y_test_pred), 
           annot = True, fmt=".0f", annot_kws={"size": 18}, ax = axes[1])

In [None]:
list(y_test)

# **Beyond the Course: Logistic Regression**
Logistic regression estimates the probability of an event occurring, based on a given dataset of independent variables.

In [None]:
logmodel = LogisticRegression(max_iter = 60000)
lm = logmodel.fit(X_train,y_train)
predictions = lm.predict(X_test)
print("Accuracy = ",lm.score(X_test,y_test))

# Tried Classification Report, but its is unreliable when dealing with 3x3 matrix
#print(classification_report(y_test,predictions))
#test = confusion_matrix(y_test,predictions)

TP = test[0][0]
FP = test[0][1] + test[0][2]
precision_away = TP / (FP + TP)

print("Precision Rate of Away Win \tTest Dataset")
print(f"Precision:\t\t\t{precision_away:.2%}") 
print()

TP = test[1][1]
FP = test[1][0] + test[1][2]
precision_draw = TP / (FP + TP)

print("Precision Rate of Draw \t\tTest Dataset")
print(f"Precision:\t\t\t{precision_draw:.2%}") 
print()

TP = test[2][2]
FP = test[2][1] + test[2][0]
precision_home = TP / (FP + TP)

print("Precision Rate of Home Win \tTest Dataset")
print(f"Precision:\t\t\t{precision_home:.2%}") 

print()
hm = sb.heatmap(data = test,annot = True)
  
# displaying the plotted heatmap
plt.show()

# **Probability**
Predicting the Probability of Away Win, Draw , Home Win in each instance

In [None]:
probability = lm.predict_proba(X_test).round(3)
probability

# **DataFrame of all the prediction information from Logistic Regression Model**

In [None]:
y_prob_logistic = lm.predict_proba(X_test).round(3)
y_prob_logistic = pd.DataFrame(y_prob_logistic[:,0], columns = ["Probability_Of_Away_Win"], index = X_test.index)

Winner_df_logistic = pd.DataFrame(y_test,columns = ["Result"],index = X_test.index)
Winner_df_logistic = y_test
Predicted_Winner_logistic_df = pd.DataFrame(predictions,columns = ["Predicted_Result"],index = X_test.index)
compare_prob_logistic = pd.concat([Winner_df_logistic,Predicted_Winner_logistic_df, y_prob_logistic], axis = 1)

y_prob_logistic = lm.predict_proba(X_test).round(3)
compare_prob_logistic["Probability_Of_Draw"] = pd.DataFrame(y_prob_logistic[:,1], columns = ["Probability_Of_Draw"], index = X_test.index)

y_prob_logistic = lm.predict_proba(X_test).round(3)
compare_prob_logistic["Probability_Of_Home_Win"] = pd.DataFrame(y_prob_logistic[:,2], columns = ["Probability_Of_Home_Win"], index = X_test.index)

#print(compare_prob_logistic.to_markdown())
compare_prob_logistic

In [None]:
pl_combined

In [None]:

Predicted_Winner_logistic_df = pd.DataFrame(predictions,columns = ["Predicted_Result"],index = X_test.index)

pl_prediction = pl_combined.join(Predicted_Winner_logistic_df)

pl_prediction 

In [None]:
# Function to extract result
def f(x,y): return pl_combined.loc[(pl_combined['home_team'] == x) & (pl_combined['away_team'] == y), ['result']]

# Get the teams from the user
x = input("Enter Home Team: ") 
y = input("\nEnter Away Team: ")

print()

# Print the result
print(f(x,y))


