# Import required modules

In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn import cross_validation
from pandas.tools.plotting import scatter_matrix
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
import seaborn as sns
import glob
import os
from sklearn.cross_validation import cross_val_score
plt.style.use('ggplot')



# Define Functions

In [2]:
#A function to calculate the results
def result(x):
	conditions = [
	(x['GoalsFor'] > x['GoalsAgainst']),
	(x['GoalsFor'] == x['GoalsAgainst']),
	(x['GoalsFor'] < x['GoalsAgainst'])
	]
	choices = [1,0,-1]
	x['Result'] = np.select(conditions, choices)
	return x

In [3]:
def joined_result(x):
    if x == 'H':
        return 1
    elif x == 'D':
        return 0
    elif x == 'A':
        return -1

# Data Preperation

Read CSV Data

In [4]:
full_results=pd.read_csv(r'C:\Users\dbrown\Documents\Misc\Football\16_17\overall.csv')
fixtures = pd.read_csv(r'C:\Users\dbrown\Documents\Misc\Football\17_18\fixtures.csv')

Define base dataframes

In [5]:
full_stats = full_results[['Date','Div','HomeTeam','AwayTeam','FTHG','FTAG','FTR']]
fixtures = fixtures[['Date','Div','HomeTeam','AwayTeam','FTHG','FTAG','FTR']]
full_results_combo = full_stats.append(fixtures)
full_results_combo.drop_duplicates(subset=['Date','Div','HomeTeam','AwayTeam'],keep='last',inplace=True)
home_stats = full_results_combo.set_index(['Date','Div','HomeTeam'])
away_stats = full_results_combo.set_index(['Date','Div','AwayTeam'])

Work on dataframes

In [6]:
home_stats = home_stats[['FTHG','FTAG']]
home_stats.rename(columns={'FTHG':'GoalsFor','FTAG':'GoalsAgainst'},inplace='True')
home_stats['GoalDiff'] = home_stats['GoalsFor'] - home_stats['GoalsAgainst']
away_stats = away_stats[['FTAG','FTHG']]
away_stats.rename(columns={'FTAG':'GoalsFor','FTHG':'GoalsAgainst'},inplace='True')
away_stats['GoalDiff'] = away_stats['GoalsFor'] - away_stats['GoalsAgainst']
#Set dataframes with the result
home_stats = result(home_stats)
away_stats = result(away_stats)
#Rename columns in Home and Away stats
home_stats.rename(columns={'GoalsFor':'HomeGoalsFor','GoalsAgainst':'HomeGoalsAgainst','GoalDiff':'HomeGoalDiff','Result':'HomeResult'},inplace='True')
away_stats.rename(columns={'GoalsFor':'AwayGoalsFor','GoalsAgainst':'AwayGoalsAgainst','GoalDiff':'AwayGoalDiff','Result':'AwayResult'},inplace='True')

Set 2-6 game form values

In [7]:
for num in range(2,7):
        home = 'Home'
        away = 'Away'
        HomeGameForm="%s%sGameForm" % (home,num)
        HomeGameGoalsFor="%s%sGameGoalsFor" % (home,num)
        HomeGameGoalDiff="%s%sGameGoalDiff" % (home,num)
        AwayGameForm="%s%sGameForm" % (away,num)
        AwayGameGoalsFor="%s%sGameGoalsFor" % (away,num)
        AwayGameGoalDiff="%s%sGameGoalDiff" % (away,num)
        home_stats[HomeGameForm] = home_stats['HomeResult'].groupby(level='HomeTeam').apply(lambda x: pd.rolling_mean(x.shift(),num))
        home_stats[HomeGameGoalsFor] = home_stats['HomeGoalsFor'].groupby(level='HomeTeam').apply(lambda x: pd.rolling_mean(x.shift(),num))
        home_stats[HomeGameGoalDiff] = home_stats['HomeGoalDiff'].groupby(level='HomeTeam').apply(lambda x: pd.rolling_mean(x.shift(),num))
        away_stats[AwayGameForm] = away_stats['AwayResult'].groupby(level='AwayTeam').apply(lambda x: pd.rolling_mean(x.shift(),num))
        away_stats[AwayGameGoalsFor] = away_stats['AwayGoalsFor'].groupby(level='AwayTeam').apply(lambda x: pd.rolling_mean(x.shift(),num))
        away_stats[AwayGameGoalDiff] = away_stats['AwayGoalDiff'].groupby(level='AwayTeam').apply(lambda x: pd.rolling_mean(x.shift(),num))

Merge Home and Away Stats

In [8]:
home_noindex = home_stats.reset_index()
away_noindex = away_stats.reset_index()
home_merge = pd.merge(full_results_combo, home_noindex, on=['Date','Div','HomeTeam'])
home_merge.rename(columns={'HomeTeam_x':'HomeTeam','AwayTeam_x':'AwayTeam'},inplace=True)
away_merge = pd.merge(full_results_combo, away_noindex, on=['Date','Div','AwayTeam'])
away_merge.rename(columns={'HomeTeam_x':'HomeTeam','AwayTeam_x':'AwayTeam'},inplace=True)
home_away_merge = pd.merge(home_merge, away_merge, on=['Date','Div','AwayTeam'])
home_away_merge.rename(columns={'HomeTeam_x':'HomeTeam'},inplace=True)
home_away_merge = home_away_merge[['Date', 'Div','HomeTeam','AwayTeam','HomeGoalsFor', 'HomeGoalsAgainst', 'HomeGoalDiff', 'HomeResult','Home2GameForm', 'Home2GameGoalsFor', 'Home2GameGoalDiff','Home3GameForm', 'Home3GameGoalsFor', 'Home3GameGoalDiff',
       'Home4GameForm', 'Home4GameGoalsFor', 'Home4GameGoalDiff',
       'Home5GameForm', 'Home5GameGoalsFor', 'Home5GameGoalDiff',
       'Home6GameForm', 'Home6GameGoalsFor', 'Home6GameGoalDiff',
       'AwayGoalsFor', 'AwayGoalsAgainst',
       'AwayGoalDiff', 'AwayResult', 'Away2GameForm', 'Away2GameGoalsFor',
       'Away2GameGoalDiff', 'Away3GameForm', 'Away3GameGoalsFor',
       'Away3GameGoalDiff', 'Away4GameForm', 'Away4GameGoalsFor',
       'Away4GameGoalDiff', 'Away5GameForm', 'Away5GameGoalsFor',
       'Away5GameGoalDiff', 'Away6GameForm', 'Away6GameGoalsFor',
       'Away6GameGoalDiff']]

Set relevant columns and clean data

In [9]:
home_away_merge_idx = home_away_merge.set_index(['Date','Div','HomeTeam','AwayTeam'])
#home_away_merge_idx_relevant = home_away_merge_idx[['HomeGoalsFor','AwayGoalsFor','HomeResult','3GameFormDiff']]
#home_away_merge_cleaned = home_away_merge_idx_relevant.dropna()

Set 2-6 game form difference values

In [10]:
for num in range(2,7):
        home = 'Home'
        away = 'Away'
        HomeGameForm="%s%sGameForm" % (home,num)
        HomeGameGoalsFor="%s%sGameGoalsFor" % (home,num)
        HomeGameGoalDiff="%s%sGameGoalDiff" % (home,num)
        AwayGameForm="%s%sGameForm" % (away,num)
        AwayGameGoalsFor="%s%sGameGoalsFor" % (away,num)
        AwayGameGoalDiff="%s%sGameGoalDiff" % (away,num)
        GameFormDiff = "%sGameFormDiff" % (num)
        GameGoalsForDiff = "%sGameGoalsForDiff" % (num)
        GameGoalDiffDiff = "%sGameGoalDiffDiff" % (num)
        home_away_merge_idx[GameFormDiff] = home_away_merge_idx[HomeGameForm] - home_away_merge_idx[AwayGameForm]
        home_away_merge_idx[GameGoalsForDiff] = home_away_merge_idx[HomeGameGoalsFor] - home_away_merge_idx[AwayGameGoalsFor]
        home_away_merge_idx[GameGoalDiffDiff] = home_away_merge_idx[HomeGameGoalDiff] - home_away_merge_idx[AwayGameGoalDiff]

Deal with NaN values

In [11]:
home_away_merge_idx_zeroed = home_away_merge_idx.fillna(0)
home_away_merge_idx_dropped = home_away_merge_idx.dropna()
home_away_merge_idx_zeroed_reset = home_away_merge_idx_zeroed.reset_index()

Get the latest stats

In [12]:
home_stats_latest = home_away_merge_idx_zeroed_reset.groupby('HomeTeam').last()
home_stats_latest.reset_index(inplace=True)
home_stats_latest.set_index(['Date','Div','HomeTeam','AwayTeam'],inplace=True)
latest_fixtures = home_stats_latest[home_stats_latest["HomeResult"]==0]

In [13]:
latest_fixtures

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,HomeGoalsFor,HomeGoalsAgainst,HomeGoalDiff,HomeResult,Home2GameForm,Home2GameGoalsFor,Home2GameGoalDiff,Home3GameForm,Home3GameGoalsFor,Home3GameGoalDiff,...,3GameGoalDiffDiff,4GameFormDiff,4GameGoalsForDiff,4GameGoalDiffDiff,5GameFormDiff,5GameGoalsForDiff,5GameGoalDiffDiff,6GameFormDiff,6GameGoalsForDiff,6GameGoalDiffDiff
Date,Div,HomeTeam,AwayTeam,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
18/02/18,G1,AEK,Xanthi,0.0,0.0,0.0,0,1.0,1.5,1.5,1.000000,2.000000,1.666667,...,2.333333,1.00,1.00,2.00,1.0,1.2,2.0,0.833333,1.000000,1.833333
17/02/18,E2,AFC Wimbledon,Bristol Rvs,0.0,0.0,0.0,0,0.0,1.5,0.0,0.333333,1.666667,0.666667,...,-0.333333,0.00,0.25,0.00,-0.2,0.2,-0.8,-0.166667,0.166667,-0.333333
10/02/18,N1,AZ Alkmaar,VVV Venlo,0.0,0.0,0.0,0,0.5,2.5,1.0,0.000000,2.000000,0.333333,...,1.000000,0.00,2.00,1.75,0.2,1.6,1.8,0.333333,1.666667,1.666667
17/02/18,SC2,Airdrie Utd,Queens Park,0.0,0.0,0.0,0,0.0,0.0,0.0,0.333333,0.666667,0.666667,...,0.333333,0.00,-1.00,-0.50,0.4,-0.8,0.0,0.666667,-0.333333,0.666667
16/02/18,F2,Ajaccio GFCO,Reims,0.0,0.0,0.0,0,0.0,1.5,0.5,0.000000,1.333333,0.333333,...,-1.000000,-1.00,-0.50,-1.50,-1.0,-0.4,-1.4,-0.833333,-0.333333,-1.166667
18/02/18,SP1,Alaves,La Coruna,0.0,0.0,0.0,0,0.5,2.0,0.5,0.666667,1.666667,0.666667,...,4.333333,1.50,1.00,4.50,1.6,1.2,4.4,1.333333,0.833333,3.666667
17/02/18,SC2,Albion Rvs,Raith Rvs,0.0,0.0,0.0,0,-0.5,1.5,-1.0,0.000000,2.000000,-0.333333,...,-0.666667,-0.50,0.00,-1.25,-0.8,0.2,-1.4,-0.833333,-0.166667,-1.333333
17/02/18,SP2,Alcorcon,Tenerife,0.0,0.0,0.0,0,0.5,1.0,0.5,0.666667,1.333333,1.000000,...,1.333333,1.00,0.50,1.75,1.0,0.2,1.6,1.000000,0.333333,1.833333
17/02/18,EC,Aldershot,Macclesfield,0.0,0.0,0.0,0,0.0,0.5,0.0,0.333333,1.333333,0.666667,...,1.666667,-0.25,-0.75,0.75,0.0,-0.4,1.0,0.000000,-0.333333,0.666667
17/02/18,F1,Amiens,Toulouse,0.0,0.0,0.0,0,0.0,1.5,0.0,0.000000,1.333333,0.000000,...,0.666667,0.25,0.25,0.50,0.2,0.4,0.4,0.500000,0.500000,0.833333


# Logistic Regression on HomeResult

In [14]:
y = home_away_merge_idx_dropped['HomeResult']
X = home_away_merge_idx_dropped[[
    '6GameFormDiff', '6GameGoalsForDiff', '6GameGoalDiffDiff']]
logreg = LogisticRegression()

In [15]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.8)
logreg.fit(X_train, y_train)
logreg.score(X_train, y_train)
cross_val_score(logreg, X, y, cv=10, scoring='accuracy').mean()

0.47645009915160674

In [16]:
y = home_away_merge_idx_dropped['HomeResult']
X = home_away_merge_idx_dropped[[
    '6GameFormDiff', '6GameGoalsForDiff', '6GameGoalDiffDiff']]

logreg = LogisticRegression()

#X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.9)
logreg.fit(X, y)
logreg.score(X, y)

y_pred = logreg.predict(X)
#metrics.accuracy_score(y_test, y_pred)

#cross_val_score(logreg, X, y, cv=10, scoring='accuracy')

#cross_val_score(logreg, X, y, cv=10, scoring='accuracy').mean()

#y_pred_prob = logreg.predict_proba(X_test)[:, -1]

#logreg.predict_proba(X_test)[:, -3]

#y_pred_prob

latest_fixtures_cut = latest_fixtures[[
    '6GameFormDiff', '6GameGoalsForDiff', '6GameGoalDiffDiff']]
probs = logreg.predict_proba(latest_fixtures_cut)
probs_df = pd.DataFrame(probs)
probs_df.rename(columns={0:'AwayWinProb',1:'DrawProb',2:'HomeWinProb'},inplace='True')
latest_fixtures_cut_noind = latest_fixtures_cut.reset_index()
pred_join = latest_fixtures_cut_noind.join(probs_df)
pred_join_cut = pred_join[["Date","Div","HomeTeam","AwayTeam","HomeWinProb"]]

In [18]:
pred_join_cut[pred_join_cut["HomeWinProb"]>0.65]

Unnamed: 0,Date,Div,HomeTeam,AwayTeam,HomeWinProb
5,18/02/18,SP1,Alaves,La Coruna,0.675833
32,17/02/18,P1,Benfica,Boavista,0.681948
95,18/02/18,N1,Feyenoord,Heracles,0.695649
162,16/02/18,F1,Monaco,Dijon,0.682071
173,16/02/18,F2,Nimes,Tours,0.689446
185,17/02/18,F1,Paris SG,Strasbourg,0.732428
190,17/02/18,SC3,Peterhead,Berwick,0.677059
