# Import required modules

In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn import cross_validation
from pandas.tools.plotting import scatter_matrix
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
import seaborn as sns
import glob
import os
from sklearn.cross_validation import cross_val_score
plt.style.use('ggplot')
import pickle
import urllib



# Define Functions

In [2]:
#A function to calculate the results
def result(x):
	conditions = [
	(x['GoalsFor'] > x['GoalsAgainst']),
	(x['GoalsFor'] == x['GoalsAgainst']),
	(x['GoalsFor'] < x['GoalsAgainst'])
	]
	choices = [1,0,-1]
	x['Result'] = np.select(conditions, choices)
	return x

In [3]:
#A function to calculate the results
def HomeResult(x):
	conditions = [
	(x['HomeGoals'] > x['AwayGoals']),
	(x['HomeGoals'] == x['AwayGoals']),
	(x['HomeGoals'] < x['AwayGoals'])
	]
	choices = [1,0,-1]
	x['HomeResult'] = np.select(conditions, choices)
	return x

def AwayResult(x):
	conditions = [
	(x['AwayGoals'] > x['HomeGoals']),
	(x['AwayGoals'] == x['HomeGoals']),
	(x['AwayGoals'] < x['HomeGoals'])
	]
	choices = [1,0,-1]
	x['AwayResult'] = np.select(conditions, choices)
	return x

In [4]:
def joined_result(x):
	conditions = [
	(x['Result'] == 'H'),
	(x['Result'] == 'D'),
	(x['Result'] == 'A')
	]
	choices = [1,0,-1]
	x['ResultValue'] = np.select(conditions, choices)
	return x

# Data Preperation

Read CSV Data

In [5]:
#HistoricalFootballResults=pd.read_csv(r'C:\Users\dbrown\Documents\Misc\Football\16_17\overall.csv')

full_stats = pd.DataFrame()
for year in ['1011','1112','1213','1314','1415','1516','1617','1718','1819']:
    for div in ['E0','E1','E2','E3','SC0','SC1','SC2','SC3','B1','F1','F2','SP1','SP2','I1','I2','D1','D2','N1','P1','G1','T1']:
        url = 'http://www.football-data.co.uk/mmz4281/%s/%s.csv' % (year,div)
        try:
            full_stats = full_stats.append(pd.read_csv(url,usecols=['Date','Div','HomeTeam','AwayTeam','FTHG','FTAG','FTR','B365H','B365D','B365A']));
        except urllib.error.HTTPError:
            # Do something here to handle the error. For example:
            print("URL", url, "could not be read.")
            continue
            
fixtures_url="http://www.football-data.co.uk/fixtures.csv"
fixtures=pd.read_csv(fixtures_url,usecols=['Date','Div','HomeTeam','AwayTeam','FTHG','FTAG','FTR','B365H','B365D','B365A'])

Define base dataframes

In [6]:
full_results_combo = full_stats.append(fixtures)
#full_results_combo2.fillna(0,inplace=True)
full_results_combo.drop_duplicates(subset=['Date','Div','HomeTeam','AwayTeam'],keep='last',inplace=True)
full_results_combo.set_index(['Date','Div','HomeTeam','AwayTeam'],inplace=True)
full_results_combo.rename(columns={'FTHG':'HomeGoals','FTAG':'AwayGoals','FTR':'Result'},inplace='True')
full_results_combo = joined_result(full_results_combo)
full_results_combo = HomeResult(full_results_combo)
full_results_combo = AwayResult(full_results_combo)
full_results_combo['HomeGoalDiff'] = full_results_combo['HomeGoals'] - full_results_combo['AwayGoals']
full_results_combo['AwayGoalDiff'] = full_results_combo['AwayGoals'] - full_results_combo['HomeGoals']
full_results_combo['H2H'] = full_results_combo['ResultValue'].groupby(level=['HomeTeam','AwayTeam']).apply(lambda x: pd.rolling_mean(x.shift(),10000,min_periods=3))

Set 2-6 game form values

In [7]:
for num in range(2,7):
        home = 'Home'
        away = 'Away'
        HomeGameForm="%s%sGameForm" % (home,num)
        HomeGameGoalsFor="%s%sGameGoalsFor" % (home,num)
        HomeGameGoalDiff="%s%sGameGoalDiff" % (home,num)
        AwayGameForm="%s%sGameForm" % (away,num)
        AwayGameGoalsFor="%s%sGameGoalsFor" % (away,num)
        AwayGameGoalDiff="%s%sGameGoalDiff" % (away,num)
        full_results_combo[HomeGameForm] = full_results_combo['HomeResult'].groupby(level='HomeTeam').apply(lambda x: pd.rolling_mean(x.shift(),num))
        full_results_combo[HomeGameGoalsFor] = full_results_combo['HomeGoals'].groupby(level='HomeTeam').apply(lambda x: pd.rolling_mean(x.shift(),num))
        full_results_combo[HomeGameGoalDiff] = full_results_combo['HomeGoalDiff'].groupby(level='HomeTeam').apply(lambda x: pd.rolling_mean(x.shift(),num))
        full_results_combo[AwayGameForm] = full_results_combo['AwayResult'].groupby(level='AwayTeam').apply(lambda x: pd.rolling_mean(x.shift(),num))
        full_results_combo[AwayGameGoalsFor] = full_results_combo['AwayGoals'].groupby(level='AwayTeam').apply(lambda x: pd.rolling_mean(x.shift(),num))
        full_results_combo[AwayGameGoalDiff] = full_results_combo['AwayGoalDiff'].groupby(level='AwayTeam').apply(lambda x: pd.rolling_mean(x.shift(),num))

Set 2-6 game form difference values

In [8]:
for num in range(2,7):
        home = 'Home'
        away = 'Away'
        HomeGameForm="%s%sGameForm" % (home,num)
        HomeGameGoalsFor="%s%sGameGoalsFor" % (home,num)
        HomeGameGoalDiff="%s%sGameGoalDiff" % (home,num)
        AwayGameForm="%s%sGameForm" % (away,num)
        AwayGameGoalsFor="%s%sGameGoalsFor" % (away,num)
        AwayGameGoalDiff="%s%sGameGoalDiff" % (away,num)
        GameFormDiff = "%sGameFormDiff" % (num)
        GameGoalsForDiff = "%sGameGoalsForDiff" % (num)
        GameGoalDiffDiff = "%sGameGoalDiffDiff" % (num)
        full_results_combo[GameFormDiff] = full_results_combo[HomeGameForm] - full_results_combo[AwayGameForm]
        full_results_combo[GameGoalsForDiff] = full_results_combo[HomeGameGoalsFor] - full_results_combo[AwayGameGoalsFor]
        full_results_combo[GameGoalDiffDiff] = full_results_combo[HomeGameGoalDiff] - full_results_combo[AwayGameGoalDiff]

Deal with NaN values

In [9]:
full_results_combo_zeroed = full_results_combo.fillna(value = {'HomeGoals': 0,'AwayGoals': 0,'Result': 0,'ResultValue': 0,'HomeResult': 0,'AwayResult': 0,'HomeGoalDiff': 0,'AwayGoalDiff': 0})
full_results_combo_zeroed_dropped = full_results_combo_zeroed.dropna()
full_results_combo_dropped = full_results_combo.dropna()
full_results_combo_zeroed_dropped_reset = full_results_combo_zeroed_dropped.reset_index()

In [10]:
stats_latest = full_results_combo_zeroed_dropped_reset.groupby('HomeTeam').last()
stats_latest.reset_index(inplace=True)
stats_latest.set_index(['Date','Div','HomeTeam','AwayTeam'],inplace=True)
latest_fixtures = stats_latest[stats_latest["Result"]==0]

In [11]:
full_results_combo_zeroed.tail()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,HomeGoals,AwayGoals,Result,B365H,B365D,B365A,ResultValue,HomeResult,AwayResult,HomeGoalDiff,...,3GameGoalDiffDiff,4GameFormDiff,4GameGoalsForDiff,4GameGoalDiffDiff,5GameFormDiff,5GameGoalsForDiff,5GameGoalDiffDiff,6GameFormDiff,6GameGoalsForDiff,6GameGoalDiffDiff
Date,Div,HomeTeam,AwayTeam,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
09/09/18,SP2,Extremadura UD,Granada,0.0,0.0,0,3.6,2.8,2.3,0,0,0,0.0,...,,,,,,,,,,
09/09/18,SP2,La Coruna,Sp Gijon,0.0,0.0,0,2.1,3.0,4.0,0,0,0,0.0,...,0.0,0.5,1.25,0.5,0.4,1.2,0.4,0.166667,1.0,0.166667
09/09/18,SP2,Numancia,Elche,0.0,0.0,0,1.8,3.3,4.75,0,0,0,0.0,...,1.333333,0.75,0.5,1.0,1.0,0.8,1.6,1.166667,1.0,1.833333
09/09/18,SP2,Reus Deportiu,Albacete,0.0,0.0,0,3.1,2.9,2.5,0,0,0,0.0,...,0.666667,0.25,-0.5,0.75,0.6,-0.4,1.0,0.666667,-0.333333,1.0
10/09/18,SP2,Rayo Majadahonda,Lugo,0.0,0.0,0,2.62,3.0,2.87,0,0,0,0.0,...,,,,,,,,,,


Get the latest stats

# Logistic Regression on HomeResult

In [12]:
#y = home_away_merge_idx_dropped['HomeResult']
#X = home_away_merge_idx_dropped[[
#    '3GameFormDiff', '3GameGoalsForDiff', '3GameGoalDiffDiff']]
#logreg = LogisticRegression()

In [13]:
#X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.8)
#logreg.fit(X_train, y_train)
#logreg.score(X_train, y_train)
#cross_val_score(logreg, X, y, cv=10, scoring='accuracy').mean()

In [14]:
columns = ['H2H',
#       'Home2GameForm', 'Home2GameGoalsFor', 'Home2GameGoalDiff',
       'Home3GameForm', 'Home3GameGoalsFor', 'Home3GameGoalDiff',
#       'Home4GameForm', 'Home4GameGoalsFor', 'Home4GameGoalDiff',
#       'Home5GameForm', 'Home5GameGoalsFor', 'Home5GameGoalDiff',
#       'Home6GameForm', 'Home6GameGoalsFor', 'Home6GameGoalDiff',
#       'Away2GameForm', 'Away2GameGoalsFor', 'Away2GameGoalDiff',
       'Away3GameForm', 'Away3GameGoalsFor', 'Away3GameGoalDiff',
#       'Away4GameForm', 'Away4GameGoalsFor', 'Away4GameGoalDiff',
#       'Away5GameForm', 'Away5GameGoalsFor', 'Away5GameGoalDiff',
#       'Away6GameForm', 'Away6GameGoalsFor', 'Away6GameGoalDiff',
#       '2GameFormDiff', '2GameGoalsForDiff', '2GameGoalDiffDiff',
       '3GameFormDiff', '3GameGoalsForDiff', '3GameGoalDiffDiff',
#       '4GameFormDiff', '4GameGoalsForDiff', '4GameGoalDiffDiff',
#       '5GameFormDiff', '5GameGoalsForDiff', '5GameGoalDiffDiff',
#       '6GameFormDiff', '6GameGoalsForDiff', '6GameGoalDiffDiff'
        ]

In [15]:
y = full_results_combo_dropped['HomeResult']
X = full_results_combo_dropped[columns]

logreg = LogisticRegression()

logreg.fit(X, y)
logreg.score(X, y)

y_pred = logreg.predict(X)

In [16]:
X.tail()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,H2H,Home3GameForm,Home3GameGoalsFor,Home3GameGoalDiff,Away3GameForm,Away3GameGoalsFor,Away3GameGoalDiff,3GameFormDiff,3GameGoalsForDiff,3GameGoalDiffDiff
Date,Div,HomeTeam,AwayTeam,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
01/09/18,T1,Akhisar Belediyespor,Sivasspor,0.8,0.0,1.333333,0.0,-0.666667,1.0,-2.0,0.666667,0.333333,2.0
01/09/18,T1,Fenerbahce,Kayserispor,0.714286,1.0,2.333333,1.0,-0.666667,0.666667,-1.0,1.666667,1.666667,2.0
01/09/18,T1,Trabzonspor,Galatasaray,0.125,0.333333,2.666667,0.666667,1.0,2.0,1.333333,-0.666667,0.666667,-0.666667
02/09/18,T1,Antalyaspor,Rizespor,0.0,-0.666667,1.333333,-1.0,0.0,1.666667,-0.333333,-0.666667,-0.333333,-0.666667
02/09/18,T1,Bursaspor,Besiktas,-0.625,0.0,1.0,-0.333333,0.333333,2.0,0.333333,-0.333333,-1.0,-0.666667


In [17]:
from sklearn.ensemble import RandomForestClassifier

In [18]:
rf=RandomForestClassifier(n_estimators = 1000, random_state = 42)
rf.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [19]:
rf_score = rf.score(X, y)
rf_score

0.97507395639311933

In [20]:
y_pred = logreg.predict(X)
y_pred

array([-1,  1, -1, ...,  1, -1, -1])

In [21]:
latest_fixtures_cut = latest_fixtures[columns]
probs = rf.predict_proba(latest_fixtures_cut)
probs_df = pd.DataFrame(probs)
probs_df.rename(columns={0:'AwayWinProb',1:'DrawProb',2:'HomeWinProb'},inplace='True')
latest_fixtures_cut_noind = latest_fixtures_cut.reset_index()
pred_join = latest_fixtures_cut_noind.join(probs_df)

In [34]:
#pred_join_cut2 = pd.merge(pred_join_cut,latest_fixtures['B365H'],on='key')
fixtures = latest_fixtures[['B365H','B365D','B365A']].reset_index()

Unnamed: 0,Date,Div,HomeTeam,AwayTeam,B365H,B365D,B365A
0,08/09/18,E2,Accrington,Burton,2.5,3.4,3.0
1,08/09/18,E2,Bristol Rvs,Plymouth,1.9,3.8,4.2
2,08/09/18,E3,Cambridge,Carlisle,2.0,3.6,4.0
3,09/09/18,SP2,Cordoba,Alcorcon,2.4,2.9,3.3
4,08/09/18,E3,Exeter,Notts County,2.2,3.6,3.4


In [40]:
pred_join_cut = pred_join[["Date","Div","HomeTeam","AwayTeam","HomeWinProb"]]
pred_join_cut = pd.merge(pred_join_cut,fixtures)
LatestHomePredictions = pred_join_cut[pred_join_cut["HomeWinProb"]>0.65].sort_values(by=['Date','Div','HomeTeam'])
LatestHomePredictions

Unnamed: 0,Date,Div,HomeTeam,AwayTeam,HomeWinProb,B365H,B365D,B365A
0,08/09/18,E2,Accrington,Burton,0.674,2.5,3.4,3.0
2,08/09/18,E3,Cambridge,Carlisle,0.763281,2.0,3.6,4.0
8,09/09/18,SP2,Numancia,Elche,0.746667,1.8,3.3,4.75


In [36]:
pred_join_cut_away = pred_join[["Date","Div","HomeTeam","AwayTeam","AwayWinProb"]]
pred_join_cut_away[pred_join_cut_away["AwayWinProb"]>0.5].sort_values(by=['Div','Date','HomeTeam'])

Unnamed: 0,Date,Div,HomeTeam,AwayTeam,AwayWinProb
5,08/09/18,E2,Gillingham,AFC Wimbledon,0.508667


In [37]:
fixtures[(fixtures["B365H"] >= 1.9) & (fixtures["B365H"] <= 2.1)]

Unnamed: 0,Date,Div,HomeTeam,AwayTeam,B365H,B365D,B365A
1,08/09/18,E2,Bristol Rvs,Plymouth,1.9,3.8,4.2
2,08/09/18,E3,Cambridge,Carlisle,2.0,3.6,4.0
6,09/09/18,SP2,La Coruna,Sp Gijon,2.1,3.0,4.0
10,08/09/18,SP2,Oviedo,Zaragoza,2.0,3.0,4.33
11,08/09/18,E2,Portsmouth,Shrewsbury,2.1,3.3,3.4


In [42]:
home_prediction_pickle_out = open("football_prediction.pickle","wb")
pickle.dump(LatestHomePredictions, home_prediction_pickle_out)
home_prediction_pickle_out.close()

In [43]:
home_prediction_pickle_in = open("football_prediction.pickle","rb")
home_prediction_pickle = pickle.load(home_prediction_pickle_in)
home_prediction_pickle

Unnamed: 0,Date,Div,HomeTeam,AwayTeam,HomeWinProb,B365H,B365D,B365A
0,08/09/18,E2,Accrington,Burton,0.674,2.5,3.4,3.0
2,08/09/18,E3,Cambridge,Carlisle,0.763281,2.0,3.6,4.0
8,09/09/18,SP2,Numancia,Elche,0.746667,1.8,3.3,4.75


In [48]:
variables = ['LatestHomePredictions','full_stats','fixtures','full_results_combo','full_results_combo_dropped']

for variable in variables:
    pickle_out = open("%s.pickle" % (variable),"wb")
    pickle.dump(variable,pickle_out)
    pickle_out.close