<a href="https://colab.research.google.com/github/Yun5141/comp0036/blob/master/FootballResultPrediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Introduction [*Terry*]

# 2. Data Import  [*Yun*]

In [ ]:
# ------------------- import packages -----------------------
import pandas as pd 

import numpy as np

import datetime as dt

import matplotlib.pyplot as plt
import seaborn as sns

from collections import deque

#!pip3 install geopy
from geopy.distance import geodesic 
from geopy.distance import great_circle 

#!pip3 install sklearn
import sklearn
from sklearn.preprocessing import scale
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

# --------------------- import data -------------------------
# training data set
url="https://raw.githubusercontent.com/Yun5141/comp0036/master/data/epl-training.csv"
raw_training_data=pd.read_csv(url)

# test set
url = 'https://raw.githubusercontent.com/Yun5141/comp0036/master/data/epl-test.csv'
rawData_toPred = pd.read_csv(url)

# 2019 up-to-date data (from http://www.football-data.co.uk)
url = 'https://raw.githubusercontent.com/Yun5141/comp0036/master/data/epl2019.csv'
rawData_2019_uptodate = pd.read_csv(url)

# geometric information of teams
# to calculate the distance needed to travel for the away team
url = "https://raw.githubusercontent.com/Yun5141/comp0036/master/data/stadiums-with-GPS-coordinates.csv"
geometricData = pd.read_csv(url)

# 3. Data Transformation & Exploration [*Yun*]

### 3.1 Data Cleaning

In [ ]:
# drop unnamed columns
raw_training_data = raw_training_data[raw_training_data.columns[~raw_training_data.columns.str.contains('Unnamed:')]]


def removeInvalidData(data):

    # remove data which contains None
    data.dropna(axis=0, how='any',inplace=True)

    # remove data which contains NaN, infinite or overflowed number 
    indices_to_keep = ~data.isin([np.nan, np.inf, -np.inf]).any(1)
    data = data[indices_to_keep]

    return data

#check if there are rows containing None, NaN, infinite or overflowed values
assert raw_training_data.shape[0] == removeInvalidData(raw_training_data).shape[0]

In [ ]:
# unify the different date formats and convert the type from str to timestamp  
def unifyDate(data):
    if not isinstance(data.Date[0],str):
        return

    newDate = []
    for _, matchInfo in data.iterrows():
        if len(matchInfo.Date) == 8 :
            newDate.append(pd.to_datetime(matchInfo.Date, format="%d/%m/%y" ))
        elif len(matchInfo.Date) == 9 :
            newDate.append(pd.to_datetime(matchInfo.Date, format="%d %b %y" ))  # the date format in test data
        elif len(matchInfo.Date) == 10 :
            newDate.append(pd.to_datetime(matchInfo.Date, format="%d/%m/%Y" ))
    
    data['Date'] = pd.Series(newDate).values

    return data

# unified the date formats for later exploration and transformation
unifyDate(raw_training_data)

### 3.2 Initial Data Exploration

##### 3.2.1 Number of matches per season

In [ ]:
# to see the number of matches each year (season)
def separateData(data):
    dataframe_collection = {}

    for year in range(2008, 2019):
        dataframe_collection[year] = data[(data.Date > dt.datetime(year,8,1,0,0) ) & (data.Date < dt.datetime(year+1, 6, 1,0,0))]

    return dataframe_collection

collection = separateData(raw_training_data)

In [ ]:
for key in collection.keys():
    print("{} [{} rows x {} columns]".format(key,collection[key].shape[0],collection[key].shape[1]))

# The result shows that the number of matches each season stays the same (380).

##### 3.2.2 Percentage of match result

In [ ]:
def getPercentageOfMatchResult(data, matchResult):

    if matchResult not in ['H', 'A', 'D']:
        raise Exception('The second argument should only take values within [“H”,“A”,“D”]')
    
    n_wins = len(data[data.FTR == matchResult])

    return n_wins / data.shape[0]


In [ ]:
# the average percentage of each match result per season
for key in collection.keys():
    print("\n" +"="*40)
    print("{} [{}]".format(key,len(collection[key])))
    print("-"*40)
    print("home team wins: {:.3f}%".format(getPercentageOfMatchResult(collection[key],"H")*100))
    print("away team wins: {:.3f}%".format(getPercentageOfMatchResult(collection[key],"A")*100))
    print("draw: {:.3f}%".format(getPercentageOfMatchResult(collection[key],"D")*100))

# the average percentage over the 11 years
print("\n" +"="*40)
print("Overall [{}]".format(len(raw_training_data)))
print("-"*40)
print("home team wins: {:.3f}%".format(getPercentageOfMatchResult(raw_training_data,"H")*100))
print("away team wins: {:.3f}%".format(getPercentageOfMatchResult(raw_training_data,"A")*100))
print("draw: {:.3f}%".format(getPercentageOfMatchResult(raw_training_data,"D")*100))

# From the result, we find that in all cases the result 'home team wins' is of the highest probability, and 'H':'A':'D' $\approx$ 5:3:2 in general.

##### 3.2.3 Relationship between attributes

In [ ]:
# plot Pearson Correlation Heatmap to see the top 10 features related to the match result FTR
def plotGraph(X_all, Y_all):

    train_data=pd.concat([X_all,Y_all],axis=1)

    #FTR correlation matrix
    plt.figure(figsize=(12,12))
    k = 11 # number of variables for heatmap
    cols = abs(train_data.astype(float).corr()).nlargest(k, 'FTR')['FTR'].index
    cm = np.corrcoef(train_data[cols].values.T)
    sns.set(font_scale=1.25)
    hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 12}, yticklabels=cols.values, xticklabels=cols.values)
    plt.show()

attributes = raw_training_data.drop(['Date','HomeTeam', 'AwayTeam', 'Referee','FTR'],1)
attributes['HTR'] = attributes['HTR'].map({'H':1,'A':0,'D':2})
label = raw_training_data['FTR']
label = label.map({'H':1,'A':0,'D':2})
plotGraph(attributes,label)

### 3.2 Feature Construction

In [ ]:
# As within the top 10 features 
# there are two pairs of data highly correlated (see details in report), 
# so we just pick [FTHG, FTAG, HS, AS, HR, AR] from the top 10 features,
# additional with [Date, HomeTeam, AwayTeam, FTR], to derive our features.
selectedAttributes = ["Date","HomeTeam", "AwayTeam","FTR","FTHG","FTAG","HS","AS","HR","AR"]
training_data = raw_training_data[selectedAttributes]

# -------- Cumulative full time goal difference [HCGD, ACGD]-------------
def getCumulativeGoalsDiff(data):
    teams = {}
    HCGD = [] 
    ACGD = []   

    # for each match
    for i in range(len(data)):
        # as the result in 3.2.1 shows that the number of matchese per season is always the same, so here we simply use i%380==0 to check if it is a new season and to initialize the feature.
        if (i % 380 == 0):
            for name in data.groupby('HomeTeam').mean().T.columns:
                teams[name] = []

        FTHG = data.iloc[i]['FTHG']
        FTAG = data.iloc[i]['FTAG']

        try:
            cgd_h = teams[data.iloc[i].HomeTeam].pop()
            cgd_a = teams[data.iloc[i].AwayTeam].pop()
        except:
            cgd_h = 0
            cgd_a = 0

        HCGD.append(cgd_h)
        ACGD.append(cgd_a)
        cgd_h = cgd_h + FTHG - FTAG
        teams[data.iloc[i].HomeTeam].append(cgd_h)
        cgd_a = cgd_a + FTAG - FTHG
        teams[data.iloc[i].AwayTeam].append(cgd_a)

    data.loc[:,'HCGD'] = pd.Series(HCGD)
    data.loc[:,'ACGD'] = pd.Series(ACGD)

    return data

# --------- Average number of shots on goal in the past 3 matches [HAHS, AAHS] ----------
def getAverageShotsOnGoalInPast3Matches(data):
    teams = {}
    HAHS = [] 
    AAHS = []   

    # for each match
    for i in range(len(data)):
        
        if (i % 380 == 0):
            for name in data.groupby('HomeTeam').mean().T.columns:
                teams[name] = deque([None, None, None]) #[3rd, 2nd, latest data]

        try:
            ahs_h = np.mean(teams[data.iloc[i].HomeTeam])
            ahs_a = np.mean(teams[data.iloc[i].AwayTeam])
        except:
            ahs_h = None
            ahs_a = None

        HAHS.append(ahs_h)
        AAHS.append(ahs_a)

        teams[data.iloc[i].HomeTeam].popleft()
        teams[data.iloc[i].HomeTeam].append(data.iloc[i].HS)

        teams[data.iloc[i].AwayTeam].popleft()
        teams[data.iloc[i].AwayTeam].append(data.iloc[i].AS)

    data.loc[:,'HAHS'] = pd.Series(HAHS)
    data.loc[:,'AAHS'] = pd.Series(AAHS)

    return data

# ----------- Delta time from last match [HDT, ADT] ---------
def getDeltaTime(data):
    
    teams = {}

    HDT = []
    ADT = []

    for i in range(len(data)):
        if (i % 380 == 0):
            for name in data.groupby('HomeTeam').mean().T.columns:
                teams[name] = []    # to store last match date

        currentDate = data.iloc[i].Date

        try:
            homeLastMatchDate = teams[data.iloc[i].HomeTeam].pop()
            awayLastMatchDate = teams[data.iloc[i].AwayTeam].pop()

            hdt = (currentDate - homeLastMatchDate).days
            adt = (currentDate - awayLastMatchDate).days
        except:
            homeLastMatchDate = currentDate
            awayLastMatchDate = currentDate

            hdt = None
            adt = None

        HDT.append(hdt)
        ADT.append(adt)

        teams[data.iloc[i].HomeTeam].append(currentDate)
        teams[data.iloc[i].AwayTeam].append(currentDate)

    data.loc[:,'HDT'] = HDT
    data.loc[:,'ADT'] = ADT

    return data

# -------------- Distance needed to travel for the away team [DIS] ----------
#The geometricData contains the latitude and longitude of teams
def getDistance(data, geometricData):
  array = []
  for x in data.iterrows():
   
    home_lat = (geometricData.loc[geometricData['Team'] == x[1].HomeTeam]).Latitude
    home_long = (geometricData.loc[geometricData['Team'] == x[1].HomeTeam]).Longitude
    home_location = (np.float32(home_lat), np.float32(home_long))
    
    away_lat = (geometricData.loc[geometricData['Team'] == x[1].AwayTeam]).Latitude
   
    away_long = (geometricData.loc[geometricData['Team'] == x[1].AwayTeam]).Longitude
    away_location = (np.float32(away_lat), np.float32(away_long))
    array.append(np.float32(geodesic(home_location, away_location).km))
  
  
  DIS = pd.Series(array)
  data.loc[:,'DIS'] = DIS

  return data

# -------- Performances of last 3 matches [HM1, AM1, HM2, AM2, HM3, AM3] -------
def getPerformanceOfLast3Matches(data):
    HM1 = []    # result of the last match of home team
    AM1 = []    # result of the last match of away team

    HM2 = []    # result of the 2nd last match of home team
    AM2 = []

    HM3 = []    # result of the 3rd last match of home team
    AM3 = []

    teams = {}

    for i in range(len(data)):
        
        if (i % 380 == 0):
            for name in data.groupby('HomeTeam').mean().T.columns:
                teams[name] = deque([None, None, None])  #[3rd, 2nd, latest data]

        HM3.append(teams[data.iloc[i].HomeTeam].popleft())
        AM3.append(teams[data.iloc[i].AwayTeam].popleft())
        HM2.append(teams[data.iloc[i].HomeTeam][0])
        AM2.append(teams[data.iloc[i].AwayTeam][0])
        HM1.append(teams[data.iloc[i].HomeTeam][1])
        AM1.append(teams[data.iloc[i].AwayTeam][1])

        if data.iloc[i].FTR == 'H':
            # 主场 赢，则主场记为赢，客场记为输
            teams[data.iloc[i].HomeTeam].append('W')
            teams[data.iloc[i].AwayTeam].append('L')
        elif data.iloc[i].FTR == 'A':
            # 客场 赢，则主场记为输，客场记为赢
            teams[data.iloc[i].AwayTeam].append('W')
            teams[data.iloc[i].HomeTeam].append('L')
        else:
            # 平局
            teams[data.iloc[i].AwayTeam].append('D')
            teams[data.iloc[i].HomeTeam].append('D')

    data.loc[:,'HM1'] = HM1
    data.loc[:,'AM1'] = AM1
    data.loc[:,'HM2'] = HM2
    data.loc[:,'AM2'] = AM2
    data.loc[:,'HM3'] = HM3
    data.loc[:,'AM3'] = AM3

    return data

##### 3.2.6 Derive features and remove invalid data

In [ ]:
# construct features
getCumulativeGoalsDiff(training_data)   # FTHG, FTAG -> HCGD, ACGD
getAverageShotsOnGoalInPast3Matches(training_data)  # HS, AS -> HAHS, AAHS
getDeltaTime(training_data)     # Date -> HDT, ADT
getDistance(training_data,geometricData)    # HomeTeam, AwayTeam -> DIS
getPerformanceOfLast3Matches(training_data) # FTR -> HM1,AM1, HM2,AM2, HM3,AM3 [latest,2nd,3rd]

# remove invalid data
# Due to the lack of data in the beginning of each year, now there are rows containing empty values. 
training_data = removeInvalidData(training_data)

# remove intermediate data
dropedAttributes = selectedAttributes.copy()
dropedAttributes.remove("HR")
dropedAttributes.remove("AR")
data = training_data.drop(dropedAttributes,1)

### 3.3 Second Data Exploration – Analyse Numerical Features

In [ ]:
numList = ['HR', 'AR', 'HCGD', 'ACGD', 'HAHS', 'AAHS', 'HDT', 'ADT', 'DIS']

for col in numList:
    l = data[col].tolist() 
    print("\n" +"="*40)
    print("{} [size: {}]".format(col,len(l)))
    print("-"*40)
    print("min: {:.4f} \nmax: {:.4f} \nmedian:{:.4f}".format(np.min(l),np.max(l),np.median(l)))
    print("mean: {:.4f} \nvariance: {:.4f} \nstandard deviation: {:.4f}".format(np.mean(l),np.var(l), np.std(l, ddof=1)))

# conclusions drawn from the result is included in the report.

### 3.4 Data Transformation

In [ ]:
# Separate the training set into feature set and label:
X_all = data.copy()
y_all = training_data['FTR']

# map string label into number
rule = {'H':1, 'A':0, 'D':2}
y_all = y_all.map(rule)

# rescale and standardize numerical features
# z-score standardization
stdScaler = StandardScaler().fit(X_all[numList])
X_all[numList] = stdScaler.transform(X_all[numList])

# min-max scaling
minmaxScaler = preprocessing.MinMaxScaler().fit(X_all[numList])
X_all[numList] = minmaxScaler.transform(X_all[numList])

# transform categorical features
def transformCategoricalFeature(data,categoricalFeatureNames):
    # transform feature to string
    for col in categoricalFeatureNames:
        data[col] = data[col].astype('str')
    
    output = pd.DataFrame(index=data.index)

    for col_name, col_data in data.iteritems():
        if col_data.dtype == 'object':
            col_data = pd.get_dummies(col_data, prefix = col_name)
        output = output.join(col_data)
    
    return output
categList = ["HM1","AM1", "HM2","AM2", "HM3","AM3"]
X_all = transformCategoricalFeature(X_all, categList)

# 4. Methodology Overview [*Yanke*]

In [ ]:
from sklearn import model_selection
from sklearn.model_selection import train_test_split
#Split data set
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all,test_size = 0.3,random_state = 2,stratify = y_all)
print(X_test, y_test)

In [ ]:
#remove warning to see clear result
import warnings
from sklearn.exceptions import ConvergenceWarning
from sklearn.exceptions import UndefinedMetricWarning


warnings.filterwarnings(action='ignore', category=ConvergenceWarning)
warnings.filterwarnings(action='ignore', category=UndefinedMetricWarning)


In [ ]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import accuracy_score

#Naive Bayes
gaussian =GaussianNB()
gaussian.fit(X_train, y_train)
y_gaussian = gaussian.predict(X_test)
accuracy1 = accuracy_score(y_test, y_gaussian)


#Logistic Regression
lr = LogisticRegression(solver='lbfgs', multi_class = 'multinomial') # Must specify solver and multi_class to reduce warning
lr.fit(X_train, y_train)
y_lr = lr.predict(X_test)
accuracy2 = accuracy_score(y_test, y_lr)


#Linear Discriminant Analysis
lda =LDA()
lda.fit(X_train, y_train)
y_lda = lda.predict(X_test)
accuracy3 = accuracy_score(y_test, y_lda)

#Quadratic Discriminant Analysis
qda =QDA()
qda.fit(X_train, y_train)
y_qda = qda.predict(X_test)
accuracy4 = accuracy_score(y_test, y_qda)


#Decision Tree
dtc =DecisionTreeClassifier()
dtc.fit(X_train, y_train)
y_dtc = dtc.predict(X_test)
accuracy5 = accuracy_score(y_test, y_dtc)


#Multilayer Perceptron, a feedforward artificial Neural Net work model
nn = MLPClassifier(solver='lbfgs', alpha=1e-5,
                    hidden_layer_sizes=(15,), random_state=1)
nn.fit(X_train, y_train)
y_nn = nn.predict(X_test)
accuracy6 = accuracy_score(y_test, y_nn)

print("{:<38}{}".format('Gaussian Naive Bayes:',accuracy1))
print("{:<38}{}".format('Logistic Regression:',accuracy2))
print("{:<38}{}".format('Linear Discriminant Analysis:',accuracy3))
print("{:<38}{}".format('Quadratic Discriminant Analysis:',accuracy4))
print("{:<38}{}".format('Decision Tree:',accuracy5))
print("{:<38}{}".format('Multilayer Perceptron(Neural Network):',accuracy6))
result=[accuracy1,accuracy2, accuracy3, accuracy4, accuracy5,accuracy6]

In [ ]:
from time import time
from sklearn.metrics import f1_score
# train classifier
def train_classifier(clf, X_train, y_train):
    start = time()
    clf.fit(X_train, y_train)
    end = time()
    print("time for training: {:.4f} sec".format(end - start))

# predict using the classifier
def predict_labels(clf, features, target):
    start = time()
    y_pred = clf.predict(features)
    end = time()
    print("time for prediction: {:.4f} sec".format(end - start))
    return f1_score(target, y_pred, pos_label=1, average="weighted"), sum(target == y_pred) / float(len(y_pred))

# print out the performance of each classifer
def train_predict(clf, X_train, y_train, X_test, y_test):

    print("Classifier: {} [sample size: {}]".format(clf.__class__.__name__, len(X_train)))

    train_classifier(clf, X_train, y_train)

    # evaluate model on train set
    print("[on train set]")
    f1a, acc = predict_labels(clf, X_train, y_train)
    print("F1 score: {:.4f} ".format(f1a))
    print("accuracy: {:.4f}".format(acc))

    # evaluate model on test set
    print("[on test set]")
    f1b, acc = predict_labels(clf, X_test, y_test)
    print("F1 score: {:.4f} ".format(f1b))
    print("accuracy: {:.4f}".format(acc))
    print("average F1: {:.4f}".format((f1a+f1b)/2))

In [ ]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
# adjust the hyperparameter of the model with best f1_score using grid search  
def adjustClassifier(clf, f1_scorer, param, X_train, y_train):

    grid_obj = GridSearchCV(clf,scoring=f1_scorer,param_grid=param,cv=5)
    grid_obj = grid_obj.fit(X_train,y_train)

    clf = grid_obj.best_estimator_

    return clf


In [ ]:
#cross-validation with KFold
kfold_ada = model_selection.KFold(n_splits=10, random_state=10)

In [ ]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
#Logistic Regression
#Score Before optimization
print('Before optimization')
clf2 = LogisticRegression(solver='lbfgs', multi_class = 'multinomial')
results = model_selection.cross_val_score(clf2,X_train, y_train, cv=kfold_ada)
print(results.mean())
train_predict(clf2, X_train, y_train, X_test, y_test)
print("\n")

#Score after optimization
print('After Optimization')
clf = LogisticRegression(solver = 'lbfgs', multi_class = 'multinomial')
f1_scorer = make_scorer(f1_score, average = 'weighted')   #f1-scorer, need to set average to 'weighted' since target is multi class
# set hyper parameter to be optimised
parameters = { 
              'C' :[1.0, 100.0, 1000.0],
              'max_iter':[100,200,300, 400, 500],
              'intercept_scaling':[0.1, 0.5, 1.0]
             }
lr_2 = adjustClassifier(clf, f1_scorer, parameters, X_train, y_train)
results = model_selection.cross_val_score(lr_2,X_train, y_train, cv=kfold_ada)
print(results.mean())
train_predict(lr_2, X_train, y_train, X_test, y_test)




In [ ]:
# GaussianNB
print('Before optimization')
clf2 = GaussianNB()
results = model_selection.cross_val_score(clf2,X_train, y_train, cv=kfold_ada)
print(results.mean())
train_predict(clf2, X_train, y_train, X_test, y_test)
print("\n")

print('After optimization')
clf = GaussianNB()
f1_scorer = make_scorer(f1_score, average = 'weighted')
parameters = { 
              'var_smoothing': [1e-09, 1e-07, 1e-05, 1e-11, 1e-13]
             }
gaussian_2 = adjustClassifier(clf, f1_scorer, parameters, X_train, y_train)
results = model_selection.cross_val_score(gaussian_2,X_train, y_train, cv=kfold_ada)
print(results.mean())
train_predict(gaussian_2, X_train, y_train, X_test, y_test)




In [ ]:
#LDA
print('Before optimization')
clf2 = LDA()
results = model_selection.cross_val_score(clf2,X_train, y_train, cv=kfold_ada)
print(results.mean())
train_predict(clf2, X_train, y_train, X_test, y_test)
print("\n")

print('After optimization')
parameters = { 
              'tol': [ 0.001, 0.0001, 0.00001]
             }
clf = LDA()
f1_scorer = make_scorer(f1_score, average = 'weighted')
# use grid search to optimise hyper paramete
lda_2 = adjustClassifier(clf, f1_scorer, parameters, X_train, y_train)
results = model_selection.cross_val_score(lda_2,X_train, y_train, cv=kfold_ada)
print(results.mean())
train_predict(lda_2, X_train, y_train, X_test, y_test)


In [ ]:
#QDA()
print('Before optimization')
clf2 = QDA()
results = model_selection.cross_val_score(clf2,X_train, y_train, cv=kfold_ada)
print(results.mean())
train_predict(clf2, X_train, y_train, X_test, y_test)
print("\n")

print('After optimization')
parameters = { 
              'reg_param': [0, 0.1, 0.01, 0.001],
              'tol': [0.001, 0.0001, 0.00001]
             }
clf = QDA()
f1_scorer = make_scorer(f1_score, average = 'weighted')
# use grid search to optimise hyper parameter
qda_2 = adjustClassifier(clf, f1_scorer, parameters, X_train, y_train)
results = model_selection.cross_val_score(qda_2,X_train, y_train, cv=kfold_ada)
print(results.mean())
train_predict(qda_2, X_train, y_train, X_test, y_test)


In [ ]:
#Decision Tree
print('Before optimization')
clf2 = DecisionTreeClassifier()
results = model_selection.cross_val_score(clf2,X_train, y_train, cv=kfold_ada)
print(results.mean())
train_predict(clf2, X_train, y_train, X_test, y_test)
print("\n")

print('After optimization')
parameters = { 
                'min_impurity_decrease':[0, 0.1, 1]
             }
clf = DecisionTreeClassifier()
f1_scorer = make_scorer(f1_score, average = 'weighted')
# use grid search to optimise hyper parameter
dtc_2 = adjustClassifier(clf, f1_scorer, parameters, X_train, y_train)
results = model_selection.cross_val_score(dtc_2,X_train, y_train, cv=kfold_ada)
print(results.mean())
train_predict(dtc_2, X_train, y_train, X_test, y_test)


In [ ]:
#MLP Classifier cannot be directly used as a base estimator in Ada Boosting estimator(sample_weight not available)
#so this custom classifier will fix this problem
#This can be found at: https://stackoverflow.com/questions/55632010/using-scikit-learns-mlpclassifier-in-adaboostclassifier
class customMLPClassifier(MLPClassifier):
    def resample_with_replacement(self, X_train, y_train, sample_weight):

        # normalize sample_weights if not already
        sample_weight = sample_weight / sample_weight.sum(dtype=np.float64)

        X_train_resampled = np.zeros((len(X_train), len(X_train[0])), dtype=np.float32)
        y_train_resampled = np.zeros((len(y_train)), dtype=np.int)
        for i in range(len(X_train)):
            # draw a number from 0 to len(X_train)-1
            draw = np.random.choice(np.arange(len(X_train)), p=sample_weight)

            # place the X and y at the drawn number into the resampled X and y
            X_train_resampled[i] = X_train[draw]
            y_train_resampled[i] = y_train[draw]

        return X_train_resampled, y_train_resampled


    def fit(self, X, y, sample_weight=None):
        if sample_weight is not None:
            X, y = self.resample_with_replacement(X, y, sample_weight)

        return self._fit(X, y, incremental=(self.warm_start and
                                            hasattr(self, "classes_")))


In [ ]:
#Neural Network

print('Before optimization')
clf2 = customMLPClassifer()
results = model_selection.cross_val_score(clf2,X_train, y_train, cv=kfold_ada)
print(results.mean())
train_predict(clf2, X_train, y_train, X_test, y_test)
print("\n")

print('After optimization')
parameters = { 
              'alpha': [ 1e-03, 1e-05, 1e-07],
              'hidden_layer_sizes':[ (5,), (10,), (15,)],
              'learning_rate_init':[0.01, 0.001, 0.0001],
             }
clf = customMLPClassifer()
f1_scorer = make_scorer(f1_score, average = 'weighted')
# use grid search to optimise hyper parameter
nn_2 = adjustClassifier(clf, f1_scorer, parameters, X_train, y_train)
results = model_selection.cross_val_score(nn_2,X_train, y_train, cv=kfold_ada)
print(results.mean())
train_predict(nn_2, X_train, y_train, X_test, y_test)

In [ ]:
#accuracy result after optimization
#compare with previous result
print("{:<32}{:20}{:20}".format("classifier", "before","after" ))
estimators = [lr_2, gaussian_2, lda_2, qda_2, dtc_2, nn_2]
for i in range(0,len(estimators)):
  estimators[i].fit(X_train, y_train)
  y = estimators[i].predict(X_test)
  accuracy = accuracy_score(y_test, y)
  print("{:<32}{:<20}{:<20}".format(estimators[i].__class__.__name__+':', result[i], accuracy ))

#we can see that most classifier has slightly better performance after hyperparameter optimization

In [ ]:
#Ensemble model use Ada boosting method
#LDA and QDA cannot be used as base_estimator of ada boosting in scikit-learn, so we cannot ensemble these 2 estimators
from sklearn.ensemble import AdaBoostClassifier

model1 = [lr_2, gaussian_2, dtc_2]
kfold_ada = model_selection.KFold(n_splits=10, random_state=10)
for estimator in model1:
  model_ada = AdaBoostClassifier(n_estimators=30, random_state=10, base_estimator = estimator)
  results_ada = model_selection.cross_val_score(model_ada,X_train, y_train, cv=kfold_ada)
  print(results_ada.mean())
  train_predict(model_ada, X_train, y_train, X_test, y_test)
  print('\n')


# clf = AdaBoostClassifier(n_estimators=30, random_state=10, base_estimator = nn_2)
# results_ada = model_selection.cross_val_score(clf,X_train, y_train, cv=kfold_ada)




In [ ]:
#final model, this gives better cross_validation_score and f1_score for test set
final_model = AdaBoostClassifier(n_estimators=30, random_state=10, base_estimator = lr_2)


# 5. Model Training & Validation [*Yanke*]

# 6. Result [*Yi*]

# 7. Final Predictions on Test Set [*Yusi*]