In [196]:
# ------------------- import packages -----------------------

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from os import chdir

# !pip install sklearn
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import accuracy_score, roc_curve, precision_recall_curve

import io

# !pip install geopy
from geopy.distance import geodesic 
from geopy.distance import great_circle 

import time
import datetime as dt
from datetime import datetime

import math

from collections import deque

# ------------------- import data -------------------------
url="https://raw.githubusercontent.com/Yun5141/comp0036/master/elp_up-to-date_data.csv"
training_data=pd.read_csv(url)
url = "https://raw.githubusercontent.com/Yun5141/comp0036/master/stadiums-with-GPS-coordinates.csv"
geometricData = pd.read_csv(url)

# ------------------ helper functions -------------------
# ********************************************
# to remove data that contains None, NaN, infinite or overflowed
def removeInvalidData(data):

    # remove data which contains None
    data.dropna(axis=0, how='any',inplace=True)

    # remove data which contains NaN, infinite or overflowed number 
    indices_to_keep = ~data.isin([np.nan, np.inf, -np.inf]).any(1)
    data = data[indices_to_keep]

    return data

# !!! 【report中: 第一步先检查有无空值】
assert training_data.shape[0] == removeInvalidData(training_data).shape[0]
#result: there is no empty value at the initial stage 

# ********************************************
# unify the different date formats and convert the type from str to timestamp   [done]
def unifyDate(data):

    if not isinstance(data.Date[0],str):
        return

    newDate = []
    for _, matchInfo in data.iterrows():
        if len(matchInfo.Date) == 8 :
            newDate.append( pd.to_datetime(matchInfo.Date, format="%d/%m/%y" ))
        elif len(matchInfo.Date) == 9 :
            newDate.append( pd.to_datetime(matchInfo.Date, format="%d %b %y" ))
        elif len(matchInfo.Date) == 10 :
            newDate.append(  pd.to_datetime(matchInfo.Date, format="%d/%m/%Y" ))
    
    data['Date'] = pd.Series(newDate).values

#unifyDate(training_data)

# ------------------ Inital Data Exploration -------------------  
# ********************************************
# to see the number of matches each year / season
def separateData(data):
    dataframe_collection = {}

    for year in range(2008, 2020):
        dataframe_collection[year] = data[(data.Date > dt.datetime(year,8,1,0,0) ) & (data.Date < dt.datetime(year+1, 6, 1,0,0))]

    return dataframe_collection

'''
data = separateData(training_data)
for key in data.keys():
    print("\n" +"="*40)
    print(key)
    print("-"*40)
    print(data[key])
'''
#result: 380 rows * 11 dataframes + 170 rows * 1 dataframes = 4350 rows

# ********************************************
# !!! 【在数据正式处理前后各用一次这个函数，即两次的data exploration section】
def checkAverageWinRate(data, resultWinner):

    if resultWinner not in ['H', 'A', 'D']:
        raise Exception('The second argument should only take values within [“H”,“A”,“D”]')
    
    n_wins = len(data[data.FTR == resultWinner])

    return n_wins / data.shape[0]

#prediction = checkAverageWinRate(training_data, 'H')

#results of raw data (ie, when nothing applied to the training data): 
#total number of matches = 4350
#home team = 0.4606896551724138 ~ 0.461; 
#away team = 0.2910344827586207 ~ 0.291;
#draw = 0.2482758620689655 ～ 0.248

# ------------------- Feature Construction ------------------——————————
#*******************************
# get the distance needed to travel for the away team   [done] 
def getDistance(data, geometricData):
  array = []
  for x in data.iterrows():
   
    home_lat = (geometricData.loc[geometricData['Team'] == x[1].HomeTeam]).Latitude
    home_long = (geometricData.loc[geometricData['Team'] == x[1].HomeTeam]).Longitude
    home_location = (np.float32(home_lat), np.float32(home_long))
    
    away_lat = (geometricData.loc[geometricData['Team'] == x[1].AwayTeam]).Latitude
   
    away_long = (geometricData.loc[geometricData['Team'] == x[1].AwayTeam]).Longitude
    away_location = (np.float32(away_lat), np.float32(away_long))
    array.append(np.float32(geodesic(home_location, away_location).km))
  
  
  DIS = pd.Series(array)
  data.loc[:,'DIS'] = DIS

  return data

#getDistance(training_data, geometricData) 
#print(training_data)

#*******************************
# get match week    [done]
def getMW(data, startYear):  
    MW = []
    Flag = 0
    year = startYear

    for _, matchInfo in data.iterrows():
        checkYear = (matchInfo.Date > dt.datetime(year,8,1,0,0)) & (matchInfo.Date < dt.datetime(year+1, 6, 1,0,0)) 
        
        if not checkYear:
            year += 1
            Flag = 0
    
        if (Flag == 0):
            firstDate = matchInfo.Date
            Flag = 1

        week = (matchInfo.Date - firstDate).days // 7 +1
        
        MW.append(week) 

    data.loc[:,'MW'] = pd.Series(MW).values

    return data

'''
使用方法1: 赛季分开成12张分表，则
    unifyDateformat(data)
    separate(data)
然后
for key in data.keys():
    print("\n" +"="*40)
    print(key)
    print("-"*40)
    #print(data[key])
    print(getMW(data[key], key))

使用方法2: 不分赛季，使用完整的表，则
    unifyDateFormat(data)
然后
print(getMW(data, 2008))
'''

#*******************************
# calculate the delta time from last match for home team and away team  [done]
def getDeltaTime(data):
    
    teams = {}

    HDT = []
    ADT = []

    for i in range(len(data)):
        if (i % 380 == 0):
            for name in data.groupby('HomeTeam').mean().T.columns:
                teams[name] = []    # to store last match date

        currentDate = data.iloc[i].Date

        try:
            homeLastMatchDate = teams[data.iloc[i].HomeTeam].pop()
            awayLastMatchDate = teams[data.iloc[i].AwayTeam].pop()
        except:
            homeLastMatchDate = currentDate
            awayLastMatchDate = currentDate

        hdt = currentDate - homeLastMatchDate
        adt = currentDate - awayLastMatchDate

        HDT.append(hdt.days)
        ADT.append(adt.days)

        teams[data.iloc[i].HomeTeam].append(currentDate)
        teams[data.iloc[i].AwayTeam].append(currentDate)

    data.loc[:,'HDT'] = HDT
    data.loc[:,'ADT'] = ADT

    return data

#unifyDateFormat(training_data)
#getMW(training_data,2008)
#getDeltaTime(training_data)
#training_data.loc[377:400,["Date","HomeTeam","AwayTeam","MW","HDT","ADT"]]

#*****************************
# calculate the cumulative goal difference (before this match) scored by home team and away team    [done]
def getCumulativeGoalsDiff(data):
    teams = {}
    HCGD = [] 
    ACGD = []   

    # for each match
    for i in range(len(data)):
        
        if (i % 380 == 0):
            for name in data.groupby('HomeTeam').mean().T.columns:
                teams[name] = []

        FTHG = data.iloc[i]['FTHG']
        FTAG = data.iloc[i]['FTAG']

        try:
            cgd_h = teams[data.iloc[i].HomeTeam].pop()
            cgd_a = teams[data.iloc[i].AwayTeam].pop()
        except:
            cgd_h = 0
            cgd_a = 0

        HCGD.append(cgd_h)
        ACGD.append(cgd_a)
        cgd_h = cgd_h + FTHG - FTAG
        teams[data.iloc[i].HomeTeam].append(cgd_h)
        cgd_a = cgd_a + FTAG - FTHG
        teams[data.iloc[i].AwayTeam].append(cgd_a)

    data.loc[:,'HCGD'] = HCGD
    data.loc[:,'ACGD'] = ACGD
    return data

#getCumulativeGoalsDiff(training_data)
#training_data

#****************************
# get average goal difference per week
def getAverageGD(data):

    data.eval('HAGD = HCGD / MW', inplace=True)
    data.eval('AAGD = ACGD / MW', inplace=True)

    return data

# !!!【必须有了CGD与MW之后再写这一个；在第二次explore画图时舍弃CGD，AGD其中一个】
# unifyDateFormat(training_data)
# getMW(training_data,2008)
# getCumulativeGoalsDiff(training_data)
# getAverageGD(training_data)

#****************************
# !!!【 写report时在代码块外提一句: 因为在最开始用separateData()已发现，每年比赛数都是固定的380场，所以循环里可直接用i%380==0来初始化】
# 统计每支队伍最近三场比赛的表现    [done]
def getPerformanceOfLast3Matches(data):
    HM1 = []    # result of the last match of home team
    AM1 = []    # result of the last match of away team

    HM2 = []    # result of the 2nd last match of home team
    AM2 = []

    HM3 = []    # result of the 3rd last match of home team
    AM3 = []

    teams = {}

    for i in range(len(data)):
        
        if (i % 380 == 0):
            for name in data.groupby('HomeTeam').mean().T.columns:
                teams[name] = deque([None, None, None])  #[3rd, 2nd, latest data]

        HM3.append(teams[data.iloc[i].HomeTeam].popleft())
        AM3.append(teams[data.iloc[i].AwayTeam].popleft())
        HM2.append(teams[data.iloc[i].HomeTeam][0])
        AM2.append(teams[data.iloc[i].AwayTeam][0])
        HM1.append(teams[data.iloc[i].HomeTeam][1])
        AM1.append(teams[data.iloc[i].AwayTeam][1])

        if data.iloc[i].FTR == 'H':
            # 主场 赢，则主场记为赢，客场记为输
            teams[data.iloc[i].HomeTeam].append('W')
            teams[data.iloc[i].AwayTeam].append('L')
        elif data.iloc[i].FTR == 'A':
            # 客场 赢，则主场记为输，客场记为赢
            teams[data.iloc[i].AwayTeam].append('W')
            teams[data.iloc[i].HomeTeam].append('L')
        else:
            # 平局
            teams[data.iloc[i].AwayTeam].append('D')
            teams[data.iloc[i].HomeTeam].append('D')

    data.loc[:,'HM1'] = HM1
    data.loc[:,'AM1'] = AM1
    data.loc[:,'HM2'] = HM2
    data.loc[:,'AM2'] = AM2
    data.loc[:,'HM3'] = HM3
    data.loc[:,'AM3'] = AM3

    return data

#getPerformanceOfLast3Matches(training_data)
#print(training_data)




In [197]:
# ----create features-----

getDistance(training_data,geometricData)

unifyDate(training_data)
getMW(training_data,2008)
getDeltaTime(training_data)
getCumulativeGoalsDiff(training_data)
getAverageGD(training_data)
getPerformanceOfLast3Matches(training_data)

training_data


Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,...,HCGD,ACGD,HAGD,AAGD,HM1,AM1,HM2,AM2,HM3,AM3
0,2008-08-16,Arsenal,West Brom,1,0,H,1,0,H,H Webb,...,0,0,0.000000,0.000000,,,,,,
1,2008-08-16,Bolton,Stoke,3,1,H,3,0,H,C Foy,...,0,0,0.000000,0.000000,,,,,,
2,2008-08-16,Everton,Blackburn,2,3,A,1,1,D,A Marriner,...,0,0,0.000000,0.000000,,,,,,
3,2008-08-16,Hull,Fulham,2,1,H,1,1,D,P Walton,...,0,0,0.000000,0.000000,,,,,,
4,2008-08-16,Middlesbrough,Tottenham,2,1,H,0,0,D,M Atkinson,...,0,0,0.000000,0.000000,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4345,2019-12-14,Southampton,West Ham,0,1,A,0,1,A,M Atkinson,...,-17,-10,-0.894737,-0.526316,L,L,W,L,W,W
4346,2019-12-15,Man United,Everton,1,1,D,0,1,A,M Oliver,...,6,-9,0.315789,-0.473684,W,W,W,L,D,L
4347,2019-12-15,Wolves,Tottenham,1,2,A,0,1,A,S Attwell,...,4,7,0.210526,0.368421,D,W,W,L,D,W
4348,2019-12-15,Arsenal,Man City,0,3,A,0,3,A,P Tierney,...,0,25,0.000000,1.315789,W,L,L,W,D,D


In [212]:
# --------------- 删除中间数据 -----------------
# !!!【在notebook中不写成函数，直接写里面的代码】
def removeIntermediateData(data):   # or removeUnwantedData(data)
    data = data[data.MW > 3]
    
    data = removeInvalidData(data)

    return data

training_data = removeIntermediateData(training_data)

# (--------------- Progress Summary -----------------)

# !!!【不必写成函数；重点是给一个总结，并且说明feature是28个，因为FTR是标签不是feature】
def printOutSummary(data):
    n_matches = data.shape[0]
    n_features = data.shape[1] - 1  # FTR is a label, not feature

    print("total number of matches: {}".format(n_matches))
    print("number of features: {}".format(n_features)) 
    print("home team win rate: {}".format(checkAverageWinRate(training_data,'H')))
    # print ("away team xxxxxxxx")
    # print ("draw xxxxxxxx")

#results of processed data:
#total number of matches = 3981
#number of features = 28  
#home team = 0.4654609394624466 ~ 0.4655; 
#away team = 0.2868625973373524 ~ 0.2869;
#draw = 0.24767646320020095 ~ 0.2477

# !!!【 in report: From these results, we can find that the processed data is still imbalanced. We chose to make it binary.】

# --------------- Data Transformation -----------------
# ********************************
data = training_data.copy()
data.drop(['Date','HomeTeam', 'AwayTeam', 'Referee','FTHG', 'FTAG', 'MW'],1, inplace=True)

# ********************************
# simplify to a binary problem, make the target be FTR == 'H'
def simplifyLabel(label):
    if label == 'H':
        return 'H'
    else:
        return 'NH'

#data['FTR'] = data.FTR.apply(simplifyLabel)
#data['HTR'] = data.HTR.apply(simplifyLabel)

# ********************************
# separate the training data into : feature set, label
X_all = data.drop(['FTR'],1)
Y_all = data['FTR']

# map the label into 0, 1, 2 [multi-class classification]
multiR = {'H':1, 'A':0, 'D':2}
Y_all_multi = Y_all.map(d)

'''
# map the label into 0, 1 [binary class classification]
Y_all_bi = Y_all.FTR.apply(simplifyLabel)
biR = {'NH':0,'H':1}
Y_all_bi = Y_all_bi.map(biR)
'''

# separate the columns in feature set by types: 
categList = ["HTR", "HM1","AM1", "HM2","AM2", "HM3","AM3"]
numList = list(set(X_all.columns.tolist()).difference(set(categList)))


# ********************************
# rescale data
def rescale(data, cols):
    for col in cols:
        max = data[col].max()
        min = data[col].min()
        data[col] = (data[col] - min) / (max - min)
    return data

rescale(X_all,numList)   #[not sure if needed to be the whole numList]

# ********************************
# standardization
from sklearn.preprocessing import scale
def standardize(data,cols):
    for col in cols:
        data[col] = scale(data[col])

standardize(X_all, numList)

# ********************************
# transform categorical features
def transformCategoricalFeature(data,categoricalFeatureNames):
    # 把这些特征转换成字符串类型
    for col in categoricalFeatureNames:
        data[col] = data[col].astype('str')
    
    output = pd.DataFrame(index=data.index)

    for col_name, col_data in data.iteritems():
        if col_data.dtype == 'object':
            col_data = pd.get_dummies(col_data, prefix = col_name)
        output = output.join(col_data)
    
    return output

X_all = transformCategoricalFeature(X_all, categList)



In [223]:
#X_all[X_all.HTR_A == 1].HTR_A
#data.AM1

In [224]:
# --------------- Visualization -----------------
import matplotlib.pyplot as plt
import seaborn as sns
# ************************************
# plot all the features with Pearson correlation heatmap
def plotGraph(X_all, Y_all):

    train_data=pd.concat([X_all,Y_all],axis=1)

    colormap = plt.cm.RdBu
    plt.figure(figsize=(21,18))
    plt.title('Pearson Correlation of Features', y=1.05, size=15)
    sns.heatmap(train_data.astype(float).corr(),linewidths=0.1,vmax=1.0,
                square=True, cmap=colormap, linecolor='white', annot=True)

# plotGraph(X_all, Y_all)
# !!!【in report: found that HAGD & HCGD, AAGD & ACGD are highly correlated, so drop HCGD, ACGD】
# X_all = X_all.drop(["HCGD","ACGD"], axis=1)

# *************************************
# plot the top 10 features related to FTR
def plotGraph2(X_all, Y_all):
    train_data=pd.concat([X_all,Y_all],axis=1)

    #FTR correlation matrix
    plt.figure(figsize=(14,12))
    k = 10 # number of variables for heatmap
    cols = abs(train_data.astype(float).corr()).nlargest(k, 'FTR')['FTR'].index
    cm = np.corrcoef(train_data[cols].values.T)
    sns.set(font_scale=1.25)
    hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
    plt.show()


# plotGraph2(X_all, Y_all)
# !!! 【in report: give a few comment of this graph】

# X_all = X_all["A", "B", "C", xxxx] 
# select the top 10 features according to the graph2, drop others




In [252]:
# -------------------- Classifiers ------------------------- 

from sklearn.naive_bayes import GaussianNB
clf1 = GaussianNB()

from sklearn.linear_model import LogisticRegression
clf2 = LogisticRegression(solver='lbfgs', multi_class = 'multinomial')

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
clf3 = LDA()

from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
clf4 = QDA()

from sklearn.tree import DecisionTreeClassifier
clf5 = DecisionTreeClassifier()

from sklearn.neural_network import MLPClassifier
clf6 = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(15,), random_state=1)

clfs = [clf1, clf2, clf3, clf4, clf5, clf6]


# -------------------- Evaluation ------------------------- 
# ********************************
# split data
X_train, X_test, y_train, y_test = train_test_split(X_all, Y_all, test_size = 50,random_state = 2,stratify = Y_all)

# ********************************
from time import time
from sklearn.metrics import f1_score
# train classifier
def train_classifier(clf, X_train, y_train):
    start = time()
    clf.fit(X_train, y_train)
    end = time()
    print("time for training: {:.4f} sec".format(end - start))

# predict using the classifier
def predict_labels(clf, features, target):
    start = time()
    y_pred = clf.predict(features)
    end = time()
    print("time for prediction: {:.4f} sec".format(end - start))
    return f1_score(target, y_pred, pos_label=1, average="weighted"), sum(target == y_pred) / float(len(y_pred))

# print out the performance of each classifer
def train_predict(clf, X_train, y_train, X_test, y_test):

    print("Classifier: {} [sample size: {}]".format(clf.__class__.__name__, len(X_train)))

    train_classifier(clf, X_train, y_train)

    # evaluate model on train set
    print("[on train set]")
    f1, acc = predict_labels(clf, X_train, y_train)
    print("F1 score: {:.4f} ".format(f1))
    print("accuracy: {:.4f}".format(acc))

    # evaluate model on test set
    print("[on test set]")
    f1, acc = predict_labels(clf, X_test, y_test)
    print("F1 score: {:.4f} ".format(f1))
    print("accuracy: {:.4f}".format(acc))

'''
for clf in clfs:
    train_predict(clf, X_train, y_train, X_test, y_test)
    print("\n")
'''
# [in report: xxx takes the shortest time for training; xxx has the highest accuracy; xxx [give comments to the result]]
# [in report: so we choose to adjust xxxx (the relatively best one among them) with hyperparameters]


'\nfor clf in clfs:\n    train_predict(clf, X_train, y_train, X_test, y_test)\n    print("\n")\n'

In [257]:

#xtrain = X_train.copy()
#xtest = X_test.copy()
'''
for clf in clfs:
    train_predict(clf, X_train, y_train, X_test, y_test)
    print("\n")
'''
#xtrain = xtrain[["DIS","HDT","ADT", "HCGD", "ACGD", "HAGD", "AAGD", "HM1_L", "HM2_L", "HM3_L","AM1_L", "AM2_L", "AM3_L", "HM1_W", "HM2_W", "HM3_W","AM1_W", "AM2_W", "AM3_W", "HM1_D", "HM2_D", "HM3_D","AM1_D", "AM2_D", "AM3_D"]]
#xtest = xtest[["DIS","HDT","ADT", "HCGD", "ACGD", "HAGD", "AAGD", "HM1_L", "HM2_L", "HM3_L","AM1_L", "AM2_L", "AM3_L", "HM1_W", "HM2_W", "HM3_W","AM1_W", "AM2_W", "AM3_W", "HM1_D", "HM2_D", "HM3_D","AM1_D", "AM2_D", "AM3_D"]]


from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
# adjust the model with hyperparameter      【如果只一个model的话可以不用写成函数】
def adjustClassifier(clf, f1_scorer, param, X_train, y_train):

    grid_obj = GridSearchCV(clf,scoring=f1_scorer,param_grid=param,cv=5)
    grid_obj = grid_obj.fit(X_train,y_train)

    clf = grid_obj.best_estimator_

    return clf

clf = LogisticRegression(solver = 'lbfgs', multi_class = 'multinomial')
f1_scorer = make_scorer(f1_score, average = 'weighted')
parameters = { 
              'C' :[1.0, 100.0, 1000.0],
              'max_iter':[100,200,300, 400, 500],
              'intercept_scaling':[0.1, 0.5, 1.0]
             }
clf = adjustClassifier(clf, f1_scorer, parameters, X_train, y_train)

train_predict(clf, X_train, y_train, X_test, y_test)

print("\n")
from sklearn.linear_model import LogisticRegression
clf2 = LogisticRegression(solver='lbfgs', multi_class = 'multinomial')

train_predict(clf2, X_train, y_train, X_test, y_test)


Classifier: LogisticRegression [sample size: 3931]
time for training: 0.0580 sec
[on train set]
time for prediction: 0.0014 sec
F1 score: 0.6480 
accuracy: 0.6640
[on test set]
time for prediction: 0.0008 sec
F1 score: 0.6318 
accuracy: 0.6800


Classifier: LogisticRegression [sample size: 3931]
time for training: 0.0963 sec
[on train set]
time for prediction: 0.0014 sec
F1 score: 0.6484 
accuracy: 0.6642
[on test set]
time for prediction: 0.0011 sec
F1 score: 0.6318 
accuracy: 0.6800


In [258]:
# ------------ Derive Feature of Test Sample --------------------
# 把test sample里的名字查出，在此之前的比赛的各特征的平均值，distance现场算出

# read data
url = 'https://raw.githubusercontent.com/Yun5141/comp0036/master/epl-test.csv'
rawData_toPred = pd.read_csv(url)
url = 'https://raw.githubusercontent.com/Yun5141/comp0036/master/2019EPL.csv'
preData2019 = pd.read_csv(url)

# join the two dataframe together
data2019 = pd.concat([preData_2019,X_sample],ignore_index=True,sort=False)

In [0]:
print(rawData_toPred)
print(preData2019)

In [259]:
'''
getDistance(training_data,geometricData)

unifyDateFormat(training_data)
getMW(training_data,2008)
getDeltaTime(training_data)
getCumulativeGoalsDiff(training_data)
getAverageGD(training_data)
getPerformanceOfLast3Matches(training_data)

'''
# derive feature
unifyDate(data2019)
getDistance(data2019,geometricData) # HomeTeam,AwayTeam -> DIS
getMW(data2019,2019)    # Date -> MW
getDeltaTime(data2019)  # Date -> HDT, ADT
getCumulativeGoalsDiff(data2019)    # FTHG, FTAG -> HCGD, ACGD
getAverageGD(data2019)      # HCGD, ACGD, MW -> HAGD, AAGD
getPerformanceOfLast3Matches(data2019)  # FTR -> HM1, AM1, HM2, AM2, HM3, AM3


Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,...,HCGD,ACGD,HAGD,AAGD,HM1,AM1,HM2,AM2,HM3,AM3
0,2019-08-09,Liverpool,Norwich,4.0,1.0,H,4.0,0.0,H,M Oliver,...,0.0,0.0,0.000000,0.000000,,,,,,
1,2019-08-10,West Ham,Man City,0.0,5.0,A,0.0,1.0,A,M Dean,...,0.0,0.0,0.000000,0.000000,,,,,,
2,2019-08-10,Bournemouth,Sheffield United,1.0,1.0,D,0.0,0.0,D,K Friend,...,0.0,0.0,0.000000,0.000000,,,,,,
3,2019-08-10,Burnley,Southampton,3.0,0.0,H,0.0,0.0,D,G Scott,...,0.0,0.0,0.000000,0.000000,,,,,,
4,2019-08-10,Crystal Palace,Everton,0.0,0.0,D,0.0,0.0,D,J Moss,...,0.0,0.0,0.000000,0.000000,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
214,2020-01-11,Leicester,Southampton,,,,,,,,...,27.0,-13.0,1.173913,-0.565217,W,W,W,D,L,W
215,2020-01-11,Man United,Norwich,,,,,,,,...,7.0,-19.0,0.304348,-0.826087,L,D,W,D,W,L
216,2020-01-11,Sheffield United,West Ham,,,,,,,,...,2.0,-7.0,0.086957,-0.304348,L,W,L,L,D,L
217,2020-01-11,Tottenham,Liverpool,,,,,,,,...,6.0,35.0,0.260870,1.521739,L,W,D,W,W,W


In [260]:
# select feature
l = ["DIS","HDT","ADT", "HCGD", "ACGD", "HAGD", "AAGD", "HM1", "HM2", "HM3","AM1", "AM2", "AM3"]
data2019_selectedFeature = data2019[l]
data2019_selectedFeature

Unnamed: 0,DIS,HDT,ADT,HCGD,ACGD,HAGD,AAGD,HM1,HM2,HM3,AM1,AM2,AM3
0,300.215302,0,0,0.0,0.0,0.000000,0.000000,,,,,,
1,265.040710,0,0,0.0,0.0,0.000000,0.000000,,,,,,
2,297.773346,0,0,0.0,0.0,0.000000,0.000000,,,,,,
3,325.889374,0,0,0.0,0.0,0.000000,0.000000,,,,,,
4,299.905975,0,0,0.0,0.0,0.000000,0.000000,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
214,191.525848,10,10,27.0,-13.0,1.173913,-0.565217,W,W,L,W,D,W
215,258.923889,10,10,7.0,-19.0,0.304348,-0.826087,L,W,W,D,D,L
216,230.038284,9,10,2.0,-7.0,0.086957,-0.304348,L,L,D,W,L,L
217,282.755493,10,9,6.0,35.0,0.260870,1.521739,L,D,W,W,W,W


In [261]:
# remove data containing empty value
data2019_removeEmpty = removeInvalidData(data2019_selectedFeature)
data2019_removeEmpty

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,DIS,HDT,ADT,HCGD,ACGD,HAGD,AAGD,HM1,HM2,HM3,AM1,AM2,AM3
30,291.123566,7,7,-2.0,3.0,-0.500000,0.750000,W,L,L,L,D,W
31,228.692932,7,7,-3.0,0.0,-0.750000,0.000000,W,D,L,L,W,D
32,174.849289,7,8,0.0,-1.0,0.000000,-0.250000,W,L,D,W,L,L
33,217.818192,7,6,1.0,-1.0,0.250000,-0.250000,W,D,D,L,W,D
34,325.604492,6,7,7.0,1.0,1.750000,0.250000,W,D,W,L,D,W
...,...,...,...,...,...,...,...,...,...,...,...,...,...
214,191.525848,10,10,27.0,-13.0,1.173913,-0.565217,W,W,L,W,D,W
215,258.923889,10,10,7.0,-19.0,0.304348,-0.826087,L,W,W,D,D,L
216,230.038284,9,10,2.0,-7.0,0.086957,-0.304348,L,L,D,W,L,L
217,282.755493,10,9,6.0,35.0,0.260870,1.521739,L,D,W,W,W,W


In [262]:
# data transformation
numList1 = ["DIS","HDT","ADT", "HCGD", "ACGD", "HAGD", "AAGD"]
rescale(data2019_selectedFeature,numList1)   
standardize(data2019_selectedFeature, numList1)

categList1 = ["HM1", "HM2", "HM3","AM1", "AM2", "AM3"]
data2019_selectedFeature = transformCategoricalFeature(data2019_selectedFeature, categList1)
data2019_selectedFeature

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_gui

Unnamed: 0,DIS,HDT,ADT,HCGD,ACGD,HAGD,AAGD,HM1_D,HM1_L,HM1_W,...,HM3_W,AM1_D,AM1_L,AM1_W,AM2_D,AM2_L,AM2_W,AM3_D,AM3_L,AM3_W
30,1.082232,-0.100742,-0.127943,-0.172498,0.282329,-0.688127,1.056763,0,0,1,...,0,0,1,0,1,0,0,0,0,1
31,0.485296,-0.100742,-0.127943,-0.263312,0.000000,-1.040985,-0.009968,0,0,1,...,0,0,1,0,0,0,1,1,0,0
32,-0.029534,-0.100742,0.159929,0.009129,-0.094110,0.017590,-0.365544,0,0,1,...,0,0,0,1,0,1,0,0,1,0
33,0.381316,-0.100742,-0.415816,0.099943,-0.094110,0.370449,-0.365544,0,0,1,...,0,0,1,0,0,0,1,1,0,0
34,1.411924,-0.389229,-0.127943,0.644825,0.094110,2.487600,0.345609,0,0,1,...,1,0,1,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
214,0.129920,0.764721,0.735674,2.461098,-1.223424,1.674491,-0.813880,0,0,1,...,0,0,0,1,1,0,0,0,0,1
215,0.774352,0.764721,0.735674,0.644825,-1.788081,0.447157,-1.184917,0,1,0,...,1,1,0,0,1,0,0,0,1,0
216,0.498160,0.476233,0.735674,0.190757,-0.658767,0.140324,-0.442844,0,1,0,...,0,0,0,1,0,1,0,0,1,0
217,1.002220,0.764721,0.447801,0.554011,3.293834,0.385791,2.154413,0,1,0,...,1,0,0,1,0,0,1,0,0,1


In [302]:
train_classifier(clf2,xtrain,y_train)     # train the classifer
sample = data2019_selectedFeature.tail(10)
sample1 = xtest.sample(n=10, random_state=1)
sample2 = data2019_selectedFeature
sample3 = xtrain.tail(179)

y_pred_2019InTrain = clf2.predict(sample3)
print(y_pred_2019InTrain)

print("\n")
y_pred_2019Data = clf2.predict(sample2)
print(y_pred_2019Data)
len(sample2)

def foo(x):
    if x == 2:
        return 1
    else:
        return 0

print(y_pred_2019InTrain.map({0:0,1:0,2:1}).sum())

time for training: 0.1022 sec
[1 1 1 1 1 1 0 0 0 0 1 1 1 1 1 1 1 1 0 1 0 1 1 0 1 1 1 0 0 1 1 1 0 1 0 1 0
 1 0 0 2 1 1 1 0 1 1 1 0 1 0 1 1 2 1 1 1 0 1 1 1 0 1 0 1 1 1 0 0 1 1 0 1 1
 1 0 1 1 1 0 1 0 1 0 0 1 1 1 1 1 0 1 0 1 1 1 0 1 1 1 1 1 0 0 1 1 0 1 0 1 0
 1 0 2 0 1 0 1 1 0 1 1 1 1 1 1 1 1 0 0 1 1 0 1 2 1 1 1 1 1 0 0 1 1 0 0 1 1
 1 0 1 1 1 2 1 1 0 1 1 0 1 0 1 1 0 1 0 1 0 1 1 1 1 0 1 1 1 1 0]


[0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 0 1 0 1 1 1 1 1 1 1 1 0 1 0 0 1 1 1 1 1 1
 0 1 1 0 1 1 1 0 1 1 1 1 0 1 1 1 1 1 1 1 0 0 1 0 1 1 0 1 1 0 1 1 0 1 1 0 1
 1 1 1 0 0 1 1 1 1 1 0 1 1 1 1 1 1 1 1 0 0 1 0 1 1 1 0 1 1 1 1 1 0 1 1 1 1
 0 1 1 1 1 1 1 1 1 0 0 1 0 1 0 1 0 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 0
 1 1 1 1 1 1 1 1 1 1 0 1 1 0 0 0 0 0 0 1 1 0 1 0 0 0 1 0 1 0 1 1 0 1 1 1 1
 1 1 0 1]


AttributeError: 'numpy.ndarray' object has no attribute 'map'

In [287]:
print(clf)      #调过参数后
print(clf2)    #普通

LogisticRegression(C=100.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=0.1, l1_ratio=None, max_iter=100,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
