In [75]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
% matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

### Load the training data

In [165]:
fifa_his=pd.read_csv('Training_fifa2.csv')

### create a inverted copy of the training data
Here we swap home and away team. We invert the features and change the label correspondingly.

In [197]:
fifa_his2=fifa_his.copy()
fifa_his2["home_team"] = fifa_his["away_team"]
fifa_his2["away_team"] = fifa_his["home_team"]

fifa_his2["rank_dif"] = -fifa_his["rank_dif"]
fifa_his2["rating_dif"] = -fifa_his["rating_dif"]

z={0:1,1:0,2:2}
fifa_his2["label"]=fifa_his["label"].map(z)

### combine the original and the inverted dataset

In [198]:
fifa_his3=pd.concat([fifa_his,fifa_his2])

In [199]:
fifa_his3.shape

(300, 6)

### create features and label

In [200]:
X=fifa_his3.loc[:,['rank_dif','rating_dif']]
X.shape

(300, 2)

In [201]:
# from sklearn.preprocessing import PolynomialFeatures
# pf = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
# X = pf.fit_transform(X)

In [202]:
X.shape

(300, 2)

In [203]:
y=fifa_his3.loc[:,'label']
y.shape

(300,)

### split test and train set

In [204]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [205]:
y_train.value_counts()

1    100
0     98
2     42
Name: label, dtype: int64

In [206]:
y_test.value_counts()

0    25
1    23
2    12
Name: label, dtype: int64

### train Logistic Regression

In [207]:
logreg = linear_model.LogisticRegression(C=1e5)

In [208]:
logreg.fit(X_train,y_train)

LogisticRegression(C=100000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [209]:
y_testpredlog = logreg.predict(X_test)
y_testpredlogproba = logreg.predict_proba(X_test)

In [210]:
logreg.coef_

array([[-0.02001743,  0.09370787],
       [ 0.01893316, -0.08648877],
       [ 0.00073205, -0.00597413]])

In [211]:
y_testpredlogproba

array([[0.32642578, 0.49164912, 0.1819251 ],
       [0.47683121, 0.34769823, 0.17547056],
       [0.40001713, 0.42015818, 0.17982469],
       [0.28009023, 0.5376513 , 0.18225847],
       [0.12531822, 0.69486042, 0.17982136],
       [0.50661334, 0.31892232, 0.17446434],
       [0.56868843, 0.26158736, 0.16972421],
       [0.53973225, 0.28808517, 0.17218257],
       [0.37119161, 0.44977279, 0.17903559],
       [0.41984723, 0.40045704, 0.17969573],
       [0.07012692, 0.75629507, 0.17357801],
       [0.44029282, 0.38244036, 0.17726681],
       [0.33785251, 0.48129763, 0.18084986],
       [0.33267628, 0.48440118, 0.18292254],
       [0.57816746, 0.25209802, 0.16973453],
       [0.43882381, 0.38400249, 0.1771737 ],
       [0.36710046, 0.4520593 , 0.18084024],
       [0.32895302, 0.48935771, 0.18168928],
       [0.80292554, 0.05483243, 0.14224203],
       [0.27481358, 0.54384214, 0.18134428],
       [0.5380199 , 0.29055319, 0.17142691],
       [0.33821687, 0.47891512, 0.18286801],
       [0.

In [212]:
y_test

53     1
116    0
2      2
9      2
83     1
76     0
46     1
109    0
5      0
25     2
87     1
57     2
68     1
45     1
32     0
71     0
139    0
61     0
148    0
15     1
78     0
113    2
99     1
100    0
104    0
42     0
131    1
145    0
7      1
88     2
17     2
14     1
33     0
24     1
65     1
119    2
7      0
90     0
46     0
73     2
93     2
76     1
136    0
60     0
77     0
63     1
84     2
79     0
111    1
81     1
30     0
144    1
89     1
75     0
147    1
128    1
97     1
92     0
42     1
25     2
Name: label, dtype: int64

### calculate confusion matrix

In [213]:
confusion_matrix(y_test, y_testpredlog)

array([[14, 11,  0],
       [ 3, 20,  0],
       [ 5,  7,  0]])

### calculate accuracy score and compare with naive solutions

#### the model

In [214]:
accuracy_score(y_test, y_testpredlog)

0.5666666666666667

#### all win

In [215]:
accuracy_score(y_test, [0]*len(y_test))

0.4166666666666667

#### all loose

In [216]:
accuracy_score(y_test, [1]*len(y_test))

0.38333333333333336

#### random

In [217]:
accuracy_score(y_test, np.random.randint(0, 3, len(y_test)))

0.2833333333333333

### Train Random Forest

In [218]:
clf = RandomForestClassifier(max_depth=2, random_state=0,n_estimators=100)

In [219]:
clf.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [220]:
clf.score(X_test,y_test)

0.5333333333333333

In [221]:
clf.predict_proba([[14,-67]])

array([[0.1216183 , 0.7016362 , 0.17674549]])

In [222]:
clf.classes_

array([0, 1, 2])