In [6]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report,r2_score
from sklearn.preprocessing import MinMaxScaler


In [2]:
""" Field Name	Order	Type (Format)	Description
Date	1	date (%Y-%m-%d)	Match Date (dd/mm/yy)
HomeTeam	2	string (default)	Home Team
AwayTeam	3	string (default)	Away Team
FTHG	4	integer (default)	Full Time Home Team Goals
FTAG	5	integer (default)	Full Time Away Team Goals
FTR	6	string (default)	Full Time Result (H=Home Win, D=Draw, A=Away Win)
HTHG	7	integer (default)	Half Time Home Team Goals
HTAG	8	integer (default)	Half Time Away Team Goals
HTR	9	string (default)	Half Time Result (H=Home Win, D=Draw, A=Away Win)
HS	10	integer (default)	Home Team Shots
AS	11	integer (default)	Away Team Shots
HST	12	integer (default)	Home Team Shots on Target
AST	13	integer (default)	Away Team Shots on Target
HF	14	integer (default)	Home Team Fouls Committed
AF	15	integer (default)	Away Team Fouls Committed
HC	16	integer (default)	Home Team Corners
AC	17	integer (default)	Away Team Corners
HY	18	integer (default)	Home Team Yellow Cards
AY	19	integer (default)	Away Team Yellow Cards
HR	20	integer (default)	Home Team Red Cards
AR	21	integer (default)	Away Team Red Cards """


squadre =  {
                'Genoa' : 0,
                'Udinese' : 1,
                'Napoli'  : 2,
                'Empoli'  : 3,
                'Milan'   : 4,
                'Atalanta': 5,
                'Chievo'  : 6,
                'Juventus': 7,
                'Lazio'   : 8,
                'Spal'    : 9,
                'Parma'   : 10,
                'Cagliari': 11,
                'Sassuolo': 12,
                'Torino'  : 13,
                'Inter'   : 14,
                'Fiorentina': 15,
                'Bologna'   : 16,
                'Roma'      : 17,
                'Sampdoria' : 18,
                'Frosinone' : 19
            }

In [4]:
df = pd.read_csv("season-1819_csv.csv")

In [9]:
print(df.shape)
df.drop(['Date'], axis='columns', inplace=True) #Tolgo la colonna Date
print(df.shape)

(259, 20)


KeyError: "labels ['Date'] not contained in axis"

In [10]:
df.drop(['FTR'], axis='columns', inplace=True) #Tolgo la colonna FTR
df.drop(['HTR'], axis='columns', inplace=True) #Tolgo la colonna HTR

In [13]:
df.shape
df.head()

Unnamed: 0,HomeTeam,AwayTeam,FTHG,FTAG,HTHG,HTAG,HS,AS,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR
0,6,7,2,3,1,1,7,23,2,11,7,9,0,8,2,0,0,0
1,8,2,1,2,1,1,9,11,5,6,8,5,4,7,0,0,0,0
2,16,9,0,1,0,0,8,10,3,5,16,11,7,0,4,2,1,0
3,3,11,2,0,1,0,9,12,4,5,19,19,6,6,3,3,0,0
4,10,1,2,2,1,0,9,16,6,6,10,13,4,5,2,2,0,0


In [8]:
df['HomeTeam'] = df['HomeTeam'].map(squadre)
df['AwayTeam'] = df['AwayTeam'].map(squadre)

In [7]:
scaler = MinMaxScaler(feature_range=(0, 1), copy=True)

In [14]:
array_scaled = scaler.fit_transform(df)

In [15]:
print(array_scaled)

[[0.31578947 0.36842105 0.33333333 ... 0.         0.         0.        ]
 [0.42105263 0.10526316 0.16666667 ... 0.         0.         0.        ]
 [0.84210526 0.47368421 0.         ... 0.28571429 0.5        0.        ]
 ...
 [0.47368421 0.94736842 0.16666667 ... 0.42857143 0.5        0.        ]
 [0.68421053 0.31578947 0.5        ... 0.57142857 0.         0.        ]
 [0.05263158 0.84210526 0.33333333 ... 0.42857143 0.         0.        ]]


In [16]:
df_scaled = pd.DataFrame(array_scaled)

In [20]:

df_scaled.columns = df.columns

In [21]:
df_scaled.head()

Unnamed: 0,HomeTeam,AwayTeam,FTHG,FTAG,HTHG,HTAG,HS,AS,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR
0,0.315789,0.368421,0.333333,0.5,0.25,0.333333,0.25,1.0,0.133333,0.846154,0.15,0.217391,0.0,0.615385,0.333333,0.0,0.0,0.0
1,0.421053,0.105263,0.166667,0.333333,0.25,0.333333,0.333333,0.428571,0.333333,0.461538,0.2,0.043478,0.2,0.538462,0.0,0.0,0.0,0.0
2,0.842105,0.473684,0.0,0.166667,0.0,0.0,0.291667,0.380952,0.2,0.384615,0.6,0.304348,0.35,0.0,0.666667,0.285714,0.5,0.0
3,0.157895,0.578947,0.333333,0.0,0.25,0.0,0.333333,0.47619,0.266667,0.384615,0.75,0.652174,0.3,0.461538,0.5,0.428571,0.0,0.0
4,0.526316,0.052632,0.333333,0.333333,0.25,0.0,0.333333,0.666667,0.4,0.461538,0.3,0.391304,0.2,0.384615,0.333333,0.285714,0.0,0.0


In [146]:
print(df_scaled.iloc[:,:2])
print(df_scaled.iloc[:,2:18])
print(type(df.iloc[:,2:6]))

     HomeTeam  AwayTeam
0    0.315789  0.368421
1    0.421053  0.105263
2    0.842105  0.473684
3    0.157895  0.578947
4    0.526316  0.052632
5    0.631579  0.736842
6    0.684211  0.894737
7    0.263158  1.000000
8    0.368421  0.421053
9    0.105263  0.210526
10   0.578947  0.631579
11   0.789474  0.315789
12   1.000000  0.842105
13   0.000000  0.157895
14   0.736842  0.684211
15   0.473684  0.526316
16   0.052632  0.947368
17   0.894737  0.263158
18   0.210526  0.894737
19   0.842105  0.736842
20   0.526316  0.368421
21   0.263158  0.578947
22   0.315789  0.157895
23   0.789474  0.052632
24   0.421053  1.000000
25   0.947368  0.105263
26   0.631579  0.000000
27   0.684211  0.473684
28   1.000000  0.947368
29   0.736842  0.526316
..        ...       ...
229  0.684211  0.052632
230  0.368421  1.000000
231  0.263158  0.210526
232  0.578947  0.526316
233  0.157895  0.631579
234  0.000000  0.421053
235  0.736842  0.947368
236  0.105263  0.684211
237  0.473684  0.789474
238  0.052632  0

In [119]:
lr = LinearRegression()
x_train, x_test, y_train, y_test = train_test_split(df.iloc[:,:2], df.iloc[:,2:18], test_size=0.2, shuffle=False)

In [120]:
lr.fit(x_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [121]:
pred = lr.predict(x_test)

In [122]:
print(pred)

[[ 1.49970325  1.22254316  0.6797705   0.466393   11.36428768  9.77698499
   5.80536028  4.94427339 12.51929814 12.97892451  5.64249537  4.98929895
   2.14259164  2.53426748  0.08680639  0.12581026]
 [ 1.44211705  1.22618624  0.61760139  0.48135708 11.06862026  9.51797617
   5.62523416  4.8393575  12.9586063  13.84171352  5.6772446   4.74254279
   2.16048235  2.58576906  0.09404499  0.08488555]
 [ 1.59171822  1.19422908  0.79280512  0.43264077 11.99595209 10.17461301
   6.1482875   5.11054575 11.62058577 11.21567414  5.68237163  5.49282411
   2.10315022  2.46109397  0.07934178  0.20764735]
 [ 1.4964243   1.14964855  0.7207452   0.43525965 11.86494308  9.70947977
   5.97421163  4.93385012 11.90485077 11.77800314  5.95452667  5.33028586
   2.10833325  2.56683689  0.10054908  0.17692608]
 [ 1.46199555  1.17431956  0.66987969  0.4540478  11.52894608  9.57086057
   5.81141053  4.87249365 12.36425498 12.67846449  5.87990118  5.07351686
   2.12988396  2.58850882  0.10077508  0.1360137 ]
 [ 1.

In [137]:
for i in range(len(y_test.values.T)): #verifico l'accuratezza di ogni variabile predetta per tutti i casi di test
    print(r2_score(y_true=y_test.values[:,i], y_pred=pred[:,i]))

-0.06433957170865057
0.0016739409049633114
-0.06544238460254248
-0.0440792235349301
-0.06571580751603823
-0.08571971046619109
-0.02114320290620908
-0.018374636312701087
-0.031126460283667612
-0.021047174012766012
-0.011678783291133144
-0.08758854517982306
0.0017301740362112294
0.006010411113237857
-0.009676734707195322
-0.012399756800896888


In [124]:
print(pred[:,10])
y_test.values[:,10]

[5.64249537 5.6772446  5.68237163 5.95452667 5.87990118 6.56285228
 6.41872833 5.7039474  4.9675907  5.88794761 6.49079031 5.39704314
 5.0209963  5.49837143 6.26143093 5.00490344 5.86929124 5.96257309
 4.85016955 6.11730698 5.58396274 5.42374594 5.08757536 6.28813373
 5.73577723 5.623839   6.2266817  6.23729164 5.79466575 6.33605642
 5.97866595 6.08255775 5.51190077 5.7544336  5.42922885 5.26901205
 5.16768376 5.490325   5.54921351 5.32498116 5.22108936 5.51446428
 5.80527569 6.13596336 6.13339984 6.16266616 6.23472813 6.73624254
 6.06390138 6.11474347 5.31437122 6.53614948]


array([ 1,  6,  1, 10, 12,  5,  4,  6, 10, 11,  1,  2,  2,  3,  7,  5,  2,
        9, 13,  2,  9,  8,  9,  5,  5,  1,  3,  7,  9, 15,  3,  6,  9,  6,
        4,  4,  4,  4,  9,  5,  3,  3,  1,  8,  5,  6,  8,  8, 12,  5,  7,
        6], dtype=int64)

In [125]:
x_test.values.shape
pred.shape

(52, 16)

In [126]:
final_array = np.concatenate((x_test.values, pred), axis = 1)

In [127]:
df_final = pd.DataFrame(final_array)

In [128]:
df_final.columns = df.columns

In [129]:
final_array_true = np.concatenate((x_test.values, y_test.values), axis = 1)

In [130]:
df_final_true = pd.DataFrame(final_array_true)

In [131]:
df_final_true.columns = df.columns

In [141]:
nomi_squadre = dict((v,k) for k,v in squadre.items())
df_final['HomeTeam'] = df_final['HomeTeam'].map(nomi_squadre)
df_final['AwayTeam'] = df_final['AwayTeam'].map(nomi_squadre)
df_final_true['HomeTeam'] = df_final_true['HomeTeam'].map(nomi_squadre)
df_final_true['AwayTeam'] = df_final_true['AwayTeam'].map(nomi_squadre)

In [142]:
df_final_true.head()

Unnamed: 0,HomeTeam,AwayTeam,FTHG,FTAG,HTHG,HTAG,HS,AS,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR
0,Parma,Spal,2,3,1,0,9,15,5,6,9,13,1,10,2,4,0,0
1,Torino,Inter,1,0,1,0,7,9,4,4,19,12,6,7,3,3,0,1
2,Empoli,Genoa,1,3,0,1,13,12,7,7,8,7,1,3,1,2,0,0
3,Empoli,Chievo,2,2,1,2,11,10,9,4,9,12,10,6,0,2,0,0
4,Juventus,Parma,3,3,1,0,18,11,9,5,6,11,12,4,1,2,0,0


In [143]:
df_final.head()

Unnamed: 0,HomeTeam,AwayTeam,FTHG,FTAG,HTHG,HTAG,HS,AS,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR
0,Parma,Spal,1.499703,1.222543,0.679771,0.466393,11.364288,9.776985,5.80536,4.944273,12.519298,12.978925,5.642495,4.989299,2.142592,2.534267,0.086806,0.12581
1,Torino,Inter,1.442117,1.226186,0.617601,0.481357,11.06862,9.517976,5.625234,4.839358,12.958606,13.841714,5.677245,4.742543,2.160482,2.585769,0.094045,0.084886
2,Empoli,Genoa,1.591718,1.194229,0.792805,0.432641,11.995952,10.174613,6.148287,5.110546,11.620586,11.215674,5.682372,5.492824,2.10315,2.461094,0.079342,0.207647
3,Empoli,Chievo,1.496424,1.149649,0.720745,0.43526,11.864943,9.70948,5.974212,4.93385,11.904851,11.778003,5.954527,5.330286,2.108333,2.566837,0.100549,0.176926
4,Juventus,Parma,1.461996,1.17432,0.66988,0.454048,11.528946,9.570861,5.811411,4.872494,12.364255,12.678464,5.879901,5.073517,2.129884,2.588509,0.100775,0.136014


In [144]:
lr.intercept_

array([ 1.56989283,  1.15343556,  0.7949243 ,  0.41985909, 12.18244533,
       10.0460108 ,  6.18335039,  5.06821528, 11.41816511, 10.82149263,
        5.87441826,  5.60413173,  2.0895787 ,  2.49771149,  0.08977593,
        0.222971  ])

In [145]:
lr.coef_

array([[ 0.00727513, -0.01588232],
       [ 0.01359784, -0.00743009],
       [-0.00070639, -0.01200999],
       [ 0.00426056,  0.00043648],
       [-0.06216441, -0.02183484],
       [ 0.0428674 , -0.07752221],
       [-0.01168763, -0.02901264],
       [ 0.01411016, -0.02944927],
       [ 0.06747355,  0.0473775 ],
       [ 0.13139384,  0.0937215 ],
       [-0.06401554,  0.04535917],
       [-0.03710254, -0.02708971],
       [ 0.00452384,  0.00086384],
       [-0.01220584,  0.01762382],
       [-0.00347805,  0.00353455],
       [-0.00510788, -0.00512021]])