In [2]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt 

from sklearn.datasets import make_regression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor

In [3]:
raw_dataframe = pd.read_csv('combined_data.csv')

necessary_data = raw_dataframe.drop(['Unnamed: 0', 'Home Team', 'Away Team','Score', 'Half Time Score', 'Home Team Rating', 'Away Team Rating',
                   'Home Team Off Target Shots', 'Home Team On Target Shots', 'Home Team Blocked Shots', 'Home Team Corners',
                    'Home Team Throw Ins', 'Home Team Aerials Won', 'Home Team Clearances', 'Home Team Yellow Cards',
                   'Home Team Second Yellow Cards', 'Home Team Red Cards', 'Away Team Off Target Shots', 'Away Team On Target Shots',
                   'Away Team Blocked Shots', 'Away Team Corners', 'Away Team Throw Ins', 'Away Team Aerials Won', 'Away Team Clearances',
                   'Away Team Yellow Cards', 'Away Team Second Yellow Cards', 'Away Team Red Cards', 'Home Team Goals Conceeded',
                    'Away Team Goals Conceeded', 'year', 'league'],axis=1)

In [4]:
final_data = necessary_data

final_data['Total Goal Number'] = necessary_data['Home Team Goals Scored'] + necessary_data['Away Team Goals Scored']

final_data['Total Shot Number'] = necessary_data['Home Team Total Shots'] + necessary_data['Away Team Total Shots']

final_data['Mean Pass Success Rate %'] = (necessary_data['Home Team Pass Success %'] + necessary_data['Away Team Pass Success %']) / 2

final_data['Total Foul Number'] = necessary_data['Home Team Fouls'] + necessary_data['Away Team Fouls']

final_data['Possession Difference %'] = abs(necessary_data['Home Team Possession %'] - necessary_data['Away Team Possession %'])

final_data = final_data.drop(['Home Team Possession %', 'Away Team Possession %', 'Home Team Total Shots', 'Home Team Pass Success %', 
                 'Home Team Fouls', 'Away Team Total Shots', 'Away Team Pass Success %', 'Away Team Fouls', 'Home Team Goals Scored', 
                 'Away Team Goals Scored'], axis=1)

In [5]:
X = final_data.drop(['Match Excitement'], axis=1).to_numpy().reshape(-1, 5)
y = final_data['Match Excitement'].to_numpy()

X_train, X_val_and_test, y_train, y_val_and_test = train_test_split(X, y, test_size=0.4, random_state=55)
X_val, X_test, y_val, y_test = train_test_split(X_val_and_test, y_val_and_test, test_size=0.5, random_state=55)

In [6]:
poly_tr_errors = []          
poly_val_errors = []
poly_degrees = []

lin_regr = LinearRegression(fit_intercept=False)

for degree in range(2, 11):
    poly = PolynomialFeatures(degree=degree)    
    X_train_poly = poly.fit_transform(X_train)    
    lin_regr.fit(X_train_poly, y_train)    
    
    
    y_pred_poly_train = lin_regr.predict(X_train_poly)
    poly_tr_error = mean_squared_error(y_train, y_pred_poly_train)
    X_val_poly = poly.fit_transform(X_val)
    y_pred_poly__val = lin_regr.predict(X_val_poly)
    poly_val_error = mean_squared_error(y_val, y_pred_poly__val)
    
    poly_tr_errors.append(poly_tr_error)
    poly_val_errors.append(poly_val_error)
    poly_degrees.append(degree)

print(poly_tr_errors)   
print(poly_val_errors)

[0.41334252802528054, 0.40038616916264536, 0.3898514732704131, 0.3763167637055534, 0.36377746647030806, 0.4847522911028888, 0.5118291842563393, 0.30762010159589914, 0.26384143789993714]
[0.383290017116679, 0.37187031670863824, 0.37682854415298445, 0.39574072243378655, 0.4679851780423501, 8.184278265081966, 45.33783690306802, 169.71623819770352, 2094.3932116470046]


In [7]:
lin_regr = LinearRegression(fit_intercept=False)


poly = PolynomialFeatures(degree=3)    
X_train_poly = poly.fit_transform(X_train)    
lin_regr.fit(X_train_poly, y_train)    
    
    
y_pred_poly_train = lin_regr.predict(X_train_poly)
poly_tr_error = mean_squared_error(y_train, y_pred_poly_train)
X_val_poly = poly.fit_transform(X_val)
y_pred_poly_val = lin_regr.predict(X_val_poly)
poly_val_error = mean_squared_error(y_val, y_pred_poly_val)
    

print(poly_tr_error)
print(poly_val_error)

0.40038616916264536
0.37187031670863824


In [8]:
dec_tr_errors = []          
dec_val_errors = []
max_depths = []


for depth in range(1, 10):
    decst_regr = DecisionTreeRegressor(max_depth=depth)
    decst_regr.fit(X_train, y_train)
    
    
    y_pred_dec_train = decst_regr.predict(X_train)
    dec_tr_error = mean_squared_error(y_train, y_pred_dec_train)
    y_pred_dec_val = decst_regr.predict(X_val)
    dec_val_error = mean_squared_error(y_val, y_pred_dec_val)
    
    dec_tr_errors.append(dec_tr_error)
    dec_val_errors.append(dec_val_error)
    max_depths.append(degree)

print(dec_tr_errors)   
print(dec_val_errors)

[1.2200990901425692, 0.7696351725956955, 0.5909809130812894, 0.47937193939718287, 0.4336013298018965, 0.4034267412081519, 0.37555575193449114, 0.34688842400155984, 0.30966956227106385]
[1.1943334500621763, 0.7256690152146563, 0.5724756731027912, 0.4614396303080739, 0.41914992423967856, 0.4088869361908845, 0.4166061907760947, 0.44129967390200064, 0.4619947152649854]


In [9]:
decst_regr = DecisionTreeRegressor(max_depth=6)
decst_regr.fit(X_train, y_train)
    
y_pred_dec_train = decst_regr.predict(X_train)
dec_tr_error = mean_squared_error(y_train, y_pred_dec_train)
y_pred_dec_val = decst_regr.predict(X_val)
dec_val_error = mean_squared_error(y_val, y_pred_dec_val)
    
print(dec_tr_error)   
print(dec_val_error)

0.4034267412081519
0.4088869361908845


In [13]:
X_test_poly = poly.fit_transform(X_test)
y_poly_test_pred = lin_regr.predict(X_test_poly)
poly_test_error = mean_squared_error(y_test, y_poly_test_pred)

y_dec_test_pred = decst_regr.predict(X_test)
dec_test_error = mean_squared_error(y_test, y_dec_test_pred)

print(poly_test_error)
print(dec_test_error)

0.41518273813915835
0.4437461186524315
