# Baseline Model

## Logistical Regression

In [4]:
import pandas as pd
import plotly_express as plt
from joblib import dump
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [23]:
x = pd.read_csv('../preparation/dataframes/cleaned_dataset.csv')
x

Unnamed: 0.1,Unnamed: 0,elo_home,elo_away,outcome,average_recent_home_scored,average_recent_home_conceeded,average_recent_away_scored,average_recent_away_conceeded,home_points_sofar,away_points_sofar,home_form,away_form
0,0,78.0,82.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,77.0,65.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,66.0,86.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,80.0,81.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,70.0,85.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
120576,120576,87.0,86.0,-1,1.9,1.4,1.6,1.4,54.0,45.0,2.0,-1.0
120577,120577,56.0,84.0,-1,1.0,1.3,2.0,1.1,58.0,35.0,0.0,4.0
120578,120578,79.0,87.0,0,1.1,1.4,2.3,1.0,50.0,52.0,0.0,3.0
120579,120579,56.0,61.0,1,1.3,1.2,0.8,0.7,44.0,46.0,-1.0,0.0


In [2]:
cleaned_dataset = pd.read_csv('../preparation/dataframes/cleaned_dataset.csv', index_col=0)
X = cleaned_dataset.drop('outcome', axis=1)
y = cleaned_dataset['outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)

In [3]:
model = LogisticRegression(multi_class='multinomial', solver='newton-cg')
dump(model, 'baseline.joblib') 
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(f'Precision: {precision_score(y_test, y_pred, average="macro")}')
print(f'Recall: {recall_score(y_test, y_pred, average="macro")}')
print(f'F1: {f1_score(y_test, y_pred, average="macro")}')

Accuracy: 0.4940498403615707
Precision: 0.4307023554272625
Recall: 0.40107861711362786
F1: 0.34355318698691023


## Comparing tuned supervised model to baseline model

In [10]:
baseline_model = {'Accuracy': 0.4940498403615707, 'Precision': 0.4307023554272625, 'Recall': 0.40107861711362786, 'F1': 0.34355318698691023}
knn = {'Accuracy': 0.4903180329228345, 'Precision': 0.4169630125072051, 'Recall': 0.3849724513610399, 'F1': 0.3226800571601976}
decision_tree = {'Accuracy': 0.49118878799187293, 'Precision': 0.3221714938904668, 'Recall': 0.38347301355219865, 'F1': 0.3139962889965292}
svm = {'Accuracy': 0, 'Precision': 0, 'Recall': 0, 'F1': 0}

In [16]:
fig = plt.bar(
    x=['Baseline Model', 'KNN Classifier', 'Decision Tree Classifier', 'SVM'],
    y=[baseline_model['Accuracy'], knn['Accuracy'], decision_tree['Accuracy'], svm['Accuracy']],
    labels = {'x': 'Supervised Model', 'y': 'Accuracy Score'},
    title='The accuracy scores of tuned supervised models in comparison to the baseline model.'
)
fig.update_yaxes(range=[0.485, 0.495]) 
fig

In [20]:
fig = plt.bar(
    x=['Baseline Model', 'KNN Classifier', 'Decision Tree Classifier', 'SVM'],
    y=[baseline_model['Precision'], knn['Precision'], decision_tree['Precision'], svm['Precision']],
    labels = {'x': 'Supervised Model', 'y': 'Precision Score'},
    title='The precision scores of tuned supervised models in comparison to the baseline model.'
)
fig.update_yaxes(range=[0.3, 0.45]) 
fig