In [35]:
#general package
import pandas as pd
import numpy as np
import json
import os
import warnings
import matplotlib.pyplot as plt
import matplotlib.cm as cm 
import seaborn as sns

In [36]:
data= pd.read_csv('/Users/quinne/Desktop/final_dataframe.csv')

In [37]:
# Filter the dataframe to only include the columns that will be available in 2023
columns_to_include = ['Result', 
                      'match_number', 
                      'team', 'opponent', 
                      'gender', 'season', 
                      'month', 'day', 
                      'home_advantage', 'venue', 'city']
filtered_data = data[columns_to_include]
filtered_data.head()

Unnamed: 0,Result,match_number,team,opponent,gender,season,month,day,home_advantage,venue,city
0,loose,1,Manchester Originals,Oval Invincibles,female,2021,7,21,no,"Kennington Oval, London",London
1,win,1,Oval Invincibles,Manchester Originals,male,2021,7,22,yes,"Kennington Oval, London",London
2,loose,2,Birmingham Phoenix,London Spirit,female,2021,7,23,yes,"Edgbaston, Birmingham",Birmingham
3,loose,2,London Spirit,Birmingham Phoenix,male,2021,7,23,no,"Edgbaston, Birmingham",Birmingham
4,win,3,Southern Brave,Trent Rockets,female,2021,7,24,no,"Trent Bridge, Nottingham",Nottingham


In [38]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder


categorical_vars = ['team', 'opponent', 'gender', 'home_advantage', 'venue', 'city']
ordinal_vars = ['gender']
target_var = 'Result'

data_preprocessed = filtered_data.copy()

# Perform label encoding for the target variable
le = LabelEncoder()
data_preprocessed[target_var] = le.fit_transform(data_preprocessed[target_var])

# Perform one-hot encoding for categorical variables
ohe = OneHotEncoder(drop='first')
encoded_categorical = ohe.fit_transform(data_preprocessed[categorical_vars]).toarray()
encoded_categorical_df = pd.DataFrame(encoded_categorical, columns=ohe.get_feature_names(categorical_vars))

# Drop original categorical columns from data
data_preprocessed = data_preprocessed.drop(columns=categorical_vars)

# Concatenate original data and encoded categorical data
data_preprocessed = pd.concat([data_preprocessed, encoded_categorical_df], axis=1)

# Split data into features and target
X = data_preprocessed.drop(columns=target_var)
y = data_preprocessed[target_var]

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.head()



Unnamed: 0,match_number,season,month,day,team_London Spirit,team_Manchester Originals,team_Northern Superchargers,team_Oval Invincibles,team_Southern Brave,team_Trent Rockets,...,"venue_Old Trafford, Manchester","venue_Sophia Gardens, Cardiff","venue_The Rose Bowl, Southampton","venue_Trent Bridge, Nottingham",city_Cardiff,city_Leeds,city_London,city_Manchester,city_Nottingham,city_Southampton
42,24,2021,8,10,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
118,32,2022,8,31,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
94,20,2022,8,20,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
36,21,2021,8,7,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
90,18,2022,8,18,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [39]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 98 entries, 42 to 102
Data columns (total 33 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   match_number                      98 non-null     int64  
 1   season                            98 non-null     int64  
 2   month                             98 non-null     int64  
 3   day                               98 non-null     int64  
 4   team_London Spirit                98 non-null     float64
 5   team_Manchester Originals         98 non-null     float64
 6   team_Northern Superchargers       98 non-null     float64
 7   team_Oval Invincibles             98 non-null     float64
 8   team_Southern Brave               98 non-null     float64
 9   team_Trent Rockets                98 non-null     float64
 10  team_Welsh Fire                   98 non-null     float64
 11  opponent_London Spirit            98 non-null     float64
 12  opponent

In [40]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif

# Store the results
model_results = []

# Define a function to apply a model
def apply_model(model, parameters, model_name, X_train, y_train, X_val, y_val):
    # Grid Search for the best parameters
    grid_search = GridSearchCV(estimator=model, param_grid=parameters, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    
    # Use the model with the best parameters to make predictions
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_val)
    
    # Calculate accuracy and F1 score
    accuracy = accuracy_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)
    
    # Store the results
    model_results.append({
        'Model': model_name,
        'Best Parameters': grid_search.best_params_,
        'Accuracy': accuracy,
        'F1 Score': f1
    })

In [41]:
# Apply Naive Bayes
apply_model(GaussianNB(), {}, 'Naive Bayes', X_train, y_train, X_val, y_val)

model_results

[{'Model': 'Naive Bayes',
  'Best Parameters': {},
  'Accuracy': 0.6,
  'F1 Score': 0.5454545454545454}]

In [42]:
# Apply logistic regression (full model)
apply_model(LogisticRegression(max_iter=1000, solver='liblinear'), {}, 
            'Logistic Regression (full model)', X_train, y_train, X_val, y_val)

# Apply logistic regression with forward feature selection
# Create a pipeline with feature selection and the model
pipe = Pipeline([
    ('feature_selection', SelectKBest(score_func=f_classif)),
    ('model', LogisticRegression(max_iter=1000, solver='liblinear'))
])

# Define the parameters for the grid search
parameters = {
    'feature_selection__k': list(range(1, X_train.shape[1] + 1))  # Try all possible numbers of features
}

apply_model(pipe, parameters, 'Logistic Regression (forward selection)', X_train, y_train, X_val, y_val)

model_results

[{'Model': 'Naive Bayes',
  'Best Parameters': {},
  'Accuracy': 0.6,
  'F1 Score': 0.5454545454545454},
 {'Model': 'Logistic Regression (full model)',
  'Best Parameters': {},
  'Accuracy': 0.56,
  'F1 Score': 0.4210526315789474},
 {'Model': 'Logistic Regression (forward selection)',
  'Best Parameters': {'feature_selection__k': 1},
  'Accuracy': 0.56,
  'F1 Score': 0.15384615384615385}]

In [43]:
# Apply decision tree (unpruned)
apply_model(DecisionTreeClassifier(random_state=42), {}, 'Decision Tree (unpruned)', X_train, y_train, X_val, y_val)

# Apply decision tree (pruned)
# Define the parameters for the grid search
parameters = {
    'max_depth': list(range(1, 20)),  # Try different maximum depths
    'min_samples_leaf': list(range(1, 20)),  # Try different minimum samples per leaf
}

apply_model(DecisionTreeClassifier(random_state=42), parameters, 'Decision Tree (pruned)', X_train, y_train, X_val, y_val)

model_results

[{'Model': 'Naive Bayes',
  'Best Parameters': {},
  'Accuracy': 0.6,
  'F1 Score': 0.5454545454545454},
 {'Model': 'Logistic Regression (full model)',
  'Best Parameters': {},
  'Accuracy': 0.56,
  'F1 Score': 0.4210526315789474},
 {'Model': 'Logistic Regression (forward selection)',
  'Best Parameters': {'feature_selection__k': 1},
  'Accuracy': 0.56,
  'F1 Score': 0.15384615384615385},
 {'Model': 'Decision Tree (unpruned)',
  'Best Parameters': {},
  'Accuracy': 0.44,
  'F1 Score': 0.3},
 {'Model': 'Decision Tree (pruned)',
  'Best Parameters': {'max_depth': 1, 'min_samples_leaf': 1},
  'Accuracy': 0.56,
  'F1 Score': 0.15384615384615385}]

In [44]:
# Apply Bagging
apply_model(BaggingClassifier(random_state=42), {}, 'Bagging', X_train, y_train, X_val, y_val)

# Apply Random Forest
# Define the parameters for the grid search
parameters = {
    'n_estimators': [50, 100, 150, 200],  
    'max_depth': [None, 5, 10, 15, 20], 
    'min_samples_leaf': [1, 2, 5, 10],  
}

apply_model(RandomForestClassifier(random_state=42), parameters, 'Random Forest', X_train, y_train, X_val, y_val)

# Apply Boosting
# Define the parameters for the grid search
parameters = {
    'n_estimators': [50, 100, 150, 200],  
    'learning_rate': [0.01, 0.1, 0.2, 0.3],  
    'max_depth': [3, 5, 7, 9],  
}

apply_model(GradientBoostingClassifier(random_state=42), parameters, 'Boosting', X_train, y_train, X_val, y_val)
model_results

[{'Model': 'Naive Bayes',
  'Best Parameters': {},
  'Accuracy': 0.6,
  'F1 Score': 0.5454545454545454},
 {'Model': 'Logistic Regression (full model)',
  'Best Parameters': {},
  'Accuracy': 0.56,
  'F1 Score': 0.4210526315789474},
 {'Model': 'Logistic Regression (forward selection)',
  'Best Parameters': {'feature_selection__k': 1},
  'Accuracy': 0.56,
  'F1 Score': 0.15384615384615385},
 {'Model': 'Decision Tree (unpruned)',
  'Best Parameters': {},
  'Accuracy': 0.44,
  'F1 Score': 0.3},
 {'Model': 'Decision Tree (pruned)',
  'Best Parameters': {'max_depth': 1, 'min_samples_leaf': 1},
  'Accuracy': 0.56,
  'F1 Score': 0.15384615384615385},
 {'Model': 'Bagging',
  'Best Parameters': {},
  'Accuracy': 0.44,
  'F1 Score': 0.3636363636363636},
 {'Model': 'Random Forest',
  'Best Parameters': {'max_depth': None,
   'min_samples_leaf': 1,
   'n_estimators': 50},
  'Accuracy': 0.6,
  'F1 Score': 0.5},
 {'Model': 'Boosting',
  'Best Parameters': {'learning_rate': 0.2,
   'max_depth': 9,
   

In [45]:
# Apply Random Forest with a smaller parameter space
# Define the parameters for the grid search
parameters = {
    'n_estimators': [50, 100],  # Try a smaller number of trees
    'min_samples_leaf': [5, 10, 15],  # Try a larger minimum samples per leaf
}

apply_model(RandomForestClassifier(random_state=42), parameters, 'Random Forest', X_train, y_train, X_val, y_val)

# Apply Boosting
# Define the parameters for the grid search
parameters = {
    'n_estimators': [50, 100],  # Try a smaller number of trees
    'learning_rate': [0.1, 0.2],  # Try a smaller number of learning rates
    'max_depth': [3, 5],  # Try a smaller number of maximum depths
}

apply_model(GradientBoostingClassifier(random_state=42), parameters, 'Boosting', X_train, y_train, X_val, y_val)

model_results

[{'Model': 'Naive Bayes',
  'Best Parameters': {},
  'Accuracy': 0.6,
  'F1 Score': 0.5454545454545454},
 {'Model': 'Logistic Regression (full model)',
  'Best Parameters': {},
  'Accuracy': 0.56,
  'F1 Score': 0.4210526315789474},
 {'Model': 'Logistic Regression (forward selection)',
  'Best Parameters': {'feature_selection__k': 1},
  'Accuracy': 0.56,
  'F1 Score': 0.15384615384615385},
 {'Model': 'Decision Tree (unpruned)',
  'Best Parameters': {},
  'Accuracy': 0.44,
  'F1 Score': 0.3},
 {'Model': 'Decision Tree (pruned)',
  'Best Parameters': {'max_depth': 1, 'min_samples_leaf': 1},
  'Accuracy': 0.56,
  'F1 Score': 0.15384615384615385},
 {'Model': 'Bagging',
  'Best Parameters': {},
  'Accuracy': 0.44,
  'F1 Score': 0.3636363636363636},
 {'Model': 'Random Forest',
  'Best Parameters': {'max_depth': None,
   'min_samples_leaf': 1,
   'n_estimators': 50},
  'Accuracy': 0.6,
  'F1 Score': 0.5},
 {'Model': 'Boosting',
  'Best Parameters': {'learning_rate': 0.2,
   'max_depth': 9,
   

In [46]:
# Apply SVM again
# Define the parameters for the grid search
parameters = {
    'model__C': [0.1, 1, 10],  # Try different values of C
    'model__kernel': ['linear', 'rbf', 'poly'],  # Try different kernels
    'model__gamma': ['scale', 'auto']  # Try different values of gamma
}

# Standardize the features before applying SVM
pipe_svm = Pipeline([
    ('scaler', StandardScaler()),
    ('model', SVC(random_state=42))
])

apply_model(pipe_svm, parameters, 'SVM', X_train, y_train, X_val, y_val)

# Apply KNN again
# Define the parameters for the grid search
parameters = {
    'model__n_neighbors': list(range(1, 20)),  # Try different numbers of neighbors
    'model__weights': ['uniform', 'distance'],  # Try different weights
    'model__metric': ['euclidean', 'manhattan', 'minkowski'],  # Try different metrics
    'model__p': [1, 2]  # Try different values of p for the Minkowski metric
}

# Standardize the features before applying KNN
pipe_knn = Pipeline([
    ('scaler', StandardScaler()),
    ('model', KNeighborsClassifier())
])

apply_model(pipe_knn, parameters, 'KNN', X_train, y_train, X_val, y_val)

model_results

[{'Model': 'Naive Bayes',
  'Best Parameters': {},
  'Accuracy': 0.6,
  'F1 Score': 0.5454545454545454},
 {'Model': 'Logistic Regression (full model)',
  'Best Parameters': {},
  'Accuracy': 0.56,
  'F1 Score': 0.4210526315789474},
 {'Model': 'Logistic Regression (forward selection)',
  'Best Parameters': {'feature_selection__k': 1},
  'Accuracy': 0.56,
  'F1 Score': 0.15384615384615385},
 {'Model': 'Decision Tree (unpruned)',
  'Best Parameters': {},
  'Accuracy': 0.44,
  'F1 Score': 0.3},
 {'Model': 'Decision Tree (pruned)',
  'Best Parameters': {'max_depth': 1, 'min_samples_leaf': 1},
  'Accuracy': 0.56,
  'F1 Score': 0.15384615384615385},
 {'Model': 'Bagging',
  'Best Parameters': {},
  'Accuracy': 0.44,
  'F1 Score': 0.3636363636363636},
 {'Model': 'Random Forest',
  'Best Parameters': {'max_depth': None,
   'min_samples_leaf': 1,
   'n_estimators': 50},
  'Accuracy': 0.6,
  'F1 Score': 0.5},
 {'Model': 'Boosting',
  'Best Parameters': {'learning_rate': 0.2,
   'max_depth': 9,
   

In [47]:
# Load the prediction set
data_2023 = pd.read_csv('/Users/quinne/Desktop/df_2023.csv')

# Check the first few rows of the prediction set
data_2023.head()

Unnamed: 0,Result,match_number,team,opponent,gender,season,month,day,home_advantage,venue,city,time
0,,1,Trent Rockets,Southern Brave,female,2023,8,1,yes,"Trent Bridge, Nottingham",Nottingham,15:00
1,,1,Trent Rockets,Southern Brave,male,2023,8,1,yes,"Trent Bridge, Nottingham",Nottingham,18:30
2,,2,Welsh Fire,Manchester Orignials,female,2023,8,2,yes,"Sophia Gardens, Cardiff",Cardiff,11:30
3,,2,Welsh Fire,Manchester Orignials,male,2023,8,2,yes,"Sophia Gardens, Cardiff",Cardiff,15:00
4,,3,London Spirit,Oval Invincibles,female,2023,8,2,yes,"Lord's, London",London,15:00


In [48]:
data_2023.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68 entries, 0 to 67
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Result          0 non-null      float64
 1   match_number    68 non-null     int64  
 2   team            68 non-null     object 
 3   opponent        68 non-null     object 
 4   gender          68 non-null     object 
 5   season          68 non-null     int64  
 6   month           68 non-null     int64  
 7   day             68 non-null     int64  
 8   home_advantage  68 non-null     object 
 9   venue           68 non-null     object 
 10  city            68 non-null     object 
 11  time            68 non-null     object 
dtypes: float64(1), int64(4), object(7)
memory usage: 6.5+ KB
