In [16]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import joblib
from sklearn.datasets import make_classification
from sklearn.model_selection import GridSearchCV, KFold, RandomizedSearchCV, cross_val_score,TimeSeriesSplit
from sklearn.pipeline import Pipeline
from tensorflow.keras.regularizers import l2
from sklearn.utils.class_weight import compute_class_weight
from scipy.stats import uniform, randint
from sklearn.compose import ColumnTransformer

In [36]:
# Load the dataset
match_results = pd.read_csv('afl_match_results_cleaned.csv')

# Define the features and the target variable
weather_dummies = pd.get_dummies(match_results['weather.weatherType'])
X = match_results.drop(columns=['match.homeTeam.name', 'match.awayTeam.name','venue.name','Margin','Result','weather.weatherType']).astype('float64')  # Drop irrelevant columns
X = pd.concat([X, weather_dummies], axis=1)
y = match_results['Result']  # BW, LW, D, LL, BL

# Assuming 'weather_columns' is a list of your dummy weather variables
weather_columns = weather_dummies.columns  # Replace with actual weather columns
discrete_columns = ['Home.Team.Venue.Win.Streak', 'Away.Team.Venue.Win.Streak','Home.Win.Streak'] 
continuous_columns = [col for col in X.columns if col not in weather_columns and col not in discrete_columns]

# ColumnTransformer to apply StandardScaler only to continuous features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), continuous_columns),
        ('disc', MinMaxScaler(), discrete_columns),
        ('weather', 'passthrough', weather_columns)  # Weather columns are passed through unchanged
    ]
)

# Initialize LabelEncoder
encoder = LabelEncoder()

# Fit and transform the target variable
y_encoded = encoder.fit_transform(y)

cutoff_index = int(0.8 * len(match_results))

# Create the training set (first 80%) and validation set (last 20%)
X_train = X
y_train = y

y_train_encoded = encoder.fit_transform(y_train)

# Standardize the features
X_train = preprocessor.fit_transform(X_train)

### Run at beginning of season

In [13]:
# Function to create the Keras model
def create_model(optimizer='adam', dropout_rate=0.3, neurons=64, learn_rate=0.01):
    model = Sequential([
        Dense(neurons, kernel_initializer='he_uniform', activation='relu', kernel_regularizer=l2(0.001), input_dim=X_train.shape[1]),
        BatchNormalization(),
        Dropout(dropout_rate),
        Dense(32, kernel_initializer='he_uniform', activation='relu', kernel_regularizer=l2(0.001)),
        BatchNormalization(),
        Dropout(dropout_rate),
        Dense(16, kernel_initializer='he_uniform', activation='relu', kernel_regularizer=l2(0.01)),
        BatchNormalization(),
        Dropout(dropout_rate),
        Dense(5, kernel_initializer='he_uniform', activation='softmax')
    ])
    model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=learn_rate, name=optimizer), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model


# Create the KerasClassifier wrapper
model = KerasClassifier(build_fn=create_model, verbose=0)

# Define the grid of hyperparameters to search
param_grid = {
    'batch_size': [32, 64, 128],
    'epochs': [20, 30, 50],
    'optimizer': ['adam', 'sgd'],
    'dropout_rate': [0.2, 0.3, 0.5],
    'neurons': [32, 64, 128],
    'learn_rate': [0.001, 0.01, 0.1]
}

# Use KFold cross-validation
kfold = KFold(n_splits=3, shuffle=True, random_state=42)

# Initialize GridSearchCV
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3)

# Fit the model using GridSearchCV
grid_result = grid.fit(X_train, y_train)

# Summarize results
print(f"Best: {grid_result.best_score_} using {grid_result.best_params_}")

# Evaluate on the test set
best_model = grid_result.best_estimator_.model
test_loss, test_accuracy = best_model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

  model = KerasClassifier(build_fn=create_model, verbose=0)


Best: 0.7580301364262899 using {'batch_size': 64, 'dropout_rate': 0.3, 'epochs': 50, 'learn_rate': 0.1, 'neurons': 128, 'optimizer': 'adam'}
Test Accuracy: 78.16%


### Continue programming

In [40]:
# Define artificial neural network
def create_simplified_model(input_dim):
    model = Sequential([
        Dense(128, kernel_initializer='he_uniform', activation='relu', input_dim=input_dim, kernel_regularizer=l2(0.01)),
        BatchNormalization(),
        Dropout(0.3),
        Dense(32, kernel_initializer='he_uniform', activation='relu', kernel_regularizer=l2(0.01)),
        BatchNormalization(),
        Dropout(0.3),
        Dense(16, kernel_initializer='he_uniform', activation='relu', kernel_regularizer=l2(0.01)),
        BatchNormalization(),
        Dropout(0.3),
        Dense(5, kernel_initializer='he_uniform', activation='softmax')
    ])
    
    model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=0.1,name='adam'), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Define TimeSeriesSplit with the desired number of splits
n_splits = 5  # You can adjust the number of splits
tscv = TimeSeriesSplit(n_splits=n_splits)

# Initialize a list to store accuracies from each fold
accuracy_scores = []

# Perform time-aware cross-validation
for train_index, test_index in tscv.split(X_train):
    # Split the data into training and validation sets based on the time split
    X_train_cv, X_test_cv = X_train[train_index], X_train[test_index]
    y_train_cv, y_test_cv = y_train_encoded[train_index], y_train_encoded[test_index]

    # Define your model (make sure it's the simplified version with regularization)
    model = create_simplified_model(input_dim=X_train_cv.shape[1])

    # Add early stopping and learning rate reduction as callbacks
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5)

    # Compile the model (ensure to add L2 regularization inside the create_model function)
    model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=0.1,name='adam'), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    # Fit the model on the training split
    history = model.fit(
        X_train_cv, y_train_cv,
        validation_split=0.2,  # Further split training data into training/validation
        epochs=100,
        callbacks=[early_stopping, reduce_lr],
        batch_size=64,
        verbose=0  # Turn off verbose output to prevent clutter
    )

    # Evaluate the model on the test split
    test_loss, test_accuracy = model.evaluate(X_test_cv, y_test_cv, verbose=0)

    # Store the accuracy score
    accuracy_scores.append(test_accuracy)

# After cross-validation, calculate the average accuracy
average_accuracy = np.mean(accuracy_scores)
print(f'Average Accuracy from TimeSeriesSplit CV: {average_accuracy * 100:.2f}%')

Average Accuracy from TimeSeriesSplit CV: 72.25%


In [41]:
# Combine train and test data for final training
y_combined = np.hstack((y_train_encoded))

early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5)

# Retrain the model on the combines data
final_model = create_simplified_model(input_dim)

X_combined = preprocessor.fit_transform(X)

final_model.fit(
    X_combined,y_combined,
    epochs=100,
    callbacks=[early_stopping, reduce_lr],
    batch_size=64)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x21f422652e0>

In [42]:
final_model.save('final_model.h5')
import pickle
with open('encoder.pkl', 'wb') as f:
    pickle.dump(encoder, f)
with open('preprocessor.pkl', 'wb') as f:
    pickle.dump(preprocessor, f)