In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import xgboost as xgb
import warnings
import shap
import matplotlib.pyplot as plt
import joblib
import sklearn
import pickle
import os
import lime
import lime.lime_tabular
import re

from tensorflow.keras.models import Sequential, save_model
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.datasets import make_classification
from sklearn.model_selection import GridSearchCV, KFold, RandomizedSearchCV, cross_val_score,TimeSeriesSplit
from sklearn.pipeline import Pipeline
from tensorflow.keras.regularizers import l2
from sklearn.utils.class_weight import compute_class_weight
from scipy.stats import uniform, randint
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from IPython.display import display, clear_output

warnings.filterwarnings('ignore')

In [2]:
# Load the dataset
match_results = pd.read_csv('afl_match_results_cleaned.csv')

# Define the features and the target variable
weather_dummies = pd.get_dummies(match_results['weather.weatherType'])
X = match_results.drop(columns=['match.homeTeam.name', 'match.awayTeam.name','venue.name','Margin','Result','weather.weatherType']).astype('float64')  # Drop irrelevant columns
X = pd.concat([X, weather_dummies], axis=1)
y = match_results['Result']  # BW, LW, D, LL, BL

# Assuming 'weather_columns' is a list of your dummy weather variables
weather_columns = weather_dummies.columns  # Replace with actual weather columns
discrete_columns = ['Home.Team.Venue.Win.Streak', 'Away.Team.Venue.Win.Streak','Home.Win.Streak'] 
continuous_columns = [col for col in X.columns if col not in weather_columns and col not in discrete_columns]

# ColumnTransformer to apply StandardScaler only to continuous features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), continuous_columns),
        ('disc', MinMaxScaler(), discrete_columns),
        ('weather', 'passthrough', weather_columns)  # Weather columns are passed through unchanged
    ]
)

# Initialize LabelEncoder
encoder = LabelEncoder()

# Fit and transform the target variable
y_encoded = encoder.fit_transform(y)

cutoff_index = int(0.8 * len(match_results))


# Train-test split
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)


y_train = encoder.fit_transform(y)
#y_test = encoder.fit_transform(y_test)


# Standardize the features
X_train = preprocessor.fit_transform(X)
#X_test = preprocessor.fit_transform(X_test)

### Run at beginning of season

### Continue programming

In [3]:
# Define the neural network model function
def create_simple_model(input_dim):
    model = Sequential([
        Dense(128, kernel_initializer='he_uniform', activation='relu', kernel_regularizer=l2(0.001), input_dim=X_train.shape[1]),
        BatchNormalization(),
        Dropout(0.5),
        Dense(32, kernel_initializer='he_uniform', activation='relu', kernel_regularizer=l2(0.001)),
        BatchNormalization(),
        Dropout(0.5),
        Dense(16, kernel_initializer='he_uniform', activation='relu', kernel_regularizer=l2(0.01)),
        BatchNormalization(),
        Dropout(0.5),
        Dense(5, kernel_initializer='he_uniform', activation='softmax')
    ])
    model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=0.1, name='adam'), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Initialize base models
nn_model = KerasClassifier(build_fn=lambda: create_simple_model(X_train.shape[1]), epochs=100, batch_size=32, verbose=0)
rf_model = RandomForestClassifier(n_estimators=100, max_depth=20, min_samples_split=5)
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', colsample_bytree=0.8,
                          learning_rate=0.2, max_depth=3, n_estimators=100, subsample=0.8)

# Fit the base models
rf_model.fit(X_train, y_train)
xgb_model.fit(X_train, y_train)

# Cross-validation
tscv = TimeSeriesSplit(n_splits=5)
fold_accuracies = []

for train_index, val_index in tscv.split(X_train):
    X_t, X_val = X_train[train_index], X_train[val_index]
    y_t, y_val = y_train[train_index], y_train[val_index]
    
    # Re-instantiate KerasClassifier for each fold with proper input_dim
    nn_model = KerasClassifier(build_fn=lambda: create_simple_model(X_train.shape[1]), epochs=100, batch_size=32, verbose=0)
    
    # Fit base models
    rf_model.fit(X_t, y_t)
    xgb_model.fit(X_t, y_t)
    
    # Get predictions from the neural network for the validation fold
    nn_model.fit(X_t, y_t)  # Fit the neural network on the training fold
    nn_predictions_val = nn_model.predict_proba(X_val)  # Get predictions for the validation fold
    
    # Get predictions from the other base models for the validation fold
    rf_predictions_val = rf_model.predict_proba(X_val)
    xgb_predictions_val = xgb_model.predict_proba(X_val)
    
    # Combine predictions from all models for stacking
    meta_features_val = np.hstack([rf_predictions_val, xgb_predictions_val, nn_predictions_val])
    
    # Train a meta-model (LogisticRegression in this case) on the combined predictions
    meta_model = LogisticRegression(C=100, max_iter=100, solver='liblinear')
    meta_model.fit(meta_features_val, y_val)
    
    # Evaluate the meta-model on the validation set
    meta_predictions_val = meta_model.predict(meta_features_val)
    accuracy = accuracy_score(y_val, meta_predictions_val)
    fold_accuracies.append(accuracy)

# After cross-validation, calculate the average accuracy
average_accuracy = np.mean(fold_accuracies)
print(f'Average Accuracy from TimeSeriesSplit CV: {average_accuracy:.4f}')

Average Accuracy from TimeSeriesSplit CV: 0.7813


In [4]:
# Combine scaled features for the entire dataset
X_full_scaled = preprocessor.fit_transform(X)
y_encoded = encoder.fit_transform(y)

# Train base models on the full dataset
rf_model.fit(X_full_scaled, y_encoded)
xgb_model.fit(X_full_scaled, y_encoded)
nn_model_full = KerasClassifier(build_fn=lambda: create_simple_model(X_full_scaled.shape[1]), 
                                epochs=100, batch_size=32, verbose=0)
nn_model_full.fit(X_full_scaled, y_encoded)

# Generate predictions (probabilities) for stacking
rf_predictions_full = rf_model.predict_proba(X_full_scaled)
xgb_predictions_full = xgb_model.predict_proba(X_full_scaled)
nn_predictions_full = nn_model_full.predict_proba(X_full_scaled)

# Stack predictions to form meta-features
meta_features_full = np.hstack([rf_predictions_full, xgb_predictions_full, nn_predictions_full])

# Train meta-model (Logistic Regression) on the full dataset's meta-features
meta_model_full = LogisticRegression()
meta_model_full.fit(meta_features_full, y_encoded)

print("Final ensemble model trained on the full dataset.")

Final ensemble model trained on the full dataset.


In [5]:
#final_model.save('final_model.h5')
with open('encoder.pkl', 'wb') as f:
    pickle.dump(encoder, f)
with open('preprocessor.pkl', 'wb') as f:
    pickle.dump(preprocessor, f)
with open('accuracy.pkl', 'wb') as f:
    pickle.dump(average_accuracy, f)

# Save RandomForest model
joblib.dump(rf_model, 'rf_model.pkl')

# Save XGBoost model
joblib.dump(xgb_model, 'xgb_model.pkl')

# Save the neural network model
save_model(nn_model_full.model, 'nn_model.h5')

# Save the meta-model (Logistic Regression)
joblib.dump(meta_model_full, 'meta_model.pkl')

print("Models saved successfully.")

Models saved successfully.


In [13]:
from sklearn.metrics import accuracy_score

def drop_column_importance(ensemble_model, X_train, y_train, X_val, y_val, features):
    # Step 1: Train the model with all features and calculate baseline performance
    ensemble_model.fit(X_train, y_train)
    baseline_accuracy = accuracy_score(y_val, ensemble_model.predict(X_val))

    importances = {}

    # Step 2: For each feature, drop it, retrain the model, and calculate new performance
    for feature in features:
        print(f"Dropping feature: {feature}")
        
        # Drop the feature from the training and validation sets
        X_train_dropped = X_train.drop(columns=[feature])
        X_val_dropped = X_val.drop(columns=[feature])
        
        # Re-train the ensemble model without this feature
        # Generate predictions (probabilities) for stacking
        rf_pred_dropped = rf_model.predict_proba(X_train_dropped)
        xgb_pred_dropped = xgb_model.predict_proba(X_train_dropped)
        nn_pred_dropped = nn_model_full.predict_proba(X_train_dropped)

        # Stack predictions to form meta-features
        meta_feat_dropped = np.hstack([rf_pred_dropped, xgb_pred_dropped, nn_pred_dropped])
        
        ensemble_model.fit(meta_feat_dropped, y_train)
        
        # Calculate accuracy without this feature
        # Generate predictions (probabilities) for stacking
        rf_pred_dropped = rf_model.predict_proba(X_val_dropped)
        xgb_pred_dropped = xgb_model.predict_proba(X_val_dropped)
        nn_pred_dropped = nn_model_full.predict_proba(X_val_dropped)

        # Stack predictions to form meta-features
        meta_feat_dropped = np.hstack([rf_pred_dropped, xgb_pred_dropped, nn_pred_dropped])
        
        new_accuracy = accuracy_score(y_val, ensemble_model.predict(meta_feat_dropped))
        
        # Store the performance drop (baseline - new accuracy)
        importances[feature] = baseline_accuracy - new_accuracy

    return importances

# Assuming you have X_train, y_train, X_val, y_val, and the ensemble_model already set up
feature_names = X.columns  # Adjust if you're using a numpy array
X_t_df = pd.DataFrame(X_t,columns = feature_names)
X_val_df = pd.DataFrame(X_val,columns = feature_names)

feature_importances = drop_column_importance(meta_model_full, X_t_df, y_t, X_val_df, y_val, feature_names)

# Convert to a DataFrame for easier viewing
import pandas as pd
importance_df = pd.DataFrame(list(feature_importances.items()), columns=['Feature', 'Importance'])
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Display feature importance
print(importance_df)


Dropping feature: match.homeTeam.Total.kicks
Dropping feature: match.homeTeam.Total.handballs
Dropping feature: match.homeTeam.Total.disposals
Dropping feature: match.homeTeam.Total.marks
Dropping feature: match.homeTeam.Total.bounces
Dropping feature: match.homeTeam.Total.tackles
Dropping feature: match.homeTeam.Total.contestedPossessions
Dropping feature: match.homeTeam.Total.uncontestedPossessions
Dropping feature: match.homeTeam.Total.totalPossessions
Dropping feature: match.homeTeam.Total.inside50s
Dropping feature: match.homeTeam.Total.marksInside50
Dropping feature: match.homeTeam.Total.contestedMarks
Dropping feature: match.homeTeam.Total.hitouts
Dropping feature: match.homeTeam.Total.onePercenters
Dropping feature: match.homeTeam.Total.disposalEfficiency
Dropping feature: match.homeTeam.Total.clangers
Dropping feature: match.homeTeam.Total.freesAgainst
Dropping feature: match.homeTeam.Total.rebound50s
Dropping feature: match.homeTeam.Total.turnovers
Dropping feature: match.hom

In [9]:
X_t.shape

(1932, 110)

In [12]:
pd.DataFrame(X_t,columns = feature_names)


Unnamed: 0,match.homeTeam.Total.kicks,match.homeTeam.Total.handballs,match.homeTeam.Total.disposals,match.homeTeam.Total.marks,match.homeTeam.Total.bounces,match.homeTeam.Total.tackles,match.homeTeam.Total.contestedPossessions,match.homeTeam.Total.uncontestedPossessions,match.homeTeam.Total.totalPossessions,match.homeTeam.Total.inside50s,...,Away.Team.Venue.Win.Streak,Away.Team.Form,Home.Win.Streak,CLEAR_NIGHT,MOSTLY_SUNNY,OVERCAST,RAIN,SUNNY,THUNDERSTORMS,WINDY
0,0.008218,0.956199,0.674818,-0.106054,0.574860,0.419987,0.952502,0.261106,0.617220,-0.140363,...,0.294118,0.461538,0.48,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3.118153,0.956199,2.466803,1.938499,0.188621,0.910007,0.893811,2.047490,2.197522,1.768899,...,0.411765,0.423077,0.48,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.649572,1.808578,2.217916,1.159622,0.574860,-0.000030,-0.632157,2.100030,1.631444,1.095042,...,0.382353,0.423077,0.48,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.267379,1.169294,0.973482,0.964902,0.381740,-0.210039,-0.045246,1.154298,1.018192,-0.926529,...,0.294118,0.423077,0.48,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,-0.510105,-0.819590,-0.868280,-0.446813,1.926698,-0.070033,-0.338702,-0.894790,-0.939495,-0.814220,...,0.382353,0.461538,0.48,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1927,0.872088,-0.357885,0.251711,1.403021,-0.583858,-1.820104,-0.514775,0.444998,0.192661,1.431971,...,0.411765,0.384615,0.56,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1928,-0.855653,-0.073759,-0.544727,-1.177010,-0.776978,0.069973,0.013445,-0.710898,-0.632870,-1.038839,...,0.323529,0.346154,0.32,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1929,0.396959,0.636557,0.674818,-0.008694,-0.004499,0.980009,0.248209,0.444998,0.499287,0.308875,...,0.441176,0.269231,0.44,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1930,-0.337331,1.204810,0.649930,-0.544173,-0.970097,-0.490050,1.069884,0.392457,0.782326,0.758114,...,0.382353,0.615385,0.40,0.0,0.0,0.0,1.0,0.0,0.0,0.0
