In [None]:
import os
import sys
NB_DIR = os.path.abspath('')

# To make nbdir import-able
if NB_DIR not in sys.path:
    sys.path.append(NB_DIR)

In [None]:
import json

from sklearn.model_selection import train_test_split, ShuffleSplit, KFold, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA

from sklearn.linear_model import LogisticRegression, LogisticRegressionCV, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, \
    GradientBoostingClassifier, HistGradientBoostingClassifier, BaggingClassifier, ExtraTreesClassifier, \
    VotingClassifier, StackingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC

from sklearn.metrics import accuracy_score

from prettytable import PrettyTable
from tqdm.notebook import tqdm

# --- Our own libraries ---

import feature_extraction as fe

# Reload automatically upon any change in feature_extraction.py
%load_ext autoreload
%autoreload 2

In [None]:
# Load the training data from file into train_data list

train_file_path = os.path.join(NB_DIR, 'data', 'train.jsonl')
train_data = []

# Read and decode the file line by line
print(f"Loading data from '{train_file_path}'...")
try:
    with open(train_file_path, 'r') as f:
        for line in f:
            # json.loads() parses one line (one JSON object) into a Python dictionary
            train_data.append(json.loads(line))
    print(f'Successfully loaded {len(train_data)} battles.')

except FileNotFoundError:
    print(f"ERROR: Could not find the training file at '{train_file_path}'.")
    print('Please make sure you have added the competition data to this notebook.')

except IOError:
    print(f"An error occurred while reading the file '{train_file_path}'.")

In [None]:
# OPTIONAL
# Sneak a peek into the just loaded battle data.

print("\n--- Structure of the first train battle: ---")
if train_data:
    first_battle = train_data[0]
    
    # To keep the output clean, we can create a copy and truncate the timeline
    battle_for_display = first_battle.copy()
    battle_for_display['battle_timeline'] = battle_for_display.get('battle_timeline', [])[:5]
    
    # Use json.dumps for pretty-printing the dictionary
    print(json.dumps(battle_for_display, indent=4))
    if len(first_battle.get('battle_timeline', [])) > 5:
        print("    ...")
        print("    [battle_timeline has been truncated for display]")

# Feature Engineering

At this point we have all the training data, decoded from JSON, in the `train_data` list (of nested structures), so it's time to work on the features, through techniques such as features regularization and selection. All the relevant functions are in the `feature_extraction` library (file `feature_extraction.py`).

In [None]:
print("Processing training data...")
train_df = fe.extract_full_features(train_data)

In [None]:
# Optional: sneak a peek into the features dataframe.
train_df.head()

In [None]:
# Optional: delete bogus line
# https://classroom.google.com/c/MjM1MTYxMzEyMTda/p/ODE1OTEyMTU1OTM3/details?hl=it

train_df.drop(train_df.index[[4877]], inplace=True)
train_df.reset_index(drop=True, inplace=True)
train_df.tail()

In [None]:
# Optional: sneak a peek into text columns.
text_cols = [col for col in train_df.columns if 'name' in col or 'type' in col]
train_df[text_cols].tail()

# Models Training and Comparison

At this point we have all the selected features in to the `train_df` dataframe, so it's time to train the various models on them.

In [None]:
# Encode all the text columns
le = LabelEncoder()

text_cols = [col for col in train_df.columns if 'name' in col or 'type' in col]
for col in text_cols: 
    train_df[col] = le.fit_transform(train_df[col])

In [None]:
# Define our features (X) and target (y)
features = [col for col in train_df.columns if col not in ['battle_id', 'player_won']]

# Any data scaling/regularization goes here
scaler = StandardScaler().fit(train_df[features])
train_df_scaled = scaler.transform(train_df[features])

# Split the data (and decide whether to use scaling or not).
X_train, X_test, y_train, y_test = train_test_split(train_df_scaled, train_df['player_won'], test_size=0.3)

In [None]:
# PCA (optional)
pca = PCA(n_components=30)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

In [None]:
# Models list (just add models with their parameters!)
models = [
    #LogisticRegression(max_iter=10_000),
    #LogisticRegressionCV(max_iter=10_000),
    #SGDClassifier(max_iter=10_000, tol=1e-3),
    #GaussianNB(),
    #DecisionTreeClassifier(random_state=0, criterion='entropy', max_depth=5),
    #GradientBoostingClassifier(),
    HistGradientBoostingClassifier(max_iter=10_000),
    #BaggingClassifier(KNeighborsClassifier(), max_samples=0.5, max_features=0.5),
    #ExtraTreesClassifier(n_estimators=100, random_state=0),
    #LinearSVC(random_state=0,dual=False),
    #RandomForestClassifier(n_estimators=100),
    #KNeighborsClassifier(n_neighbors=20),
    #GaussianProcessClassifier(),
    #AdaBoostClassifier(),
    #MLPClassifier(max_iter=10_000)
]

In [None]:
# Model testing with normal training
models_result = []

for model in models:
    model.fit(X_train, y_train)
    models_result.append([model.__class__.__name__, model.score(X_test, y_test)])

In [None]:
# Model testing with cross-validation training - Alternative to the above!
models_result = []

for model in models:
    cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)
    cv_results = cross_val_score(model, train_df_scaled, train_df['player_won'], cv=cv)
    print(f'{model.__class__.__name__:<32} mean: {cv_results.mean():.3f}\tmin: {cv_results.min():.3f}\tmax: {cv_results.max():.3f}')
    models_result.append([model.__class__.__name__, cv_results.mean()])

In [None]:
results_table = PrettyTable()
results_table.field_names = ['Model Name', 'Accuracy']
results_table.align['Model Name'] = 'r'
results_table.align['Accuracy'] = 'l'
results_table.add_rows(sorted([[result[0], round(result[1]*100, 3)] for result in models_result], key=lambda row: row[1]))
print(results_table)

In [None]:
clf1 = HistGradientBoostingClassifier(max_iter=10_000)
clf2 = LogisticRegressionCV()
clf3 = AdaBoostClassifier()
clf4 = RandomForestClassifier()
clf5 = DecisionTreeClassifier()

estimators=[('HGBC', clf1), ('GBC', clf2), ('A', clf3), ('B', clf4), ('C', clf5)]

In [None]:
# ENSEMBLE - Voting
eclf = VotingClassifier(estimators=estimators, voting='hard')
eclf.fit(X_train, y_train)
eclf.score(X_test, y_test)

In [None]:
# ENSEMBLE - Stacking
eclf = StackingClassifier(estimators=estimators, final_estimator=HistGradientBoostingClassifier())
eclf.fit(X_train, y_train)
eclf.score(X_test, y_test)