In [None]:
import os
import sys
NB_DIR = os.path.abspath('')

# To make any library in nbdir import-able
if NB_DIR not in sys.path:
    sys.path.append(NB_DIR)

In [None]:
import json
import yaml
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, ShuffleSplit, KFold, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, chi2, SelectFromModel
from sklearn.inspection import permutation_importance
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression, LogisticRegressionCV, SGDClassifier, RidgeClassifierCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, \
    GradientBoostingClassifier, HistGradientBoostingClassifier, BaggingClassifier, ExtraTreesClassifier, \
    VotingClassifier, StackingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC

from sklearn.metrics import accuracy_score

from prettytable import PrettyTable
from tqdm.notebook import tqdm

# --- Our own libraries ---

import feature_extraction as fe
import buffer

# Reload automatically upon any change in feature_extraction.py
%load_ext autoreload
%autoreload 2

In [None]:
# Number of CPU cores to use in parallel by sklearn
# (where possible).
N_JOBS = 8

In [None]:
# Load the training data from file into train_data list

train_file_path = os.path.join(NB_DIR, 'data', 'train.jsonl')
train_data = []

# Read and decode the file line by line
print(f"Loading data from '{train_file_path}'...")
try:
    with open(train_file_path, 'r') as f:
        for line in f:
            # json.loads() parses one line (one JSON object) into a Python dictionary
            train_data.append(json.loads(line))
    print(f'Successfully loaded {len(train_data)} battles.')

except FileNotFoundError:
    print(f"ERROR: Could not find the training file at '{train_file_path}'.")
    print('Please make sure you have added the competition data to this notebook.')

except IOError:
    print(f"An error occurred while reading the file '{train_file_path}'.")

In [None]:
# OPTIONAL
# Sneak a peek into the just loaded battle data.

MOVES_TO_DISPLAY = 30

print("\n--- Structure of the first train battle: ---")
if train_data:
    first_battle = train_data[0]
    
    # To keep the output clean, we can create a copy and truncate the timeline
    battle_for_display = first_battle.copy()
    battle_for_display['battle_timeline'] = battle_for_display.get('battle_timeline', [])[:MOVES_TO_DISPLAY]
    
    # Use yaml.dump for pretty-printing the dictionary
    print(yaml.dump(battle_for_display, indent=4))
    if len(first_battle.get('battle_timeline', [])) > MOVES_TO_DISPLAY:
        print("    ...")
        print("    [battle_timeline has been truncated for display]")

In [None]:
# EXPERIMENTS ON PLAYERS STATUSES DURING a BATTLE!
import pandas as pd

STATUS = 'nostatus'
STATUSES = ['nostatus', 'frz', 'par', 'slp', 'fnt', 'tox', 'psn', 'brn']
STATUSES_affected = ['frz', 'par', 'slp', 'fnt', 'tox', 'psn', 'brn']

battles_ = []
for battle in train_data:
    player_status_count = {}
    for move in battle['battle_timeline']:
        if move['p1_move_details'] and move['p2_move_details']:
            status_1 = move['p1_pokemon_state']['status']
            status_2 = move['p2_pokemon_state']['status']
            
            player_status_count[f'p1_{status_1}'] = player_status_count.get(f'p1_{status_1}', 0) + 1
            player_status_count[f'p2_{status_2}'] = player_status_count.get(f'p2_{status_2}', 0) + 1

            # STATUS combinations
            #if status_1 == 'nostatus' and status_2 in STATUSES_affected:
            #    player_status_count[f'{status_1}_{status_2}'] = player_status_count.get(f'{status_1}_{status_2}', 0) + 1
            player_status_count[f'{status_1}_{status_2}'] = player_status_count.get(f'{status_1}_{status_2}', 0) + 1
        
    player_status_count['winner'] = int(battle['player_won'])
    battles_.append(player_status_count)

statuses_df = pd.DataFrame(battles_).fillna(0)
statuses_df.reindex(sorted(statuses_df.columns), axis=1)
reordered_col_names = ['winner'] + sorted(list(statuses_df))[:-1]
stat_df = statuses_df[reordered_col_names].astype(int)

# Statistics!
#for s in STATUSES:
#    for battle in battles_

len(stat_df.loc[((stat_df['p1_psn'] > stat_df['p2_psn'] ) & (stat_df['winner']==1))])

# Feature Engineering

At this point we have all the training data, decoded from JSON, in the `train_data` list (of nested structures), so it's time to work on the features, through techniques such as features regularization and selection. All the relevant functions are in the `feature_extraction` library (file `feature_extraction.py`).

In [None]:
# Function definition only to call them in batch
def extract_features(fun, train_data):
    print('Processing training data...', end=' ')
    train_df = fun(train_data)
    print('Done!')
    print(train_df.shape)
    return train_df

train_df = extract_features(fe.extract_features_minimal, train_data)

In [None]:
# Optional: sneak a peek into the features dataframe.
train_df.head()
train_df[['nostatus_fnt_diff']].head()

In [None]:
# Optional: delete bogus line
# https://classroom.google.com/c/MjM1MTYxMzEyMTda/p/ODE1OTEyMTU1OTM3/details?hl=it
def remove_bogus_line(train_df, line=4877):
    train_df.drop(train_df.index[[line]], inplace=True)
    train_df.reset_index(drop=True, inplace=True)
    #train_df.tail()
    return train_df

remove_bogus_line(train_df, 4877)

In [None]:
import yaml
print(json.dumps(train_data[4877], indent=4))

In [None]:
# Optional: sneak a peek into text columns.
text_cols = [col for col in train_df.columns if 'name' in col or 'type' in col]
train_df[text_cols].tail()

# Models Training and Comparison

At this point we have all the selected features in to the `train_df` dataframe, so it's time to train the various models on them.

In [None]:
# Encode all the text columns
le = LabelEncoder()

text_cols = [col for col in train_df.columns if 'name' in col or 'type' in col]
for col in text_cols: 
    train_df[col] = le.fit_transform(train_df[col])

In [None]:
def define_X_y(train_df):
    # Define our features (X) and target (y)
    features = [col for col in train_df.columns if col not in ['battle_id', 'player_won']]
    
    # Any data scaling/regularization goes here
    scaler = StandardScaler().fit(train_df[features])
    train_df_scaled = scaler.transform(train_df[features])
    
    # Chose whether to assign train_data the scaled version or the original one (train_df[features])
    X_train = train_df_scaled.copy()
    return X_train

X_train = define_X_y(train_df)

In [None]:
# Alternative to the above (no scaling!)
X_train = train_df[features].copy()

In [None]:
# Optional: select the k best features
X_train_reduced_features = SelectKBest(k=40).fit_transform(X_train, train_df['player_won'])

X_train = X_train_reduced_features

In [None]:
# Split the data (and decide whether to use scaling or not).
def split(X_train, train_df, test_size=0.2, shuffle=True):
    X_train, X_test, y_train, y_test = train_test_split(X_train, train_df['player_won'], test_size=test_size)
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = split(X_train, train_df, 0.2)

In [None]:
# PCA (optional)
pca = PCA(n_components=30)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

In [None]:
# Models list (just add models with their parameters!)
models = [
    LogisticRegression(max_iter=10_000),
    LogisticRegressionCV(max_iter=10_000),
    SGDClassifier(max_iter=10_000, tol=1e-3),
    GaussianNB(),
    DecisionTreeClassifier(random_state=0, criterion='entropy', max_depth=5),
    GradientBoostingClassifier(),
    HistGradientBoostingClassifier(max_iter=10_000),
    RidgeClassifierCV(),
    ExtraTreesClassifier(n_estimators=200, random_state=0),
    LinearSVC(random_state=0, dual=False),
    RandomForestClassifier(n_estimators=200),
    KNeighborsClassifier(n_neighbors=100),
    GaussianProcessClassifier(),
    AdaBoostClassifier(),
    MLPClassifier(max_iter=10_000)
]

In [None]:
def set_model():
    models = [
        HistGradientBoostingClassifier(max_iter=5_000),
    ]
    return models

models = set_model()

In [None]:
# Model testing with normal training

def test_models(X_train, y_train, X_test, y_test, models):
    models_result = []
    best_score = 0
    best_model = None
    
    for model in models:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        score = accuracy_score(y_test, y_pred)
        if score > best_score:
            best_score = score
            best_model = model
        models_result.append([model.__class__.__name__, score])
    
    print(best_model, '\t', best_score)

test_models(X_train, y_train, X_test, y_test, models)

In [None]:
results_table = PrettyTable()
results_table.field_names = ['Model Name', 'Accuracy']
results_table.align['Model Name'] = 'r'
results_table.align['Accuracy'] = 'l'
results_table.add_rows(sorted([[result[0], round(result[1]*100, 3)] for result in models_result], key=lambda row: row[1]))
print(results_table)

In [None]:
# Model testing with cross-validation training - Alternative to the above!
models_result_cross = []

for model in models:
    cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
    cv_results = cross_val_score(model, X_train, y_train, cv=cv, n_jobs=N_JOBS)
    print(f'{model.__class__.__name__:<32} mean: {cv_results.mean():.3f}\tmin: {cv_results.min():.3f}\tmax: {cv_results.max():.3f}')
    models_result_cross.append([model.__class__.__name__, cv_results.mean()])

In [None]:
importance = np.abs(best_model.coef_)
feature_names = np.array(X_train.feature_names)
plt.bar(height=importance, x=feature_names)
plt.title("Feature importances via coefficients")
plt.show()

In [None]:
selector = SelectFromModel(estimator=HistGradientBoostingClassifier(max_iter=10_000)).fit(X_train, y_train)

In [None]:
fitted_model = HistGradientBoostingClassifier(max_iter=10_000).fit(X_train, y_train)

In [None]:
result = permutation_importance(fitted_model, X_train, y_train, n_repeats=3, random_state=0, n_jobs=8)

In [None]:
result.importances_mean

In [None]:
# These are the most important features, according to permutation_importance().
#('p1_hp_pct_shortage', 'p1_players_with_low_hp_pct')

In [None]:
importance = fitted_model.coef_
features_mask = selector.get_support()
print(importance)
print(features_mask)

In [None]:
# ENSEMBLE - Set up estimators (models) to use

estimators = [
    ('DTC', DecisionTreeClassifier(random_state=0, criterion='entropy', max_depth=6)),
    ('LR', LogisticRegression(warm_start=True)),
    #('RC', RidgeClassifierCV()),
    #('LSVC', LinearSVC(random_state=0, dual=False)),
    ('LRCV', LogisticRegressionCV()),
    ('ETC', ExtraTreesClassifier(warm_start=True, n_estimators=200, random_state=0)),
    ('RFC', RandomForestClassifier(warm_start=True, n_estimators=200)),
    ('ABC', AdaBoostClassifier()),
    ('GBC', GradientBoostingClassifier(warm_start=True)),
    ('HGBC', HistGradientBoostingClassifier(warm_start=True, max_iter=10_000))
]

In [None]:
# ENSEMBLE - Voting
ev_clf = VotingClassifier(estimators=estimators, voting='soft', n_jobs=N_JOBS)
ev_clf.fit(X_train, y_train)
ev_clf.score(X_test, y_test)

In [None]:
# ENSEMBLE - Stacking
es_clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegressionCV(), n_jobs=N_JOBS)
es_clf.fit(X_train, y_train)
es_clf.score(X_test, y_test)

In [None]:
# ENSEMBLE - Bagging
eb_clf = BaggingClassifier(LogisticRegressionCV(), n_jobs=N_JOBS)
eb_clf.fit(X_train, y_train)
eb_clf.score(X_test, y_test)

In [None]:
# Run the full pipeline at once!
# (Once the corresponding cells have been executed at least once).

train_df = extract_features(fe.extract_features_minimal, train_data)
remove_bogus_line(train_df, 4877)
X_train = define_X_y(train_df)
X_train, X_test, y_train, y_test = split(X_train, train_df, 0.2, shuffle=True)
models = set_model()
test_models(X_train, y_train, X_test, y_test, models)