In [None]:
"""TODO
- EDA
- error analysis
    - look at the type1 vs type2 errors
    - manually try to interpret what is it failing to see
- find ways to reduce dimensionality
    - unsupervised learning
    - feature engineering
        - PCA
        - umap
- mine more data
"""

In [None]:
%reload_ext autoreload
%autoreload 2
import sys  
print(sys.executable)
import numpy
import pandas
import seaborn
from platform import system

from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import make_pipeline
from xgboost import XGBClassifier
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier


In [None]:
# Load Dataset
# Get file path
if system() == "Windows":
    filepath = "C:\\Users\\Beau\\Desktop\\ML\\faker\\merged9.csv"
# elif system() == "Linux":
# filepath = ""
else:
    print(f"Unfamiliar OS. Cannot set file path to csv file.")
    exit(1)

# Load csv
print(f'Loading "{filepath}"')
dataframe = pandas.read_csv(filepath)
print(f"Finish loading.")
print(dataframe.shape)

In [None]:
# # IF YOU WANT TO BALANCE THE TARGET
# num_wins = dataframe['W/L'].value_counts()[0]
# num_losses = dataframe['W/L'].value_counts()[1]
# print(dataframe['W/L'].value_counts())

# X = dataframe[dataframe['W/L'] == "Win"].sample(num_losses, random_state=0)
# dataframe = pandas.concat([X, dataframe[dataframe['W/L'] == "Loss"]])
# print(dataframe.shape)

In [None]:
# cols_to_use = ['Side', 'tournament_curr_win_percentage', 'teams_region', 'teammate_top_champion','teammate_jungle_champion','teammate_mid_champion','teammate_adc_champion','teammate_support_champion','opponent_top_champion','opponent_jungle_champion','opponent_mid_champion','opponent_adc_champion','opponent_support_champion']
cols_to_use = ['Side', 'tournament_curr_win_percentage', 'teams_region', 'teammate_role_top','teammate_role_jungle','teammate_role_mid','teammate_role_adc','teammate_role_support','enemy_role_top','enemy_role_jungle','enemy_role_mid','enemy_role_adc','enemy_role_support']

X = dataframe[cols_to_use]

y = dataframe["W/L"]
def transform_game_result(game_result):
    if game_result.lower() == "win":
        return 1
    elif game_result.lower() == "loss":
        return 0
    else:
        raise ValueError("\"W/L\" column has invalid values")
y = y.transform(transform_game_result)

In [None]:

# Separate dataset in train, cv, test
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

# Get categorical cols
categorical_cols = [col for col in X_train.columns
                    if X_train[col].dtype == "object"]
print(f"categorical_cols = {categorical_cols}\n")
# Get numerical cols
numerical_cols = [col for col in X_train.columns
                  if X_train[col].dtype in ["int64", "float64"]]
print(f"numerical_cols = {numerical_cols}")

In [None]:
print(f"Before the pipeline, the shape of X_train is {X_train.shape}")

In [None]:
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers = [
        ('cat', categorical_transformer, categorical_cols)
    ]
)

In [None]:
# Establish baseline performance
BASELINE_PERFORMANCE = 0.65  # taken from baseline_performance.txt

In [None]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=0)
xgb_model = XGBClassifier(random_state=0)

In [None]:
# Undersampling
under_sampler = RandomUnderSampler(random_state=0)
rf_model = make_pipeline(under_sampler, rf_model)
xgb_model = make_pipeline(under_sampler, xgb_model)

In [None]:
# def create_nn_model():
#     nn_model = keras.Sequential([
#         layers.Dense(512, activation='relu', input_shape=[559]),
#         layers.Dropout(rate=0.5),
#         layers.BatchNormalization(),
#         layers.Dense(512, activation='relu'),
#         layers.Dropout(rate=0.5),
#         layers.BatchNormalization(),
#         # layers.Dense(512, activation='relu'),
#         # layers.Dropout(rate=0.1),
#         # layers.BatchNormalization(),
#         # layers.Dense(512, activation='relu'),
#         # layers.Dropout(rate=0.1),
#         # layers.BatchNormalization(),
#         layers.Dense(1, activation='sigmoid'),
#     ])
#     nn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['binary_accuracy'])
#     return nn_model
# early_stopping = keras.callbacks.EarlyStopping(
#     patience=30,
#     min_delta=0.001,
#     restore_best_weights=True,
# )

In [None]:
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', rf_model)
])

xgb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', xgb_model)
])

In [None]:
# nn_model = KerasClassifier(build_fn=create_nn_model, verbose=0)
# nn_pipeline = Pipeline(steps=[
#     ('preprocessor', preprocessor),
#     ('model', nn_model)
# ])

In [None]:
# vscode the window has crashed reason oom 536870904
# rf_params = [{
#     "model__n_estimators": [25, 50, 75],
#     "model__max_depth": [5, 10, 15, 20, 25, 30],
#     "model__max_samples": [0.2, 0.4, 0.6, 0.8],

#     # "max_leaf_nodes": []
# }]
# IF RF_MODEL IS A PIPELINE, NEED TO ADD PREFIXES
rf_params = [{
    "model__randomforestclassifier__n_estimators": [15, 25, 35],
    "model__randomforestclassifier__max_depth": [3, 5, 8],
    "model__randomforestclassifier__max_samples": [0.5, 0.6, 0.7],

    # "max_leaf_nodes": []
}]

# IF XGB_MODEL IS A PIPELINE, NEED TO ADD PREFIXES
xgb_params = [{
    'model__xgbclassifier__alpha': [1],
    'model__xgbclassifier__lambda': [1],
    'model__xgbclassifier__learning_rate': [0.5],
    'model__xgbclassifier__max_delta_step': [3],
    'model__xgbclassifier__max_depth': [5],
    'model__xgbclassifier__min_child_weight': [10],
    'model__xgbclassifier__min_split_loss': [1],
    'model__xgbclassifier__subsample': [0.5]
}]
# xgb_params = [{
#     'model__alpha': [1],
#     'model__lambda': [1],
#     'model__learning_rate': [0.5],
#     'model__max_delta_step': [3],
#     'model__max_depth': [5],
#     'model__min_child_weight': [10],
#     'model__min_split_loss': [1],
#     'model__subsample': [0.5]
# }]
# ~10 mins
# xgb_params = [{
#     # Prevents overfitting
#     "model__learning_rate": [0.5, 0.7],
#     "model__max_depth": [5, 10,15],
#     "model__subsample": [0.5, 0.6],
#     # More conservative algorithm
#     "model__min_split_loss": [1, 10],
#     "model__min_child_weight": [1, 10],
#     "model__max_delta_step": [3, 9],
#     "model__lambda": [1, 10],
#     "model__alpha": [1, 10],
#     # Other
#     # "max_leaves": [],
# }]
# # This crashes my laptop
# xgb_params = [{
#     # Prevents overfitting
#     "model__learning_rate": [0.5, 0.7, 0.9],
#     "model__max_depth": [5, 10,15],
#     "model__subsample": [0.4, 0.6, 0.8],
#     # More conservative algorithm
#     "model__min_split_loss": [0.1, 1, 10],
#     "model__min_child_weight": [0.1, 1, 10],
#     "model__max_delta_step": [3, 6, 9],
#     "model__lambda": [0.1, 1, 10],
#     "model__alpha": [0.1, 1, 10],
#     # Other
#     # "max_leaves": [],
# }]
# # Overnight: ~6.25 hours
# xgb_params = [{
#     # Prevents overfitting
#     "model__learning_rate": [0.3, 0.5, 0.7, 0.8, 0.9],
#     "model__max_depth": [5, 10, 15, 20, 25, 30],
#     "model__subsample": [0.2, 0.4, 0.6, 0.8],
#     # More conservative algorithm
#     "model__min_split_loss": [0.01, 0.1, 1, 10, 100],
#     "model__min_child_weight": [0.01, 0.1, 1, 10, 100],
#     "model__max_delta_step": [3, 6, 9],
#     "model__lambda": [0.01, 0.1, 1, 10, 100],
#     "model__alpha": [0.01, 0.1, 1, 10, 100],
#     # Other
#     # "max_leaves": [],
# }]

def calculate_runtime(params, seconds_per_model=0.25, cv_folds=5):
    """
    For reference:
    Every 36,000 permutations => 1 hour at 0.1 secs/model
    5 hours at 0.1 secs/model is 180,000 total permutations
    6 hours at 0.1 secs/model is 216,000 total permutations
    7 hours at 0.1 secs/model is 252,000 total permutations
    8 hours at 0.1 secs/model is 288,000 total permutations
    """
    params = params[0]
    total_permutations = 1
    for param, lst in params.items():
        total_permutations *= len(lst)
    
    total_permutations *= cv_folds
    time = ((total_permutations * seconds_per_model) / 60)

    return total_permutations, time

rf_permutations, rf_runtime = calculate_runtime(rf_params)
xgb_permutations, xgb_runtime = calculate_runtime(xgb_params)
print(f"RF - GridSearchCV has {rf_permutations} permutations and will take {rf_runtime:.2f} minutes (={rf_runtime/60:.2f})")
print(f"XGB - GridSearchCV has {xgb_permutations} permutations and will take {xgb_runtime:.2f} minutes (={xgb_runtime/60:.2f} hours)")


In [None]:
rf = GridSearchCV(rf_pipeline,
                      param_grid=rf_params,
                      scoring='f1',
                      cv=5,
                      verbose=2)
xgb = GridSearchCV(xgb_pipeline,
                      param_grid=xgb_params,
                      scoring='f1',
                      cv=5,
                      verbose=2)

In [None]:
# rf_pipeline.fit(X_train, y_train)
rf.fit(X_train, y_train)

In [None]:
print(rf.best_score_)
print(rf.best_params_)

In [None]:
# xgb_pipeline.fit(X_train, y_train)
xgb.fit(X_train, y_train)

In [None]:
# X_temp = numpy.asarray(X_train).astype(numpy.float32)
# nn_pipeline.fit(
#     X_temp, y_train,
#     model__validation_data=(X_test, y_test),
#     model__batch_size=64,
#     model__epochs=1000,
#     model__callbacks=[early_stopping],
#     # model__verbose=0, # hide the output because we have so many epochs
# )

In [None]:
# rf_predict_train = rf_pipeline.predict(X_train)
# rf_predict_test = rf_pipeline.predict(X_test)
rf_predict_train = rf.predict(X_train)
rf_predict_test = rf.predict(X_test)

In [None]:
rf_train_f1_score = f1_score(y_train, rf_predict_train, average="binary")
print(f"The F1 Score for the RF model on the training set is: {rf_train_f1_score * 100:.2f}%")
rf_test_f1_score = f1_score(y_test, rf_predict_test, average="binary")
print(f"The F1 Score for the RF model on the test set  is: {rf_test_f1_score * 100:.2f}%")

In [None]:
# xgb_predict_train = xgb_pipeline.predict(X_train)
# xgb_predict_test = xgb_pipeline.predict(X_test)
xgb_predict_train = xgb.predict(X_train)
xgb_predict_test = xgb.predict(X_test)
print(xgb.best_score_)
print(xgb.best_params_)

In [None]:
xgb_train_f1_score = f1_score(y_train, xgb_predict_train, average="binary")
print(f"The F1 Score for the XGB model on the training set is: {xgb_train_f1_score * 100:.2f}%")
xgb_test_f1_score = f1_score(y_test, xgb_predict_test, average="binary")
print(f"The F1 Score for the XGB model on the test set  is: {xgb_test_f1_score * 100:.2f}%")

In [None]:
# import joblib
# rf_file = "rf.pkl"
# rf.estimator[0].transformers[0][1]
# rf = joblib.load(rf_file)
# rf_predict_train = rf.predict(X_train)
# rf_predict_test = rf.predict(X_test)
# rf_train_f1_score = f1_score(y_train, rf_predict_train, average="binary")
# rf_test_f1_score = f1_score(y_test, rf_predict_test, average="binary")

In [None]:
from sklearn.utils.class_weight import compute_class_weight

In [None]:
foo = compute_class_weight('balanced', classes=[1,0], y=y)

In [None]:
foo

Save Model

In [None]:
import importlib
import save_model
importlib.reload(save_model)

In [None]:
# To prove that undersampling is helping
# the std should be very small (comparing with and without undersampling)

# cv_results = rf.cv_results_["split0_test_score"].mean()
cv_results = rf.cv_results_
num_folds = 5
for x in range(num_folds):
    curr = f"split{x}_test_score"
    print(
        f"F1 mean +/- std. dev.: for split={curr}: "
        f"{cv_results[curr].mean():.3f} +/- "
        f"{cv_results[curr].std():.3f}"
    )

In [None]:
save_model.save_model(rf, "rf_4.pkl", X_train, rf_params[0], rf_test_f1_score, description="[supposedly] using undersampling")

In [None]:
save_model.save_model(xgb, "xgb_3.pkl", X_train, xgb_params[0], xgb_test_f1_score, description="using undersampling")

In [None]:
# rf_pipeline[0] is the preprocessor object, which has the function get_feature_names_out
# encoded_features = rf_pipeline[0].get_feature_names_out()

In [None]:
# # This cell did not work initially. VSCode might need to reboot to recognize the newly downloaded graphviz
# estimator = rf_pipeline[1].estimators_[5]

# from sklearn.tree import export_graphviz
# # Export as dot file
# export_graphviz(estimator, out_file='tree.dot', 
#                 feature_names = encoded_features,
#                 class_names = ["Win", "Loss"],
#                 rounded = True, proportion = False, 
#                 precision = 2, filled = True)

# # Convert to png using system command (requires Graphviz)
# from subprocess import call
# call(['dot', '-Tpng', 'C:\\Users\\Beau\\Desktop\\ML\\faker\\tree.dot', '-o', 'C:\\Users\\Beau\\Desktop\\ML\\faker\\tree.png', '-Gdpi=600'])

# # Display in jupyter notebook
# from IPython.display import Image
# Image(filename = 'tree.png')

In [None]:
# # Generate a PNG file for one of the trees
# from sklearn import tree
# import matplotlib.pyplot as plt
# fn=encoded_features
# cn=["Win", "Loss"]
# fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (4,4), dpi=800)
# tree.plot_tree(rf.estimator[1].estimators_[0],
# # tree.plot_tree(rf_pipeline[1].estimators_[0],
#                feature_names = fn, 
#                class_names=cn,
#                filled = True);
# fig.savefig('rf_individualtree.png')
# print("Generated one tree")  # takes about 30 seconds

To reduce overfitting,
A. Add more examples
B. Reduce number of features
C. Increase regularization parameter
D. Trees - reduce depth and other parameters
    - early_stopping_rounds: can use high n_estimators, and then modify this parameter. note: this is a parameter for XGBClassifier.fit()
    - learning_rate: large learning_rate+large n_estimators leads to more accurate models, but takes longer

Error Analysis
    - manually examine 100 examples and see if there is a pattern

Iterative Loop of ML
    Choose Architecture
    Train
    Diagnostics (bias, variance, error analysis)

Baseline Performance    10.6%           10.6%       10.6%
                            +0.2%           +4.4%       +4.4%
Training Error          10.8%           15.0%       15.0%
                            +4.0%           +0.5%       +4.7%
CV Error                14.8%           15.5%       19.7%
                        (High Variance) (High Bias) (High Variance & Bias)

No Hyperparameter Tuning
RF
Baseline Performance    35.0%
                            -35.0%
Training Error          0.00%
                            +22.9%
CV Error                22.9%
                        (High Variance)
XGB
Baseline Performance    35.0%
                            -30.8%
Training Error          4.20%
                            +23.4%
CV Error                27.6%
                        (High Variance)

After Hyperparameter Tuning
RF
Baseline Performance    35.0%
                            -16.0%
Training Error          19.0%
                            +2.10%
CV Error                21.1%
                        (High Variance)


Random Forest Parameters

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

"Prevents overfitting" \
`n_estimators`. default=100. increase => overfitting \
`max_depth`. default=None. increase => overfitting \
`max_samples`. similar to `subsample` for XGB?

Other \
`max_leaf_nodes`. default=None

XGB Parameters

https://xgboost.readthedocs.io/en/stable/parameter.html

"Prevents overfitting" \
`eta` (aka `learning_rate`). default=0.3 \
`max_depth`. default=6. range=[0, infinity]. larger max_depth => overfitting. \
`subsample`. default=1. range=(0,1]

"More conversative" \
`gamma` (aka `min_split_loss`). default=0.0. range=[0, infinity]. larger gamma => conservative algorithm \
`min_child_weight`. default=1. range=[0, infinity]. increase => conservative \
`max_delta_step`. default=0. range=[0, infinity]. Set to 1-10. \
`lambda` (aka `reg_lambda`). default=1. increase => conservative \
`alpha` (aka `reg_alpha`). default=0. increase => conservative

Other \
`max_leaves`. default=0

Useful Code \
https://xgboost.readthedocs.io/en/stable/python/python_intro.html#setting-parameters
```
import xgboost as xgb
bst = xgb.train(param, dtrain, num_round, evallist)
bst.save_model('0001.model')
bst.dump_model('dump.raw.txt')
bst.dump_model('dump.raw.txt', 'featmap.txt')
bst = xgb.Booster({'nthread': 4})  # init model
bst.load_model('model.bin')  # load data

# Plotting
xgb.plot_importance(bst)   
xgb.plot_tree(bst, num_trees=2)
xgb.to_graphviz(bst, num_trees=2)
```

```
from sklearn.model_selection import GridSearchCV
clf = GridSearchCV(xgb_model, {'max_depth': [2, 4, 6],
                                   'n_estimators': [50, 100, 200]}, verbose=1,
                       n_jobs=2)
clf.fit(X, y)
print(clf.best_score_)
print(clf.best_params_)
```

```
# Cross-Validation with XGB
# https://xgboost.readthedocs.io/en/stable/python/examples/cross_validation.html#sphx-glr-python-examples-cross-validation-py
xgb.cv(param, dtrain, num_round, nfold=5,
       metrics={'error'}, seed=0,
       callbacks=[xgb.callback.EvaluationMonitor(show_stdv=True)])
```

EDA

In [None]:
import matplotlib.pyplot as plt

In [None]:
numerical_cols

In [None]:
for col in numerical_cols:
    # seaborn.histplot(data=X_train, x=col)
    seaborn.histplot(data=X_train, x=col, kde=True)
    plt.xticks(rotation=45)

In [None]:
categorical_cols

In [None]:
for col in categorical_cols:
    seaborn.barplot(data=X_train, x=col, y=y_train)
    plt.figure(figsize=(10,10))
    plt.xticks(rotation=45)

In [None]:
print_pandas_no_truncate()
for col in categorical_cols:
    print(X_train.pivot_table(index=y_train, columns=col, aggfunc="size", fill_value=0))
print_pandas_reset()

Error Analysis

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
xgb = joblib.load("C:\\Users\\Beau\\Desktop\\ML\\faker\\models\\rf_1.pkl")

In [None]:
predictions = xgb.predict(X_test)
cm = confusion_matrix(y_test, predictions)
print(cm)
tn, fp, fn, tp = cm.ravel()
display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=rf.classes_)
display.plot()
plt.figure(figsize=(10,10))
plt.show()

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))

In [None]:
pandas.set_option('display.max_columns', None)
pandas.set_option('display.max_colwidth', None)
pandas.set_option('display.max_rows', None)

count = 0
for i, match in X_test.iterrows():
    match = pandas.DataFrame(match).T  # convert 'match':Series to DataFrame (which will have shape (13,1)) and transpose to have shape (1,13) as a compatible datatype for *.predict 
    prediction = rf.predict(match)[0]

    if prediction == y_test.loc[i]:
        continue
    # Type 1 - False Positive
    if prediction == 1 and y_test.loc[i] != prediction:
        print("TYPE 1 ERROR")
    # Type 2 - False Negative
    if prediction == 0 and y_test.loc[i] != prediction:
        print("TYPE 2 ERROR")
    print(match)
    print(dataframe.loc[i])
    count += 1
    if count == 15:
        break
pandas.reset_option('display.max_columns')
pandas.reset_option('display.max_colwidth')
pandas.reset_option('display.max_rows')

Helper Functions

In [None]:
def print_df_row(row):
    """Print full row details. Resets panda options"""
    # Removes truncations
    pandas.set_option('display.max_columns', None)
    pandas.set_option('display.max_colwidth', None)
    pandas.set_option('display.max_rows', None)

    # Print
    print(row)

    # Reset
    pandas.reset_option('display.max_columns')
    pandas.reset_option('display.max_colwidth')
    pandas.reset_option('display.max_rows')

def print_pandas_no_truncate():
    pandas.set_option('display.max_columns', None)
    pandas.set_option('display.max_colwidth', None)
    pandas.set_option('display.max_rows', None)

def print_pandas_reset():
    pandas.reset_option('display.max_columns')
    pandas.reset_option('display.max_colwidth')
    pandas.reset_option('display.max_rows')