# Training Machine Learning Models

In [101]:
from pycaret.classification import *
from sklearn.model_selection import train_test_split
import pandas as pd
import json

In [102]:
df = pd.read_parquet('datasets/teams/battlelog_train.parquet')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129190 entries, 0 to 129189
Columns: 270 entries, battleTime to battle_power_diff
dtypes: Float64(19), category(32), datetime64[ns](7), float32(94), float64(36), int16(1), int64(25), int8(5), object(51)
memory usage: 189.5+ MB


In [103]:
# subset columns by feature importance

df = df[[
	'event_mode',
	'event_map',
	'battle_team1_player1_brawler_name',
	'battle_team1_player2_brawler_name',
	'battle_team1_player3_brawler_name',
	'battle_team2_player1_brawler_name',
	'battle_team2_player2_brawler_name',
	'battle_team2_player3_brawler_name',
	'avg_brawler_trophies_diff',
	# 'avg_highestTrophies_diff',
	# 'avg_trophies_diff',
	# 'avg_team_victories_diff',
	# 'avg_expPoints_diff',
	'max_brawler_trophies_diff',
	# 'max_highestTrophies_diff',
	# 'max_trophies_diff',
	# 'max_team_victories_diff',
	# 'max_expPoints_diff',
	'min_brawler_trophies_diff',
	# 'min_highestTrophies_diff',
	# 'min_trophies_diff',
	# 'min_team_victories_diff',
	# 'min_expPoints_diff',
	'battle_power_diff',
	'winner_team',                        
]]

In [104]:
# dummy brawler name
dft1 = pd.get_dummies(data=df['battle_team1_player1_brawler_name'], prefix='T1') + \
	pd.get_dummies(data=df['battle_team1_player2_brawler_name'], prefix='T1') + \
	pd.get_dummies(data=df['battle_team1_player3_brawler_name'], prefix='T1')

dft2 = pd.get_dummies(data=df['battle_team2_player1_brawler_name'], prefix='T2') + \
	pd.get_dummies(data=df['battle_team2_player2_brawler_name'], prefix='T2') + \
	pd.get_dummies(data=df['battle_team2_player3_brawler_name'], prefix='T2')

df = pd.concat([df, dft1, dft2], axis=1)
df = df.drop(columns=[
	'battle_team1_player1_brawler_name',
	'battle_team1_player2_brawler_name',
	'battle_team1_player3_brawler_name',
	'battle_team2_player1_brawler_name',
	'battle_team2_player2_brawler_name',
	'battle_team2_player3_brawler_name',
])
df

Unnamed: 0,event_mode,event_map,avg_brawler_trophies_diff,max_brawler_trophies_diff,min_brawler_trophies_diff,battle_power_diff,winner_team,T1_8-BIT,T1_AMBER,T1_ASH,...,T2_SANDY,T2_SHELLY,T2_SPIKE,T2_SPROUT,T2_SQUEAK,T2_STU,T2_SURGE,T2_TARA,T2_TICK,T2_WILLOW
0,brawlBall,Super Beach,0.147371,0.105546,0.186610,0.0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,brawlBall,Super Beach,-0.078622,-0.157831,0.012148,0.0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,brawlBall,Super Beach,0.030869,0.052697,-0.020846,0.0,2,1,0,0,...,0,0,1,0,0,0,0,0,0,0
3,brawlBall,Super Beach,-0.007010,-0.018387,0.028386,0.0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
4,brawlBall,Super Beach,0.068858,0.038340,0.106025,0.0,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129185,knockout,New Perspective,-0.034374,-0.031272,-0.016926,-1.0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
129186,heist,Snaked Assault,0.003576,0.010302,0.009662,-1.0,2,0,1,0,...,0,0,0,0,0,0,0,0,0,0
129187,heist,Snaked Assault,0.008475,-0.028988,0.059325,-1.0,2,1,0,0,...,0,0,0,0,0,0,0,0,0,0
129188,knockout,Riverside,0.011111,-0.005038,0.050325,-1.0,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [105]:
# subset by gamemode
event_mode = 'brawlBall'
df = df[df['event_mode'] == event_mode]
df = df.drop(columns=['event_mode'])

In [106]:
def split_data(data, test_size, random_state):
	"""Funcion para dividir el dataset en train y test"""
	train, test = train_test_split(
			data,
			test_size=test_size,
			random_state=random_state
			)
		
	print('train: ', train.shape)
	print('test: ', test.shape)

	return train, test

seed=14697

train, test = split_data(df, test_size = 0.25, random_state=seed)

train:  (62905, 142)
test:  (20969, 142)


In [107]:
# setup model
session_1 = setup(
	data = train,
	target = 'winner_team',
	# fix_imbalance = True,
    # feature_selection= True,
	# remove_outliers=True,
	log_experiment = True,
    use_gpu=False,
    max_encoding_ohe=500,
    )

Unnamed: 0,Description,Value
0,Session id,2557
1,Target,winner_team
2,Target type,Binary
3,Target mapping,"1: 0, 2: 1"
4,Original data shape,"(62905, 142)"
5,Transformed data shape,"(62905, 146)"
6,Transformed train set shape,"(44033, 146)"
7,Transformed test set shape,"(18872, 146)"
8,Numeric features,140
9,Categorical features,1


In [108]:
#comparación de modelos
model = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.7035,0.7789,0.6875,0.701,0.6941,0.4066,0.4067,18.176
rf,Random Forest Classifier,0.7001,0.771,0.6739,0.7016,0.6874,0.3995,0.3998,24.608
gbc,Gradient Boosting Classifier,0.6957,0.7682,0.6759,0.6942,0.6849,0.3908,0.391,26.149
ada,Ada Boost Classifier,0.6906,0.7617,0.6771,0.6864,0.6817,0.3808,0.3809,24.343
lr,Logistic Regression,0.6904,0.7626,0.6768,0.6863,0.6815,0.3804,0.3804,19.695
lda,Linear Discriminant Analysis,0.69,0.7625,0.6764,0.6858,0.681,0.3795,0.3796,18.587
ridge,Ridge Classifier,0.6899,0.0,0.6764,0.6857,0.681,0.3794,0.3794,17.549
svm,SVM - Linear Kernel,0.6855,0.0,0.671,0.6828,0.6757,0.3706,0.3718,18.527
et,Extra Trees Classifier,0.6842,0.7511,0.6583,0.6844,0.6711,0.3677,0.368,29.067
knn,K Neighbors Classifier,0.6419,0.6876,0.635,0.6339,0.6344,0.2836,0.2836,20.208


In [109]:
# model = create_model('lightgbm')

In [110]:
# tunear el mejor modelo
model_tuned = tune_model(model, optimize = 'F1', choose_better=True)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7128,0.7844,0.7039,0.7076,0.7057,0.4252,0.4252
1,0.6973,0.766,0.6896,0.6912,0.6904,0.3943,0.3943
2,0.7023,0.7819,0.6849,0.7002,0.6925,0.4041,0.4042
3,0.7057,0.7823,0.6854,0.705,0.6951,0.4107,0.4109
4,0.6979,0.7755,0.6914,0.6914,0.6914,0.3956,0.3956
5,0.6993,0.7696,0.691,0.6935,0.6922,0.3983,0.3983
6,0.6991,0.782,0.684,0.6959,0.6899,0.3977,0.3977
7,0.6982,0.7738,0.6724,0.6993,0.6856,0.3956,0.3959
8,0.7179,0.7881,0.7089,0.7129,0.7109,0.4355,0.4355
9,0.6998,0.7742,0.6755,0.7002,0.6876,0.3988,0.399


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


In [111]:
# finalizar modelo
model_finalized = finalize_model(model_tuned)

In [112]:
save_model(model_finalized, 'models/bs_predictor_' + event_mode)

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=FastMemory(location=C:\Users\alniquia\AppData\Local\Temp\joblib),
          steps=[('label_encoding',
                  TransformerWrapperWithInverse(exclude=None, include=None,
                                                transformer=LabelEncoder())),
                 ('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['avg_brawler_trophies_diff',
                                              'max_brawler_trophies_diff',
                                              'min_brawler_trophies_diff',
                                              'battle_power...
                  LGBMClassifier(boosting_type='gbdt', class_weight=None,
                                 colsample_bytree=1.0, importance_type='split',
                                 learning_rate=0.1, max_depth=-1,
                                 min_child_samples=20, min_child_weight=0.001,
                                 min_split_gain=0.

In [113]:
predictions = predict_model(model_finalized, data = test)
predictions['winner_team'] = predictions['winner_team'] + 1
predictions.head()

Unnamed: 0,event_map,avg_brawler_trophies_diff,max_brawler_trophies_diff,min_brawler_trophies_diff,battle_power_diff,T1_8-BIT,T1_AMBER,T1_ASH,T1_BARLEY,T1_BEA,...,T2_SPROUT,T2_SQUEAK,T2_STU,T2_SURGE,T2_TARA,T2_TICK,T2_WILLOW,winner_team,prediction_label,prediction_score
57102,Pinhole Punt,-0.04107,-0.010917,-0.049714,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,1,0.5082
38482,Super Beach,-0.005692,0.007639,-0.03331,-3.0,0,0,0,0,0,...,0,0,0,0,1,0,0,2,2,0.6706
9568,Super Beach,-0.009444,-0.007335,-0.021158,4.0,0,0,0,0,0,...,0,0,0,1,0,0,0,2,1,0.801
43483,Super Beach,-0.150787,-0.165928,-0.178248,-2.0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,2,0.6556
102972,Backyard Bowl,0.021368,-0.016099,0.105361,-1.0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0.6457


In [114]:
def metrics_capturing(df):
	"""Captures the metrics of a classification model"""
	# from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
	from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

	accuracy = accuracy_score(df['winner_team'], df['prediction_label'])
	f1 = f1_score(df['winner_team'], df['prediction_label'])
	precision = precision_score(df['winner_team'], df['prediction_label'])
	recall = recall_score(df['winner_team'], df['prediction_label'])
	# report = classification_report(df['winner_team'], df['prediction_label'])
	# matrix = confusion_matrix(df['winner_team'], df['prediction_label'])

	# return accuracy, f1, precision, recall, report, matrix
	return accuracy, f1, precision, recall

accuracy, f1, precision, recall = metrics_capturing(predictions)

# save metrics
metrics = {
		'accuracy': accuracy,
		'f1': f1,
		'precision': precision,
		'recall': recall,
}

print(metrics)

with open('resources/bs_metrics.json', 'r') as f:
	data = json.load(f)
	data[event_mode] = metrics

with open('resources/bs_metrics.json', 'w') as f:
	json.dump(data, f, indent=4)

{'accuracy': 0.7042777433353998, 'f1': 0.7125840092699883, 'precision': 0.7140070592606353, 'recall': 0.7111666204089185}


In [115]:
# get feature importance
feature_importance = pd.DataFrame({
		'feature': model_finalized.booster_.feature_name(),
		'importance': model_finalized.feature_importances_
		})

feature_importance.sort_values(by='importance', ascending=False, inplace=True)

feature_importance.to_json('resources/bs_feature_importance.json', orient='records')

feature_importance

Unnamed: 0,feature,importance
6,max_brawler_trophies_diff,375
5,avg_brawler_trophies_diff,329
8,battle_power_diff,264
7,min_brawler_trophies_diff,263
110,T2_HANK,75
...,...,...
127,T2_PENNY,0
122,T2_MR._P,0
47,T1_LOLA,0
52,T1_MEG,0
