# Training Machine Learning Models

In [1]:
from pycaret.classification import *
from sklearn.model_selection import train_test_split
import pandas as pd

In [2]:
df = pd.read_parquet('datasets/teams/battlelog_train.parquet')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129190 entries, 0 to 129189
Columns: 270 entries, battleTime to battle_power_diff
dtypes: Float64(19), category(32), datetime64[ns](7), float32(94), float64(36), int16(1), int64(25), int8(5), object(51)
memory usage: 189.5+ MB


In [3]:
# subset columns by feature importance

df = df[[
	'event_mode',
	'event_map',
	'battle_team1_player1_brawler_name',
	'battle_team1_player2_brawler_name',
	'battle_team1_player3_brawler_name',
	'battle_team2_player1_brawler_name',
	'battle_team2_player2_brawler_name',
	'battle_team2_player3_brawler_name',
	'avg_brawler_trophies_diff',
	# 'avg_highestTrophies_diff',
	# 'avg_trophies_diff',
	# 'avg_team_victories_diff',
	# 'avg_expPoints_diff',
	'max_brawler_trophies_diff',
	# 'max_highestTrophies_diff',
	# 'max_trophies_diff',
	# 'max_team_victories_diff',
	# 'max_expPoints_diff',
	'min_brawler_trophies_diff',
	# 'min_highestTrophies_diff',
	# 'min_trophies_diff',
	# 'min_team_victories_diff',
	# 'min_expPoints_diff',
	'battle_power_diff',
	'winner_team',                        
]]

In [4]:
# subset by gamemode
event_mode = 'gemGrab'
df = df[df['event_mode'] == event_mode]
df = df.drop(columns=['event_mode'])

In [5]:
def split_data(data, test_size, random_state):
	"""Funcion para dividir el dataset en train y test"""
	train, test = train_test_split(
			data,
			test_size=test_size,
			random_state=random_state
			)
		
	print('train: ', train.shape)
	print('test: ', test.shape)

	return train, test

seed=14697

train, test = split_data(df, test_size = 0.25, random_state=seed)

train:  (62905, 12)
test:  (20969, 12)


In [6]:
# setup model
session_1 = setup(
	data = train,
	target = 'winner_team',
	# fix_imbalance = True,
    # feature_selection= True,
	# remove_outliers=True,
	log_experiment = True,
    use_gpu=False,
    max_encoding_ohe=500,
    )

Unnamed: 0,Description,Value
0,Session id,6999
1,Target,winner_team
2,Target type,Binary
3,Target mapping,"1: 0, 2: 1"
4,Original data shape,"(62905, 12)"
5,Transformed data shape,"(62905, 418)"
6,Transformed train set shape,"(44033, 418)"
7,Transformed test set shape,"(18872, 418)"
8,Numeric features,4
9,Categorical features,7


In [7]:
#comparación de modelos
# model = compare_models()

In [8]:
model = create_model('lightgbm')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6926,0.7692,0.6835,0.6867,0.6851,0.3848,0.3848
1,0.7014,0.7682,0.6845,0.6991,0.6917,0.4023,0.4024
2,0.6826,0.7566,0.677,0.6752,0.6761,0.3649,0.3649
3,0.6818,0.7626,0.6807,0.6729,0.6768,0.3635,0.3635
4,0.695,0.7655,0.6715,0.695,0.683,0.3893,0.3895
5,0.6952,0.7703,0.6794,0.6922,0.6857,0.3899,0.39
6,0.7032,0.7756,0.6937,0.6979,0.6958,0.406,0.406
7,0.6988,0.7751,0.691,0.6929,0.6919,0.3974,0.3974
8,0.6913,0.7634,0.6894,0.6828,0.6861,0.3825,0.3826
9,0.6984,0.77,0.6797,0.6965,0.688,0.3962,0.3963


In [9]:
# tunear el mejor modelo
model_tuned = tune_model(model, optimize = 'F1', choose_better=True)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6939,0.7701,0.6784,0.6906,0.6845,0.3873,0.3874
1,0.6948,0.766,0.6738,0.6937,0.6836,0.389,0.3892
2,0.683,0.7593,0.6659,0.6798,0.6728,0.3655,0.3655
3,0.6877,0.7652,0.6896,0.6779,0.6837,0.3754,0.3754
4,0.6936,0.7668,0.6756,0.6914,0.6834,0.3867,0.3868
5,0.6961,0.7701,0.684,0.6917,0.6878,0.3918,0.3918
6,0.6977,0.7752,0.6923,0.6907,0.6915,0.3952,0.3952
7,0.7007,0.774,0.6951,0.6938,0.6945,0.4011,0.4011
8,0.6968,0.7627,0.6885,0.6907,0.6896,0.3933,0.3933
9,0.6941,0.7693,0.6755,0.6919,0.6836,0.3876,0.3877


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


In [10]:
# finalizar modelo
model_finalized = finalize_model(model_tuned)

In [15]:
save_model(model_finalized, 'models/bs_predictor_brawlBall')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=FastMemory(location=C:\Users\alniquia\AppData\Local\Temp\joblib),
          steps=[('label_encoding',
                  TransformerWrapperWithInverse(exclude=None, include=None,
                                                transformer=LabelEncoder())),
                 ('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['avg_brawler_trophies_diff',
                                              'max_brawler_trophies_diff',
                                              'min_brawler_trophies_diff',
                                              'battle_power...
                  LGBMClassifier(boosting_type='gbdt', class_weight=None,
                                 colsample_bytree=1.0, importance_type='split',
                                 learning_rate=0.1, max_depth=-1,
                                 min_child_samples=20, min_child_weight=0.001,
                                 min_split_gain=0.

In [12]:
predictions = predict_model(model_finalized, data = test)
predictions['winner_team'] = predictions['winner_team'] + 1
predictions.head()

Unnamed: 0,event_map,battle_team1_player1_brawler_name,battle_team1_player2_brawler_name,battle_team1_player3_brawler_name,battle_team2_player1_brawler_name,battle_team2_player2_brawler_name,battle_team2_player3_brawler_name,avg_brawler_trophies_diff,max_brawler_trophies_diff,min_brawler_trophies_diff,battle_power_diff,winner_team,prediction_label,prediction_score
57102,Pinhole Punt,CROW,SPIKE,FANG,FRANK,BYRON,COLT,-0.04107,-0.010917,-0.049714,0.0,2,2,0.5193
38482,Super Beach,SURGE,HANK,SAM,DYNAMIKE,TARA,HANK,-0.005692,0.007639,-0.03331,-3.0,2,2,0.6162
9568,Super Beach,HANK,DARRYL,TICK,SURGE,BUSTER,LOU,-0.009444,-0.007335,-0.021158,4.0,2,1,0.765
43483,Super Beach,HANK,GUS,MAISIE,HANK,MAISIE,TICK,-0.150787,-0.165928,-0.178248,-2.0,1,2,0.6385
102972,Backyard Bowl,COLETTE,SAM,EL PRIMO,MORTIS,FRANK,BULL,0.021368,-0.016099,0.105361,-1.0,1,1,0.5839


In [13]:
def metrics_capturing(df):
	"""Captures the metrics of a classification model"""
	from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix

	accuracy = accuracy_score(df['winner_team'], df['prediction_label'])
	f1 = f1_score(df['winner_team'], df['prediction_label'])
	precision = precision_score(df['winner_team'], df['prediction_label'])
	recall = recall_score(df['winner_team'], df['prediction_label'])
	# report = classification_report(df['winner_team'], df['prediction_label'])
	# matrix = confusion_matrix(df['winner_team'], df['prediction_label'])

	# return accuracy, f1, precision, recall, report, matrix
	return accuracy, f1, precision, recall

accuracy, f1, precision, recall, report, matrix = metrics_capturing(predictions)
print('accuracy: ', accuracy)
print('f1: ', f1)
print('precision: ', precision)
print('recall: ', recall)
# print('report: ', report)
# print('matrix: ', matrix)

# save metrics
metrics = pd.DataFrame({
		'event_mode': event_mode,
		'accuracy': accuracy,
		'f1': f1,
		'precision': precision,
		'recall': recall,
})

metrics.to_json('bs_metrics.json')

accuracy:  0.6988888359006152
f1:  0.7094607031106204
precision:  0.7057584912569806
recall:  0.7132019613285225
report:                precision    recall  f1-score   support

           1       0.71      0.71      0.71     10809
           2       0.69      0.68      0.69     10160

    accuracy                           0.70     20969
   macro avg       0.70      0.70      0.70     20969
weighted avg       0.70      0.70      0.70     20969

matrix:  [[7709 3100]
 [3214 6946]]


In [14]:
# get feature importance
feature_importance = pd.DataFrame({
		'feature': model_finalized.booster_.feature_name(),
		'importance': model_finalized.feature_importances_
		})

feature_importance.sort_values(by='importance', ascending=False, inplace=True)

feature_importance.to_json('resources/bs_feature_importance.json', orient='records')

feature_importance

Unnamed: 0,feature,importance
414,max_brawler_trophies_diff,306
416,battle_power_diff,254
413,avg_brawler_trophies_diff,249
415,min_brawler_trophies_diff,235
6,battle_team1_player1_brawler_name_HANK,72
...,...,...
262,battle_team2_player1_brawler_name_OTIS,0
64,battle_team1_player1_brawler_name_RUFFS,0
265,battle_team2_player1_brawler_name_CHESTER,0
266,battle_team2_player1_brawler_name_GRAY,0
