# Training Machine Learning Models

In [1]:
import os

In [2]:
from pycaret.classification import *
from sklearn.model_selection import train_test_split
import pandas as pd

In [3]:
df = pd.read_parquet('datasets/teams/battlelog_train.parquet')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129190 entries, 0 to 129189
Data columns (total 69 columns):
 #   Column                                       Non-Null Count   Dtype   
---  ------                                       --------------   -----   
 0   event_mode                                   129190 non-null  category
 1   event_map                                    129190 non-null  category
 2   battle_team1_player1_brawler_name            129190 non-null  category
 3   battle_team1_player2_brawler_name            129190 non-null  category
 4   battle_team1_player3_brawler_name            129190 non-null  category
 5   battle_team2_player1_brawler_name            129190 non-null  category
 6   battle_team2_player2_brawler_name            129190 non-null  category
 7   battle_team2_player3_brawler_name            129190 non-null  category
 8   battle_team1_player1_brawler_Class           107560 non-null  category
 9   battle_team1_player2_brawler_Class           105

In [4]:
# cambiar working directory
print(os.getcwd())
os.chdir('machine_learning')
print(os.getcwd())

c:\Users\alniquia\OneDrive - Telefonica\Documents\Projects\BrawlStars_Model
c:\Users\alniquia\OneDrive - Telefonica\Documents\Projects\BrawlStars_Model\machine_learning


In [5]:
def split_data(data, test_size, random_state):
	"""Funcion para dividir el dataset en train y test"""
	train, test = train_test_split(
			data,
			test_size=test_size,
			random_state=random_state
			)
		
	print('train: ', train.shape)
	print('test: ', test.shape)

	return train, test

seed=14697

train, test = split_data(df, test_size = 0.25, random_state=seed)

train:  (96892, 69)
test:  (32298, 69)


In [10]:
# setup model
session_1 = setup(
	data = train,
	target = 'winner_team',
	fix_imbalance = True,
    feature_selection= True,
	remove_outliers=True,
	log_experiment = True,
    use_gpu=False)

Unnamed: 0,Description,Value
0,Session id,4813
1,Target,winner_team
2,Target type,Binary
3,Target mapping,"1: 0, 2: 1"
4,Original data shape,"(96892, 69)"
5,Transformed data shape,"(94704, 14)"
6,Transformed train set shape,"(65636, 14)"
7,Transformed test set shape,"(29068, 14)"
8,Numeric features,36
9,Categorical features,32


2023/06/10 21:33:05 INFO mlflow.tracking.fluent: Experiment with name 'clf-default-name' does not exist. Creating a new experiment.


In [11]:
#comparación de modelos
model = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.7048,0.7858,0.7095,0.6954,0.7024,0.4096,0.4096,19.371
gbc,Gradient Boosting Classifier,0.7022,0.7823,0.7102,0.6917,0.7008,0.4046,0.4047,25.048
lr,Logistic Regression,0.6975,0.7765,0.7009,0.6887,0.6947,0.395,0.3951,35.648
ada,Ada Boost Classifier,0.6973,0.7747,0.7022,0.688,0.695,0.3947,0.3949,19.062
ridge,Ridge Classifier,0.6972,0.0,0.7012,0.688,0.6946,0.3943,0.3944,23.178
lda,Linear Discriminant Analysis,0.6972,0.7762,0.7012,0.6881,0.6946,0.3944,0.3944,16.963
et,Extra Trees Classifier,0.6956,0.7731,0.6905,0.6899,0.6902,0.391,0.391,22.583
rf,Random Forest Classifier,0.695,0.7728,0.6911,0.6888,0.6899,0.3898,0.3898,32.715
svm,SVM - Linear Kernel,0.6934,0.0,0.7163,0.6803,0.6952,0.3872,0.3912,24.05
qda,Quadratic Discriminant Analysis,0.6902,0.7647,0.716,0.6736,0.6941,0.3809,0.3816,17.208


In [None]:
#try model blender

# example:
# train a few models
# lr = create_model('lr')
# dt = create_model('dt')
# knn = create_model('knn')

# blend models
# blender = blend_models([lr, dt, knn])
# blender_tuned = tune_model(blender, optimize = 'F1')

In [12]:
# tunear el mejor modelo
model_tuned = tune_model(model, optimize = 'F1')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7006,0.7838,0.7118,0.6888,0.7001,0.4013,0.4015
1,0.7046,0.779,0.7043,0.6972,0.7007,0.409,0.409
2,0.7029,0.7857,0.7046,0.6948,0.6997,0.4058,0.4059
3,0.7057,0.7834,0.7163,0.6942,0.7051,0.4116,0.4118
4,0.6971,0.7756,0.7018,0.6878,0.6947,0.3943,0.3944
5,0.7069,0.791,0.7132,0.6969,0.705,0.4138,0.4139
6,0.6957,0.7804,0.7006,0.6862,0.6933,0.3914,0.3914
7,0.7035,0.7879,0.7111,0.693,0.7019,0.407,0.4072
8,0.7103,0.7884,0.7282,0.6958,0.7117,0.4208,0.4213
9,0.7095,0.7845,0.7126,0.7008,0.7067,0.419,0.4191


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


In [15]:
# finalizar modelo
model_finalized = finalize_model(model_tuned)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Light Gradient Boosting Machine,0.7093,0.7916,0.7184,0.6982,0.7082,0.4187,0.4189


In [16]:
save_model(model_finalized, 'bs_predictor')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=FastMemory(location=C:\Users\alniquia\AppData\Local\Temp\joblib),
          steps=[('label_encoding',
                  TransformerWrapperWithInverse(exclude=None, include=None,
                                                transformer=LabelEncoder())),
                 ('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['barriers', 'barriers_center',
                                              'bushes', 'bushes_center',
                                              'waterProp',
                                              'avg_brawler_Range_Num_diff',
                                              'a...
                  LGBMClassifier(boosting_type='gbdt', class_weight=None,
                                 colsample_bytree=1.0, importance_type='split',
                                 learning_rate=0.1, max_depth=-1,
                                 min_child_samples=20, min_child_weight=0.001,
 

In [9]:
predictions = predict_model(model_finalized, data = test)
predictions['winner_team'] = predictions['winner_team'] + 1
predictions.head()

Unnamed: 0,event_mode,event_map,battle_team1_player1_brawler_name,battle_team1_player2_brawler_name,battle_team1_player3_brawler_name,battle_team2_player1_brawler_name,battle_team2_player2_brawler_name,battle_team2_player3_brawler_name,battle_team1_player1_brawler_Class,battle_team1_player2_brawler_Class,...,min_brawler_winrate_diff,min_brawler_winrate_overall_diff,min_highestTrophies_diff,min_trophies_diff,min_expPoints_diff,min_team_victories_diff,battle_power_diff,winner_team,prediction_label,prediction_score
36071,brawlBall,Super Beach,TICK,EL PRIMO,SHELLY,FANG,COLT,OTIS,Damage Dealer,Tank,...,-0.0775,-0.088469,-2.015099,-1.98292,-4.348138,-4.52691,0.0,1,2,0.8202
36425,brawlBall,Super Beach,MAISIE,SPIKE,CHESTER,HANK,EMZ,MAISIE,,Damage Dealer,...,,,0.489726,0.478965,0.960376,0.72232,8.0,1,1,0.8335
102249,brawlBall,Backyard Bowl,PAM,MORTIS,SURGE,BEA,LOLA,MEG,Support,Assassin,...,-0.071375,-0.084791,-0.339709,-0.335962,-0.533372,-0.596536,1.0,2,2,0.8085
38608,brawlBall,Super Beach,GROM,HANK,GENE,HANK,MORTIS,WILLOW,Damage Dealer,,...,,,-0.225481,-0.241171,-0.671387,-0.691845,1.0,2,1,0.6205
117920,bounty,Canal Grande,PIPER,TICK,EVE,HANK,PENNY,RUFFS,Damage Dealer,Damage Dealer,...,,,-1.232141,-1.23518,-1.868867,-1.949996,-1.0,2,2,0.9121


In [12]:
def metrics_capturing(df):
	"""Captures the metrics of a classification model"""
	from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix

	accuracy = accuracy_score(df['winner_team'], df['prediction_label'])
	f1 = f1_score(df['winner_team'], df['prediction_label'])
	precision = precision_score(df['winner_team'], df['prediction_label'])
	recall = recall_score(df['winner_team'], df['prediction_label'])
	report = classification_report(df['winner_team'], df['prediction_label'])
	matrix = confusion_matrix(df['winner_team'], df['prediction_label'])

	return accuracy, f1, precision, recall, report, matrix

accuracy, f1, precision, recall, report, matrix = metrics_capturing(predictions)
print('accuracy: ', accuracy)
print('f1: ', f1)
print('precision: ', precision)
print('recall: ', recall)
print('report: ', report)
print('matrix: ', matrix)

# save metrics
metrics = pd.DataFrame({
		'accuracy': [accuracy],
		'f1': [f1],
		'precision': [precision],
		'recall': [recall],
})

metrics.to_json('bs_metrics.json')

accuracy:  0.7093008855037464
f1:  0.7104394757131842
precision:  0.7206406807232685
recall:  0.7005230507237562
report:                precision    recall  f1-score   support

           1       0.72      0.70      0.71     16442
           2       0.70      0.72      0.71     15856

    accuracy                           0.71     32298
   macro avg       0.71      0.71      0.71     32298
weighted avg       0.71      0.71      0.71     32298

matrix:  [[11518  4924]
 [ 4465 11391]]
