# Notebook Summary
In this notebook I test a baseline XGB Boost model

In [1]:
import autoreload
%load_ext autoreload
%autoreload 2
%reload_ext autoreload
import Classes

In [2]:
import pickle
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score, recall_score, precision_recall_curve,f1_score, fbeta_score
from imblearn.over_sampling import ADASYN
import xgboost as xgb
from collections import Counter
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

# Pickle in Data

In [3]:
# Pickle in data

path = r"C:\Users\Andrew\Documents\Metis\TikTok_Hit_Predictor\Pickle\supervised_factorized.pkl"

df = pickle.load(open(path,'rb'))
df.head(2)

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,year,spotify_artists,success
0,0.88,0.501,2.0,-6.774,1.0,0.062,0.0494,0.0695,0.436,0.459,120.038,2020.0,0,1.0
1,0.935,0.454,1.0,-7.509,1.0,0.375,0.0194,0.0,0.0824,0.357,133.073,2018.0,1,1.0


# XG Boost - Directly altered weighting

In [4]:
# Seperate features from label

X = df.loc[:,['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo','year','spotify_artists']]

y = df['success']

In [5]:
# split training data into training and validate
#Split data into 3: 60% train, 20% validation, 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25,stratify=y_train)

In [6]:
# calculating small_pos_weight
# count examples in each class
counter = Counter(y_train)
# estimate scale_pos_weight value
estimate = counter[0] / counter[1]
print('Estimate: %.3f' % estimate)

Estimate: 7.093


In [7]:
gbm = xgb.XGBClassifier( 
                        n_estimators=30000,
                        max_depth=8,
                        objective='binary:logistic', 
                        learning_rate=.05, 
                        subsample=.8,
                        min_child_weight=3,
                        colsample_bytree=.8,
                        scale_pos_weight = 7.728,
                        n_jobs = -1
                       )
eval_set=[(X_train,y_train),(X_val,y_val)]

gbm.fit( 
                    X_train, y_train, 
                    eval_set=eval_set,
                    eval_metric='error', #new evaluation metric: classification error (could also use AUC, e.g.)
                    early_stopping_rounds=50,
                    verbose=False
)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.05, max_delta_step=0, max_depth=8,
              min_child_weight=3, missing=nan, monotone_constraints='()',
              n_estimators=30000, n_jobs=-1, num_parallel_tree=1,
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=7.728,
              subsample=0.8, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [8]:
#scores
print("Scores for the XGB model test data")

print("\nAccuracy score: {:6.4f}%".format(100*accuracy_score(y_test, gbm.predict(X_test, ntree_limit=gbm.best_ntree_limit))))
print("F1 score: {:6.4f}%".format(100*f1_score(y_test, gbm.predict(X_test, ntree_limit=gbm.best_ntree_limit))))
print("F2 score: {:6.4f}%".format(100*fbeta_score(y_test, gbm.predict(X_test, ntree_limit=gbm.best_ntree_limit),beta=2.0)))
print("Precision score: {:6.4f}%".format(100*precision_score(y_test, gbm.predict(X_test, ntree_limit=gbm.best_ntree_limit))))
print("Recall score: {:6.4f}%".format(100*recall_score(y_test, gbm.predict(X_test, ntree_limit=gbm.best_ntree_limit))))
print("AUC: {:6.4f}%".format(100*roc_auc_score(y_test, gbm.predict(X_test, ntree_limit=gbm.best_ntree_limit))))

Scores for the XGB model test data

Accuracy score: 94.3322%
F1 score: 81.1280%
F2 score: 90.6887%
Precision score: 69.0037%
Recall score: 98.4211%
AUC: 96.0878%
