In [1]:
import numpy as np
import pandas as pd
import re
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import roc_auc_score, recall_score, classification_report,confusion_matrix, f1_score
import glob
import os
import matplotlib.pyplot as plt

In [2]:
# train_path = max(glob.glob('./data/twitter_data_train/*.csv'), key=os.path.getctime) 
# test_path = max(glob.glob('./data/test/*.csv'), key=os.path.getctime) 
train_df = pd.read_csv('./data/twitter_data_train_multiclass.csv')
test_df = pd.read_csv('./data/twitter_data_test_multiclass.csv')

print(train_df.shape)
print(test_df.shape)

(9445, 235)
(1667, 235)


In [3]:
to_drop = ['screen_name', 'url', 'profile_image_url', 'description',
           'id', 'name', 'account_type','account_type_multi', 'tweets_list', 'tweets_list_processed',
          'description_processed', 'protected', 'verified']

X_train, y_train = train_df.drop(to_drop, axis=1), train_df['account_type_multi']
X_test, y_test = test_df.drop(to_drop, axis=1), test_df['account_type_multi']

In [4]:
X_train.columns[:30]

Index(['Unnamed: 0', 'statuses_count', 'followers_count', 'friends_count',
       'favourites_count', 'listed_count', 'default_profile',
       'default_profile_image', 'geo_enabled', 'profile_use_background_image',
       'profile_background_tile', 'tweet_frequency', 'number_of_tags',
       'number_of_mentions', 'tweet_weekend_frequency',
       'tweet_weekday_frequency', 'following_to_followers_ratio',
       'username_length', 'screen_name_length', 'username_spec_char_count',
       'screen_name_spec_char_count', 'has_url', 'tweets_0', 'tweets_1',
       'tweets_2', 'tweets_3', 'tweets_4', 'tweets_5', 'tweets_6', 'tweets_7'],
      dtype='object')

In [5]:
# Combine train and test data set tgt

X = pd.concat([X_train, X_test])
y = pd.concat([y_train, y_test])

In [6]:
# Check if dataset is balanced
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(9445, 222)
(9445,)
(1667, 222)
(1667,)


In [7]:
# Make sure all data are of integer of float type
X_train.dtypes.unique()

array([dtype('int64'), dtype('float64')], dtype=object)

In [8]:
#Default model
model_xgboost_default = xgb.XGBClassifier(objective='multi:softprob')

model_xgboost_default.fit(X_train,
                  y_train,
                  early_stopping_rounds=10,
                  eval_set=[(X_test, y_test)],
                  verbose=True)

xgb_train_default_predict = model_xgboost_default.predict(X_train)
xgb_test_default_predict = model_xgboost_default.predict(X_test)




[0]	validation_0-mlogloss:1.03431
[1]	validation_0-mlogloss:0.72202
[2]	validation_0-mlogloss:0.53013
[3]	validation_0-mlogloss:0.39909
[4]	validation_0-mlogloss:0.30820
[5]	validation_0-mlogloss:0.24296
[6]	validation_0-mlogloss:0.19545
[7]	validation_0-mlogloss:0.16075
[8]	validation_0-mlogloss:0.13401
[9]	validation_0-mlogloss:0.11343
[10]	validation_0-mlogloss:0.09742
[11]	validation_0-mlogloss:0.08565
[12]	validation_0-mlogloss:0.07629
[13]	validation_0-mlogloss:0.06953
[14]	validation_0-mlogloss:0.06424
[15]	validation_0-mlogloss:0.06047
[16]	validation_0-mlogloss:0.05760
[17]	validation_0-mlogloss:0.05436
[18]	validation_0-mlogloss:0.05272
[19]	validation_0-mlogloss:0.05056
[20]	validation_0-mlogloss:0.04923
[21]	validation_0-mlogloss:0.04810
[22]	validation_0-mlogloss:0.04779
[23]	validation_0-mlogloss:0.04754
[24]	validation_0-mlogloss:0.04695
[25]	validation_0-mlogloss:0.04621
[26]	validation_0-mlogloss:0.04589
[27]	validation_0-mlogloss:0.04577
[28]	validation_0-mlogloss:0.0

In [9]:
print(model_xgboost_default.get_params)

<bound method XGBModel.get_params of XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, objective='multi:softprob',
              predictor='auto', random_state=0, reg_alpha=0, ...)>


In [10]:
# model_xgboost_complex = xgb.XGBClassifier(learning_rate=0.1,
#                                       max_depth=5,
#                                       n_estimators=5000,
#                                       subsample=0.5,
#                                       colsample_bytree=0.5,
#                                       eval_metric='aucpr',
#                                       verbosity=1)

# model_xgboost_complex.fit(X_train,
#                   y_train,
#                   early_stopping_rounds=10,
#                   eval_set=[(X_test, y_test)],
#                   verbose=True)

**Evaluate Model Performance**

In [11]:
print(confusion_matrix(y_train, xgb_train_default_predict))
print(classification_report(y_train, xgb_train_default_predict))

[[2343    0    0    0    0    0]
 [   0 2701    0    0    0    0]
 [   0    0  300    0    0    0]
 [   0    0    0 3706    0    0]
 [   0    0    0    0  316    0]
 [   0    0    0    0    0   79]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2343
           1       1.00      1.00      1.00      2701
           2       1.00      1.00      1.00       300
           3       1.00      1.00      1.00      3706
           4       1.00      1.00      1.00       316
           5       1.00      1.00      1.00        79

    accuracy                           1.00      9445
   macro avg       1.00      1.00      1.00      9445
weighted avg       1.00      1.00      1.00      9445



In [12]:
print(confusion_matrix(y_test, xgb_test_default_predict))
print(classification_report(y_test, xgb_test_default_predict))

[[409   1   1   2   0   0]
 [  2 459   0   0   2   0]
 [  4   0  55   0   0   0]
 [  5   0   0 657   1   0]
 [  1   0   0   0  50   0]
 [  0   0   0   0   0  18]]
              precision    recall  f1-score   support

           0       0.97      0.99      0.98       413
           1       1.00      0.99      0.99       463
           2       0.98      0.93      0.96        59
           3       1.00      0.99      0.99       663
           4       0.94      0.98      0.96        51
           5       1.00      1.00      1.00        18

    accuracy                           0.99      1667
   macro avg       0.98      0.98      0.98      1667
weighted avg       0.99      0.99      0.99      1667



In [13]:
# y_train_pred = model_xgboost_default.predict_proba(X_train)[:,1]
# y_test_pred = model_xgboost_default.predict_proba(X_test)[:,1] # Slicing to obtain prob of observation being 1

# print("AUC Train: {:.4f}\nAUC Valid: {:.4f}".format(f1_score(y_train, y_train_pred, multi_class = 'ovo'),
#                                                     roc_auc_score(y_test, y_test_pred, multi_class = 'ovo')))

In [14]:
# y_train_pred = model_xgboost_complex.predict_proba(X_train)[:,1]
# y_test_pred = model_xgboost_complex.predict_proba(X_test)[:,1] # Slicing to obtain prob of observation being 1

# print("AUC Train: {:.4f}\nAUC Valid: {:.4f}".format(roc_auc_score(y_train, y_train_pred),
#                                                     roc_auc_score(y_test, y_test_pred)))

**Hyperparameter Tuning**

In [24]:
learning_rate_list = [0.05, 0.1]
max_depth_list = [2, 5]
n_estimators_list = [1000, 3000]
# reg_lambda = [10, 20, 100]

params_dict = {"learning_rate": learning_rate_list,
               "max_depth": max_depth_list,
               "n_estimators": n_estimators_list}

num_combinations = 1
for v in params_dict.values(): num_combinations *= len(v) 

print(num_combinations)
params_dict

8


{'learning_rate': [0.05, 0.1],
 'max_depth': [2, 5],
 'n_estimators': [1000, 3000]}

In [25]:
# Wrapper for auc auc score
# def auc_score(model, X, y): 
#     return recall_score(y, model.predict_proba(X)[:,1])


model_xgboost_hp = RandomizedSearchCV(estimator=model_xgboost_default,
                                param_distributions = params_dict,
                                cv=3, # 1/3 of data is used for validation every iteration
                                scoring='f1_weighted',
                                return_train_score=True,
                                verbose=4)

model_xgboost_hp.fit(X, y)



Fitting 3 folds for each of 8 candidates, totalling 24 fits
[CV 1/3] END learning_rate=0.05, max_depth=2, n_estimators=1000;, score=(train=1.000, test=0.990) total time=  36.0s
[CV 2/3] END learning_rate=0.05, max_depth=2, n_estimators=1000;, score=(train=1.000, test=0.991) total time=  45.5s
[CV 3/3] END learning_rate=0.05, max_depth=2, n_estimators=1000;, score=(train=1.000, test=0.991) total time=  34.0s
[CV 1/3] END learning_rate=0.05, max_depth=2, n_estimators=3000;, score=(train=1.000, test=0.991) total time= 1.4min
[CV 2/3] END learning_rate=0.05, max_depth=2, n_estimators=3000;, score=(train=1.000, test=0.991) total time= 1.7min
[CV 3/3] END learning_rate=0.05, max_depth=2, n_estimators=3000;, score=(train=1.000, test=0.990) total time= 1.6min
[CV 1/3] END learning_rate=0.05, max_depth=5, n_estimators=1000;, score=(train=1.000, test=0.990) total time=  48.5s
[CV 2/3] END learning_rate=0.05, max_depth=5, n_estimators=1000;, score=(train=1.000, test=0.991) total time=  51.8s
[CV 

In [26]:
df_cv_results = pd.DataFrame(model_xgboost_hp.cv_results_)
df_cv_results = df_cv_results[['rank_test_score','mean_test_score','mean_train_score',
                               'param_learning_rate', 'param_max_depth', 'param_n_estimators']]
df_cv_results.sort_values(by='rank_test_score', inplace=True)
df_cv_results

Unnamed: 0,rank_test_score,mean_test_score,mean_train_score,param_learning_rate,param_max_depth,param_n_estimators
0,1,0.990818,1.0,0.05,2,1000
4,2,0.990815,1.0,0.1,2,1000
1,3,0.990726,1.0,0.05,2,3000
5,4,0.990637,1.0,0.1,2,3000
3,5,0.990631,1.0,0.05,5,3000
7,6,0.990539,1.0,0.1,5,3000
6,7,0.990537,1.0,0.1,5,1000
2,8,0.990361,1.0,0.05,5,1000


## Final Model
Using best parameters from above step

In [27]:
model_xgboost_fin = xgb.XGBClassifier(learning_rate=0.05,
                                          max_depth=2,
                                          n_estimators=1000,
                                          verbosity=1)

# Passing both training and validation dataset as we want to plot AUC for both
eval_set = [(X_train, y_train),(X_test, y_test)]

model_xgboost_fin.fit(X_train,
                  y_train,
                  early_stopping_rounds=20,
                  eval_set=eval_set,
                  verbose=True)



[0]	validation_0-mlogloss:1.66399	validation_1-mlogloss:1.66630
[1]	validation_0-mlogloss:1.55356	validation_1-mlogloss:1.55719
[2]	validation_0-mlogloss:1.45522	validation_1-mlogloss:1.45991
[3]	validation_0-mlogloss:1.36774	validation_1-mlogloss:1.37375
[4]	validation_0-mlogloss:1.28897	validation_1-mlogloss:1.29568
[5]	validation_0-mlogloss:1.21761	validation_1-mlogloss:1.22517
[6]	validation_0-mlogloss:1.15249	validation_1-mlogloss:1.16082
[7]	validation_0-mlogloss:1.09272	validation_1-mlogloss:1.10173
[8]	validation_0-mlogloss:1.03864	validation_1-mlogloss:1.04813
[9]	validation_0-mlogloss:0.98854	validation_1-mlogloss:0.99817
[10]	validation_0-mlogloss:0.94132	validation_1-mlogloss:0.95144
[11]	validation_0-mlogloss:0.89789	validation_1-mlogloss:0.90844
[12]	validation_0-mlogloss:0.85733	validation_1-mlogloss:0.86803
[13]	validation_0-mlogloss:0.81941	validation_1-mlogloss:0.83028
[14]	validation_0-mlogloss:0.78390	validation_1-mlogloss:0.79494
[15]	validation_0-mlogloss:0.74938	

In [28]:
var_colums = [c for c in X_train.columns if c not in ['state']]

df_var_imp = pd.DataFrame({"Variable": var_colums,
                           "Importance": model_xgboost_fin.feature_importances_})\
                        .sort_values(by='Importance', ascending=False)
df_var_imp[:10]

Unnamed: 0,Variable,Importance
36,tweets_14,0.123859
4,favourites_count,0.083913
168,description_46,0.081448
45,tweets_23,0.072082
6,default_profile,0.071394
16,following_to_followers_ratio,0.065341
41,tweets_19,0.059756
81,tweets_59,0.048839
53,tweets_31,0.048396
9,profile_use_background_image,0.029556
