In [9]:
import numpy as np
import pandas as pd
import re
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, recall_score, classification_report,confusion_matrix
import glob
import os
import matplotlib.pyplot as plt

In [2]:
# train_path = max(glob.glob('./data/twitter_data_train/*.csv'), key=os.path.getctime) 
# test_path = max(glob.glob('./data/test/*.csv'), key=os.path.getctime) 
train_df = pd.read_csv('./data/twitter_data_train_multiclass.csv')
test_df = pd.read_csv('./data/twitter_data_test_multiclass.csv')

print(train_df.shape)
print(test_df.shape)

(9445, 235)
(1667, 235)


In [3]:
to_drop = ['screen_name', 'url', 'profile_image_url', 'description',
           'id', 'name', 'account_type','account_type_multi', 'tweets_list', 'tweets_list_processed',
          'description_processed', 'protected', 'verified']

X_train, y_train = train_df.drop(to_drop, axis=1), train_df['account_type_multi']
X_test, y_test = test_df.drop(to_drop, axis=1), test_df['account_type_multi']

In [4]:
# print(train_df.columns[:20])
# test_df.columns == train_df.columns

In [5]:
# Combine train and test data set tgt

X = pd.concat([X_train, X_test])
y = pd.concat([y_train, y_test])

In [6]:
# Check if dataset is balanced
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(9445, 222)
(9445,)
(1667, 222)
(1667,)


In [7]:
# Make sure all data are of integer of float type
X_train.dtypes.unique()

array([dtype('int64'), dtype('float64')], dtype=object)

In [12]:
#Default model
model_xgboost_default = xgb.XGBClassifier(objective = 'multi:softprob', eval_metric='auc')

model_xgboost_default.fit(X_train,
                  y_train,
                  early_stopping_rounds=10,
                  eval_set=[(X_test, y_test)],
                  verbose=True)

xgb_train_default_predict = model_xgboost_default.predict(X_train)
xgb_test_default_predict = model_xgboost_default.predict(X_test)




[0]	validation_0-auc:0.99737
[1]	validation_0-auc:0.99799
[2]	validation_0-auc:0.99780
[3]	validation_0-auc:0.99792
[4]	validation_0-auc:0.99795
[5]	validation_0-auc:0.99830
[6]	validation_0-auc:0.99865
[7]	validation_0-auc:0.99881
[8]	validation_0-auc:0.99881
[9]	validation_0-auc:0.99898
[10]	validation_0-auc:0.99925
[11]	validation_0-auc:0.99931
[12]	validation_0-auc:0.99936
[13]	validation_0-auc:0.99936
[14]	validation_0-auc:0.99938
[15]	validation_0-auc:0.99941
[16]	validation_0-auc:0.99941
[17]	validation_0-auc:0.99943
[18]	validation_0-auc:0.99942
[19]	validation_0-auc:0.99942
[20]	validation_0-auc:0.99941
[21]	validation_0-auc:0.99938
[22]	validation_0-auc:0.99936
[23]	validation_0-auc:0.99935
[24]	validation_0-auc:0.99938
[25]	validation_0-auc:0.99936
[26]	validation_0-auc:0.99934


In [13]:
# model_xgboost_complex = xgb.XGBClassifier(learning_rate=0.1,
#                                       max_depth=5,
#                                       n_estimators=5000,
#                                       subsample=0.5,
#                                       colsample_bytree=0.5,
#                                       eval_metric='aucpr',
#                                       verbosity=1)

# model_xgboost_complex.fit(X_train,
#                   y_train,
#                   early_stopping_rounds=10,
#                   eval_set=[(X_test, y_test)],
#                   verbose=True)

[0]	validation_0-aucpr:0.94957
[1]	validation_0-aucpr:0.98175
[2]	validation_0-aucpr:0.98275
[3]	validation_0-aucpr:0.98359
[4]	validation_0-aucpr:0.98419
[5]	validation_0-aucpr:0.98579
[6]	validation_0-aucpr:0.98590
[7]	validation_0-aucpr:0.98644
[8]	validation_0-aucpr:0.98832
[9]	validation_0-aucpr:0.98863
[10]	validation_0-aucpr:0.98783
[11]	validation_0-aucpr:0.98825
[12]	validation_0-aucpr:0.98912
[13]	validation_0-aucpr:0.98913
[14]	validation_0-aucpr:0.98972
[15]	validation_0-aucpr:0.98949
[16]	validation_0-aucpr:0.98930
[17]	validation_0-aucpr:0.98987
[18]	validation_0-aucpr:0.99020
[19]	validation_0-aucpr:0.99127
[20]	validation_0-aucpr:0.99129
[21]	validation_0-aucpr:0.99158
[22]	validation_0-aucpr:0.99172
[23]	validation_0-aucpr:0.99167
[24]	validation_0-aucpr:0.99178
[25]	validation_0-aucpr:0.99161
[26]	validation_0-aucpr:0.99150
[27]	validation_0-aucpr:0.99105
[28]	validation_0-aucpr:0.99169
[29]	validation_0-aucpr:0.99123
[30]	validation_0-aucpr:0.99175
[31]	validation_0-

**Evaluate Model Performance**

In [13]:
print(confusion_matrix(y_train, xgb_train_default_predict))
print(classification_report(y_train, xgb_train_default_predict))

[[2342    1    0    0    0    0]
 [   0 2701    0    0    0    0]
 [   0    0  300    0    0    0]
 [   1    1    0 3704    0    0]
 [   0    0    0    0  316    0]
 [   0    0    0    3    0   76]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2343
           1       1.00      1.00      1.00      2701
           2       1.00      1.00      1.00       300
           3       1.00      1.00      1.00      3706
           4       1.00      1.00      1.00       316
           5       1.00      0.96      0.98        79

    accuracy                           1.00      9445
   macro avg       1.00      0.99      1.00      9445
weighted avg       1.00      1.00      1.00      9445



In [14]:
print(confusion_matrix(y_test, xgb_test_default_predict))
print(classification_report(y_test, xgb_test_default_predict))

[[406   2   1   4   0   0]
 [  4 456   0   1   2   0]
 [  5   0  54   0   0   0]
 [  6   0   0 656   1   0]
 [  1   0   0   0  50   0]
 [  0   0   0   1   0  17]]
              precision    recall  f1-score   support

           0       0.96      0.98      0.97       413
           1       1.00      0.98      0.99       463
           2       0.98      0.92      0.95        59
           3       0.99      0.99      0.99       663
           4       0.94      0.98      0.96        51
           5       1.00      0.94      0.97        18

    accuracy                           0.98      1667
   macro avg       0.98      0.97      0.97      1667
weighted avg       0.98      0.98      0.98      1667



In [21]:
min(y_train_pred)

0.00077874504

In [18]:
y_train_pred = model_xgboost_default.predict_proba(X_train)[:,1]
y_test_pred = model_xgboost_default.predict_proba(X_test)[:,1] # Slicing to obtain prob of observation being 1

print("AUC Train: {:.4f}\nAUC Valid: {:.4f}".format(roc_auc_score(y_train, y_train_pred, multi_class = 'ovo'),
                                                    roc_auc_score(y_test, y_test_pred, multi_class = 'ovo')))

AxisError: axis 1 is out of bounds for array of dimension 1

In [12]:
# y_train_pred = model_xgboost_complex.predict_proba(X_train)[:,1]
# y_test_pred = model_xgboost_complex.predict_proba(X_test)[:,1] # Slicing to obtain prob of observation being 1

# print("AUC Train: {:.4f}\nAUC Valid: {:.4f}".format(roc_auc_score(y_train, y_train_pred),
#                                                     roc_auc_score(y_test, y_test_pred)))

AUC Train: 0.9999
AUC Valid: 0.9989


**Hyperparameter Tuning**

We will use GridSearchCV for hyperparameter tuning.


In [13]:
learning_rate_list = [0.02, 0.05, 0.1]
max_depth_list = [2, 3, 5]
n_estimators_list = [1000, 2000, 3000]

params_dict = {"learning_rate": learning_rate_list,
               "max_depth": max_depth_list,
               "n_estimators": n_estimators_list}

num_combinations = 1
for v in params_dict.values(): num_combinations *= len(v) 

print(num_combinations)
params_dict

27


{'learning_rate': [0.02, 0.05, 0.1],
 'max_depth': [2, 3, 5],
 'n_estimators': [1000, 2000, 3000]}

In [17]:
# Wrapper for auc auc score
def auc_score(model, X, y): 
    return recall_score(y, model.predict_proba(X)[:,1])

model_xgboost_hp = GridSearchCV(estimator=xgb.XGBClassifier(subsample=0.5, # Params that we are not tuning
                                                                colsample_bytree=0.25,
                                                                eval_metric='aucpr',
                                                                use_label_encoder=False),
                                param_grid=params_dict,
                                cv=2, # Half of data are used for validation every iteration
                                scoring='recall',
                                return_train_score=True,
                                verbose=4)

model_xgboost_hp.fit(X, y)

Fitting 2 folds for each of 27 candidates, totalling 54 fits
[CV 1/2] END learning_rate=0.02, max_depth=2, n_estimators=1000;, score=(train=0.987, test=0.985) total time=   3.4s
[CV 2/2] END learning_rate=0.02, max_depth=2, n_estimators=1000;, score=(train=0.991, test=0.984) total time=   3.3s
[CV 1/2] END learning_rate=0.02, max_depth=2, n_estimators=2000;, score=(train=0.998, test=0.986) total time=   6.7s
[CV 2/2] END learning_rate=0.02, max_depth=2, n_estimators=2000;, score=(train=0.997, test=0.985) total time=   7.0s
[CV 1/2] END learning_rate=0.02, max_depth=2, n_estimators=3000;, score=(train=0.999, test=0.987) total time=  11.3s
[CV 2/2] END learning_rate=0.02, max_depth=2, n_estimators=3000;, score=(train=0.999, test=0.989) total time=  10.2s
[CV 1/2] END learning_rate=0.02, max_depth=3, n_estimators=1000;, score=(train=0.998, test=0.986) total time=   3.9s
[CV 2/2] END learning_rate=0.02, max_depth=3, n_estimators=1000;, score=(train=0.997, test=0.986) total time=   4.2s
[CV

In [18]:
df_cv_results = pd.DataFrame(model_xgboost_hp.cv_results_)
df_cv_results = df_cv_results[['rank_test_score','mean_test_score','mean_train_score',
                               'param_learning_rate', 'param_max_depth', 'param_n_estimators']]
df_cv_results.sort_values(by='rank_test_score', inplace=True)
df_cv_results

Unnamed: 0,rank_test_score,mean_test_score,mean_train_score,param_learning_rate,param_max_depth,param_n_estimators
14,1,0.988518,1.0,0.05,3,3000
6,1,0.988518,0.999304,0.02,5,1000
13,3,0.98817,1.0,0.05,3,2000
18,3,0.98817,1.0,0.1,2,1000
12,3,0.98817,0.999652,0.05,3,1000
2,6,0.987822,0.998956,0.02,2,3000
15,6,0.987822,1.0,0.05,5,1000
22,8,0.987474,1.0,0.1,3,2000
10,8,0.987474,1.0,0.05,2,2000
23,8,0.987474,1.0,0.1,3,3000


## Final Model
Using best parameters from above step

In [20]:
model_xgboost_fin = xgb.XGBClassifier(learning_rate=0.05,
                                          max_depth=3,
                                          n_estimators=3000,
                                          subsample=0.5,
                                          colsample_bytree=0.25,
                                          eval_metric='aucpr',
                                          verbosity=1,
                                          use_label_encoder=False)

# Passing both training and validation dataset as we want to plot AUC for both
eval_set = [(X_train, y_train),(X_test, y_test)]

model_xgboost_fin.fit(X_train,
                  y_train,
                  early_stopping_rounds=20,
                  eval_set=eval_set,
                  verbose=True)

[0]	validation_0-aucpr:0.88215	validation_1-aucpr:0.86296
[1]	validation_0-aucpr:0.94790	validation_1-aucpr:0.92140
[2]	validation_0-aucpr:0.96183	validation_1-aucpr:0.93057
[3]	validation_0-aucpr:0.97130	validation_1-aucpr:0.94553
[4]	validation_0-aucpr:0.98034	validation_1-aucpr:0.96197
[5]	validation_0-aucpr:0.98420	validation_1-aucpr:0.98355
[6]	validation_0-aucpr:0.98531	validation_1-aucpr:0.98203
[7]	validation_0-aucpr:0.98327	validation_1-aucpr:0.97882
[8]	validation_0-aucpr:0.98290	validation_1-aucpr:0.97719
[9]	validation_0-aucpr:0.98472	validation_1-aucpr:0.97928
[10]	validation_0-aucpr:0.98507	validation_1-aucpr:0.98313
[11]	validation_0-aucpr:0.98412	validation_1-aucpr:0.98237
[12]	validation_0-aucpr:0.98556	validation_1-aucpr:0.97802
[13]	validation_0-aucpr:0.98689	validation_1-aucpr:0.98223
[14]	validation_0-aucpr:0.98747	validation_1-aucpr:0.98200
[15]	validation_0-aucpr:0.98799	validation_1-aucpr:0.98135
[16]	validation_0-aucpr:0.98735	validation_1-aucpr:0.98107
[17]	va

In [21]:
var_colums = [c for c in X_train.columns if c not in ['state']]

df_var_imp = pd.DataFrame({"Variable": var_colums,
                           "Importance": model_xgboost_fin.feature_importances_})\
                        .sort_values(by='Importance', ascending=False)
df_var_imp[:10]

Unnamed: 0,Variable,Importance
161,description_40,0.072634
14,tweet_weekday_frequency,0.071494
0,statuses_count,0.070476
3,favourites_count,0.069373
1,followers_count,0.062043
124,description_3,0.042818
167,description_46,0.036456
7,geo_enabled,0.035228
113,tweets_92,0.032998
15,following_to_followers_ratio,0.02799
