In [2]:
import numpy as np
import pandas as pd
import re
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, recall_score
import glob
import os
import matplotlib.pyplot as plt

In [3]:
# train_path = max(glob.glob('./data/twitter_data_train/*.csv'), key=os.path.getctime) 
# test_path = max(glob.glob('./data/test/*.csv'), key=os.path.getctime) 
train_df = pd.read_csv('./data/twitter_data_train.csv')
test_df = pd.read_csv('./data/twitter_data_test.csv')

print(train_df.shape)
print(test_df.shape)

(9445, 233)
(1667, 233)


In [4]:
to_drop = ['screen_name', 'url', 'profile_image_url', 'description',
           'id', 'name', 'account_type', 'tweets_list', 'tweets_list_processed',
          'description_processed', 'protected', 'verified']

X_train, y_train = train_df.drop(to_drop, axis=1), train_df['account_type']
X_test, y_test = test_df.drop(to_drop, axis=1), test_df['account_type']

In [5]:
# print(train_df.columns[:20])
# test_df.columns == train_df.columns

In [6]:
# Combine train and test data set tgt

X = pd.concat([X_train, X_test])
y = pd.concat([y_train, y_test])

In [7]:
# Check if dataset is balanced
print(test_df[y_test == 1].shape)
print(test_df[y_test == 0].shape)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(433, 233)
(1234, 233)
(9445, 221)
(9445,)
(1667, 221)
(1667,)


In [8]:
# Make sure all data are of integer of float type
X_train.dtypes.unique()

array([dtype('int64'), dtype('float64')], dtype=object)

In [9]:
#Default model
model_xgboost_default = xgb.XGBClassifier(eval_metric='aucpr')

model_xgboost_default.fit(X_train,
                  y_train,
                  early_stopping_rounds=10,
                  eval_set=[(X_test, y_test)],
                  verbose=True)


[0]	validation_0-aucpr:0.95511
[1]	validation_0-aucpr:0.96717
[2]	validation_0-aucpr:0.97703




[3]	validation_0-aucpr:0.97924
[4]	validation_0-aucpr:0.98570
[5]	validation_0-aucpr:0.98582
[6]	validation_0-aucpr:0.98145
[7]	validation_0-aucpr:0.99220
[8]	validation_0-aucpr:0.99299
[9]	validation_0-aucpr:0.99368
[10]	validation_0-aucpr:0.99372
[11]	validation_0-aucpr:0.99374
[12]	validation_0-aucpr:0.99368
[13]	validation_0-aucpr:0.99338
[14]	validation_0-aucpr:0.99398
[15]	validation_0-aucpr:0.99441
[16]	validation_0-aucpr:0.99489
[17]	validation_0-aucpr:0.99501
[18]	validation_0-aucpr:0.99480
[19]	validation_0-aucpr:0.99481
[20]	validation_0-aucpr:0.99514
[21]	validation_0-aucpr:0.99507
[22]	validation_0-aucpr:0.99570
[23]	validation_0-aucpr:0.99561
[24]	validation_0-aucpr:0.99532
[25]	validation_0-aucpr:0.99538
[26]	validation_0-aucpr:0.99520
[27]	validation_0-aucpr:0.99524
[28]	validation_0-aucpr:0.99503
[29]	validation_0-aucpr:0.99505
[30]	validation_0-aucpr:0.99510
[31]	validation_0-aucpr:0.99498


In [10]:
model_xgboost_complex = xgb.XGBClassifier(learning_rate=0.1,
                                      max_depth=5,
                                      n_estimators=5000,
                                      subsample=0.5,
                                      colsample_bytree=0.5,
                                      eval_metric='aucpr',
                                      verbosity=1)

model_xgboost_complex.fit(X_train,
                  y_train,
                  early_stopping_rounds=10,
                  eval_set=[(X_test, y_test)],
                  verbose=True)

[0]	validation_0-aucpr:0.95634
[1]	validation_0-aucpr:0.97933
[2]	validation_0-aucpr:0.98335
[3]	validation_0-aucpr:0.98374
[4]	validation_0-aucpr:0.97694
[5]	validation_0-aucpr:0.98209
[6]	validation_0-aucpr:0.98406
[7]	validation_0-aucpr:0.98432
[8]	validation_0-aucpr:0.98459
[9]	validation_0-aucpr:0.98928
[10]	validation_0-aucpr:0.98920
[11]	validation_0-aucpr:0.98949
[12]	validation_0-aucpr:0.98971
[13]	validation_0-aucpr:0.99035
[14]	validation_0-aucpr:0.99033
[15]	validation_0-aucpr:0.99343
[16]	validation_0-aucpr:0.99305
[17]	validation_0-aucpr:0.99323
[18]	validation_0-aucpr:0.99351
[19]	validation_0-aucpr:0.99350
[20]	validation_0-aucpr:0.99432
[21]	validation_0-aucpr:0.99438
[22]	validation_0-aucpr:0.99350
[23]	validation_0-aucpr:0.99357
[24]	validation_0-aucpr:0.99351
[25]	validation_0-aucpr:0.99368
[26]	validation_0-aucpr:0.99466
[27]	validation_0-aucpr:0.99465
[28]	validation_0-aucpr:0.99455
[29]	validation_0-aucpr:0.99453
[30]	validation_0-aucpr:0.99510
[31]	validation_0-

**Evaluate Model Performance**

In [11]:
y_train_pred = model_xgboost_default.predict_proba(X_train)[:,1]
y_test_pred = model_xgboost_default.predict_proba(X_test)[:,1] # Slicing to obtain prob of observation being 1

print("AUC Train: {:.4f}\nAUC Valid: {:.4f}".format(roc_auc_score(y_train, y_train_pred),
                                                    roc_auc_score(y_test, y_test_pred)))

AUC Train: 1.0000
AUC Valid: 0.9986


In [12]:
y_train_pred = model_xgboost_complex.predict_proba(X_train)[:,1]
y_test_pred = model_xgboost_complex.predict_proba(X_test)[:,1] # Slicing to obtain prob of observation being 1

print("AUC Train: {:.4f}\nAUC Valid: {:.4f}".format(roc_auc_score(y_train, y_train_pred),
                                                    roc_auc_score(y_test, y_test_pred)))

AUC Train: 0.9999
AUC Valid: 0.9989


**Hyperparameter Tuning**

We will use GridSearchCV for hyperparameter tuning.


In [13]:
learning_rate_list = [0.02, 0.05, 0.1]
max_depth_list = [2, 3, 5]
n_estimators_list = [1000, 2000, 3000]

params_dict = {"learning_rate": learning_rate_list,
               "max_depth": max_depth_list,
               "n_estimators": n_estimators_list}

num_combinations = 1
for v in params_dict.values(): num_combinations *= len(v) 

print(num_combinations)
params_dict

27


{'learning_rate': [0.02, 0.05, 0.1],
 'max_depth': [2, 3, 5],
 'n_estimators': [1000, 2000, 3000]}

In [17]:
# Wrapper for auc auc score
def auc_score(model, X, y): 
    return recall_score(y, model.predict_proba(X)[:,1])

model_xgboost_hp = GridSearchCV(estimator=xgb.XGBClassifier(subsample=0.5, # Params that we are not tuning
                                                                colsample_bytree=0.25,
                                                                eval_metric='aucpr',
                                                                use_label_encoder=False),
                                param_grid=params_dict,
                                cv=2, # Half of data are used for validation every iteration
                                scoring='recall',
                                return_train_score=True,
                                verbose=4)

model_xgboost_hp.fit(X, y)

Fitting 2 folds for each of 27 candidates, totalling 54 fits
[CV 1/2] END learning_rate=0.02, max_depth=2, n_estimators=1000;, score=(train=0.987, test=0.985) total time=   3.4s
[CV 2/2] END learning_rate=0.02, max_depth=2, n_estimators=1000;, score=(train=0.991, test=0.984) total time=   3.3s
[CV 1/2] END learning_rate=0.02, max_depth=2, n_estimators=2000;, score=(train=0.998, test=0.986) total time=   6.7s
[CV 2/2] END learning_rate=0.02, max_depth=2, n_estimators=2000;, score=(train=0.997, test=0.985) total time=   7.0s
[CV 1/2] END learning_rate=0.02, max_depth=2, n_estimators=3000;, score=(train=0.999, test=0.987) total time=  11.3s
[CV 2/2] END learning_rate=0.02, max_depth=2, n_estimators=3000;, score=(train=0.999, test=0.989) total time=  10.2s
[CV 1/2] END learning_rate=0.02, max_depth=3, n_estimators=1000;, score=(train=0.998, test=0.986) total time=   3.9s
[CV 2/2] END learning_rate=0.02, max_depth=3, n_estimators=1000;, score=(train=0.997, test=0.986) total time=   4.2s
[CV

In [18]:
df_cv_results = pd.DataFrame(model_xgboost_hp.cv_results_)
df_cv_results = df_cv_results[['rank_test_score','mean_test_score','mean_train_score',
                               'param_learning_rate', 'param_max_depth', 'param_n_estimators']]
df_cv_results.sort_values(by='rank_test_score', inplace=True)
df_cv_results

Unnamed: 0,rank_test_score,mean_test_score,mean_train_score,param_learning_rate,param_max_depth,param_n_estimators
14,1,0.988518,1.0,0.05,3,3000
6,1,0.988518,0.999304,0.02,5,1000
13,3,0.98817,1.0,0.05,3,2000
18,3,0.98817,1.0,0.1,2,1000
12,3,0.98817,0.999652,0.05,3,1000
2,6,0.987822,0.998956,0.02,2,3000
15,6,0.987822,1.0,0.05,5,1000
22,8,0.987474,1.0,0.1,3,2000
10,8,0.987474,1.0,0.05,2,2000
23,8,0.987474,1.0,0.1,3,3000


## Final Model
Using best parameters from above step

In [20]:
model_xgboost_fin = xgb.XGBClassifier(learning_rate=0.05,
                                          max_depth=3,
                                          n_estimators=3000,
                                          subsample=0.5,
                                          colsample_bytree=0.25,
                                          eval_metric='aucpr',
                                          verbosity=1,
                                          use_label_encoder=False)

# Passing both training and validation dataset as we want to plot AUC for both
eval_set = [(X_train, y_train),(X_test, y_test)]

model_xgboost_fin.fit(X_train,
                  y_train,
                  early_stopping_rounds=20,
                  eval_set=eval_set,
                  verbose=True)

[0]	validation_0-aucpr:0.88215	validation_1-aucpr:0.86296
[1]	validation_0-aucpr:0.94790	validation_1-aucpr:0.92140
[2]	validation_0-aucpr:0.96183	validation_1-aucpr:0.93057
[3]	validation_0-aucpr:0.97130	validation_1-aucpr:0.94553
[4]	validation_0-aucpr:0.98034	validation_1-aucpr:0.96197
[5]	validation_0-aucpr:0.98420	validation_1-aucpr:0.98355
[6]	validation_0-aucpr:0.98531	validation_1-aucpr:0.98203
[7]	validation_0-aucpr:0.98327	validation_1-aucpr:0.97882
[8]	validation_0-aucpr:0.98290	validation_1-aucpr:0.97719
[9]	validation_0-aucpr:0.98472	validation_1-aucpr:0.97928
[10]	validation_0-aucpr:0.98507	validation_1-aucpr:0.98313
[11]	validation_0-aucpr:0.98412	validation_1-aucpr:0.98237
[12]	validation_0-aucpr:0.98556	validation_1-aucpr:0.97802
[13]	validation_0-aucpr:0.98689	validation_1-aucpr:0.98223
[14]	validation_0-aucpr:0.98747	validation_1-aucpr:0.98200
[15]	validation_0-aucpr:0.98799	validation_1-aucpr:0.98135
[16]	validation_0-aucpr:0.98735	validation_1-aucpr:0.98107
[17]	va

In [21]:
var_colums = [c for c in X_train.columns if c not in ['state']]

df_var_imp = pd.DataFrame({"Variable": var_colums,
                           "Importance": model_xgboost_fin.feature_importances_})\
                        .sort_values(by='Importance', ascending=False)
df_var_imp[:10]

Unnamed: 0,Variable,Importance
161,description_40,0.072634
14,tweet_weekday_frequency,0.071494
0,statuses_count,0.070476
3,favourites_count,0.069373
1,followers_count,0.062043
124,description_3,0.042818
167,description_46,0.036456
7,geo_enabled,0.035228
113,tweets_92,0.032998
15,following_to_followers_ratio,0.02799
