In [1]:
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.ensemble import RandomForestClassifier
import glob
import os
import matplotlib.pyplot as plt

In [2]:
pd.set_option('mode.use_inf_as_na', True)

In [3]:
train_df = pd.read_csv("data/twitter_data_train.csv")
test_df = pd.read_csv("data/twitter_data_test.csv")

print(train_df.shape)
print(test_df.shape)

(9445, 233)
(1667, 233)


In [4]:
to_drop = [
    'account_type', 
    'id', 
    'name', 
    'screen_name',
    'profile_image_url',
    'protected',
    'verified',
    'description',
    'description_processed',
    'tweets_list',
    'tweets_list_processed',
    'url'
    ]

X_train, y_train = train_df.drop(to_drop, axis=1), train_df['account_type']
X_test, y_test = test_df.drop(to_drop, axis=1), test_df['account_type']

X_train.fillna(0, inplace=True)
X_test.fillna(0, inplace=True)


In [5]:
# Combine train and test data set tgt

X = pd.concat([X_train, X_test])
y = pd.concat([y_train, y_test])

In [6]:
X_train.dtypes.unique()

array([dtype('int64'), dtype('float64')], dtype=object)

**Random Forest model training and testing**

In [7]:
# Random forest model 
# training
regressor = RandomForestClassifier(n_estimators=100)
regressor.fit(X_train, y_train)
# testing
y_pred_test = regressor.predict(X_test)

**Evaluate Model Performance**

In [8]:
# test accuracy
accuracy_score(y_test, y_pred_test)

0.9868026394721056

In [9]:
# test auroc 
roc_auc_score(y_test, y_pred_test)

0.9847326328147025

**Hyperparameter Tuning**

We will use RandomizedSearchCV to narrow down the range for grid search hyperparameter tuning.


In [10]:
regressor.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [11]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the grid
grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [12]:
# Random forest model 
# training
regressor = RandomForestClassifier(n_estimators=100)

rf_randomcv = RandomizedSearchCV(
    estimator=regressor,
    param_distributions=grid,
    n_iter=100, 
    cv=5, # k-fold cv
    verbose=2,
    random_state=2022,
    n_jobs=-1 # use all processors
)

rf_randomcv.fit(X_train, y_train)


Fitting 5 folds for each of 100 candidates, totalling 500 fits


RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=2022, verbose=2)

In [13]:
rf_randomcv.best_params_

{'n_estimators': 2000,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 70,
 'bootstrap': False}

In [14]:
rf_randomcv.best_score_

0.9917416622551615

**Hyperparameter Tuning**

We will use GridSearchCV for hyperparameter tuning.


In [17]:

n_estimators = [1500, 2000, 2500]
max_features = ['sqrt']
max_depth = [50,60,70,80,90]
min_samples_split = [1,2,3]
min_samples_leaf = [1, 2, 3]
bootstrap = [False]

narrow_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(narrow_grid)

{'n_estimators': [1500, 2000, 2500], 'max_features': ['sqrt'], 'max_depth': [50, 60, 70, 80, 90], 'min_samples_split': [1, 2, 3], 'min_samples_leaf': [1, 2, 3], 'bootstrap': [False]}


In [18]:
regressor = RandomForestClassifier()
rf_gridcv = GridSearchCV(
    estimator=regressor,
    param_grid=narrow_grid,
    cv=5, # k-fold cv
    verbose=2,
    n_jobs=-1 # use all processors
)

rf_gridcv.fit(X_train, y_train)

Fitting 5 folds for each of 135 candidates, totalling 675 fits


225 fits failed out of a total of 675.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
225 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\valen\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\valen\anaconda3\lib\site-packages\sklearn\ensemble\_forest.py", line 450, in fit
    trees = Parallel(
  File "c:\Users\valen\anaconda3\lib\site-packages\joblib\parallel.py", line 1043, in __call__
    if self.dispatch_one_batch(iterator):
  File "c:\Users\valen\anaconda3\lib\site-packages\joblib\parallel.py", line 861, in dispatch_one_batch
    self._dispatch(tasks)
  File "c:\Users\valen\ana

GridSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'bootstrap': [False],
                         'max_depth': [50, 60, 70, 80, 90],
                         'max_features': ['sqrt'],
                         'min_samples_leaf': [1, 2, 3],
                         'min_samples_split': [1, 2, 3],
                         'n_estimators': [1500, 2000, 2500]},
             verbose=2)

In [19]:
rf_gridcv.best_params_

{'bootstrap': False,
 'max_depth': 50,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 1500}

In [20]:
rf_gridcv.best_score_

0.9918475383800953

**Use tuned classifier on test data**

In [21]:
best_clf = rf_gridcv.best_estimator_

y_pred_test = best_clf.predict(X_test)
accuracy_score(y_test, y_pred_test)

0.9886022795440912

In [22]:
roc_auc_score(y_test, y_pred_test)

0.9883645940737824