In [6]:
import pandas as pd 
import numpy as np
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.inspection import permutation_importance
from scipy.stats import loguniform
import glob
import os
import time

In [7]:
pd.set_option('mode.use_inf_as_na', True)

In [8]:
train_df = pd.read_csv("data/twitter_data_train_multiclass.csv")
test_df = pd.read_csv("data/twitter_data_test_multiclass.csv")

print(train_df.shape)
print(test_df.shape)

(9446, 238)
(1667, 238)


In [9]:
for x in train_df.columns:
    print(x)

Unnamed: 0
id
name
screen_name
statuses_count
followers_count
friends_count
favourites_count
listed_count
url
default_profile
default_profile_image
geo_enabled
profile_image_url
profile_use_background_image
profile_background_tile
protected
verified
description
account_type
tweets_list
tweet_frequency
number_of_tags
number_of_mentions
tweet_weekend_frequency
tweet_weekday_frequency
following_to_followers_ratio
username_length
screen_name_length
username_spec_char_count
screen_name_spec_char_count
has_url
has_desc
tweets_list_processed
description_processed
tweets_0
tweets_1
tweets_2
tweets_3
tweets_4
tweets_5
tweets_6
tweets_7
tweets_8
tweets_9
tweets_10
tweets_11
tweets_12
tweets_13
tweets_14
tweets_15
tweets_16
tweets_17
tweets_18
tweets_19
tweets_20
tweets_21
tweets_22
tweets_23
tweets_24
tweets_25
tweets_26
tweets_27
tweets_28
tweets_29
tweets_30
tweets_31
tweets_32
tweets_33
tweets_34
tweets_35
tweets_36
tweets_37
tweets_38
tweets_39
tweets_40
tweets_41
tweets_42
tweets_43
tweets_

In [10]:
scaler = MinMaxScaler() 
train_df[['statuses_count', 'favourites_count', 'followers_count', 'friends_count', 'number_of_mentions', 'listed_count', 'number_of_tags']] = scaler.fit_transform(train_df[['statuses_count', 'favourites_count', 'followers_count', 'friends_count', 'number_of_mentions', 'listed_count', 'number_of_tags']])
test_df[['statuses_count', 'favourites_count', 'followers_count', 'friends_count', 'number_of_mentions', 'listed_count', 'number_of_tags']] = scaler.transform(test_df[['statuses_count', 'favourites_count', 'followers_count', 'friends_count', 'number_of_mentions', 'listed_count', 'number_of_tags']])

In [11]:
to_drop = ['Unnamed: 0', 'screen_name', 'url', 'profile_image_url', 'description',
           'id', 'name', 'account_type', 'tweets_list', 'tweets_list_processed',
          'description_processed', 'protected', 'verified', 'account_type_multi', 
          'profile_use_background_image', 'profile_background_tile', 'geo_enabled']

X_train, y_train = train_df.drop(to_drop, axis=1), train_df['account_type_multi']
X_test, y_test = test_df.drop(to_drop, axis=1), test_df['account_type_multi']



In [12]:
X_train.dtypes.unique()

array([dtype('float64'), dtype('int64')], dtype=object)

In [13]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(9446, 221)
(9446,)
(1667, 221)
(1667,)


#### Logistic Regression model Training and Hyper parameters tuning

In [14]:
# uses lbfgs solver as default
LRModel = LogisticRegression(multi_class = 'ovr', max_iter=10000)
LRModel.fit(X_train, y_train)
# Train set
LRModel_train_score = LRModel.score(X_train, y_train)
print('Train Set Accuracy Score: ', LRModel_train_score)
pred = LRModel.predict(X_train)
print("Train Set Confusion Matrix \n", confusion_matrix(y_train, pred))
print("Train Set Classification Report \n",classification_report(y_train, pred))

Train Set Accuracy Score:  0.9726868515773872
Train Set Confusion Matrix 
 [[2280   21   19   12    6    0]
 [  68 2625    0    5    8    0]
 [  20    1  281    0    0    0]
 [  33    6    0 3666    0    1]
 [  30    8    0   17  259    0]
 [   3    0    0    0    0   77]]
Train Set Classification Report 
               precision    recall  f1-score   support

           0       0.94      0.98      0.96      2338
           1       0.99      0.97      0.98      2706
           2       0.94      0.93      0.93       302
           3       0.99      0.99      0.99      3706
           4       0.95      0.82      0.88       314
           5       0.99      0.96      0.97        80

    accuracy                           0.97      9446
   macro avg       0.96      0.94      0.95      9446
weighted avg       0.97      0.97      0.97      9446



In [15]:
#Test Set
LRModel_test_score = LRModel.score(X_test, y_test)
print('Test Set Accuracy Score: ', LRModel_test_score)
test_pred = LRModel.predict(X_test)
print("Test Set Confusion Matrix \n", confusion_matrix(y_test, test_pred))
print("Test Set Classification Report \n",classification_report(y_test, test_pred))

Test Set Accuracy Score:  0.9634073185362927
Test Set Confusion Matrix 
 [[396   6   5   5   1   0]
 [  9 446   0   1   4   0]
 [  7   0  51   0   0   0]
 [ 12   0   0 653   1   0]
 [  7   1   0   1  44   0]
 [  1   0   0   0   0  16]]
Test Set Classification Report 
               precision    recall  f1-score   support

           0       0.92      0.96      0.94       413
           1       0.98      0.97      0.98       460
           2       0.91      0.88      0.89        58
           3       0.99      0.98      0.98       666
           4       0.88      0.83      0.85        53
           5       1.00      0.94      0.97        17

    accuracy                           0.96      1667
   macro avg       0.95      0.93      0.94      1667
weighted avg       0.96      0.96      0.96      1667



### Hyper parameter tuning with RandomizedSearchCV

In [16]:
#using repeatedStratifiedKfold 
model = LogisticRegression(multi_class='ovr', max_iter=10000)
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=999)
space = dict()
space['solver'] = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
space['penalty'] = ['l1', 'l2']
space['C'] = loguniform(1e-5, 100)

#define search
search = RandomizedSearchCV(
    model, 
    space, 
    n_iter=10, 
    scoring='f1_weighted', 
    n_jobs=-1, 
    cv=cv, 
    verbose=2,
    random_state=999)

# execute search
result = search.fit(X_train, y_train)

print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

Fitting 15 folds for each of 10 candidates, totalling 150 fits


30 fits failed out of a total of 150.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\radellng\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\radellng\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\radellng\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 447, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

------------------

Best Score: 0.9710773044543977
Best Hyperparameters: {'C': 21.786162232558315, 'penalty': 'l2', 'solver': 'liblinear'}


Best Score: 0.9710773044543977
Best Hyperparameters: {'C': 21.786162232558315, 'penalty': 'l2', 'solver': 'liblinear'}

#### Testing 

In [17]:
print(X_test.shape)
print(y_test.shape)

(1667, 221)
(1667,)


In [18]:
best_model = LogisticRegression(multi_class='ovr', max_iter=10000, **result.best_params_)
best_model.fit(X_train, y_train)
# Train set
best_model_train_score = best_model.score(X_train, y_train)
print('Train Set Accuracy Score after tuning: ', best_model_train_score)
pred1 = best_model.predict(X_train)
print("Train Set Confusion Matrix \n", confusion_matrix(y_train, pred1))
print("Train Set Classification Report \n",classification_report(y_train, pred1))


Train Set Accuracy Score after tuning:  0.9802032606394241
Train Set Confusion Matrix 
 [[2292   20   11   10    5    0]
 [  54 2642    0    6    4    0]
 [  17    1  284    0    0    0]
 [  26    5    0 3675    0    0]
 [  13    6    0    6  289    0]
 [   3    0    0    0    0   77]]
Train Set Classification Report 
               precision    recall  f1-score   support

           0       0.95      0.98      0.97      2338
           1       0.99      0.98      0.98      2706
           2       0.96      0.94      0.95       302
           3       0.99      0.99      0.99      3706
           4       0.97      0.92      0.94       314
           5       1.00      0.96      0.98        80

    accuracy                           0.98      9446
   macro avg       0.98      0.96      0.97      9446
weighted avg       0.98      0.98      0.98      9446



In [21]:
#Test Set
best_model_test_score = best_model.score(X_test, y_test)
print('Test Set Accuracy Score: ', best_model_test_score)
test_pred1 = best_model.predict(X_test)
print("Test Set Confusion Matrix \n", confusion_matrix(y_test, test_pred1))
print("Test Set Classification Report \n",classification_report(y_test, test_pred1,digits=5))

Test Set Accuracy Score:  0.9706058788242351
Test Set Confusion Matrix 
 [[397   7   4   3   2   0]
 [  8 449   0   2   1   0]
 [  6   0  52   0   0   0]
 [ 11   0   0 654   1   0]
 [  2   1   0   0  50   0]
 [  1   0   0   0   0  16]]
Test Set Classification Report 
               precision    recall  f1-score   support

           0    0.93412   0.96126   0.94749       413
           1    0.98249   0.97609   0.97928       460
           2    0.92857   0.89655   0.91228        58
           3    0.99241   0.98198   0.98717       666
           4    0.92593   0.94340   0.93458        53
           5    1.00000   0.94118   0.96970        17

    accuracy                        0.97061      1667
   macro avg    0.96059   0.95008   0.95508      1667
weighted avg    0.97098   0.97061   0.97071      1667



In [23]:
%%time
start_time = time.time()
model_time = LogisticRegression(multi_class='ovr', max_iter=10000, **result.best_params_)
model_time.fit(X_train, y_train)
test_pred1 = model_time.predict(X_test)
print("Execution time: " + str((time.time() - start_time)) + ' s')

Execution time: 4.819108247756958 s
Wall time: 4.82 s


In [24]:
imps = permutation_importance(model_time, X_test, y_test)
imps_sorted = imps.importances_mean.argsort()[::-1]

print(np.take(X_train.columns, imps_sorted[: 20]))

Index(['following_to_followers_ratio', 'tweet_weekday_frequency',
       'tweet_frequency', 'default_profile', 'tweet_weekend_frequency',
       'has_desc', 'tweets_35', 'number_of_mentions', 'description_46',
       'reciprocity', 'tweets_7', 'tweets_16', 'description_59',
       'description_14', 'tweets_82', 'description_42', 'tweets_2',
       'tweets_88', 'tweets_5', 'has_url'],
      dtype='object')
