In [2]:
import pandas as pd 
import numpy as np
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.inspection import permutation_importance
from scipy.stats import loguniform
import glob
import os
import time

In [4]:
pd.set_option('mode.use_inf_as_na', True)

In [5]:
train_df = pd.read_csv("data/twitter_data_train_multiclass.csv")
test_df = pd.read_csv("data/twitter_data_test_multiclass.csv")

print(train_df.shape)
print(test_df.shape)

(9446, 236)
(1667, 236)


In [7]:
scaler = MinMaxScaler() 
train_df[['statuses_count', 'favourites_count', 'followers_count', 'friends_count', 'number_of_mentions', 'listed_count', 'number_of_tags']] = scaler.fit_transform(train_df[['statuses_count', 'favourites_count', 'followers_count', 'friends_count', 'number_of_mentions', 'listed_count', 'number_of_tags']])
test_df[['statuses_count', 'favourites_count', 'followers_count', 'friends_count', 'number_of_mentions', 'listed_count', 'number_of_tags']] = scaler.transform(test_df[['statuses_count', 'favourites_count', 'followers_count', 'friends_count', 'number_of_mentions', 'listed_count', 'number_of_tags']])

In [8]:
to_drop = ['Unnamed: 0', 'screen_name', 'url', 'profile_image_url', 'description',
           'id', 'name', 'account_type', 'tweets_list', 'tweets_list_processed',
          'description_processed', 'protected', 'verified', 'account_type_multi']

X_train, y_train = train_df.drop(to_drop, axis=1), train_df['account_type_multi']
X_test, y_test = test_df.drop(to_drop, axis=1), test_df['account_type_multi']



In [9]:
X_train.dtypes.unique()

array([dtype('float64'), dtype('int64')], dtype=object)

In [10]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(9446, 222)
(9446,)
(1667, 222)
(1667,)


#### Logistic Regression model Training and Hyper parameters tuning

In [11]:
# uses lbfgs solver as default
LRModel = LogisticRegression(multi_class = 'ovr', max_iter=10000)
LRModel.fit(X_train, y_train)
# Train set
LRModel_train_score = LRModel.score(X_train, y_train)
print('Train Set Accuracy Score: ', LRModel_train_score)
pred = LRModel.predict(X_train)
print("Train Set Confusion Matrix \n", confusion_matrix(y_train, pred))
print("Train Set Classification Report \n",classification_report(y_train, pred))

Train Set Accuracy Score:  0.9767097183993224
Train Set Confusion Matrix 
 [[2261   23   20   17   17    0]
 [  62 2636    0    6    2    0]
 [  20    1  281    0    0    0]
 [  29    7    0 3668    1    1]
 [   9    0    0    2  303    0]
 [   3    0    0    0    0   77]]
Train Set Classification Report 
               precision    recall  f1-score   support

           0       0.95      0.97      0.96      2338
           1       0.99      0.97      0.98      2706
           2       0.93      0.93      0.93       302
           3       0.99      0.99      0.99      3706
           4       0.94      0.96      0.95       314
           5       0.99      0.96      0.97        80

    accuracy                           0.98      9446
   macro avg       0.96      0.96      0.96      9446
weighted avg       0.98      0.98      0.98      9446



In [12]:
#Test Set
LRModel_test_score = LRModel.score(X_test, y_test)
print('Test Set Accuracy Score: ', LRModel_test_score)
test_pred = LRModel.predict(X_test)
print("Test Set Confusion Matrix \n", confusion_matrix(y_test, test_pred))
print("Test Set Classification Report \n",classification_report(y_test, test_pred))

Test Set Accuracy Score:  0.9676064787042592
Test Set Confusion Matrix 
 [[394   5   5   3   6   0]
 [ 10 447   0   2   1   0]
 [  6   0  52   0   0   0]
 [ 12   1   0 653   0   0]
 [  2   0   0   0  51   0]
 [  1   0   0   0   0  16]]
Test Set Classification Report 
               precision    recall  f1-score   support

           0       0.93      0.95      0.94       413
           1       0.99      0.97      0.98       460
           2       0.91      0.90      0.90        58
           3       0.99      0.98      0.99       666
           4       0.88      0.96      0.92        53
           5       1.00      0.94      0.97        17

    accuracy                           0.97      1667
   macro avg       0.95      0.95      0.95      1667
weighted avg       0.97      0.97      0.97      1667



### Hyper parameter tuning with RandomizedSearchCV

In [13]:
#using repeatedStratifiedKfold 
model = LogisticRegression(multi_class='ovr', max_iter=10000)
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=999)
space = dict()
space['solver'] = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
space['penalty'] = ['l1', 'l2']
space['C'] = loguniform(1e-5, 100)

#define search
search = RandomizedSearchCV(
    model, 
    space, 
    n_iter=10, 
    scoring='f1_weighted', 
    n_jobs=-1, 
    cv=cv, 
    verbose=2,
    random_state=999)

# execute search
result = search.fit(X_train, y_train)

print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

Fitting 15 folds for each of 10 candidates, totalling 150 fits
Best Score: 0.9749791808854282
Best Hyperparameters: {'C': 21.786162232558315, 'penalty': 'l2', 'solver': 'liblinear'}


Best Score: 0.9749791808854282
Best Hyperparameters: {'C': 21.786162232558315, 'penalty': 'l2', 'solver': 'liblinear'}

#### Testing 

In [15]:
print(X_test.shape)
print(y_test.shape)

(1667, 222)
(1667,)


In [16]:
best_model = LogisticRegression(multi_class='ovr', max_iter=10000, **result.best_params_)
best_model.fit(X_train, y_train)
# Train set
best_model_train_score = best_model.score(X_train, y_train)
print('Train Set Accuracy Score after tuning: ', best_model_train_score)
pred1 = best_model.predict(X_train)
print("Train Set Confusion Matrix \n", confusion_matrix(y_train, pred1))
print("Train Set Classification Report \n",classification_report(y_train, pred1))


Train Set Accuracy Score after tuning:  0.9833792081304256
Train Set Confusion Matrix 
 [[2291   21   12    9    5    0]
 [  46 2652    0    8    0    0]
 [  18    1  283    0    0    0]
 [  25    7    0 3674    0    0]
 [   2    0    0    0  312    0]
 [   3    0    0    0    0   77]]
Train Set Classification Report 
               precision    recall  f1-score   support

           0       0.96      0.98      0.97      2338
           1       0.99      0.98      0.98      2706
           2       0.96      0.94      0.95       302
           3       1.00      0.99      0.99      3706
           4       0.98      0.99      0.99       314
           5       1.00      0.96      0.98        80

    accuracy                           0.98      9446
   macro avg       0.98      0.97      0.98      9446
weighted avg       0.98      0.98      0.98      9446



In [17]:
#Test Set
best_model_test_score = best_model.score(X_test, y_test)
print('Test Set Accuracy Score: ', best_model_test_score)
test_pred1 = best_model.predict(X_test)
print("Test Set Confusion Matrix \n", confusion_matrix(y_test, test_pred1))
print("Test Set Classification Report \n",classification_report(y_test, test_pred1))

Test Set Accuracy Score:  0.9742051589682064
Test Set Confusion Matrix 
 [[399   6   3   1   4   0]
 [  8 451   0   0   1   0]
 [  5   0  53   0   0   0]
 [ 11   2   1 652   0   0]
 [  0   0   0   0  53   0]
 [  1   0   0   0   0  16]]
Test Set Classification Report 
               precision    recall  f1-score   support

           0       0.94      0.97      0.95       413
           1       0.98      0.98      0.98       460
           2       0.93      0.91      0.92        58
           3       1.00      0.98      0.99       666
           4       0.91      1.00      0.95        53
           5       1.00      0.94      0.97        17

    accuracy                           0.97      1667
   macro avg       0.96      0.96      0.96      1667
weighted avg       0.97      0.97      0.97      1667



In [25]:
%%time
start_time = time.time()
model_time = LogisticRegression(multi_class='ovr', max_iter=10000, **result.best_params_)
model_time.fit(X_train, y_train)
test_pred1 = model_time.predict(X_test)
print("Execution time: " + str((time.time() - start_time)) + ' s')

Execution time: 5.5188727378845215 s
Wall time: 5.52 s


In [22]:
imps = permutation_importance(model_time, X_test, y_test)
imps_sorted = imps.importances_mean.argsort()[::-1]

print(np.take(X_train.columns, imps_sorted[: 20]))

Index(['following_to_followers_ratio', 'tweet_weekday_frequency',
       'tweet_frequency', 'profile_use_background_image', 'geo_enabled',
       'tweet_weekend_frequency', 'profile_background_tile',
       'number_of_mentions', 'default_profile', 'tweets_35', 'tweets_7',
       'description_14', 'tweets_16', 'tweets_59', 'tweets_91',
       'description_42', 'screen_name_length', 'tweets_43', 'tweets_46',
       'tweets_13'],
      dtype='object')
