In [1]:
import pandas as pd 
import numpy as np
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.inspection import permutation_importance
from scipy.stats import loguniform
import glob
import os
import time

In [2]:
pd.set_option('mode.use_inf_as_na', True)

In [3]:
train_df = pd.read_csv("data/twitter_data_train_multiclass.csv")
test_df = pd.read_csv("data/twitter_data_test_multiclass.csv")

print(train_df.shape)
print(test_df.shape)

(9445, 235)
(1667, 235)


In [4]:
scaler = MinMaxScaler() 
train_df[['statuses_count', 'favourites_count', 'followers_count', 'friends_count', 'number_of_mentions']] = scaler.fit_transform(train_df[['statuses_count', 'favourites_count', 'followers_count', 'friends_count', 'number_of_mentions']])
test_df[['statuses_count', 'favourites_count', 'followers_count', 'friends_count', 'number_of_mentions']] = scaler.transform(test_df[['statuses_count', 'favourites_count', 'followers_count', 'friends_count', 'number_of_mentions']])

In [5]:
to_drop = [
    'account_type', 
    'account_type_multi',
    'id', 
    'name', 
    'screen_name',
    'profile_image_url',
    'protected',
    'verified',
    'description',
    'description_processed',
    'tweets_list',
    'tweets_list_processed',
    'url'
    ]

X_train, y_train = train_df.drop(to_drop, axis=1), train_df['account_type_multi']
X_test, y_test = test_df.drop(to_drop, axis=1), test_df['account_type_multi']



In [6]:
X_train.dtypes.unique()

array([dtype('int64'), dtype('float64')], dtype=object)

In [7]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(9445, 222)
(9445,)
(1667, 222)
(1667,)


#### Logistic Regression model Training and Hyper parameters tuning

In [8]:
# uses lbfgs solver as default
LRModel = LogisticRegression(multi_class = 'ovr', max_iter=10000)
LRModel.fit(X_train, y_train)
# Train set
LRModel_train_score = LRModel.score(X_train, y_train)
print('Train Set Accuracy Score: ', LRModel_train_score)
pred = LRModel.predict(X_train)
print("Train Set Confusion Matrix \n", confusion_matrix(y_train, pred))
print("Train Set Classification Report \n",classification_report(y_train, pred))

Train Set Accuracy Score:  0.9766013763896242
Train Set Confusion Matrix 
 [[2262   20   22   20   19    0]
 [  56 2638    0    5    2    0]
 [  26    1  273    0    0    0]
 [  34    6    0 3665    0    1]
 [   5    0    0    1  310    0]
 [   3    0    0    0    0   76]]
Train Set Classification Report 
               precision    recall  f1-score   support

           0       0.95      0.97      0.96      2343
           1       0.99      0.98      0.98      2701
           2       0.93      0.91      0.92       300
           3       0.99      0.99      0.99      3706
           4       0.94      0.98      0.96       316
           5       0.99      0.96      0.97        79

    accuracy                           0.98      9445
   macro avg       0.96      0.96      0.96      9445
weighted avg       0.98      0.98      0.98      9445



In [9]:
#Test Set
LRModel_test_score = LRModel.score(X_test, y_test)
print('Test Set Accuracy Score: ', LRModel_test_score)
test_pred = LRModel.predict(X_test)
print("Test Set Confusion Matrix \n", confusion_matrix(y_test, test_pred))
print("Test Set Classification Report \n",classification_report(y_test, test_pred))

Test Set Accuracy Score:  0.9610077984403119
Test Set Confusion Matrix 
 [[381   9   7   9   7   0]
 [ 12 450   0   1   0   0]
 [  6   0  52   1   0   0]
 [  7   2   0 654   0   0]
 [  1   0   0   2  48   0]
 [  1   0   0   0   0  17]]
Test Set Classification Report 
               precision    recall  f1-score   support

           0       0.93      0.92      0.93       413
           1       0.98      0.97      0.97       463
           2       0.88      0.88      0.88        59
           3       0.98      0.99      0.98       663
           4       0.87      0.94      0.91        51
           5       1.00      0.94      0.97        18

    accuracy                           0.96      1667
   macro avg       0.94      0.94      0.94      1667
weighted avg       0.96      0.96      0.96      1667



### Hyper parameter tuning with RandomizedSearchCV

In [10]:
import warnings
warnings.filterwarnings('ignore')
#using repeatedStratifiedKfold 
model = LogisticRegression(multi_class='ovr', max_iter=10000)
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=999)
space = dict()
space['solver'] = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
space['penalty'] = ['l1', 'l2']
space['C'] = loguniform(1e-5, 100)

#define search
search = RandomizedSearchCV(
    model, 
    space, 
    n_iter=10, 
    scoring='f1_weighted', 
    n_jobs=-1, 
    cv=cv, 
    verbose=2,
    random_state=999)

# execute search
result = search.fit(X_train, y_train)

print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

Fitting 15 folds for each of 10 candidates, totalling 150 fits
Best Score: 0.9747205162725633
Best Hyperparameters: {'C': 4.207230251572931, 'penalty': 'l2', 'solver': 'lbfgs'}


#### Testing 

In [11]:
print(X_test.shape)
print(y_test.shape)

(1667, 222)
(1667,)


In [12]:
best_model = LogisticRegression(multi_class='ovr', max_iter=10000, **result.best_params_)
best_model.fit(X_train, y_train)
# Train set
best_model_train_score = best_model.score(X_train, y_train)
print('Train Set Accuracy Score after tuning: ', best_model_train_score)
pred1 = best_model.predict(X_train)
print("Train Set Confusion Matrix \n", confusion_matrix(y_train, pred1))
print("Train Set Classification Report \n",classification_report(y_train, pred1))


Train Set Accuracy Score after tuning:  0.9803070407623081
Train Set Confusion Matrix 
 [[2275   23   21   12   12    0]
 [  49 2646    0    5    1    0]
 [  20    1  279    0    0    0]
 [  32    5    0 3669    0    0]
 [   2    0    0    0  314    0]
 [   3    0    0    0    0   76]]
Train Set Classification Report 
               precision    recall  f1-score   support

           0       0.96      0.97      0.96      2343
           1       0.99      0.98      0.98      2701
           2       0.93      0.93      0.93       300
           3       1.00      0.99      0.99      3706
           4       0.96      0.99      0.98       316
           5       1.00      0.96      0.98        79

    accuracy                           0.98      9445
   macro avg       0.97      0.97      0.97      9445
weighted avg       0.98      0.98      0.98      9445



In [13]:
#Test Set
best_model_test_score = best_model.score(X_test, y_test)
print('Test Set Accuracy Score: ', best_model_test_score)
test_pred1 = best_model.predict(X_test)
print("Test Set Confusion Matrix \n", confusion_matrix(y_test, test_pred1))
print("Test Set Classification Report \n",classification_report(y_test, test_pred1))

Test Set Accuracy Score:  0.9640071985602879
Test Set Confusion Matrix 
 [[382   8   8   7   8   0]
 [ 12 451   0   0   0   0]
 [  6   0  53   0   0   0]
 [  7   0   0 656   0   0]
 [  1   0   0   2  48   0]
 [  1   0   0   0   0  17]]
Test Set Classification Report 
               precision    recall  f1-score   support

           0       0.93      0.92      0.93       413
           1       0.98      0.97      0.98       463
           2       0.87      0.90      0.88        59
           3       0.99      0.99      0.99       663
           4       0.86      0.94      0.90        51
           5       1.00      0.94      0.97        18

    accuracy                           0.96      1667
   macro avg       0.94      0.95      0.94      1667
weighted avg       0.96      0.96      0.96      1667



In [17]:
%%time
start_time = time.time()
model_time = LogisticRegression(multi_class='ovr', max_iter=10000, **result.best_params_)
model_time.fit(X_train, y_train)
test_pred1 = model_time.predict(X_test)
print("Execution time: " + str((time.time() - start_time)) + ' s')

Execution time: 156.5509090423584 s
Wall time: 2min 36s


In [16]:
imps = permutation_importance(model_time, X_test, y_test)
imps_sorted = imps.importances_mean.argsort()[::-1]

print(np.take(X_train.columns, imps_sorted[: 20]))

Index(['following_to_followers_ratio', 'tweet_weekday_frequency',
       'tweet_frequency', 'tweet_weekend_frequency',
       'profile_use_background_image', 'number_of_mentions', 'tweets_8',
       'geo_enabled', 'listed_count', 'tweets_17', 'description_14',
       'tweets_6', 'tweets_0', 'default_profile', 'profile_background_tile',
       'tweets_59', 'tweets_20', 'description_46', 'tweets_58',
       'screen_name_length'],
      dtype='object')
