In [20]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.inspection import permutation_importance
import glob
import os
import time

In [2]:
pd.set_option('mode.use_inf_as_na', True)

In [84]:
train_df = pd.read_csv("data/twitter_data_train_multiclass.csv")
test_df = pd.read_csv("data/twitter_data_test_multiclass.csv")

print(train_df.shape)
print(test_df.shape)

(9445, 235)
(1667, 235)


In [85]:
to_drop = [
    'account_type_multi',
    'account_type',
    'id', 
    'name', 
    'screen_name',
    'profile_image_url',
    'protected',
    'verified',
    'description',
    'description_processed',
    'tweets_list',
    'tweets_list_processed',
    'url'
    ]

X_train, y_train = train_df.drop(to_drop, axis=1), train_df['account_type_multi']
X_test, y_test = test_df.drop(to_drop, axis=1), test_df['account_type_multi']



(9445, 222)
(9445,)
(1667, 222)
(1667,)


In [74]:
X_train.dtypes.unique()

array([dtype('int64'), dtype('float64')], dtype=object)

In [92]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(9445, 222)
(9445,)
(1667, 222)
(1667,)


#### Logistic Regression model Training and Hyper parameters tuning

In [87]:
# uses lbfgs solver as default
LRModel = LogisticRegression(multi_class = 'multinomial')
LRModel.fit(X_train, y_train)
# Train set
LRModel_train_score = LRModel.score(X_train, y_train)
print('Train Set Accuracy Score: ', LRModel_train_score)
pred = LRModel.predict(X_train)
print("Train Set Confusion Matrix \n", confusion_matrix(y_train, pred))
print("Train Set Classification Report \n",classification_report(y_train, pred))

#Test Set
LRModel_test_score = LRModel.score(X_test, y_test)
print('Test Set Accuracy Score: ', LRModel_test_score)
test_pred = LRModel.predict(X_test)
print("Test Set Confusion Matrix \n", confusion_matrix(y_test, test_pred))
print("Test Set Classification Report \n",classification_report(y_test, test_pred))

Train Set Accuracy Score:  0.8344097406034939
Train Set Confusion Matrix 
 [[1812   65   49  388   29    0]
 [  10 2410    0  130  151    0]
 [  22    6  107  160    5    0]
 [  27  221   23 3407   28    0]
 [   1  135    8   27  145    0]
 [   0    0    0   79    0    0]]
Train Set Classification Report 
               precision    recall  f1-score   support

           0       0.97      0.77      0.86      2343
           1       0.85      0.89      0.87      2701
           2       0.57      0.36      0.44       300
           3       0.81      0.92      0.86      3706
           4       0.41      0.46      0.43       316
           5       0.00      0.00      0.00        79

    accuracy                           0.83      9445
   macro avg       0.60      0.57      0.58      9445
weighted avg       0.83      0.83      0.83      9445

Test Set Accuracy Score:  0.728254349130174
Test Set Confusion Matrix 
 [[329  13  36  22  13   0]
 [  3 319   1   4 136   0]
 [ 12   0  39   7   1  

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [6]:
#define models and parameters for Logistic Regression
#estimate 1 hour to do tuning
model = LogisticRegression(multi_class = 'multinomial')
solvers = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
penalty = ['l1','l2']
c_values = 10. ** np.arange(-3,8)

#define grid seach
grid = dict(solver=solvers,penalty=penalty,C=c_values)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X_train, y_train)

#results 
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))
    
#Best: 0.982107 using {'C': 1.0, 'penalty': 'l1', 'solver': 'liblinear'}

Best: 0.982107 using {'C': 1.0, 'penalty': 'l1', 'solver': 'liblinear'}
0.975613 (0.004766) with: {'C': 0.001, 'penalty': 'none', 'solver': 'newton-cg'}
0.911064 (0.007096) with: {'C': 0.001, 'penalty': 'none', 'solver': 'lbfgs'}
0.000000 (0.000000) with: {'C': 0.001, 'penalty': 'none', 'solver': 'liblinear'}
0.704080 (0.070334) with: {'C': 0.001, 'penalty': 'none', 'solver': 'sag'}
0.685834 (0.071493) with: {'C': 0.001, 'penalty': 'none', 'solver': 'saga'}
0.000000 (0.000000) with: {'C': 0.001, 'penalty': 'l1', 'solver': 'newton-cg'}
0.000000 (0.000000) with: {'C': 0.001, 'penalty': 'l1', 'solver': 'lbfgs'}
0.913605 (0.006372) with: {'C': 0.001, 'penalty': 'l1', 'solver': 'liblinear'}
0.000000 (0.000000) with: {'C': 0.001, 'penalty': 'l1', 'solver': 'sag'}
0.685870 (0.071486) with: {'C': 0.001, 'penalty': 'l1', 'solver': 'saga'}
0.939827 (0.006217) with: {'C': 0.001, 'penalty': 'l2', 'solver': 'newton-cg'}
0.910922 (0.006754) with: {'C': 0.001, 'penalty': 'l2', 'solver': 'lbfgs'}
0.91

#### Testing 

In [7]:
print(X_test.shape)
print(y_test.shape)

(1667, 221)
(1667,)


In [88]:
best_model = LogisticRegression(C=1.0, penalty='l1', solver='liblinear')
best_model.fit(X_train, y_train)
# Train set
best_model_train_score = best_model.score(X_train, y_train)
print('Train Set Accuracy Score after tuning: ', best_model_train_score)
pred1 = best_model.predict(X_train)
print("Train Set Confusion Matrix \n", confusion_matrix(y_train, pred1))
print("Train Set Classification Report \n",classification_report(y_train, pred1))

#Test Set
best_model_test_score = best_model.score(X_test, y_test)
print('Test Set Accuracy Score: ', best_model_test_score)
test_pred1 = best_model.predict(X_test)
print("Test Set Confusion Matrix \n", confusion_matrix(y_test, test_pred1))
print("Test Set Classification Report \n",classification_report(y_test, test_pred1))

Train Set Accuracy Score after tuning:  0.9832715722604553
Train Set Confusion Matrix 
 [[2282   26   12   14    9    0]
 [  33 2660    0    7    1    0]
 [  17    1  282    0    0    0]
 [  29    5    0 3672    0    0]
 [   1    0    0    0  315    0]
 [   3    0    0    0    0   76]]
Train Set Classification Report 
               precision    recall  f1-score   support

           0       0.96      0.97      0.97      2343
           1       0.99      0.98      0.99      2701
           2       0.96      0.94      0.95       300
           3       0.99      0.99      0.99      3706
           4       0.97      1.00      0.98       316
           5       1.00      0.96      0.98        79

    accuracy                           0.98      9445
   macro avg       0.98      0.97      0.98      9445
weighted avg       0.98      0.98      0.98      9445

Test Set Accuracy Score:  0.973005398920216
Test Set Confusion Matrix 
 [[395   5   5   6   2   0]
 [  8 454   0   0   1   0]
 [  5   0 



In [16]:
import matplotlib.pyplot as plt
fpr, tpr, thresholds = metrics.roc_curve(y_test, test_pred)
plt.plot(fpr, tpr, label = "Without Tuning")
fpr1, tpr1, thresholds1 = metrics.roc_curve(y_test, test_pred1)
plt.plot(fpr1, tpr1, label = "With Tuning")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.rcParams['font.size'] = 14
plt.title('ROC curve of Logistic Regression')
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.legend()
plt.grid(True)

ValueError: multiclass format is not supported

In [93]:
%%time
start_time = time.time()
model_time = LogisticRegression(C=1.0, penalty='l1', solver='liblinear')
model_time.fit(X_train, y_train)
test_pred1 = model_time.predict(X_test)
print("Execution time: " + str((time.time() - start_time)) + ' s')

Execution time: 3.7118613719940186 s
Wall time: 3.71 s




In [94]:
imps = permutation_importance(model_time, X_test, y_test)
imps_sorted = imps.importances_mean.argsort()[::-1]

print(np.take(X_train.columns, imps_sorted[: 20]))

Index(['statuses_count', 'favourites_count', 'following_to_followers_ratio',
       'profile_use_background_image', 'tweet_frequency', 'followers_count',
       'friends_count', 'number_of_mentions', 'default_profile',
       'tweet_weekday_frequency', 'geo_enabled', 'description_87',
       'tweet_weekend_frequency', 'description_14', 'tweets_59', 'tweets_35',
       'tweets_6', 'description_46', 'description_59', 'tweets_8'],
      dtype='object')
