In [53]:
import pandas as pd, numpy as np
from keras.models import Sequential
from keras.layers import Flatten, Dense, Dropout
from sklearn.metrics import roc_auc_score, accuracy_score, classification_report, roc_curve, f1_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import time
import eli5
from lime import lime_tabular
from eli5.sklearn import PermutationImportance

In [15]:
train_df = pd.read_csv("data/twitter_data_train_multiclass.csv")
test_df = pd.read_csv("data/twitter_data_test_multiclass.csv")

print(train_df.shape)
print(test_df.shape)

(9446, 237)
(1667, 237)


In [16]:
to_drop = [
    'Unnamed: 0',
    'account_type', 
    'account_type_multi',
    'id', 
    'name', 
    'screen_name',
    'profile_image_url',
    'protected',
    'verified',
    'description',
    'description_processed',
    'tweets_list',
    'tweets_list_processed',
    'url',
    'profile_use_background_image', 
    'profile_background_tile'
    ]

scaler = MinMaxScaler() 
train_df[['statuses_count', 'favourites_count', 'followers_count', 'friends_count', 'number_of_mentions', 'listed_count', 'number_of_tags']] = scaler.fit_transform(train_df[['statuses_count', 'favourites_count', 'followers_count', 'friends_count', 'number_of_mentions', 'listed_count', 'number_of_tags']])
test_df[['statuses_count', 'favourites_count', 'followers_count', 'friends_count', 'number_of_mentions', 'listed_count', 'number_of_tags']] = scaler.transform(test_df[['statuses_count', 'favourites_count', 'followers_count', 'friends_count', 'number_of_mentions', 'listed_count', 'number_of_tags']])

X_train, y_train = train_df.drop(to_drop, axis=1), train_df['account_type_multi']
X_test, y_test = test_df.drop(to_drop, axis=1), test_df['account_type_multi']

# validation split
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.15)

In [17]:
from keras.utils import to_categorical
y_train_cat = to_categorical(y_train)
y_val_cat = to_categorical(y_val)
y_test_cat  = to_categorical(y_test)

In [19]:
start_time = time.time()

model = Sequential()
model.add(Dense(64, activation = 'relu', input_dim = 221))
model.add(Dropout(.1))
model.add(Dense(128, activation='relu'))
model.add(Dense(4, activation='softmax'))

model.summary()

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['acc'])

model.fit(X_train, y_train_cat, epochs=20, validation_data=(X_val, y_val_cat))

time_taken = time.time() - start_time
print("Total time taken for the program execution", time_taken) # seconds


Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_15 (Dense)            (None, 64)                14208     
                                                                 
 dropout_5 (Dropout)         (None, 64)                0         
                                                                 
 dense_16 (Dense)            (None, 128)               8320      
                                                                 
 dense_17 (Dense)            (None, 4)                 516       
                                                                 
Total params: 23,044
Trainable params: 23,044
Non-trainable params: 0
_________________________________________________________________
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20

Train

In [20]:
best_clf = model

start_time = time.time()

y_pred_train = best_clf.predict(X_train)
y_pred_train = np.argmax(y_pred_train,axis=1)

time_taken = time.time() - start_time
print("Total time taken for the program execution", time_taken) # seconds
print(classification_report(y_train, y_pred_train, digits=5))
# roc_auc_score(y_test, y_pred_test)
print("F1 weighted: ", f1_score(y_train, y_pred_train, average='weighted'))

Total time taken for the program execution 0.3873770236968994
              precision    recall  f1-score   support

           0    0.96537   0.99597   0.98043      1987
           1    0.99779   0.98042   0.98903      2298
           2    0.98101   0.98569   0.98335      1258
           3    0.99959   0.98793   0.99373      2486

    accuracy                        0.98742      8029
   macro avg    0.98594   0.98750   0.98663      8029
weighted avg    0.98769   0.98742   0.98746      8029

F1 weighted:  0.9874649341108309


test

In [21]:
start_time = time.time()

y_pred_test = best_clf.predict(X_test)
y_pred_test = np.argmax(y_pred_test,axis=1)

time_taken = time.time() - start_time
print("Total time taken for the program execution", time_taken) # seconds
print(classification_report(y_test, y_pred_test, digits=5))
# roc_auc_score(y_test, y_pred_test)
print("F1 weighted: ", f1_score(y_test, y_pred_test, average='weighted'))

Total time taken for the program execution 0.15932583808898926
              precision    recall  f1-score   support

           0    0.93968   0.98063   0.95972       413
           1    0.99119   0.97826   0.98468       460
           2    0.97436   0.96377   0.96903       276
           3    0.99018   0.97297   0.98150       518

    accuracy                        0.97481      1667
   macro avg    0.97385   0.97391   0.97373      1667
weighted avg    0.97533   0.97481   0.97492      1667

F1 weighted:  0.9749171608299166


### compute SHAP values

In [35]:
explainer = lime_tabular.LimeTabularExplainer(X_train.values, mode="classification",
                                              class_names=y_train.values,
                                              feature_names=list(X_train.columns),
                                             )

In [49]:
from keras.wrappers.scikit_learn import KerasClassifier

def create_model():
    model = Sequential()
    model.add(Dense(64, activation = 'relu', input_dim = 221))
    model.add(Dropout(.1))
    model.add(Dense(128, activation='relu'))
    model.add(Dense(4, activation='softmax'))
    model.compile(optimizer='adam',
                loss='sparse_categorical_crossentropy',
                metrics=['acc'])
    return model

my_model = KerasClassifier(build_fn=create_model, epochs=20, batch_size=50, validation_data=(X_val, y_val))
my_model.fit(X_train,y_train)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fb9ccd74cd0>

In [54]:
perm = PermutationImportance(my_model, scoring="accuracy", random_state=1).fit(X_val, y_val)
eli5.show_weights(perm, feature_names = X_val.columns.tolist())

Weight,Feature
0.1362  ± 0.0115,following_to_followers_ratio
0.0885  ± 0.0042,tweet_weekday_frequency
0.0591  ± 0.0031,tweet_weekend_frequency
0.0570  ± 0.0103,tweet_frequency
0.0126  ± 0.0052,has_desc
0.0096  ± 0.0060,geo_enabled
0.0027  ± 0.0038,screen_name_spec_char_count
0.0020  ± 0.0011,tweets_72
0.0018  ± 0.0037,username_length
0.0018  ± 0.0007,tweets_8
