In [175]:
import pandas as pd, numpy as np
from keras.models import Sequential
from keras.layers import Flatten, Dense, Dropout
from sklearn.metrics import roc_auc_score, accuracy_score, classification_report, roc_curve
from sklearn.preprocessing import MinMaxScaler

In [176]:
train_df = pd.read_csv("data/twitter_data_train_multiclass.csv")
test_df = pd.read_csv("data/twitter_data_test_multiclass.csv")

print(train_df.shape)
print(test_df.shape)

(9446, 236)
(1667, 236)


In [177]:
to_drop = [
    'Unnamed: 0',
    'account_type', 
    'account_type_multi',
    'id', 
    'name', 
    'screen_name',
    'profile_image_url',
    'protected',
    'verified',
    'description',
    'description_processed',
    'tweets_list',
    'tweets_list_processed',
    'url',
    'profile_use_background_image', 
    'profile_background_tile'
    ]

scaler = MinMaxScaler() 
train_df[['statuses_count', 'favourites_count', 'followers_count', 'friends_count', 'number_of_mentions', 'listed_count', 'number_of_tags']] = scaler.fit_transform(train_df[['statuses_count', 'favourites_count', 'followers_count', 'friends_count', 'number_of_mentions', 'listed_count', 'number_of_tags']])
test_df[['statuses_count', 'favourites_count', 'followers_count', 'friends_count', 'number_of_mentions', 'listed_count', 'number_of_tags']] = scaler.transform(test_df[['statuses_count', 'favourites_count', 'followers_count', 'friends_count', 'number_of_mentions', 'listed_count', 'number_of_tags']])

X_train, y_train = train_df.drop(to_drop, axis=1), train_df['account_type_multi']
X_test, y_test = test_df.drop(to_drop, axis=1), test_df['account_type_multi']

In [178]:
from keras.utils import to_categorical
y_train = to_categorical(y_train)
# y_test  = to_categorical(y_test)

In [179]:
X_train.shape

(9446, 220)

In [180]:
y_train

array([[1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       ...,
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0.]], dtype=float32)

In [181]:
model = Sequential()

model.add(Dense(100, activation='relu', input_dim=220))
model.add(Dropout(.2))
model.add(Dense(500, activation='relu'))
model.add(Dense(500, activation='relu'))
model.add(Dense(6, activation='softmax'))

model.summary()

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['acc'])

model.fit(X_train, y_train, epochs=100)


Model: "sequential_20"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_150 (Dense)           (None, 100)               22100     
                                                                 
 dropout_2 (Dropout)         (None, 100)               0         
                                                                 
 dense_151 (Dense)           (None, 500)               50500     
                                                                 
 dense_152 (Dense)           (None, 500)               250500    
                                                                 
 dense_153 (Dense)           (None, 6)                 3006      
                                                                 
Total params: 326,106
Trainable params: 326,106
Non-trainable params: 0
_________________________________________________________________
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100

<keras.callbacks.History at 0x18537481520>

In [182]:
import time

from sklearn.metrics import f1_score

best_clf = model

start_time = time.time()

y_pred_train = best_clf.predict(X_train)
y_pred_train = np.argmax(y_pred_train,axis=1)
y_train = np.argmax(y_train,axis=1)

time_taken = time.time() - start_time
print("Total time taken for the program execution", time_taken) # seconds
print(classification_report(y_train, y_pred_train, digits=5))
# roc_auc_score(y_test, y_pred_test)
print("F1 weighted: ", f1_score(y_train, y_pred_train, average='weighted'))

Total time taken for the program execution 0.7825047969818115
              precision    recall  f1-score   support

           0    0.98649   0.99957   0.99299      2338
           1    0.99963   0.99667   0.99815      2706
           2    1.00000   0.96026   0.97973       302
           3    1.00000   0.99865   0.99932      3706
           4    1.00000   0.98726   0.99359       314
           5    1.00000   0.97500   0.98734        80

    accuracy                        0.99651      9446
   macro avg    0.99769   0.98624   0.99185      9446
weighted avg    0.99655   0.99651   0.99650      9446

F1 weighted:  0.9965014268538805


In [183]:
start_time = time.time()

y_pred_test = best_clf.predict(X_test)
y_pred_test = np.argmax(y_pred_test,axis=1)

time_taken = time.time() - start_time
print("Total time taken for the program execution", time_taken) # seconds
print(classification_report(y_test, y_pred_test, digits=5))
# roc_auc_score(y_test, y_pred_test)
print("F1 weighted: ", f1_score(y_test, y_pred_test, average='weighted'))

Total time taken for the program execution 0.11875677108764648
              precision    recall  f1-score   support

           0    0.95962   0.97821   0.96882       413
           1    0.98069   0.99348   0.98704       460
           2    0.98113   0.89655   0.93694        58
           3    0.99545   0.98498   0.99019       666
           4    0.94231   0.92453   0.93333        53
           5    1.00000   0.94118   0.96970        17

    accuracy                        0.98020      1667
   macro avg    0.97653   0.95315   0.96434      1667
weighted avg    0.98036   0.98020   0.98016      1667

F1 weighted:  0.9801578254354367
