## ANN + CNN + LSTM
train a neural network that combines categorical/numerical attributes with images and text data

### Data preprocessing

In [23]:
import pandas as pd 
import numpy as np 
import re
import matplotlib.pyplot as plt
import keras
import tensorflow as tf
import cv2
import time
import shap

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report

from keras.utils import pad_sequences
from tensorflow.keras.layers import concatenate
from keras.layers import Conv2D, MaxPooling2D, Input, Flatten, Dense, Dropout, BatchNormalization, Embedding, LSTM, SpatialDropout1D
from keras.models import Sequential, Model
from keras.metrics import CategoricalAccuracy
from keras.regularizers import l2
from keras.callbacks import EarlyStopping
from keras.layers import Dropout
from tensorflow.keras.applications.inception_v3 import InceptionV3
from keras.preprocessing.text import Tokenizer
from nltk.corpus import stopwords
from nltk import word_tokenize
STOPWORDS = set(stopwords.words('english'))

pd.set_option('display.max_columns', None)

In [2]:
train_df = pd.read_csv('data/twitter_data_train_multiclass.csv')
test_df = pd.read_csv('data/twitter_data_test_multiclass.csv')

In [3]:
scaler = MinMaxScaler() 
train_df[['statuses_count', 'favourites_count', 'followers_count', 'friends_count', 'number_of_mentions', 'listed_count', 'number_of_tags']] = scaler.fit_transform(train_df[['statuses_count', 'favourites_count', 'followers_count', 'friends_count', 'number_of_mentions', 'listed_count', 'number_of_tags']])
test_df[['statuses_count', 'favourites_count', 'followers_count', 'friends_count', 'number_of_mentions', 'listed_count', 'number_of_tags']] = scaler.transform(test_df[['statuses_count', 'favourites_count', 'followers_count', 'friends_count', 'number_of_mentions', 'listed_count', 'number_of_tags']])

In [4]:
faces_base = "data/new batch profile pics/"

In [5]:
# get all image pathnames from base
# list to store files
res = []
res2 = []

# Iterate directory
for path in os.listdir(faces_base):
    # check if current path is a file
    if os.path.isfile(os.path.join(faces_base, path)):
        res.append(faces_base + path)
        res2.append(path)

In [6]:
train_img_dict2 = {}
test_img_dict2 = {}
train_img2 = []
test_img2 = []
for i in range(len(res)):
    pic = res[i]
    id_name = re.match(r"[^\/\\]+(?=\.png|\.jpg)", res2[i]).group(0)
    try:
        img = cv2.imread(pic)
        if img is None:
            print("none")
            continue
        img = cv2.resize(img, (75, 75))
        if int(id_name) in list(train_df['id']):
            train_img_dict2[int(id_name)] = img 
            train_img2.append(img)
        elif int(id_name) in list(test_df['id']):
            test_img_dict2[int(id_name)] = img
            test_img2.append(img)
        #img.close()
    except Exception as e:
        print(e)

none
none
none
none
none
none
none
none
none
none
none
none
none
none
none
none
none
none
none
none
none


In [7]:
train_df['description_processed'] = train_df['description_processed'].apply(str)
test_df['description_processed'] = test_df['description_processed'].apply(str)

In [8]:
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 50000
# Max number of words
MAX_SEQUENCE_LENGTH = 250
# This is fixed.
EMBEDDING_DIM = 128

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(train_df['description_processed'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

tokenizer_test = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer_test.fit_on_texts(test_df['description_processed'].values)
word_index_test = tokenizer_test.word_index
print('Found %s unique tokens.' % len(word_index_test))

Found 16349 unique tokens.
Found 4366 unique tokens.


In [10]:
train_img_df2 = pd.DataFrame(train_img_dict2.items(), columns = ['id', 'img'])  
test_img_df2 = pd.DataFrame(test_img_dict2.items(), columns = ['id', 'img'])  
train_df_with_img2 = pd.merge(train_img_df2, train_df, on='id')
test_df_with_img2 = pd.merge(test_img_df2, test_df, on='id')

In [26]:
LSTM_X = tokenizer.texts_to_sequences(train_df_with_img2['description_processed'].values)
LSTM_X_train = pad_sequences(LSTM_X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', LSTM_X_train.shape)

LSTM_x = tokenizer_test.texts_to_sequences(test_df_with_img2['description_processed'].values)
LSTM_X_test = pad_sequences(LSTM_x, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', LSTM_X_test.shape)

Shape of data tensor: (9430, 250)
Shape of data tensor: (1662, 250)


### ANN + CNN Model training

In [12]:
to_drop = ['Unnamed: 0', 'screen_name', 'url', 'profile_image_url', 'description',
           'id', 'name', 'account_type', 'tweets_list', 'tweets_list_processed',
          'description_processed', 'protected', 'verified', 'account_type_multi', 
          'profile_use_background_image', 'profile_background_tile']

x_train, y_train = train_df_with_img2.drop(to_drop, axis=1), train_df_with_img2['account_type_multi']
x_test, y_test = test_df_with_img2.drop(to_drop, axis=1), test_df_with_img2['account_type_multi']

In [27]:
x_train_img, x_train_attr, x_train_text = np.stack(x_train['img']) / 255.0, x_train.drop('img', axis=1), LSTM_X_train
x_test_img, x_test_attr, x_test_text = np.stack(x_test['img']) / 255.0, x_test.drop('img', axis=1), LSTM_X_test

In [14]:
def create_ann():
    ann_model = Sequential()
    ann_model.add(Dense(64, activation = 'relu', input_dim = 221))
    ann_model.add(Dropout(.1))
    ann_model.add(Dense(128, activation='relu'))
    return ann_model

In [15]:
def create_cnn():    
    base_model = InceptionV3(input_shape = (75, 75, 3), include_top = False, weights = 'imagenet')

    for layer in base_model.layers:
        layer.trainable = False
    x = Flatten()(base_model.output)
    x = Dense(70, activation='relu')(x)
    x = Dropout(0.5)(x)
    x = Dense(50, activation='softmax')(x)
    cnn_model = Model(base_model.input, x)
    return cnn_model

In [16]:
ann_model = create_ann()
cnn_model = create_cnn()
combined_input = concatenate([ann_model.output, cnn_model.output])
x = Dense(50, activation="relu")(combined_input)
x = Dense(4, activation="softmax")(x)
combined_model = Model(inputs=[ann_model.input, cnn_model.input], outputs=x)
combined_model.compile(optimizer='adam',
                    loss='sparse_categorical_crossentropy',
                    metrics=['accuracy'])

2022-11-16 11:28:08.571282: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [17]:
start_time = time.time()
combined_model.fit(
	x=[x_train_attr, x_train_img], y=y_train,
	validation_split=0.15,
	epochs=20, batch_size=50)
time_taken = time.time() - start_time
print("Total time taken for the program execution", time_taken) # seconds

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Total time taken for the program execution 559.6284792423248


In [18]:
score = combined_model.evaluate([x_train_attr, x_train_img], y_train, verbose=0)
print(f'Train loss: {score[0]} / Train accuracy: {score[1]}')
score = combined_model.evaluate([x_test_attr, x_test_img], y_test, verbose=0)
print(f'Test loss: {score[0]} / Test accuracy: {score[1]}')

Train loss: 0.03053157776594162 / Train accuracy: 0.9915164113044739
Test loss: 0.12443032115697861 / Test accuracy: 0.9741275310516357


In [19]:
pred = combined_model.predict([x_test_attr, x_test_img])
pred = np.argmax(pred,axis=1)
print(classification_report(y_test, pred, digits=5))

              precision    recall  f1-score   support

           0    0.94976   0.96359   0.95663       412
           1    0.97430   0.98913   0.98166       460
           2    0.97059   0.96350   0.96703       274
           3    0.99604   0.97481   0.98531       516

    accuracy                        0.97413      1662
   macro avg    0.97267   0.97276   0.97266      1662
weighted avg    0.97436   0.97413   0.97418      1662



### ANN + CNN + LSTM Model training

In [20]:
def create_lstm():    
    lstm_model = Sequential()
    lstm_model.add(Embedding(50000, 128, input_length=250))
    lstm_model.add(SpatialDropout1D(0.7))
    lstm_model.add(LSTM(64, dropout=0.7, recurrent_dropout=0.7))
    return lstm_model

In [28]:
ann_model = create_ann()
cnn_model = create_cnn()
lstm_model = create_lstm()
combined_input = concatenate([ann_model.output, cnn_model.output, lstm_model.output])
x = Dense(50, activation="relu")(combined_input)
x = Dense(4, activation="softmax")(x)
combined_model = Model(inputs=[ann_model.input, cnn_model.input, lstm_model.input], outputs=x)
combined_model.compile(optimizer='adam',
                    loss='sparse_categorical_crossentropy',
                    metrics=['accuracy'])

In [30]:
start_time = time.time()
combined_model.fit(
	x=[x_train_attr, x_train_img, x_train_text], y=y_train,
	validation_split=0.15,
	epochs=20, batch_size=50)
time_taken = time.time() - start_time
print("Total time taken for the program execution", time_taken) # seconds

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Total time taken for the program execution 1779.1880779266357


In [31]:
score = combined_model.evaluate([x_train_attr, x_train_img, x_train_text], y_train, verbose=0)
print(f'Train loss: {score[0]} / Train accuracy: {score[1]}')
score = combined_model.evaluate([x_test_attr, x_test_img, x_test_text], y_test, verbose=0)
print(f'Test loss: {score[0]} / Test accuracy: {score[1]}')

Train loss: 0.03258036822080612 / Train accuracy: 0.9908801913261414
Test loss: 0.29367831349372864 / Test accuracy: 0.9368231296539307


In [32]:
pred = combined_model.predict([x_test_attr, x_test_img, x_test_text])
pred = np.argmax(pred,axis=1)
print(classification_report(y_test, pred, digits=5))

              precision    recall  f1-score   support

           0    0.87617   0.91019   0.89286       412
           1    0.96781   0.98043   0.97408       460
           2    0.91575   0.91241   0.91408       274
           3    0.97172   0.93217   0.95153       516

    accuracy                        0.93682      1662
   macro avg    0.93286   0.93380   0.93314      1662
weighted avg    0.93772   0.93682   0.93705      1662

