## ANN + CNN + LSTM
train a neural network that combines categorical/numerical attributes with images and text data

### Data preprocessing

In [26]:
import pandas as pd 
import numpy as np 
import re
import matplotlib.pyplot as plt
import keras
import tensorflow as tf
import cv2
import time

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from sklearn.inspection import permutation_importance
from sklearn.metrics import classification_report

from keras.utils import pad_sequences
from tensorflow.keras.layers import concatenate
from keras.layers import Conv2D, MaxPooling2D, Input, Flatten, Dense, Dropout, BatchNormalization, Embedding, LSTM, SpatialDropout1D
from keras.models import Sequential, Model
from keras.metrics import CategoricalAccuracy
from keras.regularizers import l2
from keras.callbacks import EarlyStopping
from keras.layers import Dropout
from tensorflow.keras.applications.inception_v3 import InceptionV3
from keras.preprocessing.text import Tokenizer
from nltk.corpus import stopwords
from nltk import word_tokenize
STOPWORDS = set(stopwords.words('english'))

pd.set_option('display.max_columns', None)

In [27]:
train_df = pd.read_csv('data/twitter_data_train_multiclass.csv')
test_df = pd.read_csv('data/twitter_data_test_multiclass.csv')

In [28]:
scaler = MinMaxScaler() 
train_df[['statuses_count', 'favourites_count', 'followers_count', 'friends_count', 'number_of_mentions', 'listed_count', 'number_of_tags']] = scaler.fit_transform(train_df[['statuses_count', 'favourites_count', 'followers_count', 'friends_count', 'number_of_mentions', 'listed_count', 'number_of_tags']])
test_df[['statuses_count', 'favourites_count', 'followers_count', 'friends_count', 'number_of_mentions', 'listed_count', 'number_of_tags']] = scaler.transform(test_df[['statuses_count', 'favourites_count', 'followers_count', 'friends_count', 'number_of_mentions', 'listed_count', 'number_of_tags']])

In [10]:
faces_base = "../new batch profile pics/"

In [None]:
# get all image pathnames from base
# list to store files
res = []
res2 = []

# Iterate directory
for path in os.listdir(faces_base):
    # check if current path is a file
    if os.path.isfile(os.path.join(faces_base, path)):
        res.append(faces_base + path)
        res2.append(path)

In [None]:
train_img_dict2 = {}
test_img_dict2 = {}
train_img2 = []
test_img2 = []
for i in range(len(res)):
    pic = res[i]
    id_name = re.match(r"[^\/\\]+(?=\.png|\.jpg)", res2[i]).group(0)
    try:
        img = cv2.imread(pic)
        if img is None:
            print("none")
            continue
        img = cv2.resize(img, (75, 75))
        if int(id_name) in list(train_df['id']):
            train_img_dict2[int(id_name)] = img 
            train_img2.append(img)
        elif int(id_name) in list(test_df['id']):
            test_img_dict2[int(id_name)] = img
            test_img2.append(img)
        #img.close()
    except Exception as e:
        print(e)

In [29]:
train_LSTM = train_df[['description_processed', 'account_type_multi']]
test_LSTM = test_df[['description_processed','account_type_multi']]

train_LSTM['description_processed'] = train_LSTM['description_processed'].apply(str)
test_LSTM['description_processed'] = test_LSTM['description_processed'].apply(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_LSTM['description_processed'] = train_LSTM['description_processed'].apply(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_LSTM['description_processed'] = test_LSTM['description_processed'].apply(str)


In [None]:
train_img_df2 = pd.DataFrame(train_img_dict2.items(), columns = ['id', 'img'])  
test_img_df2 = pd.DataFrame(test_img_dict2.items(), columns = ['id', 'img'])  
train_df_with_img2 = pd.merge(train_img_df2, train_df, on='id')
test_df_with_img2 = pd.merge(test_img_df2, test_df, on='id')

In [30]:
LSTM_X = tokenizer.texts_to_sequences(train_LSTM['description_processed'].values)
LSTM_X_train = pad_sequences(LSTM_X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', LSTM_X_train.shape)

LSTM_x = tokenizer_test.texts_to_sequences(test_LSTM['description_processed'].values)
LSTM_X_test = pad_sequences(LSTM_x, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', LSTM_X_test.shape)

Shape of data tensor: (9446, 250)
Shape of data tensor: (1667, 250)


In [None]:
to_drop = ['Unnamed: 0', 'screen_name', 'url', 'profile_image_url', 'description',
           'id', 'name', 'account_type', 'tweets_list', 'tweets_list_processed',
          'description_processed', 'protected', 'verified', 'account_type_multi', 
          'profile_use_background_image', 'profile_background_tile']

x_train, y_train = train_df.drop(to_drop, axis=1), train_df['account_type_multi']
x_test, y_test = test_df.drop(to_drop, axis=1), test_df['account_type_multi']

### Model training

In [None]:
x_train, y_train = train_df_with_img2.drop(to_drop, axis=1), train_df_with_img2['account_type_multi']
x_test, y_test = test_df_with_img2.drop(to_drop, axis=1), test_df_with_img2['account_type_multi']
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.15)

In [None]:
x_train_img, x_train_attr = np.stack(x_train['img']) / 255.0, x_train.drop('img', axis=1)
x_val_img, x_val_attr = np.stack(x_val['img']) / 255.0, x_val.drop('img', axis=1)
x_test_img, x_test_attr = np.stack(x_test['img']) / 255.0, x_test.drop('img', axis=1)

In [31]:
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 50000
# Max number of words
MAX_SEQUENCE_LENGTH = 250
# This is fixed.
EMBEDDING_DIM = 128

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(train_LSTM['description_processed'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

tokenizer_test = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer_test.fit_on_texts(test_LSTM['description_processed'].values)
word_index_test = tokenizer_test.word_index
print('Found %s unique tokens.' % len(word_index_test))

Found 16349 unique tokens.
Found 4366 unique tokens.


In [None]:
def create_ann():
    ann_model = Sequential()
    ann_model.add(Dense(64, activation = 'relu', input_dim = 222))
    ann_model.add(Dropout(.1))
    ann_model.add(Dense(128, activation='relu'))
    return ann_model

In [None]:
def create_cnn():    
    base_model = InceptionV3(input_shape = (75, 75, 3), include_top = False, weights = 'imagenet')

    for layer in base_model.layers:
        layer.trainable = False
    x = Flatten()(base_model.output)
    x = Dense(70, activation='relu')(x)
    x = Dropout(0.5)(x)
    x = Dense(50, activation='softmax')(x)
    cnn_model = Model(base_model.input, x)
    return cnn_model

In [32]:
def create_lstm():    
    lstm_model = Sequential()
    lstm_model.add(Embedding(50000, 128, input_length=250))
    lstm_model.add(SpatialDropout1D(0.7))
    lstm_model.add(LSTM(64, dropout=0.7, recurrent_dropout=0.7))
    return lstm_model

In [None]:
ann_model = create_ann()
cnn_model = create_cnn()
combined_input = concatenate([ann_model.output, cnn_model.output])
x = Dense(50, activation="relu")(combined_input)
x = Dense(6, activation="softmax")(x)
combined_model = Model(inputs=[ann_model.input, cnn_model.input], outputs=x)
combined_model.compile(optimizer='adam',
                    loss='sparse_categorical_crossentropy',
                    metrics=['accuracy'])

In [None]:
start_time = time.time()
combined_model.fit(
	x=[x_train_attr, x_train_img], y=y_train,
	validation_data=([x_val_attr, x_val_img], y_val),
	epochs=20, batch_size=50)
time_taken = time.time() - start_time
print("Total time taken for the program execution", time_taken) # seconds

In [None]:
score = combined_model.evaluate([x_train_attr, x_train_img], y_train, verbose=0)
print(f'Train loss: {score[0]} / Train accuracy: {score[1]}')
score = combined_model.evaluate([x_test_attr, x_test_img], y_test, verbose=0)
print(f'Test loss: {score[0]} / Test accuracy: {score[1]}')

In [None]:
pred = combined_model.predict([x_test_attr, x_test_img])
pred = np.argmax(pred,axis=1)
print(classification_report(y_test, pred, digits=5))

### LSTM on Account's description Text