In [1]:
%matplotlib inline
from matplotlib import pyplot as plt
from tqdm import tqdm
import numpy as np
import tensorflow as tf
from PIL import Image
import os
import pickle
import json
import re
import gc
import transformers

In [2]:
tf.test.is_gpu_available()

True

## Loading Data

In [3]:
# make image dataloader using flow_from_dataframe
import pandas as pd
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# load data to extract labels
data_dir = '../facebook_challenge_data/'
model_dir = 'models/'

# load data and print sizes
def get_dict(path):
    jsonl_content = open(path, 'r').read()
    data = [json.loads(jline) for jline in jsonl_content.split('\n')]
    return {datum['id'] : datum for datum in data}


train_dict = get_dict(data_dir + 'train.jsonl')
val_dict = get_dict(data_dir + 'dev.jsonl')
test_dict = get_dict(data_dir + 'test.jsonl')

print(len(train_dict))
print(len(val_dict))
print(len(test_dict))

8500
500
1000


## Load Model

In [4]:
# # load pretrained roberta
# from transformers import AutoConfig, AutoModelForSequenceClassification, AutoTokenizer, TFRobertaModel

# tokenizer = AutoTokenizer.from_pretrained('roberta-base')

# class ROBERTA(transformers.TFRobertaModel):

#     def __init__(self, config, *inputs, **kwargs):
#         super(ROBERTA, self).__init__(config, *inputs, **kwargs)
#         self.roberta.call = tf.function(self.roberta.call)
# #         self.roberta.trainable = False # freeze pretrained roberta

In [5]:
# # load pretrained resnet
# image_encoder = tf.keras.applications.ResNet152(include_top=False, 
#                                weights='imagenet', 
#                                input_shape=(299, 299, 3)
# )

# # load inception bc of memory constraints
# # image_encoder = tf.keras.applications.InceptionV3(include_top=False, 
# #                                weights='imagenet', 
# #                                input_shape=(299, 299, 3)
# # )

# for layer in image_encoder.layers[:-1]: layer.trainable = False # freeze pretrained layers

In [6]:
# # https://keras.io/examples/nlp/text_extraction_with_bert/
# # https://github.com/huggingface/transformers/issues/1350

# from tensorflow.keras.models import Model
# from tensorflow.keras.layers import Input
# from tensorflow.keras import layers

# max_len = 50

# # handle text
# input_ids = layers.Input(shape=(max_len,), dtype=tf.int32)
# roberta = ROBERTA.from_pretrained('roberta-base')
# roberta_encodings = roberta([input_ids])[0]
# text_embed = tf.squeeze(roberta_encodings[:, 0:1, :], axis=1) # Keep [CLS] token encoding
# text_embed = layers.Dropout(0.1)(text_embed)
# # text_embed = layers.Dense(512)(text_embed)


# # handle image
# # input_img = Input((224, 224, 3))
# # img_embed = image_encoder(input_img)
# # img_embed = layers.GlobalAveragePooling2D()(img_embed)
# # img_embed = layers.Dense(1)(img_embed)

# # concat
# # pred = layers.Concatenate()([text_embed, img_embed])
# pred = text_embed
# # pred = layers.Dense(512, activation='relu')(pred)
# # pred = layers.Dense(256, activation='relu')(pred)
# pred = layers.Dense(1, activation='sigmoid')(pred)

# model = Model(
#     inputs=[input_ids],# , input_img],
#     outputs=pred,
# )

# optimizer = tf.keras.optimizers.Adam(lr=5e-5)
# model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
# model.summary()

In [9]:
# load pretrained roberta
import transformers
from tensorflow.keras import layers
from transformers import AutoConfig, AutoModelForSequenceClassification, AutoTokenizer, TFRobertaModel

tokenizer = AutoTokenizer.from_pretrained('roberta-base')
MAX_SEQ_LEN = 50

class ROBERTA(transformers.TFRobertaModel):

    def __init__(self, config, *inputs, **kwargs):
        super(ROBERTA, self).__init__(config, *inputs, **kwargs)
        self.roberta.call = tf.function(self.roberta.call)

input_ids = layers.Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32)
roberta = ROBERTA.from_pretrained('roberta-base')
roberta_encodings = roberta([input_ids])[0]
doc_encoding = tf.squeeze(roberta_encodings[:, 0:1, :], axis=1) # Keep [CLS] token encoding
doc_encoding = layers.Dropout(0.1)(doc_encoding) # Apply dropout
outputs = layers.Dense(1, activation='sigmoid', name='outputs')(doc_encoding)
model = tf.keras.models.Model(inputs=[input_ids], outputs=[outputs])

# compile
optimizer = tf.keras.optimizers.Adam(lr=5e-5)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 50)]              0         
_________________________________________________________________
roberta (ROBERTA)            ((None, 50, 768), (None,  124645632 
_________________________________________________________________
tf_op_layer_strided_slice (T [(None, 1, 768)]          0         
_________________________________________________________________
tf_op_layer_Squeeze (TensorF [(None, 768)]             0         
_________________________________________________________________
dropout_38 (Dropout)         (None, 768)               0         
_________________________________________________________________
outputs (Dense)              (None, 1)                 769       
Total params: 124,646,401
Trainable params: 124,646,401
Non-trainable params: 0
_______________________________________________

## Create Data Generator

In [10]:
from random import randint # for random cropping
from tensorflow.keras.preprocessing.sequence import pad_sequences

class FBMMDataGenerator(tf.keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, data_dict, tokenizer, pad_len, batch_size=32, dim=(299, 299), n_channels=3, shuffle=True):
        'Initialization'
        self.dim = dim
        self.data_dict = data_dict
        self.batch_size = batch_size
        self.n_channels = n_channels
        self.shuffle = shuffle
        self.pad_len = pad_len
        self.tokenizer = tokenizer
        
        # build labels list and id list
        self.id_list = list(self.data_dict.keys())
        self.labels = {ID: self.data_dict[ID]['label'] for ID in self.id_list}
        self.img_list = {ID: self.data_dict[ID]['img'] for ID in self.id_list}
            
        # get text dictionary
        self.text_dict = self.process_text(self.id_list)
        
        self.on_epoch_end()
        self.classes = [self.labels[self.id_list[i]] for i in self.indexes]

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.id_list) / self.batch_size)) + 1 # last batch is partial

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:index*self.batch_size + self.batch_size]
        
        
        # Find list of IDs
        id_list_temp = [self.id_list[k] for k in indexes]

        # Generate data
        X_txt, X_img, y = self.__data_generation(id_list_temp)
        
        return X_txt, y #(X_txt, X_img), y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.id_list))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, id_list_temp):
        'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
        # Initialization
        X_img = np.empty((len(id_list_temp), *self.dim, self.n_channels))
        X_txt = np.empty((len(id_list_temp), self.pad_len))
        y = np.empty(len(id_list_temp), dtype=int)

        # Generate data
        for i, ID in enumerate(id_list_temp):
            # Store sample
            X_img[i,] = self.process_img(data_dir + self.img_list[ID])
            X_txt[i,] = self.text_dict[ID]

            # Store class
            y[i] = self.labels[ID]

        return X_txt.astype(int), X_img, y
    
    def process_img(self, path): # method for getting image
        img = Image.open(path)
        img.load()
        scale_size = int(1.5 * self.dim[0]) # want cropping
        if img.size[0] < img.size[1]: # width greater than height
            wpercent = (scale_size/float(img.size[0]))
            hsize = int((float(img.size[1])*float(wpercent)))
            img = img.resize((scale_size,hsize), Image.ANTIALIAS)
        else: # height greater than width
            hpercent = (scale_size/float(img.size[1]))
            wsize = int((float(img.size[0])*float(hpercent)))
            img = img.resize((wsize, scale_size), Image.ANTIALIAS)
            
        data = np.asarray(img, dtype='uint8')
        im = self.augment(data) # apply transformation
        
        
        if im.shape==(self.dim[0], self.dim[1]): im = np.stack((im,)*3, axis=-1) # handle grayscale
        if im.shape == (*self.dim, 4): im = im[:,:,:3] # handle weird case
        
        return im
    
    def augment(self, im): # random crop and random mirror
        
        # random crop
        x_max, y_max = im.shape[0], im.shape[1]
        x_start, y_start = randint(0, x_max - self.dim[0]), randint(0, y_max - self.dim[1])
        im = im[x_start:x_start + self.dim[0], y_start:y_start + self.dim[1]]
        
        # random mirror
        if randint(0,1): im = np.flip(im, axis=1)
        
        return im
    
    def process_text(self, id_list):
        
        # matrix for texts
        texts = [self.data_dict[ID]['text'] for ID in id_list]
        sequences = [self.tokenizer.encode(text) for text in texts] # make this more efficient...
        text_seqs = pad_sequences(sequences, maxlen=self.pad_len)
        
        id_to_seq = {ID: txt for (ID, txt) in zip(id_list, text_seqs)} # map id to text seq
        
        return id_to_seq

In [11]:
# # test tokenizer

# print(type(tokenizer))
# print(tokenizer.encode('hi my name is bob and i like to go swimming'))
# print(tokenizer.encode(['hi my name is bob and i like to go swimming', 'hello']))

In [13]:
# create data generators

# create data generators
max_len = 50
train_gen = FBMMDataGenerator(data_dict=train_dict,
                          tokenizer=tokenizer,
                          pad_len=max_len,
                          batch_size=16,
                          dim=(224, 224),
                          n_channels=3,
                          shuffle=True)

val_gen = FBMMDataGenerator(data_dict=val_dict,
                          tokenizer=tokenizer,
                          pad_len=max_len,
                          batch_size=16,
                          dim=(224, 224),
                          n_channels=3,
                          shuffle=True)

In [None]:
# train model
from tensorflow.keras.callbacks import ModelCheckpoint

mcp_save = ModelCheckpoint(model_dir + 'best_roberta_fusion.h5', 
                           save_weights_only=True, 
                           save_best_only=True, 
                           verbose=1,
                           monitor='val_loss', 
                           mode='min')

history = model.fit_generator(train_gen,
                    validation_data=val_gen,
                    shuffle=True,
                    epochs=5,
                    callbacks=[mcp_save])

Epoch 1/5
Epoch 00001: val_loss improved from inf to 0.73164, saving model to models/best_roberta_fusion.h5
Epoch 2/5
113/532 [=====>........................] - ETA: 4:30 - loss: 0.6545 - accuracy: 0.6460

In [None]:
model.load_weights(model_dir + 'best_roberta_fusion.h5')

In [None]:
# test
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, accuracy_score
import math

y_val = val_gen.classes

# get AUROC
preds = model.predict_generator(val_gen)
print('Val AUROC:', roc_auc_score(y_val, preds))

# get loss and acc
preds_bin = np.array(preds)
preds_bin[preds>0.5] = 1
preds_bin[preds<=0.5] = 0
print('Val Accuracy:', accuracy_score(y_val, preds_bin))

In [None]:
text, y = train_gen.__getitem__(0)

print([tokenizer.decode(t) for t in text])

In [None]:
print(preds)