In [1]:
%matplotlib inline
from matplotlib import pyplot as plt
from tqdm import tqdm
import numpy as np
import tensorflow as tf
from PIL import Image
import os
import pickle
import json
import cv2
import re

In [2]:
tf.test.is_gpu_available()

True

In [3]:
# make image dataloader using flow_from_dataframe
import pandas as pd
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# load data to extract labels
data_dir = '../facebook_challenge_data/'
model_dir = 'models/'

# load data and print sizes
def get_dict(path):
    jsonl_content = open(path, 'r').read()
    data = [json.loads(jline) for jline in jsonl_content.split('\n')]
    return {datum['id'] : datum for datum in data}


train_dict = get_dict(data_dir + 'train.jsonl')
val_dict = get_dict(data_dir + 'dev.jsonl')
test_dict = get_dict(data_dir + 'test.jsonl')

print(len(train_dict))
print(len(val_dict))
print(len(test_dict))

8500
500
1000


In [6]:
# custom data generator to handle cropping
# https://stanford.edu/~shervine/blog/keras-how-to-generate-data-on-the-fly
from random import randint # for random cropping

class FBDataGenerator(tf.keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, data_dict, batch_size=32, dim=(299, 299), n_channels=3, shuffle=True):
        'Initialization'
        self.dim = dim
        self.batch_size = batch_size
        self.n_channels = n_channels
        self.data_dict = data_dict
        self.shuffle = shuffle
        
        # build labels list and id list
        self.id_list = list(self.data_dict.keys())
        self.labels = {ID: self.data_dict[ID]['label'] for ID in self.id_list}
        self.img_list = {ID: self.data_dict[ID]['img'] for ID in self.id_list}
            
        self.on_epoch_end()
        self.classes = [self.labels[self.id_list[i]] for i in self.indexes]

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.id_list) / self.batch_size)) + 1 # last batch is partial

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:index*self.batch_size + self.batch_size]
        
        
        # Find list of IDs
        id_list_temp = [self.id_list[k] for k in indexes]

        # Generate data
        X, y = self.__data_generation(id_list_temp)
        
        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.id_list))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, id_list_temp):
        'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
        # Initialization
        X = np.empty((len(id_list_temp), *self.dim, self.n_channels))
        y = np.empty(len(id_list_temp), dtype=int)

        # Generate data
        for i, ID in enumerate(id_list_temp):
            # Store sample
            X[i,] = self.process_img(data_dir + self.img_list[ID])

            # Store class
            y[i] = self.labels[ID]

        return X, y
    
    def process_img(self, path): # method for getting image
        img = Image.open(path)
        img.load()
        img = img.resize(self.dim, Image.ANTIALIAS)
        data = np.asarray(img, dtype='uint8')
        im = self.augment(data)
        
        
        if im.shape==(self.dim[0], self.dim[1]): im = np.stack((im,)*3, axis=-1) # handle grayscale
        if im.shape == (*self.dim, 4): im = im[:,:,:3] # handle weird case
        
        return im
    
    def augment(self, im): # random crop and random mirror
        
        # random crop
        x_max, y_max = im.shape[0], im.shape[1]
        x_start, y_start = randint(0, x_max - self.dim[0]), randint(0, y_max - self.dim[1])
        im = im[x_start:x_start + self.dim[0], y_start:y_start + self.dim[1]]
        
        # random mirror
        if randint(0,1): im = np.flip(im, axis=1)
        
        return im

In [7]:
# create data generators
train_gen = FBDataGenerator(data_dict=train_dict,
                          batch_size=32,
                          dim=(299, 299),
                          n_channels=3,
                          shuffle=True)

val_gen = FBDataGenerator(data_dict=val_dict,
                          batch_size=32,
                          dim=(299, 299),
                          n_channels=3,
                          shuffle=True)

In [8]:
from tensorflow.keras.applications.inception_v3 import InceptionV3
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Flatten

conv_base = tf.keras.applications.inception_v3.InceptionV3(include_top=False, 
                                                        weights='imagenet', 
                                                        input_shape=(299, 299, 3))
# for layer in conv_base.layers[:-1]: layer.trainable = False # freeze pretrained layers

model = Sequential()
model.add(conv_base)
model.add(Flatten())
# model.add(Dense(2048, activation='relu'))
# model.add(Dense(1024, activation='relu'))
model.add(Dense(512, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
print(model.summary())

optimizer = Adam(lr = 1e-6)
model.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=['accuracy'])

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inception_v3 (Model)         (None, 8, 8, 2048)        21802784  
_________________________________________________________________
flatten (Flatten)            (None, 131072)            0         
_________________________________________________________________
dense (Dense)                (None, 512)               67109376  
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 513       
Total params: 88,912,673
Trainable params: 88,878,241
Non-trainable params: 34,432
_________________________________________________________________
None


In [9]:
# train model

# from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import ModelCheckpoint

# model = load_model(model_dir + 'cnn_weighted.h5')

mcp_save = ModelCheckpoint(model_dir + 'best_fb_inc_cnn.h5', save_best_only=True, monitor='val_loss', mode='min')

history = model.fit_generator(train_gen,
                    validation_data=val_gen,
                    shuffle=True,
                    epochs=25,
                    callbacks=[mcp_save])


Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [10]:
val_gen = FBDataGenerator(data_dict=val_dict,
                          batch_size=32,
                          dim=(299, 299),
                          n_channels=3,
                          shuffle=True)

In [11]:
# test
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, accuracy_score
import math

y_val = val_gen.classes

# get AUROC
preds = model.predict_generator(val_gen)
print('Test AUROC:', roc_auc_score(y_val, preds))

# get loss and acc
preds_bin = np.array(preds)
preds_bin[preds>0.5] = 1
preds_bin[preds<=0.5] = 0
print('Test Accuracy:', accuracy_score(y_val, preds_bin))

# get F1
print('Test F1:', f1_score(y_val, preds_bin, zero_division=1))
print('Test Precision:', precision_score(y_val, preds_bin, zero_division=1))
print('Test Recall:', recall_score(y_val, preds_bin, zero_division=1))

Test AUROC: 0.4501919999999999
Test Accuracy: 0.484
Test F1: 0.19375
Test Precision: 0.44285714285714284
Test Recall: 0.124


In [None]:
# model.save(model_dir + 'cnn_weighted.h5')

In [None]:
# get loss and acc with lower threshold
preds_bin = np.array(preds)
preds_bin[preds>0.4] = 1
preds_bin[preds<=0.4] = 0
print('Test Accuracy:', accuracy_score(y_test, preds_bin))

# get F1
print('Test F1:', f1_score(y_test, preds_bin, zero_division=1))
print('Test Precision:', precision_score(y_test, preds_bin, zero_division=1))
print('Test Recall:', recall_score(y_test, preds_bin, zero_division=1))

In [None]:
from tensorflow.keras.models import load_model

model = load_model(model_dir + 'cnn_weighted.h5')

In [None]:
preds = model.predict_generator(test_gen)
y_test = np.array(test_gen.get_labels())

pos_preds = preds[y_test == 1]
neg_preds = preds[y_test == 0]

In [None]:
plt.title('Distribution of Prediction Values (Image Only) (Full CNN with 1:1/0.28 weighting)\n 8 epochs 1e-6 lr')
plt.hist(pos_preds, bins=100, alpha=0.5, label='positive')
plt.hist(neg_preds, bins=100, alpha=0.5, label= 'negative')
plt.legend()

In [12]:
for im in os.listdir('../facebook_challenge_data/img/'):
    img = Image.open('../facebook_challenge_data/img/' + im)
    print(img.size)

(240, 400)
(550, 366)
(550, 366)
(800, 800)
(550, 398)
(550, 404)
(550, 366)
(313, 399)
(825, 549)
(297, 400)
(266, 400)
(550, 366)
(550, 412)
(825, 549)
(800, 533)
(550, 366)
(825, 549)
(267, 400)
(550, 367)
(825, 543)
(825, 551)
(825, 538)
(672, 800)
(550, 366)
(825, 549)
(800, 533)
(282, 400)
(305, 400)
(550, 366)
(550, 350)
(531, 800)
(538, 800)
(550, 366)
(550, 366)
(240, 400)
(825, 549)
(825, 549)
(550, 374)
(800, 533)
(543, 800)
(565, 800)
(310, 400)
(800, 532)
(800, 533)
(393, 800)
(825, 575)
(550, 412)
(533, 800)
(550, 366)
(533, 800)
(550, 366)
(800, 533)
(550, 378)
(573, 800)
(550, 366)
(266, 400)
(825, 549)
(550, 365)
(800, 533)
(277, 399)
(266, 400)
(550, 366)
(550, 366)
(550, 366)
(550, 422)
(800, 533)
(294, 400)
(550, 366)
(550, 412)
(800, 523)
(550, 410)
(825, 549)
(696, 800)
(825, 631)
(724, 800)
(533, 800)
(825, 634)
(550, 366)
(550, 366)
(417, 800)
(800, 533)
(550, 366)
(480, 800)
(550, 435)
(300, 400)
(550, 365)
(225, 400)
(533, 800)
(533, 800)
(750, 800)
(800, 531)

(800, 800)
(800, 533)
(550, 366)
(323, 400)
(312, 800)
(825, 549)
(550, 366)
(550, 366)
(294, 400)
(268, 400)
(696, 800)
(825, 549)
(550, 366)
(300, 400)
(295, 399)
(245, 400)
(825, 549)
(550, 412)
(291, 400)
(825, 549)
(550, 373)
(550, 366)
(702, 800)
(800, 515)
(800, 533)
(550, 366)
(550, 365)
(550, 365)
(825, 549)
(800, 533)
(800, 533)
(825, 576)
(286, 400)
(600, 800)
(285, 400)
(800, 800)
(550, 366)
(800, 533)
(533, 800)
(825, 549)
(825, 546)
(550, 366)
(825, 539)
(825, 549)
(550, 388)
(266, 400)
(825, 407)
(550, 366)
(825, 549)
(825, 549)
(533, 800)
(550, 480)
(825, 547)
(550, 366)
(550, 366)
(825, 549)
(533, 800)
(321, 400)
(267, 400)
(825, 502)
(637, 800)
(268, 400)
(550, 350)
(277, 399)
(642, 800)
(550, 366)
(559, 800)
(267, 400)
(825, 636)
(825, 549)
(825, 549)
(550, 366)
(800, 533)
(550, 366)
(550, 366)
(400, 400)
(266, 400)
(550, 342)
(550, 516)
(550, 365)
(594, 800)
(550, 358)
(825, 549)
(531, 800)
(599, 800)
(550, 485)
(800, 800)
(536, 800)
(825, 539)
(550, 366)
(599, 800)

(331, 400)
(550, 355)
(825, 549)
(561, 800)
(825, 549)
(825, 549)
(188, 400)
(550, 358)
(550, 366)
(623, 800)
(550, 397)
(800, 533)
(534, 800)
(825, 544)
(266, 400)
(800, 800)
(615, 800)
(550, 366)
(616, 800)
(400, 400)
(550, 340)
(641, 800)
(532, 800)
(550, 366)
(302, 400)
(825, 549)
(326, 800)
(550, 366)
(266, 400)
(591, 799)
(533, 800)
(550, 412)
(296, 400)
(825, 621)
(589, 800)
(800, 525)
(347, 400)
(550, 524)
(825, 546)
(269, 400)
(550, 366)
(280, 400)
(266, 400)
(825, 549)
(666, 800)
(247, 400)
(550, 366)
(533, 800)
(240, 400)
(575, 800)
(550, 384)
(825, 549)
(800, 800)
(825, 547)
(519, 800)
(825, 549)
(800, 533)
(395, 400)
(599, 799)
(550, 366)
(825, 573)
(550, 449)
(550, 466)
(467, 800)
(550, 366)
(825, 549)
(298, 399)
(550, 382)
(550, 403)
(825, 549)
(550, 366)
(160, 400)
(550, 366)
(611, 799)
(550, 366)
(550, 366)
(300, 400)
(589, 800)
(550, 366)
(554, 800)
(200, 400)
(550, 359)
(267, 400)
(240, 400)
(550, 366)
(400, 400)
(550, 412)
(550, 366)
(600, 800)
(800, 800)
(550, 384)

(550, 366)
(550, 426)
(550, 373)
(200, 400)
(299, 400)
(825, 547)
(547, 800)
(550, 388)
(661, 800)
(680, 800)
(825, 618)
(390, 400)
(800, 800)
(266, 400)
(550, 388)
(267, 400)
(550, 309)
(825, 549)
(712, 800)
(550, 366)
(825, 515)
(307, 400)
(800, 533)
(550, 365)
(800, 533)
(630, 800)
(550, 366)
(550, 317)
(259, 400)
(572, 800)
(800, 533)
(550, 366)
(299, 400)
(550, 359)
(533, 800)
(800, 537)
(550, 417)
(550, 366)
(550, 366)
(825, 549)
(400, 400)
(599, 800)
(800, 533)
(550, 341)
(550, 366)
(266, 400)
(550, 366)
(533, 800)
(409, 800)
(153, 400)
(550, 366)
(300, 400)
(550, 366)
(283, 400)
(400, 400)
(800, 533)
(800, 546)
(825, 549)
(825, 549)
(533, 800)
(312, 800)
(550, 366)
(550, 366)
(550, 366)
(533, 800)
(825, 549)
(800, 594)
(550, 365)
(800, 628)
(550, 366)
(550, 383)
(550, 366)
(800, 533)
(417, 800)
(825, 549)
(825, 549)
(800, 533)
(550, 365)
(339, 400)
(825, 592)
(800, 533)
(541, 800)
(800, 553)
(800, 533)
(267, 400)
(825, 549)
(550, 439)
(825, 549)
(304, 400)
(550, 368)
(550, 366)

KeyboardInterrupt: 

In [None]:
a = np.zeros((299,299,4))
print(a.shape)
a = a[:,:,:3]
print(a.shape)