In [1]:
%matplotlib inline
from matplotlib import pyplot as plt
from tqdm import tqdm
import numpy as np
from PIL import Image
import os
import pickle
import json
import cv2
import re
import keras

Using TensorFlow backend.


In [2]:
from keras import backend as K
K.tensorflow_backend._get_available_gpus()

['/job:localhost/replica:0/task:0/device:GPU:0']

In [3]:
# make image dataloader using flow_from_dataframe
import pandas as pd
from keras.preprocessing.image import ImageDataGenerator

# load data to extract labels
data_dir = 'mmhs150k/'
model_dir = 'models/'
tweet_dict = json.load(open(data_dir + 'MMHS150K_GT.json', 'r'))

# # read split id's and return data generator
# def get_data_dict(path):
    
#     # build dictionary mapping id's to labels
#     data = {'id': [], 'label': []}
#     for id in open(data_dir + path, 'r').read().splitlines():

#         # get majority vote label
#         binary_labels = [1 if n > 0 else 0 for n in tweet_dict[id]['labels']]
#         label = 1 if sum(binary_labels)/len(tweet_dict[id]['labels']) > 0.5 else 0

#         # save to data dict
#         data['id'].append(id + '.jpg')
#         data['label'].append(str(label))
        
#     data_df = pd.DataFrame.from_dict(data) # get dataframe to flow from
    
#     datagen = ImageDataGenerator(rescale=1./255,
#                                  samplewise_center=True,
#                                  samplewise_std_normalization=True,
#                                  width_shift_range=0.3,
#                                  height_shift_range=0.3,
#                                  shear_range=10,
#                                  horizontal_flip=True,
#                                  vertical_flip=True)
#     datagen = ImageDataGenerator(rescale=1./255)
#     generator = datagen.flow_from_dataframe(
#         dataframe=data_df,
#         directory=data_dir + 'img_resized',
#         x_col='id',
#         y_col='label',
#         target_size=(299, 299),
#         batch_size=16,
#         class_mode='binary')
    
#     return generator

# train_generator = get_data_dict('splits/train_ids.txt')
# val_generator = get_data_dict('splits/val_ids.txt')
# test_generator = get_data_dict('splits/test_ids.txt')

In [19]:
# custom data generator to handle cropping
# https://stanford.edu/~shervine/blog/keras-how-to-generate-data-on-the-fly
class DataGenerator(keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, splits_path, tweet_dict, batch_size=32, dim=(299, 299), n_channels=3, shuffle=True):
        'Initialization'
        self.dim = dim
        self.batch_size = batch_size
        self.n_channels = n_channels
        self.shuffle = shuffle
        
        # build labels list and id list
        self.id_list = open(splits_path, 'r').read().splitlines()
        self.labels = dict()
        for id in self.id_list:
            binary_labels = [1 if n > 0 else 0 for n in tweet_dict[id]['labels']]
            label = 1 if sum(binary_labels)/len(tweet_dict[id]['labels']) > 0.5 else 0
            self.labels[id] = label
            
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.id_list) / self.batch_size)) + 1 # last batch is partial

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:index*self.batch_size + self.batch_size]
        
        
        # Find list of IDs
        id_list_temp = [self.id_list[k] for k in indexes]

        # Generate data
        X, y = self.__data_generation(id_list_temp)
        
        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.id_list))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, id_list_temp):
        'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
        # Initialization
        X = np.empty((len(id_list_temp), *self.dim, self.n_channels))
        y = np.empty(len(id_list_temp), dtype=int)

        # Generate data
        for i, ID in enumerate(id_list_temp):
            # Store sample
            X[i,] = self.process_img(data_dir + 'img_resized/' + ID + '.jpg')

            # Store class
            y[i] = self.labels[ID]

        return X, y
    
    def process_img(self, path): # method for getting image
        img = Image.open(path)
        img.load()
        data = np.asarray(img, dtype='uint8')
        im = data[:self.dim[0], :self.dim[1]]
        
        if im.shape==(self.dim[0], self.dim[1]): im = np.stack((im,)*3, axis=-1) # handle grayscale
        
        return augment(im)
    
    def get_labels(self): # get list of labels for calculating AUROC
        return [self.labels[ID] for ID in self.id_list]
    
    def augment(self, im): # random crop and random mirror
        
        
        
        
train_gen = DataGenerator(splits_path=data_dir + 'splits/train_ids.txt',
                          tweet_dict=tweet_dict,
                          batch_size=32,
                          dim=(299, 299),
                          n_channels=3,
                          shuffle=True)

val_gen = DataGenerator(splits_path=data_dir + 'splits/val_ids.txt',
                          tweet_dict=tweet_dict,
                          batch_size=32,
                          dim=(299, 299),
                          n_channels=3,
                          shuffle=True)

test_gen = DataGenerator(splits_path=data_dir + 'splits/test_ids.txt',
                          tweet_dict=tweet_dict,
                          batch_size=32,
                          dim=(299, 299),
                          n_channels=3,
                          shuffle=False)

In [21]:
from keras.applications.inception_v3 import InceptionV3
from keras.optimizers import Adam
from keras.models import Sequential, Model
from keras.layers.core import Dense, Flatten

conv_base = keras.applications.inception_v3.InceptionV3(include_top=False, 
                                                        weights='imagenet', 
                                                        input_shape=(299, 299, 3))
for layer in conv_base.layers[:-1]: layer.trainable = False # freeze pretrained layers

model = Sequential()
model.add(conv_base)
model.add(Flatten())
# model.add(Dense(2048, activation='relu'))
model.add(Dense(1024, activation='relu'))
model.add(Dense(512, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
print(model.summary())

optimizer = Adam(lr = 0.001)
model.compile(loss="binary_crossentropy",optimizer=optimizer, metrics=['accuracy'])

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inception_v3 (Model)         (None, 8, 8, 2048)        21802784  
_________________________________________________________________
flatten_2 (Flatten)          (None, 131072)            0         
_________________________________________________________________
dense_4 (Dense)              (None, 1024)              134218752 
_________________________________________________________________
dense_5 (Dense)              (None, 512)               524800    
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 513       
Total params: 156,546,849
Trainable params: 134,744,065
Non-trainable params: 21,802,784
_________________________________________________________________
None


In [22]:
# train model
history = model.fit_generator(train_gen, 
                    validation_data=val_gen,
                    shuffle=True,
                    epochs=1)


Epoch 1/1


In [21]:
# test
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score
import math

y_test = test_gen.get_labels()

# get AUROC
preds = np.concatenate([model.predict(test_gen.__getitem__(idx)[0]) for idx in range(len(test_gen))])
print('Test AUROC:', roc_auc_score(y_test, preds))

# get loss and acc
print('Test acc:', model.evaluate(test_gen)[1])

# get F1
preds_bin = np.array(preds)
preds_bin[preds>0.5] = 1
preds_bin[preds<=0.5] = 0
print('Test F1:', f1_score(y_test, preds_bin, zero_division=1))
print('Test Precision:', precision_score(y_test, preds_bin, zero_division=1))
print('Test Recall:', recall_score(y_test, preds_bin, zero_division=1))

Test AUROC: 0.50019996
Test acc: 0.5001000165939331
Test F1: 0.0007995202878273037
Test Precision: 0.6666666666666666
Test Recall: 0.0004


In [None]:
model.save(model_dir + 'CNN.h5')

In [None]:
# see fraction of positive examples
print(sum(train_generator.classes)/len(train_generator.classes))
print(sum(val_generator.classes)/len(val_generator.classes))
print(sum(test_generator.classes)/len(test_generator.classes))

In [7]:
from keras.models import load_model
model = load_model(model_dir + 'CNN.h5')

In [64]:
print(len(test_gen))

313


In [None]:
# make image dataloader using flow_from_dataframe
import pandas as pd
from keras.preprocessing.image import ImageDataGenerator

# load data to extract labels
data_dir = 'mmhs150k/'
tweet_dict = json.load(open(data_dir + 'MMHS150K_GT.json', 'r'))

# read split id's and return data generator
def get_data_dict(path):
    
    # build dictionary mapping id's to labels
    data = {'id': [], 'label': []}
    for id in open(data_dir + path, 'r').read().splitlines()[:2]: # test for two

        # get majority vote label
        binary_labels = [1 if n > 0 else 0 for n in tweet_dict[id]['labels']]
        label = 1 if sum(binary_labels)/len(tweet_dict[id]['labels']) > 0.5 else 0

        # save to data dict
        data['id'].append(id + '.jpg')
        data['label'].append("0" if not data['label'] else "1")
        
        im = Image.open(data_dir + 'img_resized/' + id + '.jpg')
        display(im)
        
        
    data_df = pd.DataFrame.from_dict(data) # get dataframe to flow from
    
    datagen = ImageDataGenerator()#rescale=1./255)
    generator = datagen.flow_from_dataframe(
        dataframe=data_df,
        directory=data_dir + 'img_resized',
        x_col='id',
        y_col='label',
        target_size=(299, 299),
        batch_size=2,
        class_mode='binary')
    
    return generator

train_generator = get_data_dict('splits/train_ids.txt')

# test image augmentation
x, y = train_generator.next()

# test image augmentation
for img, label in zip(x,y):
    im = Image.fromarray(np.uint8(img), 'RGB')
    display(im)
    im = Image.fromarray(img, 'RGB')
    display(im)