In [9]:
%matplotlib inline
from matplotlib import pyplot as plt
from tqdm import tqdm
import numpy as np
from PIL import Image
import os
import pickle
import json
import cv2
import re
import keras

In [10]:
from keras import backend as K
K.tensorflow_backend._get_available_gpus()

['/job:localhost/replica:0/task:0/device:GPU:0']

In [11]:
# make image dataloader using flow_from_dataframe
import pandas as pd
from keras.preprocessing.image import ImageDataGenerator

# load data to extract labels
data_dir = 'mmhs150k/'
model_dir = 'models/'
tweet_dict = json.load(open(data_dir + 'MMHS150K_GT.json', 'r'))

# # read split id's and return data generator
# def get_data_dict(path):
    
#     # build dictionary mapping id's to labels
#     data = {'id': [], 'label': []}
#     for id in open(data_dir + path, 'r').read().splitlines():

#         # get majority vote label
#         binary_labels = [1 if n > 0 else 0 for n in tweet_dict[id]['labels']]
#         label = 1 if sum(binary_labels)/len(tweet_dict[id]['labels']) > 0.5 else 0

#         # save to data dict
#         data['id'].append(id + '.jpg')
#         data['label'].append(str(label))
        
#     data_df = pd.DataFrame.from_dict(data) # get dataframe to flow from
    
#     datagen = ImageDataGenerator(rescale=1./255,
#                                  samplewise_center=True,
#                                  samplewise_std_normalization=True,
#                                  width_shift_range=0.3,
#                                  height_shift_range=0.3,
#                                  shear_range=10,
#                                  horizontal_flip=True,
#                                  vertical_flip=True)
#     datagen = ImageDataGenerator(rescale=1./255)
#     generator = datagen.flow_from_dataframe(
#         dataframe=data_df,
#         directory=data_dir + 'img_resized',
#         x_col='id',
#         y_col='label',
#         target_size=(299, 299),
#         batch_size=16,
#         class_mode='binary')
    
#     return generator

# train_generator = get_data_dict('splits/train_ids.txt')
# val_generator = get_data_dict('splits/val_ids.txt')
# test_generator = get_data_dict('splits/test_ids.txt')

In [55]:
# custom data generator to handle cropping
# https://stanford.edu/~shervine/blog/keras-how-to-generate-data-on-the-fly
class DataGenerator(keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, splits_path, tweet_dict, batch_size=32, dim=(299, 299), n_channels=3, shuffle=True):
        'Initialization'
        self.dim = dim
        self.batch_size = batch_size
        self.n_channels = n_channels
        self.shuffle = shuffle
        
        # build labels list and id list
        self.id_list = open(splits_path, 'r').read().splitlines()
        self.labels = dict()
        for id in self.id_list:
            binary_labels = [1 if n > 0 else 0 for n in tweet_dict[id]['labels']]
            label = 1 if sum(binary_labels)/len(tweet_dict[id]['labels']) > 0.5 else 0
            self.labels[id] = label
            
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.id_list) / self.batch_size)) + 1 # last batch is partial

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
#         indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        idx_min = index*self.batch_size
        idx_max = min(idx_min + self.batch_size, len(self.id_list))
        indexes = self.indexes[idx_min: idx_max]
#         print(len(indexes), self.__len__())
        print(index, indexes)
        
        # Find list of IDs
        id_list_temp = [self.id_list[k] for k in indexes]

        # Generate data
        X, y = self.__data_generation(id_list_temp)
        
        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.id_list))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, id_list_temp):
        'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
        # Initialization
        X = np.empty((self.batch_size, *self.dim, self.n_channels))
        y = np.empty((self.batch_size), dtype=int)

        # Generate data
        for i, ID in enumerate(id_list_temp):
            # Store sample
            X[i,] = self.process_img(data_dir + 'img_resized/' + ID + '.jpg')

            # Store class
            y[i] = self.labels[ID]

        return X, y
    
    def process_img(self, path): # method for getting image, augmentation
        img = Image.open(path)
        img.load()
        data = np.asarray(img, dtype='uint8')
        im = data[:299, :299]
        
        if im.shape==(299, 299): im = np.stack((im,)*3, axis=-1) # handle grayscale
        
        return im
    
    def get_labels(self): # get list of labels for calculating AUROC
        return [self.labels[ID] for ID in self.id_list]
        
        
################
train_gen = DataGenerator(splits_path=data_dir + 'splits/train_ids.txt',
                          tweet_dict=tweet_dict,
                          batch_size=32,
                          dim=(299, 299),
                          n_channels=3,
                          shuffle=True)

val_gen = DataGenerator(splits_path=data_dir + 'splits/val_ids.txt',
                          tweet_dict=tweet_dict,
                          batch_size=32,
                          dim=(299, 299),
                          n_channels=3,
                          shuffle=True)

test_gen = DataGenerator(splits_path=data_dir + 'splits/test_ids.txt',
                          tweet_dict=tweet_dict,
                          batch_size=32,
                          dim=(299, 299),
                          n_channels=3,
                          shuffle=False)

In [21]:
from keras.applications.inception_v3 import InceptionV3
from keras.optimizers import Adam
from keras.models import Sequential, Model
from keras.layers.core import Dense, Flatten

conv_base = keras.applications.inception_v3.InceptionV3(include_top=False, 
                                                        weights='imagenet', 
                                                        input_shape=(299, 299, 3))
for layer in conv_base.layers[:-1]: layer.trainable = False # freeze pretrained layers

model = Sequential()
model.add(conv_base)
model.add(Flatten())
# model.add(Dense(2048, activation='relu'))
model.add(Dense(1024, activation='relu'))
model.add(Dense(512, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
print(model.summary())

optimizer = Adam(lr = 0.001)
model.compile(loss="binary_crossentropy",optimizer=optimizer, metrics=['accuracy'])

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inception_v3 (Model)         (None, 8, 8, 2048)        21802784  
_________________________________________________________________
flatten_2 (Flatten)          (None, 131072)            0         
_________________________________________________________________
dense_4 (Dense)              (None, 1024)              134218752 
_________________________________________________________________
dense_5 (Dense)              (None, 512)               524800    
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 513       
Total params: 156,546,849
Trainable params: 134,744,065
Non-trainable params: 21,802,784
_________________________________________________________________
None


In [22]:
# train model
history = model.fit_generator(train_gen, 
                    validation_data=val_gen,
                    shuffle=True,
                    epochs=1)


Epoch 1/1


In [68]:
# test
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score
import math

y_test = test_gen.get_labels()

# get AUROC
preds = np.concatenate([model.predict(test_gen.__getitem__(idx)[0]) for idx in range(len(test_gen))])
print(preds.shape)
print('Test AUROC:', roc_auc_score(y_test, preds))

# get loss and acc
# print('Test acc:', model.evaluate(test_gen)[1])

# # get F1
# preds[preds>0.5] = 1
# preds[preds<=0.5] = 0
# preds_bin = preds
# print('Test F1:', f1_score(y_test, preds_bin, zero_division=1))
# print('Test Precision:', precision_score(y_test, preds_bin, zero_division=1))
# print('Test Recall:', recall_score(y_test, preds_bin, zero_division=1))

0 [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31]
1 [32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55
 56 57 58 59 60 61 62 63]
2 [64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87
 88 89 90 91 92 93 94 95]
3 [ 96  97  98  99 100 101 102 103 104 105 106 107 108 109 110 111 112 113
 114 115 116 117 118 119 120 121 122 123 124 125 126 127]
4 [128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145
 146 147 148 149 150 151 152 153 154 155 156 157 158 159]
5 [160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177
 178 179 180 181 182 183 184 185 186 187 188 189 190 191]
6 [192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209
 210 211 212 213 214 215 216 217 218 219 220 221 222 223]
7 [224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241
 242 243 244 245 246 247 248 249 250 251 252 253 254 255]
8 [256 257 258 259 260 261 262 2

56 [1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805
 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819
 1820 1821 1822 1823]
57 [1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837
 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851
 1852 1853 1854 1855]
58 [1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869
 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883
 1884 1885 1886 1887]
59 [1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901
 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915
 1916 1917 1918 1919]
60 [1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933
 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947
 1948 1949 1950 1951]
61 [1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965
 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979
 1980 1981 1982 1983

106 [3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405
 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419
 3420 3421 3422 3423]
107 [3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437
 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451
 3452 3453 3454 3455]
108 [3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469
 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483
 3484 3485 3486 3487]
109 [3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501
 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515
 3516 3517 3518 3519]
110 [3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533
 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547
 3548 3549 3550 3551]
111 [3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565
 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579
 3580 3581 358

155 [4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973
 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987
 4988 4989 4990 4991]
156 [4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005
 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019
 5020 5021 5022 5023]
157 [5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037
 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051
 5052 5053 5054 5055]
158 [5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069
 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083
 5084 5085 5086 5087]
159 [5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101
 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115
 5116 5117 5118 5119]
160 [5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133
 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147
 5148 5149 515

204 [6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541
 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555
 6556 6557 6558 6559]
205 [6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573
 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587
 6588 6589 6590 6591]
206 [6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605
 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619
 6620 6621 6622 6623]
207 [6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637
 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651
 6652 6653 6654 6655]
208 [6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669
 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683
 6684 6685 6686 6687]
209 [6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701
 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715
 6716 6717 671

253 [8096 8097 8098 8099 8100 8101 8102 8103 8104 8105 8106 8107 8108 8109
 8110 8111 8112 8113 8114 8115 8116 8117 8118 8119 8120 8121 8122 8123
 8124 8125 8126 8127]
254 [8128 8129 8130 8131 8132 8133 8134 8135 8136 8137 8138 8139 8140 8141
 8142 8143 8144 8145 8146 8147 8148 8149 8150 8151 8152 8153 8154 8155
 8156 8157 8158 8159]
255 [8160 8161 8162 8163 8164 8165 8166 8167 8168 8169 8170 8171 8172 8173
 8174 8175 8176 8177 8178 8179 8180 8181 8182 8183 8184 8185 8186 8187
 8188 8189 8190 8191]
256 [8192 8193 8194 8195 8196 8197 8198 8199 8200 8201 8202 8203 8204 8205
 8206 8207 8208 8209 8210 8211 8212 8213 8214 8215 8216 8217 8218 8219
 8220 8221 8222 8223]
257 [8224 8225 8226 8227 8228 8229 8230 8231 8232 8233 8234 8235 8236 8237
 8238 8239 8240 8241 8242 8243 8244 8245 8246 8247 8248 8249 8250 8251
 8252 8253 8254 8255]
258 [8256 8257 8258 8259 8260 8261 8262 8263 8264 8265 8266 8267 8268 8269
 8270 8271 8272 8273 8274 8275 8276 8277 8278 8279 8280 8281 8282 8283
 8284 8285 828

302 [9664 9665 9666 9667 9668 9669 9670 9671 9672 9673 9674 9675 9676 9677
 9678 9679 9680 9681 9682 9683 9684 9685 9686 9687 9688 9689 9690 9691
 9692 9693 9694 9695]
303 [9696 9697 9698 9699 9700 9701 9702 9703 9704 9705 9706 9707 9708 9709
 9710 9711 9712 9713 9714 9715 9716 9717 9718 9719 9720 9721 9722 9723
 9724 9725 9726 9727]
304 [9728 9729 9730 9731 9732 9733 9734 9735 9736 9737 9738 9739 9740 9741
 9742 9743 9744 9745 9746 9747 9748 9749 9750 9751 9752 9753 9754 9755
 9756 9757 9758 9759]
305 [9760 9761 9762 9763 9764 9765 9766 9767 9768 9769 9770 9771 9772 9773
 9774 9775 9776 9777 9778 9779 9780 9781 9782 9783 9784 9785 9786 9787
 9788 9789 9790 9791]
306 [9792 9793 9794 9795 9796 9797 9798 9799 9800 9801 9802 9803 9804 9805
 9806 9807 9808 9809 9810 9811 9812 9813 9814 9815 9816 9817 9818 9819
 9820 9821 9822 9823]
307 [9824 9825 9826 9827 9828 9829 9830 9831 9832 9833 9834 9835 9836 9837
 9838 9839 9840 9841 9842 9843 9844 9845 9846 9847 9848 9849 9850 9851
 9852 9853 985

ValueError: Found input variables with inconsistent numbers of samples: [10000, 10016]

In [None]:
model.save(model_dir + 'CNN.h5')

In [None]:
# see fraction of positive examples
print(sum(train_generator.classes)/len(train_generator.classes))
print(sum(val_generator.classes)/len(val_generator.classes))
print(sum(test_generator.classes)/len(test_generator.classes))

In [60]:
print(list([n.item() for n in preds]))

[0.0, 0.0, 0.0, 8.940696716308594e-08, 0.0, 3.3676624298095703e-06, 0.0, 0.0, 0.0, 2.562999725341797e-06, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0005840063095092773, 0.0011669397354125977, 0.1251588761806488, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 6.866455078125e-05, 0.0, 0.0, 0.0, 0.0, 4.2766332626342773e-05, 0.0, 0.0006448030471801758, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.5497207641601562e-06, 0.0, 8.940696716308594e-08, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.7881393432617188e-07, 0.0, 0.0, 0.0, 0.0, 0.0025686323642730713, 0.0, 0.0, 2.1010637283325195e-05, 0.0, 0.0, 0.0, 2.086162567138672e-07, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.031614065170288086, 0.0, 0.0, 6.258487701416016e-07, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.516674041748047e-06, 0.0, 0.0, 0.016470342874526978, 0.0, 0.0, 0.0, 0.0, 7.140636444091797e-05, 0.0, 0.0, 0.00017443299293518066, 0.0, 0.0

In [64]:
print(len(test_gen))

313


In [None]:
# make image dataloader using flow_from_dataframe
import pandas as pd
from keras.preprocessing.image import ImageDataGenerator

# load data to extract labels
data_dir = 'mmhs150k/'
tweet_dict = json.load(open(data_dir + 'MMHS150K_GT.json', 'r'))

# read split id's and return data generator
def get_data_dict(path):
    
    # build dictionary mapping id's to labels
    data = {'id': [], 'label': []}
    for id in open(data_dir + path, 'r').read().splitlines()[:2]: # test for two

        # get majority vote label
        binary_labels = [1 if n > 0 else 0 for n in tweet_dict[id]['labels']]
        label = 1 if sum(binary_labels)/len(tweet_dict[id]['labels']) > 0.5 else 0

        # save to data dict
        data['id'].append(id + '.jpg')
        data['label'].append("0" if not data['label'] else "1")
        
        im = Image.open(data_dir + 'img_resized/' + id + '.jpg')
        display(im)
        
        
    data_df = pd.DataFrame.from_dict(data) # get dataframe to flow from
    
    datagen = ImageDataGenerator()#rescale=1./255)
    generator = datagen.flow_from_dataframe(
        dataframe=data_df,
        directory=data_dir + 'img_resized',
        x_col='id',
        y_col='label',
        target_size=(299, 299),
        batch_size=2,
        class_mode='binary')
    
    return generator

train_generator = get_data_dict('splits/train_ids.txt')

# test image augmentation
x, y = train_generator.next()

# test image augmentation
for img, label in zip(x,y):
    im = Image.fromarray(np.uint8(img), 'RGB')
    display(im)
    im = Image.fromarray(img, 'RGB')
    display(im)