In [2]:
import pandas as pd
import os
import re
import tensorflow as tf
from tensorflow import keras

# Load data files

Assumes you have unzipped the images.zip file into the folder "Data Instagram" which also contains the Excel sheet with the data labels.

## Get data labels

In [3]:
ig_post_data = pd.read_excel("../Data Instagram/Labeled_instagram_posts_related_to_covid.xlsx",
                             usecols="A, N", true_values=[1], false_values=[2, 3, 99])

## Get Image Sets

### Load Instagram image data

Find each corresponding image for each label we just loaded. Because the labels don't correspond to the image file names, first we strip off the leading information for all of the images' file names, then search directly for the remainder. This should be equal to imagename.jpg. You should only have to run rename_files() once. This way the images can be found quickly. The image resizing is done inside of get_labeled_images().

In [None]:
def rename_files():
    # Rename all image files in "Data Instagram", removing the leading integer and underscore
    for filename in os.listdir("../Data Instagram"):
        if filename.endswith(".jpg"):
            new_name = re.sub(r"\d*_", "", filename)
            
            if not os.path.exists("../Data Instagram/" + new_name):
                os.rename("../Data Instagram/" + filename, "../Data Instagram/" + new_name)

rename_files()

In [4]:
def get_labeled_images(image_data):
    images = []
    labels = []
    
    # Get image file name and label
    for post in image_data.itertuples():
        _, imagename, label = post
        
        # Change label to binary
        if label != 1:
            label = 0
        
        # Load and resize image
        file_name = imagename + ".jpg"
        image_file_path = "../Data Instagram/" + file_name
        if os.path.exists(image_file_path):
            picture = tf.io.read_file(image_file_path)
            picture = tf.image.decode_jpeg(picture, channels=3)
            picture = tf.image.resize_with_pad(picture, 480, 480)
            images.append(picture)
            labels.append(label)
    
    return images, labels

In [5]:
ig_images, ig_labels = get_labeled_images(ig_post_data)  # Will store the images we learn with

In [5]:
import imageio

def move_labeled_images(image_data):
    if not os.path.exists("../Data Instagram/Matching/"):
        os.mkdir("../Data Instagram/Matching")
    
    # Get image file name and label
    for post in image_data.itertuples():
        _, imagename, label = post
        
        # Change label to binary
        if label != 1:
            label = 0
        
        # Load and resize image
        file_name = imagename + ".jpg"
        image_file_path = "../Data Instagram/" + file_name
        if os.path.exists(image_file_path):
            picture = imageio.imread(image_file_path)
            out_file_path = "../Data Instagram/Matching/" + file_name
            imageio.imwrite(out_file_path, picture)
            
move_labeled_images(ig_post_data)

In [None]:
import matplotlib.pyplot as plt

sum_l = 0
for label in ig_labels:
    sum_l = sum_l + label
print(sum_l)
print(len(ig_labels))
print(ig_labels[5:5])
test_list = ig_labels[5:5] + ig_labels[:3]
print(test_list)

#print(type(ig_images[0]))
#i = 0
#j = 0
#while i < 9:
#    lbl = ig_labels[j]
#    if lbl == 1:
#        ax = plt.subplot(3, 3, i + 1)
#        plt.imshow(ig_images[j])
#        print(ig_images[j].shape)
#        plt.title(int(lbl))
#        plt.axis("off")
#        i += 1
#    j += 1

### Split data

Split the data into training, validation, and test data sets.

In [None]:
validation_split = int(len(ig_images) / 10) # 10% for validation, 10% for test
train_images = ig_images[:-validation_split]
train_labels = ig_labels[:-validation_split]
pos_cls_wgt = (len(train_labels) - sum(train_labels)) / sum(train_labels)
neg_cls_wgt = (len(train_labels) - sum(train_labels)) / len(train_labels)
cls_wgts_dic = {0: neg_cls_wgt, 1: pos_cls_wgt}

# Split data
training_ds = tf.data.Dataset.from_tensor_slices((train_images[:-validation_split],
                                                 train_labels[:-validation_split]))
validation_ds = tf.data.Dataset.from_tensor_slices((train_images[-validation_split:],
                                                   train_labels[-validation_split:]))
test_ds = tf.data.Dataset.from_tensor_slices((ig_images[-validation_split:],
                                             ig_labels[-validation_split:]))

In [None]:
print(pos_cls_wgt, neg_cls_wgt)
print(training_ds.element_spec)

### Apply ResNet preprocessing

In [None]:
training_ds = training_ds.map(lambda d, l:
                              (tf.keras.applications.resnet_v2.preprocess_input(d), l))
validation_ds = validation_ds.map(lambda d, l:
                                 (tf.keras.applications.resnet_v2.preprocess_input(d), l))
test_ds = test_ds.map(lambda d, l:
                      (tf.keras.applications.resnet_v2.preprocess_input(d), l))

batch_size = 32
training_ds = training_ds.cache().batch(batch_size).prefetch(buffer_size=10)
validation_ds = validation_ds.cache().batch(batch_size).prefetch(buffer_size=10)
test_ds = test_ds.cache().batch(batch_size).prefetch(buffer_size=10)

# Training on Instagram Images (Proof of Concept)

## Instantiate pre-trained ResNet

In [None]:
base = keras.applications.ResNet50V2(include_top=False,
                                     weights="imagenet",
                                     input_shape=(480, 480, 3))
base.trainable = False

## Create classification layer for 'East Asia' classification

In [None]:
inputs = keras.Input(shape=(480, 480, 3))

a = base(inputs, training=False)
a = keras.layers.GlobalAveragePooling2D()(a)

outputs = keras.layers.Dense(1, activation="sigmoid")(a)

classifier = keras.Model(inputs, outputs)
classifier.compile(optimizer=keras.optimizers.Adam(),
              loss=keras.losses.BinaryCrossentropy(from_logits=False),
              metrics=[keras.metrics.BinaryAccuracy(),
                       keras.metrics.Recall(),
                       keras.metrics.Precision(),
                       keras.metrics.FalseNegatives(),
                       keras.metrics.FalsePositives()])

classifier.summary()

## Train

In [None]:
classifier.fit(training_ds, epochs=1, class_weight=cls_wgts_dic, validation_data=validation_ds)

In [None]:
classifier.history.history

# 5-Fold Cross-Validation on Instagram Images

## Set up

### Define functions

In [6]:
def build_model():
    # Instantiate pre-trained ResNet
    base = keras.applications.ResNet50V2(include_top=False,
                                     weights="imagenet",
                                     input_shape=(480, 480, 3))
    base.trainable = False
    
    # Create classifier
    inputs = keras.Input(shape=(480, 480, 3))
    a = base(inputs, training=False)
    a = keras.layers.GlobalAveragePooling2D()(a)
    outputs = keras.layers.Dense(1, activation="sigmoid")(a)
    classifier = keras.Model(inputs, outputs)
    
    # Compile model
    classifier.compile(optimizer=keras.optimizers.Adam(),
                       loss=keras.losses.BinaryCrossentropy(from_logits=False),
                       metrics=[keras.metrics.BinaryAccuracy(),
                                keras.metrics.Recall(),
                                keras.metrics.Precision()])
    
    return classifier

In [7]:
def split_data(cv_round):
    vs = int(len(ig_images) / 5) # 20% for validation, test set already withheld
    valid_start = (cv_round) * vs
    valid_end = (cv_round+1) * vs if cv_round != 4 else (len(ig_images)-1)

    # Split data
    training_ds = tf.data.Dataset.from_tensor_slices((ig_images[:valid_start] + ig_images[valid_end:],
                                                      ig_labels[:valid_start] + ig_labels[valid_end:]))
    validation_ds = tf.data.Dataset.from_tensor_slices((ig_images[valid_start:valid_end],
                                                        ig_labels[valid_start:valid_end]))
    
    return training_ds, validation_ds

In [8]:
def preprocess_data(tds, vds):
    # ResNet preprocessing
    training_ds = tds.map(lambda d, l:
                                  (tf.keras.applications.resnet_v2.preprocess_input(d), l))
    validation_ds = vds.map(lambda d, l:
                                      (tf.keras.applications.resnet_v2.preprocess_input(d), l))
    
    # Batch setup
    batch_size = 32
    training_ds = training_ds.cache().batch(batch_size).prefetch(buffer_size=10)
    validation_ds = validation_ds.cache().batch(batch_size).prefetch(buffer_size=10)
    
    return training_ds, validation_ds

### One-time set up

In [10]:
# Withhold test set
#test_split = int(len(ig_images) / 10) # take 10%
#test_ds = tf.data.Dataset.from_tensor_slices((ig_images[-validation_split:],
#                                             ig_labels[-validation_split:]))
#ig_images = ig_images[:-validation_split]
#ig_labels = ig_labels[:-validation_split]

# Prepare test set
#test_ds = test_ds.map(lambda d, l:
#                      (tf.keras.applications.resnet_v2.preprocess_input(d), l))
#test_ds = test_ds.cache().batch(32).prefetch(buffer_size=10)

# Create weight dictionary to offset imbalanced data
pos_cls_wgt = (len(ig_labels) - sum(ig_labels)) / sum(ig_labels)
neg_cls_wgt = (len(ig_labels) - sum(ig_labels)) / len(ig_labels)
cls_wgts_dic = {0: neg_cls_wgt, 1: pos_cls_wgt}

## Cross-Validation Loop

In [11]:
model_results = []
for i in range(0, 5):
    # Create the ith data split
    trainset, validset = split_data(i)
    
    # Prepare data
    trainset, validset = preprocess_data(trainset, validset)
    
    # Get model
    model = build_model()
    
    # Fit model
    result = model.fit(trainset,
                       epochs=5,
                       class_weight=cls_wgts_dic,
                       validation_data=validset)
    model_results.append(result)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


## Results

In [24]:
print(model_results[0].history['val_precision' + ''][-1])

0.4285714328289032


In [29]:
i = 0
print("Results on validation sets:")
for i in range(5):
    result = model_results[i]
    suffix = ''
    if i != 0:
        suffix = '_' + str(i)
    
    print("Split " + str(i+1))
    print("Accuracy: {0}\nPrecision: {1}\nRecall: {2}".format(
        result.history['val_binary_accuracy'][-1],
        result.history['val_precision' + suffix][-1],
        result.history['val_recall' + suffix][-1]))
    print("--------------------")

Results on validation sets:
Split 1
Accuracy: 0.8243451714515686
Precision: 0.4285714328289032
Recall: 0.8426966071128845
--------------------
Split 2
Accuracy: 0.7996918559074402
Precision: 0.3693181872367859
Recall: 0.773809552192688
--------------------
Split 3
Accuracy: 0.8289676308631897
Precision: 0.44155845046043396
Recall: 0.7311828136444092
--------------------
Split 4
Accuracy: 0.8567026257514954
Precision: 0.4748201370239258
Recall: 0.7674418687820435
--------------------
Split 5
Accuracy: 0.7923076748847961
Precision: 0.46464645862579346
Recall: 0.7603305578231812
--------------------
