In [33]:
import pandas as pd
import os
import re
import numpy as np
import tensorflow as tf
from tensorflow import keras

# Modify Data Files

Creates a new file "images.tfrecords" which contains all of the images that had labels in the label sheet. This will only have to be done if you haven't done it already. First unzip "images.zip" into the "../Data Instagram" directory and copy "Labeled_instagram_posts_related_to_covid.xlsx" into the same location. Then run rename_files() to prepare the files for the next step. Run get_labeled_images_raw() to get the matching images and their associated labels. Finally run the cells under "Create TFRecord File" to create the "images.tfrecords" file used by the rest of the notebook. If the "images.tfrecords" file already exists, skip all of this and go to the "5-Fold Cross-Validation" section.

## Get Data Labels

In [11]:
ig_post_data = pd.read_excel("../Data Instagram/Labeled_instagram_posts_related_to_covid.xlsx",
                             usecols="A, N")

## Get Instagram Image Set

Find the corresponding images for the labels we just loaded. Because the file names have extra, unhelpful information at the front, first we remove that information. This allows us to search directly for the image as imagename.jpg. Run rename_files() once if you have not changed the file names already. The images are resized to fit the model inside of get_labeled_images().

In [None]:
def rename_files():
    # Rename all image files in "Data Instagram", removing the leading integer and underscore
    for filename in os.listdir("../Data Instagram"):
        if filename.endswith(".jpg"):
            new_name = re.sub(r"\d*_", "", filename)
            
            if not os.path.exists("../Data Instagram/" + new_name):
                os.rename("../Data Instagram/" + filename, "../Data Instagram/" + new_name)

rename_files()

In [16]:
def get_labeled_images_raw(image_data):
    images = []
    labels = []
    
    # Get image file name and label
    for post in image_data.itertuples():
        _, imagename, label = post
        
        # Change label to binary
        if label != 1:
            label = 0
        
        # Load and resize image
        file_name = imagename + ".jpg"
        image_file_path = "../Data Instagram/Matching/" + file_name
        if os.path.exists(image_file_path):
            picture = tf.io.read_file(image_file_path)
            #picture = tf.image.decode_jpeg(picture, channels=3)
            #picture = tf.image.resize_with_pad(picture, 480, 480)
            images.append(picture)
            labels.append(label)
    
    return images, labels

In [17]:
ig_images_raw, ig_labels = get_labeled_images_raw(ig_post_data)  # Will store the images we learn with

In [18]:
print(type(ig_images_raw[0]))
print(type(ig_labels[0]))

<class 'tensorflow.python.framework.ops.EagerTensor'>
<class 'int'>


## Create TFRecord File

Will only need the file "images.tfrecords" from now on.

In [23]:
def create_example(raw_image, label):
    # Create Feature objects from image and label data
    lbl = tf.train.Feature(int64_list=tf.train.Int64List(value=[label]))
    if isinstance(raw_image, type(tf.constant(0))):
        raw_image = raw_image.numpy()  # Change from EagerTensor
    image_string = tf.train.Feature(bytes_list=tf.train.BytesList(value=[raw_image]))
    
    # Make dictionary for example
    features = {"label": lbl,
                "image_raw": image_string}
    
    example = tf.train.Example(features=tf.train.Features(feature=features))
    return example

In [25]:
output_filename = "../Data Instagram/images.tfrecords"
with tf.io.TFRecordWriter(output_filename) as writer:
    for i in range(len(ig_images_raw)):
        ex = create_example(ig_images_raw[i], ig_labels[i])
        writer.write(ex.SerializeToString())

# 5-Fold Cross-Validation on Instagram Images with Various Models

Run the cross-validation loop once per each model. The results are stored in the Excel sheet "Model_Results", where the averages are calculated across the splits.

## Set Up

### Define Functions

In [59]:
def split_data(cv_round, images, num_images):
    vs = int(num_images / 5) # 20% for validation, 5-fold validation
    valid_start = (cv_round) * vs
    valid_end = (cv_round+1) * vs if cv_round != 4 else -1

    # Split data
    training_ds = images.take(valid_start).concatenate(images.skip(valid_end))
    validation_ds = images.skip(valid_start).take(vs)
    
    return training_ds, validation_ds

In [77]:
def prepare_batch(tds, vds):
    # Batch setup
    batch_size = 32
    training_ds = tds.cache().batch(batch_size).prefetch(buffer_size=10)
    validation_ds = vds.cache().batch(batch_size).prefetch(buffer_size=10)
    
    return training_ds, validation_ds

### Models

In [2]:
def build_model1():
    """
    A basic model with one classifier layer built on top of ResNet50 v2.
    The classification layer uses an average pooling layer followed by a sigmoid activation layer
    for binary classification.
    """
    # Instantiate pre-trained ResNet
    base = keras.applications.ResNet50V2(include_top=False,
                                     weights="imagenet",
                                     input_shape=(480, 480, 3))
    base.trainable = False
    
    # Create classifier
    inputs = keras.Input(shape=(480, 480, 3))
    a = base(inputs, training=False)
    a = keras.layers.GlobalAveragePooling2D()(a)
    outputs = keras.layers.Dense(1, activation="sigmoid")(a)
    classifier = keras.Model(inputs, outputs)
    
    # Compile model
    classifier.compile(optimizer=keras.optimizers.Adam(),
                       loss=keras.losses.BinaryCrossentropy(from_logits=False),
                       metrics=[keras.metrics.BinaryAccuracy(),
                                keras.metrics.Recall(),
                                keras.metrics.Precision()])
    
    return classifier

In [75]:
def build_model2():
    """
    A model with one classifier layer built on top of ResNet50 v2.
    The classification layer uses an average pooling layer followed by a sigmoid activation layer
    for binary classification.
    Adds a data augmentation layer to the input.
    """
    # Instantiate pre-trained ResNet
    base = keras.applications.ResNet50V2(include_top=False,
                                     weights="imagenet",
                                     input_shape=(480, 480, 3))
    base.trainable = False
    
    # Create classifier
    inputs = keras.Input(shape=(480, 480, 3))
    augmentation = keras.layers.RandomFlip("horizontal")(inputs)
    augmentation = keras.layers.RandomRotation(0.1)(augmentation)
    a = base(augmentation, training=False)
    a = keras.layers.GlobalAveragePooling2D()(a)
    outputs = keras.layers.Dense(1, activation="sigmoid")(a)
    classifier = keras.Model(inputs, outputs)
    
    # Compile model
    classifier.compile(optimizer=keras.optimizers.Adam(),
                       loss=keras.losses.BinaryCrossentropy(from_logits=False),
                       metrics=[keras.metrics.BinaryAccuracy(),
                                keras.metrics.Recall(),
                                keras.metrics.Precision()])
    
    return classifier

In [1]:
def build_model3():
    """
    A model with one classifier layer built on top of ResNet50 v2.
    The classification layer uses an average pooling layer followed by a sigmoid activation layer
    for binary classification.
    Adds a data augmentation layer to the input.
    """
    # Instantiate pre-trained ResNet
    base = keras.applications.ResNet50V2(include_top=False,
                                     weights="imagenet",
                                     input_shape=(480, 480, 3))
    base.trainable = False
    
    # Create classifier
    inputs = keras.Input(shape=(480, 480, 3))
    augmentation = keras.layers.RandomFlip("horizontal")(inputs)
    augmentation = keras.layers.RandomRotation(0.1)(augmentation)
    a = base(augmentation, training=False)
    a = keras.layers.GlobalAveragePooling2D()(a)
    a = keras.layers.Dense(2048, activation="relu")(a)
    outputs = keras.layers.Dense(1, activation="sigmoid")(a)
    classifier = keras.Model(inputs, outputs)
    
    # Compile model
    classifier.compile(optimizer=keras.optimizers.Adam(),
                       loss=keras.losses.BinaryCrossentropy(from_logits=False),
                       metrics=[keras.metrics.BinaryAccuracy(),
                                keras.metrics.Recall(),
                                keras.metrics.Precision()])
    
    return classifier

### One-time Set Up

In [72]:
def count_positive_labels(old_state, input_element):
    lbl = input_element["label"]
    new_state = old_state + lbl
    return new_state

def process_image(ds_elem):
    print(ds_elem)
    image = ds_elem["image_raw"]
    label = ds_elem["label"]
    
    image = tf.image.decode_jpeg(image, channels=3)
    image = tf.image.resize_with_pad(image, 480, 480)
    image = keras.applications.resnet_v2.preprocess_input(image)
    return (image, label)

image_features = {"label": tf.io.FixedLenFeature([], tf.int64),
                  "image_raw": tf.io.FixedLenFeature([], tf.string)}
images_ds = tf.data.TFRecordDataset("../Data Instagram/images.tfrecords")
images_ds = images_ds.map(lambda x: tf.io.parse_single_example(x, image_features))

# Create weight dictionary to offset imbalanced data
num_images = images_ds.reduce(np.int64(0), lambda x, _: x + 1).numpy()
num_pos = images_ds.reduce(np.int64(0), count_positive_labels).numpy()

pos_cls_wgt = (num_images - num_pos) / num_pos
neg_cls_wgt = (num_images - num_pos) / num_images

cls_wgts_dic = {0: neg_cls_wgt, 1: pos_cls_wgt}
print(cls_wgts_dic)

# Prepare images
images_ds = images_ds.map(process_image)

{0: 0.8543270711425932, 1: 5.864693446088795}
{'image_raw': <tf.Tensor 'args_0:0' shape=() dtype=string>, 'label': <tf.Tensor 'args_1:0' shape=() dtype=int64>}


## Cross-Validation Loop

The loop records the individual splits' performances in model_results, which is a list of tf.keras.callback.History objects.

In [78]:
model_results = []
for i in range(0, 5):
    # Create the ith data split
    trainset, validset = split_data(i, images_ds, num_images)
    
    # Prepare data
    trainset, validset = prepare_batch(trainset, validset)
    
    # Get model, change the number to get a different model
    model = build_model2()
    
    # Fit model
    result = model.fit(trainset,
                       epochs=10,
                       class_weight=cls_wgts_dic,
                       validation_data=validset)
    model_results.append(result)

Epoch 1/10


KeyboardInterrupt: 

## Results

Prints the results from model_results. Only shows the last validation metrics for each split.

In [None]:
i = 0
print("Results on validation sets:")
for i in range(5):
    result = model_results[i]
    suffix = ''
    if i != 0:
        suffix = '_' + str(i)
    
    print("Split " + str(i+1))
    print("Accuracy: {0}\nPrecision: {1}\nRecall: {2}".format(
        result.history['val_binary_accuracy'][-1],
        result.history['val_precision' + suffix][-1],
        result.history['val_recall' + suffix][-1]))
    print("--------------------")