In [None]:
pip install kaggle

In [None]:
from google.colab import files
files.upload()

In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/

In [None]:
!kaggle datasets download -d hsankesara/flickr-image-dataset

In [None]:
from zipfile import *

file_name = 'flickr-image-dataset.zip' #the file is your dataset exact name
with ZipFile(file_name, 'r') as zip:
  zip.extractall()
  print('Files Extracted')

In [None]:
pip install contractions

In [None]:
nltk.download('all')

In [None]:
pip install tensorflow-text 

In [None]:
pip install tensorflow-hub

In [None]:
pip install pandas

In [None]:
pip install seaborn

In [None]:
pip install scikit-learn

In [None]:
pip install matplotlib

In [None]:
pip install nltk

In [None]:
pip install numpy

In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import contractions
import string
from collections import Counter
from sklearn.model_selection import train_test_split

## 1.0 Get Data and Greet Data

A dataset should be broken up into multiple datasets:
- data_raw = first read in of our data, but will not be touched
- data_copy = data_raw copied, which will be used to clean, feature engineer, and wrangle our data

In [None]:
data_raw = pd.read_csv('/content/flickr30k_images/results.csv', delimiter='|')
data_copy = pd.read_csv('/content/flickr30k_images/results.csv', delimiter='|')

In [None]:
data_raw.info()

We notice that we have a missing value in the comment section. This will be important to us as we clean our data later. The other two columns look like they do not have any missing data.

In [None]:
data_raw.sample(10)

Our data is structured with three columns:
- image_name: which is the unique tag of the image
- comment_number: which is the specific comment number about the image
- comment: this is the actual comment left about the image <br><br>

We expect to see 5 comments per uniuqe image_id so let's make sure that is the case:

In [None]:
image_name_unique_vals = data_raw['image_name'].unique().tolist()
print("Total length of unique values: ", len(image_name_unique_vals))
print("Total length of data frame: ",  len(data_raw))
print("Total length of unique values * 5: ", len(image_name_unique_vals) * 5)

Based off our findings above, we can see that there are in fact 31,783 unique images in the dataset, corresponding to 158,915 in the total dataset, so in fact we do have 5 comments per image.

### 1.1 The 4 C's of Data Cleaning: Correction, Completing, Creating, and Converting <br><br>

In this stage, we will clean our data by:
1. correcting values and outliers 
2. completing missing information
3. creating new features for analysis
4. converting fields to the correct format for calculations and presentation.

#### 1.1.1 Correcting data

In [None]:
print('Columns with null values:')
print(data_copy.isnull().sum())

In [None]:
data_copy.describe(include = 'all').transpose()

First, let's make sure that our columns are formatted correctly so we can pull data from them:

In [None]:
data_copy.columns

In [None]:
data_copy.columns = data_copy.columns.str.replace(' ', '')
data_copy.columns

Let's take a look at the comment_number issue because we should have 5 unique comment_numbers not 6:

In [None]:
comment_number_unique_vals = data_copy['comment_number'].unique().tolist()
print(comment_number_unique_vals)

Let's see if this last item in the list above correlates with our missing value in the comment column because it seems to have a number and a sentence combined into one:

In [None]:
data_copy[data_copy['comment'].isnull()]

In [None]:
#Filter location to where the data is missing a value
data_copy[data_copy['image_name'] == '2199200615.jpg']
#Create a temporary variable to hold that row
tmp = data_copy.iloc[19999]

#Get the comment number column
holder = tmp['comment_number']

#Seperate the text from the digit and print
result = ''.join([i for i in holder if not i.isdigit()])

#Update the temporary variable row
tmp['comment'] = result 
tmp['comment_number'] = '4'

#Update our data copy row to reflect change
data_copy.iloc[19999] = tmp
print(data_copy.iloc[19999])

In [None]:
print('Updated Columns with null values:') 
print(data_copy.isnull().sum())

In [None]:
comment_number_unique_vals = data_copy['comment_number'].unique().tolist()
print(comment_number_unique_vals)

In [None]:
data_copy['comment_number'] = data_copy['comment_number'].str.lstrip()

In [None]:
comment_number_unique_vals = data_copy['comment_number'].unique().tolist()
print(comment_number_unique_vals)

We should also clean our sentences:
1. Remove punctuations
    - Ex:
        - i am happy. -> i am happy
2. Lower case all the data
    - Ex:
        - I AM HAPPY -> i am happy
3. Remove emojis
    - Ex:
        - 🫠 -> ''
4. Remove contractions
    - Ex:
        - I didn't do well -> I did not do well
5. Remove extra whitespace
    - Ex:
        - ' i am happy ' -> 'i am happy'
6. Deal with stopwords
    - In this case I am going to remove them. Stopwords are words that are automatically omitted from a computer-generated index.
    - Ex:
        - "I", "a", "the"
    - We may want to consider adding our own stopwords we see a lot of to the mix



In [None]:
def expand_contractions(comment):
    """
    This function will expand all contractions to get rid of them.

    Args:
        comment (object): A column in the dataframe that contains all the comments from each image

    Returns:
       Object: A fixed object that has all the punctuations taken out of the sentence
    """
    comment = contractions.fix(comment)
    return comment
data_copy['comment'] = data_copy['comment'].apply(expand_contractions)

In [None]:
def remove_punctuations(comment):
    """
    This function will go through all the comment rows and remove all punctuations.
    
    Args:
        comment (object): A column in the dataframe that contains all the comments from each image

    Returns:
        Object: A fixed object that has all the punctuations taken out of the sentence
    """
    for letter in string.punctuation:
        comment = comment.replace(letter, '')
    return comment
#call function on dataframe to remove punctuations
data_copy['comment'] = data_copy['comment'].apply(remove_punctuations)

In [None]:
#convert all letters in the comment column to lowercase
data_copy['comment'] = data_copy['comment'].str.lower()

In [None]:
#remove all emojis from the comment column
data_copy.astype(str).apply(lambda x: x.str.encode('ascii', 'ignore').str.decode('ascii'))

In [None]:
#get rid of all whitespace in the comment column
data_copy['comment'] = data_copy['comment'].str.strip()

Now let's find out what the most common words are from each sentence:

In [None]:
Counter(" ".join(data_copy["comment"]).split()).most_common(100)

In [None]:
stop = stopwords.words('english') #assign all stopwords to variable stop
data_copy['comment'] = data_copy['comment'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)])) #remove stopwords

With stopwords removed, here are the most common words:

In [None]:
Counter(" ".join(data_copy["comment"]).split()).most_common(100)

#### 1.1.2 Creating

Here we will be using feature engineering:
- Count number of characters
- Count number of words
- Count number of capital characters
- Count number of capital words
- Count number of punctuations
- Count number of words in quotes
- Count number of sentences
- Count number of unique words
- Count special characters
- Count stopwords
- Calculate average word length
- Calculate average sentence length
- Unique word count vs word count
- Stopword count vs word count

In [None]:
def count_chars(comment): 
    """
    This function will count the amount of characters in each comment

    Args:
        comment (object): A column in the dataframe that contains all the comments from each image

    Returns:
        Object: A fixed object that has all the punctuations taken out of the sentence
    """
    return len(comment)



def count_words(comment):
    """
    This function will count the amount of words in each comment

    Args:
        comment (object): A column in the dataframe that contains all the comments from each image

    Returns:
        Object: A fixed object that has all the punctuations taken out of the sentence
    """
    return len(comment.split()) #split the string on each space


#need this line for some reason
nltk.download('punkt')
def count_sentences(comment):
    """
    This function will count the amount of sentences in each comment

    Args:
        comment (object): A column in the dataframe that contains all the comments from each image

    Returns:
        Object: A fixed object that has all the punctuations taken out of the sentence
    """
    return len(nltk.sent_tokenize(comment))


def count_unique_words(comment):
    """
    This function will count the amount of unique words in a comment

    Args:
        comment (object): A column in the dataframe that contains all the comments from each image

    Returns:
        Object: A fixed object that has all the punctuations taken out of the sentence
    """
    return len(set(comment.split()))




def calc_avg_word_length(num_chars, num_words):
    """
    This function will calculate the average word length per sentence

    Args:
        num_chars (int): number of chars per word
        num_words (int): number of words per sentence

    Returns:
        Float: the average word length per setence
    """
    return num_chars/num_words


def calc_avg_sentence_length(num_words, num_sentences):
    """
    This function will calculate the average sentence length per comment

    Args:
        num_words (int): number of words per comment
        num_sentences (int): number of sentences per comment

    Returns:
        Float: the average sentence length per comment
    """
    return num_words/num_sentences

def count_unique_words_vs_word_count(num_words, num_unique):
    """
    This function will count the number of unique words vs the total words per sentence

    Args:
        num_words (int): number of words per sentence
        num_unique (int): number of unique words per sentence

    Returns:
        Float: the ratio of unique words to total words
    """
    return num_unique/num_words

In [None]:
data_copy['char_count'] = data_copy['comment'].apply(lambda x : count_chars(x))
data_copy['word_count'] = data_copy['comment'].apply(lambda x : count_words(x))
data_copy['sentence_count'] = data_copy['comment'].apply(lambda x : count_sentences(x))
data_copy['unique_word_count'] = data_copy['comment'].apply(lambda x : count_unique_words(x))
data_copy['average_word_length'] = data_copy['char_count']/data_copy['word_count']
data_copy['average_sentence_length'] = data_copy['word_count']/data_copy['sentence_count']
data_copy['unique_words_vs_words'] = data_copy['unique_word_count']/data_copy['word_count']

In [None]:
data_copy.columns

## 2.0 EDA

### 2.1.1 Analyzing Character Counts

In [None]:
def plot_hist(x_size, y_size, facecolor, x_axis, edgecolor, bar_color, x_label, y_label, title):
    """
    Args:
        x_size (int): x_axis size for figsize
        y_size (int): y_axis size for figsize
        facecolor (string): Hex value for the facecolor of graph
        x_axis (object): Column of the dataframe to graph
        edgecolor (string): Hex value for the edge color of histogram bins
        bar_color (string): Hex value for the bar color of the histogram
        x_label (string): x_axis title
        y_label (string): y_axis title
        title (string): The title of the graph
    """
    plt.figure(figsize=(x_size, y_size))
    plt.rcParams['axes.facecolor'] = facecolor
    plt.rcParams['figure.facecolor'] = facecolor
    #using Sturge's rule to calculate number of bins
    bin_count = int(np.ceil(np.log2(len(data_copy))) + 1)
    values, bins, bars = plt.hist(x_axis, bin_count, edgecolor=edgecolor, facecolor=bar_color, alpha=1)
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.title(title)
    # plt.bar_label(bars, fontsize=10, color='black')
    plt.margins(x=0.01, y=0.1)
    plt.grid(False)
    print("Displaying histogram with " + str(bin_count) + " bins.")

In [None]:
plot_hist(15,
          10,
          '#DEEBF7',
          data_copy['char_count'],
          'white',
          '#3182BD',
          'Char Count',
          'Count',
          'Char Count Distribution')

Based off our historgram distribution we can see that a majority of our data lies in bin 3. We used Sturge's rule to calculate the number of bins, in which we use the formula, $$[log_{2}n + 1]$$. <br><br> This formula uses **n** which is the total number of observations in the dataset and takes the log of that and adds 1. <br><br> Since our data peaks in bin 3 we can say that most of our data will range from roughly 29-44 characters. Adding on, our data is skewed right as we see that there is a little bit of a tail to the right of the graph. This means that the mean is greater than the median.

### 2.1.1 Analyzing Word Counts

In [None]:
plot_hist(15,
          10,
          '#DEEBF7',
          data_copy['word_count'],
          'white',
          '#3182BD',
          'Word Count',
          'Count',
          'Word Count Distribution')

## 3.0 Model 1 (Baseline)
1. Create a temporary dataframe in order to shrink our dataset down because it takes forever to train
2. remove .jpg from image name column
3. label each image
4. split into validation and train datasets (must do this on every run in order to avoid overfitting)
5. encode each sentence (give each character a token)
6. build and run model

How to improve?
1. run more epochs
2. train on more data
3. clean data differently
4. add more layers
5. change activation function
6. change hyperparameters

Creating a temporary data frame for our model to run our testing on:

In [None]:
tmp = data_copy

In [None]:
#Remove '.jpg' from image
tmp['image_name'] = tmp['image_name'].str.replace('.jpg', '')
tmp['image_name'].astype(str)

In [None]:
#temporary df until final model is built
temp33 = tmp
dic = dict()
label_list = list()
#create new column label to label each picture to a numeric digit representing a class
temp33['label'] = np.nan


def classify_images(df):
  '''
    A function to turn the images to a label classifying them

    Args: 
        df: a dataframe
    
'''
    #temporary counting variable
    temp_counter = 0
    #loop through each image in the image_name column
    for value in df['image_name']:
        #assign the counter as the value while the image_name is the key
        if value not in dic:
            dic[value] = temp_counter
            temp_counter+=1
    #put the image_name column values in a list
    images = temp33['image_name'].tolist()
    #loop through the image list and append the correct counter
    for image in images:
        label_list.append(dic[image])
    #update label column to the correct value
    temp33['label'] = label_list

classify_images(temp33)

#make sure output is correct
temp33.tail()

Shrink dataset to test our baseline model:

In [None]:
temp33 = temp33[:500]

In [None]:
y = tf.keras.utils.to_categorical(temp33["label"].values, num_classes=100)
x_train, x_test, y_train, y_test = train_test_split(temp33['comment'], y, test_size=0.30, random_state=101)

In [None]:
#Get the pretrained model from tensorflow
preprocessor = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder-cmlm/multilingual-preprocess/2")
encoder = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder-cmlm/multilingual-base/1")

def get_embeddings(sentences):
  '''return BERT-like embeddings of input text
  Args:
    - sentences: list of strings
  Output:
    - BERT-like embeddings: tf.Tensor of shape=(len(sentences), 768)
  '''
  preprocessed_text = preprocessor(sentences)
  return encoder(preprocessed_text)['pooled_output']

In [None]:
get_embeddings([
    "Two men in green shirts are standing in a yard ."]
)

In [None]:
i = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
x = preprocessor(i)
x = encoder(x)
x = tf.keras.layers.Dropout(0.2, name="dropout")(x['pooled_output'])
x = tf.keras.layers.Dense(100, activation='softmax', name="output")(x)
model = tf.keras.Model(i, x)

In [None]:
n_epochs = 20

METRICS = [
      tf.keras.metrics.CategoricalAccuracy(name="accuracy"),  
]

earlystop_callback = tf.keras.callbacks.EarlyStopping(monitor = "val_loss", 
                                                      patience = 3,
                                                      restore_best_weights = True)

model.compile(optimizer = "adam",
              loss = "categorical_crossentropy",
              metrics = METRICS)

model_fit = model.fit(x_train, 
                      y_train, 
                      epochs = n_epochs,
                      validation_data = (x_test, y_test),
                      callbacks = [earlystop_callback],
                      shuffle=True)

In [None]:
x = list(range(1, n_epochs+1))
metric_list = list(model_fit.history.keys())
num_metrics = int(len(metric_list)/2)

fig, ax = plt.subplots(nrows=1, ncols=num_metrics, figsize=(30, 5))

for i in range(0, num_metrics):
  ax[i].plot(x, model_fit.history[metric_list[i]], marker="o", label=metric_list[i].replace("_", " "))
  ax[i].plot(x, model_fit.history[metric_list[i+num_metrics]], marker="o", label=metric_list[i+num_metrics].replace("_", " "))
  ax[i].set_xlabel("epochs",fontsize=14)
  ax[i].set_title(metric_list[i].replace("_", " "),fontsize=20)
  ax[i].legend(loc="lower left")

In [None]:
temp33.head(30)

In [None]:
reviews = [
    "two men cooking a meal",
    "two men stove preparing food",
    "two guys",
    "two men"
]

def predict_class(reviews):
  '''predict class of input text
  Args:
    - reviews (list of strings)
  Output:
    - class (list of int)
  '''
  return [np.argmax(pred) for pred in model.predict(reviews)]


predict_class(reviews)

In [None]:
tmp[tmp['label'] == 77]

Our model predicts two sentences correctly. The two that are correct are th sentences that match word for word. However on the two it mislabels we see that it still predicts a label with two men present. How can we improve??