## Import Libraries
**Note: Please include a `requirements.txt` file in the future**

In [2]:
import pandas as pd
import numpy as np
import os
import re
import string
import contractions
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import torch
from transformers import BertTokenizer
from torchvision import transforms
import torchvision.transforms.functional as F 
from torchvision.models import vgg16

from PIL import Image
import matplotlib.pyplot as plt

from transformers import BertModel, BertTokenizer

from sklearn.metrics.pairwise import cosine_similarity

import warnings
warnings.filterwarnings('ignore') 

In [3]:
if torch.backends.mps.is_available():
    device = torch.device('mps')
else:
    print("No GPU found")
device

device(type='mps')

## Part 1.

### Read in the data
**Note: move the `results.csv` file into the cloned repository and the below cell will run**

1. We want to have two versions `data_raw` will be the untouched dataset and `data_copy` will be the one we do manipulations on 
    

In [4]:
data_raw = pd.read_csv(f"{os.getcwd()}/results.csv",
                       delimiter="|")
data_copy = data_raw

### Data cleaning and info
1. Call .info() to generate the non-null counts, the column name, and the data type of each column
    - There is whitespace in `comment_number` and `comment` column names
    - There is a null value in the `comment` column
2. Verify there is 5 comments for each image
3. Call .describe() to generate basic statistics about the data
    - There is one more unique comment number in the `comment_number` column

In [5]:
data_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 158915 entries, 0 to 158914
Data columns (total 3 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   image_name       158915 non-null  object
 1    comment_number  158915 non-null  object
 2    comment         158914 non-null  object
dtypes: object(3)
memory usage: 3.6+ MB


In [6]:
print("Before cleaning: ", data_copy.columns)
data_copy.columns = data_copy.columns.str.replace(' ', '')
print("After cleaning: ", data_copy.columns)

Before cleaning:  Index(['image_name', ' comment_number', ' comment'], dtype='object')
After cleaning:  Index(['image_name', 'comment_number', 'comment'], dtype='object')


The data is structured into three columns:
1. **image_name**: represents the image that the comments are attached to
2. **comment_number**: the comment number associated with the image
3. **comment**: the actual comment of the image

We should expect to see 5 comments for each image

In [7]:
image_name_unique_vals = data_copy['image_name'].unique().tolist()
print("Total length of unique values: ", len(image_name_unique_vals))
print("Total length of data frame: ",  len(data_raw))
print("Total length of unique values with comments:", len(image_name_unique_vals) * 5)

Total length of unique values:  31783
Total length of data frame:  158915
Total length of unique values with comments: 158915


In [8]:
data_copy.describe(include='all').transpose()

Unnamed: 0,count,unique,top,freq
image_name,158915,31783,1000092795.jpg,5
comment_number,158915,6,0,31783
comment,158914,158438,Two dogs playing in the snow .,7


In [9]:
comment_number_unique_vals = data_copy['comment_number'].unique().tolist()
comment_number_unique_vals

[' 0', ' 1', ' 2', ' 3', ' 4', ' 4   A dog runs across the grass .']

In [10]:
data_copy['comment_number'] = data_copy['comment_number'].str.replace('\s+', '', regex=True)
comment_number_unique_vals = data_copy['comment_number'].unique().tolist()
comment_number_unique_vals

['0', '1', '2', '3', '4', '4Adogrunsacrossthegrass.']

In [11]:
data_copy[data_copy.isnull().any(axis=1)]

Unnamed: 0,image_name,comment_number,comment
19999,2199200615.jpg,4Adogrunsacrossthegrass.,


In [12]:
data_copy.loc[19999, 'image_name'] = '2199200615.jpg'
data_copy.loc[19999, 'comment_number'] = '4'
data_copy.loc[19999, 'comment'] = 'A dog runs across the grass.'
data_copy.iloc[19999]

image_name                      2199200615.jpg
comment_number                               4
comment           A dog runs across the grass.
Name: 19999, dtype: object

### Feature Engineering
1. Preprocess text and make a new column with cleaned text
    - Remove whitespace
    - Remove punctuation
    - Expand contractions
    - Make everything lowercase
    - Remove stopwords
    - Lemmatize words that are needed
2. Encode text for BERT
    - Get the max length of the comments in the dataset
    - Generate the input_ids and the attention masks for each comment in the dataset
        - Add these as columns into the dataset
3. Preprocess images
    - Turn each image path into a column in the dataset
    - Resize each image to (224 x 224)
    - Use a standard mean and std 
    - Turn each image to a tensor
4. Get VGG16 embeddings
    - Extract the classifiaction layer from vgg16
    - Take the first 500 rows -> 100 images
    - Put the embeddings in a new column
5. Get BERT embeddings

In [13]:
data_copy = data_copy[:20]

#### Preprocess Text
1. Remove whitespace from each comment
2. Remove all punctuations from each comment
3. Expand contractions 
4. Turn all comments to lower case
5. Tokenize each word -> i.e. turn comment into a list of substrings for each word
    - comment: "I like dogs" -> ['I', 'like', 'dogs']
6. Use the tokenizing of the words to remove `stopwords`
    - A `stopword` is a common occuring word. Words such as 'a' and 'is' are examples of a `stopword`
    - Get a set of English stopwords to remove common words like 'the', 'is', etc.
7. Lemmatize the words
    - A `lemmatizer` takes a word and turns it into its lemma
    - For example, if we have the words 'walking', 'walked' and 'walks' the words would now become 'walk'

8. Return the comment
    - Each comment is turned into its own string and returned

In [14]:
def process_comment(comment):
    '''
    Preprocess the ith row and jth comment
    
    @comment: the comment that is getting preprocessed
    '''
    comment = comment.strip()
    comment = comment.translate(str.maketrans('', '', string.punctuation))
    comment = contractions.fix(comment)
    comment = comment.lower()
    words = word_tokenize(comment)
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word.lower() not in stop_words]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    processed_comment = ' '.join(words)
    return processed_comment

In [15]:
data_copy['cleaned_comment'] = data_copy['comment'].apply(process_comment)
data_copy.sample(1)

Unnamed: 0,image_name,comment_number,comment,cleaned_comment
11,1000268201.jpg,1,A little girl in a pink dress going into a wo...,little girl pink dress going wooden cabin


#### Encode Text for BERT
1. Getting the max length of the comments
    1. Grab all the comments and store them into a numpy array
        - We can do this by using `.values` on a column of a pandas df
            - array[('A dog', 'A cat')]
    2. `Tokenize` each word
        - The `NLTK tokenizer` turns each comment into a sublist of words of the entire comment
        - The `BERT tokenizer` turns each word inside of a comment to its own unique integer
            - There are special tokens called `cls`, `sep`, `unk`, `pad`, `mask`
                - `cls`: tells us when the sentence begins
                - `sep`: tells us when the sentence ends
                - `unk`: tells us when a word is not in BERT's vocabulary
                - `pad`: tells us when a sentence is too short, so we add this token to make it longer
                - `mask`: masks a word in the sentence and tries to get the model to predict the word during training
   3. Loop through the comments, tokenize each comment, and find the max length of all the sentences because we need to pad the other sentences to this length as an input for the model

In [16]:
sentences = data_copy['cleaned_comment'].values

In [17]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [18]:
max_len = 0
for i in sentences:
    input_ids = tokenizer.encode(i, add_special_tokens=True)
    max_len = max(max_len, len(input_ids))
max_len

11

In [19]:
def gen_ids_masks(text, max_len=64):
    tokens = tokenizer.encode_plus(
        text,
        max_length=max_len,
        truncation=True,
        padding='max_length',
        return_tensors='pt',
    )
    input_ids = tokens['input_ids']
    attention_mask = tokens['attention_mask']
    return input_ids, attention_mask

In [20]:
data_copy['input_ids'], data_copy['attention_mask'] = zip(*data_copy['cleaned_comment'].apply(gen_ids_masks))
data_copy.head()

Unnamed: 0,image_name,comment_number,comment,cleaned_comment,input_ids,attention_mask
0,1000092795.jpg,0,Two young guys with shaggy hair look at their...,two young guy shaggy hair look hand hanging yard,"[[tensor(101), tensor(2048), tensor(2402), ten...","[[tensor(1), tensor(1), tensor(1), tensor(1), ..."
1,1000092795.jpg,1,"Two young , White males are outside near many...",two young white male outside near many bush,"[[tensor(101), tensor(2048), tensor(2402), ten...","[[tensor(1), tensor(1), tensor(1), tensor(1), ..."
2,1000092795.jpg,2,Two men in green shirts are standing in a yard .,two men green shirt standing yard,"[[tensor(101), tensor(2048), tensor(2273), ten...","[[tensor(1), tensor(1), tensor(1), tensor(1), ..."
3,1000092795.jpg,3,A man in a blue shirt standing in a garden .,man blue shirt standing garden,"[[tensor(101), tensor(2158), tensor(2630), ten...","[[tensor(1), tensor(1), tensor(1), tensor(1), ..."
4,1000092795.jpg,4,Two friends enjoy time spent together .,two friend enjoy time spent together,"[[tensor(101), tensor(2048), tensor(2767), ten...","[[tensor(1), tensor(1), tensor(1), tensor(1), ..."


#### Vgg16 Embeddings

In [21]:
image_root_path = f"{os.getcwd()}/flickr30k_images/"
data_copy['image_path'] = image_root_path + data_copy['image_name']
data_copy.sample(1)

Unnamed: 0,image_name,comment_number,comment,cleaned_comment,input_ids,attention_mask,image_path
19,1000344755.jpg,4,a man on a ladder cleans a window,man ladder clean window,"[[tensor(101), tensor(2158), tensor(10535), te...","[[tensor(1), tensor(1), tensor(1), tensor(1), ...",/Users/blakedickerson/image-text-retrieval/fli...


In [None]:
image_path = data_copy.at[0, 'image_path']
img = Image.open(image_path)
display(img)

# this cell crashes for me investigate later

In [22]:
data_copy['vgg16_embeddings'] = None

# Create vgg16 model from torchvision and put on GPU
vgg16 = vgg16(pretrained=True).to(device)
# Get the embeddings by removing the classification layer
vgg16 = torch.nn.Sequential(*(list(vgg16.children())[:-1]))
# Put in train mode
vgg16.eval()

# Preprocess images
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225],
    ),
])

# Loop over dataset and train on model
with torch.no_grad():
    for idx in range(len(data_copy)):
        image_path = data_copy.at[idx, 'image_path']
        img = Image.open(image_path).convert('RGB')
        # Add a batch to the image -> [C, H, W] -> [1, C, H, W]
        img_tensor = transform(img).unsqueeze(dim=0).to(device)
        embeddings = vgg16(img_tensor)
        data_copy.at[idx, 'vgg16_embeddings'] = embeddings.cpu().numpy()

In [23]:
data_copy.head(1)

Unnamed: 0,image_name,comment_number,comment,cleaned_comment,input_ids,attention_mask,image_path,vgg16_embeddings
0,1000092795.jpg,0,Two young guys with shaggy hair look at their...,two young guy shaggy hair look hand hanging yard,"[[tensor(101), tensor(2048), tensor(2402), ten...","[[tensor(1), tensor(1), tensor(1), tensor(1), ...",/Users/blakedickerson/image-text-retrieval/fli...,[[[[1.3359613 1.5481076 1.9946228 1.3696046 0....


#### BERT Embeddings
1. Instantiate the pre-trained BERT model and set the training to the GPU using `.to(device)`
2. Fine-tune the BERT pre-trained model over the comments
    1. Make a new column for the BERT embeddings and set all values to null
    2. Loop over the dataset
        - Use `torch.no_grad()` because we do not need gradient calculations because we do not need any backpropogation since we do not need a loss function, rather just the embeddings
        - Get the input ids at the $i^{th}$ and $j^{th}$ row and column, respectively
        - Get the attention mask at the $i^{th}$ and $j^{th}$ row and column, respectively
        - Get the output of the BERT model using the inputs `input ids` and `attention mask`
        - Get the embeddings, which is the last state of the output

In [24]:
bert_model = BertModel.from_pretrained('bert-base-uncased').to(device)

In [25]:
input_ids_column = 'input_ids'
attention_mask_column = 'attention_mask'
data_copy['bert_embeddings'] = None


with torch.no_grad():
    for idx in range(len(data_copy)):
        input_ids = data_copy.at[idx, input_ids_column].to(device)
        attention_mask = data_copy.at[idx, attention_mask_column].to(device)
        outputs = bert_model(input_ids=input_ids, attention_mask=attention_mask)
        embeddings = outputs.last_hidden_state
        data_copy.at[idx, 'bert_embeddings'] = embeddings.cpu().numpy()

In [26]:
data_copy.head(15)

Unnamed: 0,image_name,comment_number,comment,cleaned_comment,input_ids,attention_mask,image_path,vgg16_embeddings,bert_embeddings
0,1000092795.jpg,0,Two young guys with shaggy hair look at their...,two young guy shaggy hair look hand hanging yard,"[[tensor(101), tensor(2048), tensor(2402), ten...","[[tensor(1), tensor(1), tensor(1), tensor(1), ...",/Users/blakedickerson/image-text-retrieval/fli...,[[[[1.3359613 1.5481076 1.9946228 1.3696046 0....,"[[[-0.2685596, 0.21052477, -0.08599222, -0.265..."
1,1000092795.jpg,1,"Two young , White males are outside near many...",two young white male outside near many bush,"[[tensor(101), tensor(2048), tensor(2402), ten...","[[tensor(1), tensor(1), tensor(1), tensor(1), ...",/Users/blakedickerson/image-text-retrieval/fli...,[[[[1.3359613 1.5481076 1.9946228 1.3696046 0....,"[[[-0.2770078, -0.39967567, -0.2484164, -0.304..."
2,1000092795.jpg,2,Two men in green shirts are standing in a yard .,two men green shirt standing yard,"[[tensor(101), tensor(2048), tensor(2273), ten...","[[tensor(1), tensor(1), tensor(1), tensor(1), ...",/Users/blakedickerson/image-text-retrieval/fli...,[[[[1.3359613 1.5481076 1.9946228 1.3696046 0....,"[[[-0.18692452, 0.15539032, -0.3427699, 0.0620..."
3,1000092795.jpg,3,A man in a blue shirt standing in a garden .,man blue shirt standing garden,"[[tensor(101), tensor(2158), tensor(2630), ten...","[[tensor(1), tensor(1), tensor(1), tensor(1), ...",/Users/blakedickerson/image-text-retrieval/fli...,[[[[1.3359613 1.5481076 1.9946228 1.3696046 0....,"[[[-0.17712925, -0.0005741473, -0.13430652, -0..."
4,1000092795.jpg,4,Two friends enjoy time spent together .,two friend enjoy time spent together,"[[tensor(101), tensor(2048), tensor(2767), ten...","[[tensor(1), tensor(1), tensor(1), tensor(1), ...",/Users/blakedickerson/image-text-retrieval/fli...,[[[[1.3359613 1.5481076 1.9946228 1.3696046 0....,"[[[0.055349417, 0.10062032, 0.15722284, 0.1162..."
5,10002456.jpg,0,Several men in hard hats are operating a gian...,several men hard hat operating giant pulley sy...,"[[tensor(101), tensor(2195), tensor(2273), ten...","[[tensor(1), tensor(1), tensor(1), tensor(1), ...",/Users/blakedickerson/image-text-retrieval/fli...,"[[[[0. 0. 0. 0. 0. 0. 0.], [0. 0. 0. 0. 0. 0. ...","[[[-0.16020359, 0.27652726, 0.051281925, 0.135..."
6,10002456.jpg,1,Workers look down from up above on a piece of...,worker look piece equipment,"[[tensor(101), tensor(7309), tensor(2298), ten...","[[tensor(1), tensor(1), tensor(1), tensor(1), ...",/Users/blakedickerson/image-text-retrieval/fli...,"[[[[0. 0. 0. 0. 0. 0. 0.], [0. 0. 0. 0. 0. 0. ...","[[[-0.045069344, 0.39052594, -0.107470155, -0...."
7,10002456.jpg,2,Two men working on a machine wearing hard hats .,two men working machine wearing hard hat,"[[tensor(101), tensor(2048), tensor(2273), ten...","[[tensor(1), tensor(1), tensor(1), tensor(1), ...",/Users/blakedickerson/image-text-retrieval/fli...,"[[[[0. 0. 0. 0. 0. 0. 0.], [0. 0. 0. 0. 0. 0. ...","[[[-0.11012821, 0.21814653, -0.651759, 0.11333..."
8,10002456.jpg,3,Four men on top of a tall structure .,four men top tall structure,"[[tensor(101), tensor(2176), tensor(2273), ten...","[[tensor(1), tensor(1), tensor(1), tensor(1), ...",/Users/blakedickerson/image-text-retrieval/fli...,"[[[[0. 0. 0. 0. 0. 0. 0.], [0. 0. 0. 0. 0. 0. ...","[[[-0.2949568, 0.3480261, -0.21604586, -0.0413..."
9,10002456.jpg,4,Three men on a large rig .,three men large rig,"[[tensor(101), tensor(2093), tensor(2273), ten...","[[tensor(1), tensor(1), tensor(1), tensor(1), ...",/Users/blakedickerson/image-text-retrieval/fli...,"[[[[0. 0. 0. 0. 0. 0. 0.], [0. 0. 0. 0. 0. 0. ...","[[[-0.35205492, 0.36165425, -0.3777878, -0.112..."


## Part 2. 
In this part we will build two graphs, x & y, respectively, where each node is an image and each edge is determined by its similarity with the node.

<br><br>
1000 images -> 1000x1000 i,j element is a scalar what is it? The similarity/closeness between i,j -> cosine similarity -> (I*j)/2 for every I,j scalar which gives us a graph with only the vgg information do the same thing for text 

Match means we have an order of images , If they are not matching the comment and image are not a pair this is what the algorithm solves

Phi means they are not connected
Straight line/curve line is the same 

Node attribute -> embedding

n_1/e_1 -> diagonal elements represent different types of nodes node attributes over all nodes are the same along with the edges 

Node -> ordered number of node
Edge -> yes if connected, no if not
Try non-binary first, then covert to binary

### Step 2.1 
Build graph x

In [27]:
graph_x_initial = data_copy[['image_name', 'vgg16_embeddings']]
graph_x_initial.sample(1)

Unnamed: 0,image_name,vgg16_embeddings
17,1000344755.jpg,"[[[[0. 0. 0. 0. 0. 0. 0.], [0. 0. ..."


In [28]:
matrices = []
for idx, row in graph_x_initial.iterrows():
    new_matrix = np.array([row['image_name']])
    remaining_rows = graph_x_initial.drop(idx)
    transposed = remaining_rows['image_name'].values  # Extract values for transpose
    new_matrix = np.concatenate((new_matrix, transposed), axis=0)
    matrices.append(new_matrix)

In [29]:
matrices[0]

array(['1000092795.jpg', '1000092795.jpg', '1000092795.jpg',
       '1000092795.jpg', '1000092795.jpg', '10002456.jpg', '10002456.jpg',
       '10002456.jpg', '10002456.jpg', '10002456.jpg', '1000268201.jpg',
       '1000268201.jpg', '1000268201.jpg', '1000268201.jpg',
       '1000268201.jpg', '1000344755.jpg', '1000344755.jpg',
       '1000344755.jpg', '1000344755.jpg', '1000344755.jpg'], dtype=object)

In [30]:
def build_graph_x(matrix: np.array,
                  df: pd.DataFrame,
                  ith_row: int,
                  jth_col: int):
    """
    Builds the graph of the images
    
    matrix (np.array): a 2d array of image names
    df (pd.DataFrame): an initial graph of image names and their corresponding embeddings
    ith_row (int): number of rows in the dataframe
    jth_col (int): number of columns in the dataframe
    """
    graph = pd.DataFrame(index=range(len(matrix)), columns=range(len(matrix)))
    
    for i, row in enumerate(matrix):
        row_embedding = df.loc[i, 'vgg16_embeddings'].reshape(1, -1)
        
        for j, col in enumerate(row):
            column_embedding = df.loc[j, 'vgg16_embeddings'].reshape(1, -1)
            similarity = cosine_similarity(row_embedding, column_embedding)[0][0]
            graph.loc[i, j] = similarity
    
    graph.index = [df.loc[idx, 'image_name'] for idx in range(len(matrix))]
    graph.columns = [df.loc[idx, 'image_name'] for idx in range(len(matrix))]
    
    graph = graph.mul((ith_row * jth_col) / 2)
    
    return graph

In [31]:
graph_x = build_graph_x(matrix=matrices,
                        df=graph_x_initial,
                        ith_row=20,
                        jth_col=20)
graph_x.sample(2)

Unnamed: 0,1000092795.jpg,1000092795.jpg.1,1000092795.jpg.2,1000092795.jpg.3,1000092795.jpg.4,10002456.jpg,10002456.jpg.1,10002456.jpg.2,10002456.jpg.3,10002456.jpg.4,1000268201.jpg,1000268201.jpg.1,1000268201.jpg.2,1000268201.jpg.3,1000268201.jpg.4,1000344755.jpg,1000344755.jpg.1,1000344755.jpg.2,1000344755.jpg.3,1000344755.jpg.4
10002456.jpg,14.951783,14.951783,14.951783,14.951783,14.951783,200.0,200.0,200.0,200.0,200.0,19.110151,19.110151,19.110151,19.110151,19.110151,27.236593,27.236593,27.236593,27.236593,27.236593
10002456.jpg,14.951783,14.951783,14.951783,14.951783,14.951783,200.0,200.0,200.0,200.0,200.0,19.110151,19.110151,19.110151,19.110151,19.110151,27.236593,27.236593,27.236593,27.236593,27.236593


## Step 2.2
Build graph y

In [32]:
graph_y_initial = data_copy[['image_name', 'bert_embeddings']]
graph_y_initial.sample(1)

Unnamed: 0,image_name,bert_embeddings
0,1000092795.jpg,"[[[-0.2685596, 0.21052477, -0.08599222, -0.265..."


In [33]:
matrices = []
for idx, row in graph_y_initial.iterrows():
    new_matrix = np.array([row['image_name']])
    remaining_rows = graph_y_initial.drop(idx)
    transposed = remaining_rows['image_name'].values  
    new_matrix = np.concatenate((new_matrix, transposed), axis=0)
    matrices.append(new_matrix)

In [34]:
def build_graph_y(matrix: np.array,
                  df: pd.DataFrame,
                  ith_row: int,
                  jth_col: int):
    """
    
    """
    graph = pd.DataFrame(index=range(len(matrix)), columns=range(len(matrix)))
    
    for i, row in enumerate(matrix):
        row_embedding = df.loc[df['image_name'] == row[0], 'bert_embeddings'].values[0].reshape(1, -1)
        for j, col in enumerate(row):
            column_embedding = df.loc[df['image_name'] == col, 'bert_embeddings'].values[0].reshape(1, -1)
            similarity = cosine_similarity(row_embedding, column_embedding)[0][0]
            graph.loc[i, j] = similarity
            
    graph.index = [df.loc[idx, 'image_name'] for idx in range(len(matrix))]
    graph.columns = [df.loc[idx, 'image_name'] for idx in range(len(matrix))]
    
    graph = graph.mul((ith_row * jth_col) / 2)
    
    return graph

In [35]:
graph_y = build_graph_y(matrix=matrices,
                        df=graph_y_initial,
                        ith_row=20,
                        jth_col=20)
graph_y.sample(2)

Unnamed: 0,1000092795.jpg,1000092795.jpg.1,1000092795.jpg.2,1000092795.jpg.3,1000092795.jpg.4,10002456.jpg,10002456.jpg.1,10002456.jpg.2,10002456.jpg.3,10002456.jpg.4,1000268201.jpg,1000268201.jpg.1,1000268201.jpg.2,1000268201.jpg.3,1000268201.jpg.4,1000344755.jpg,1000344755.jpg.1,1000344755.jpg.2,1000344755.jpg.3,1000344755.jpg.4
1000092795.jpg,200.000048,200.000048,200.000048,200.000048,200.000048,136.523414,136.523414,136.523414,136.523414,136.523414,121.723104,121.723104,121.723104,121.723104,121.723104,147.071195,147.071195,147.071195,147.071195,147.071195
1000268201.jpg,199.999821,121.723104,121.723104,121.723104,121.723104,121.723104,135.257769,135.257769,135.257769,135.257769,135.257769,199.999821,199.999821,199.999821,199.999821,149.226141,149.226141,149.226141,149.226141,149.226141


## Step 2.3
Build adjacency matrix 
<br>
Use a threshold of within 1% of max value of 200

In [48]:
A1 = pd.DataFrame(np.zeros((len(graph_x), len(graph_y))), index=graph_x.index, columns=graph_y.index)

for x_index, x_row in enumerate(graph_x.iterrows()):
    largest_indices = np.argsort(x_row[1])[-5:]
    A1.iloc[x_index, largest_indices] = 1

In [49]:
A2 = pd.DataFrame(np.zeros((len(graph_x), len(graph_y))), index=graph_y.index, columns=graph_x.index)

for y_index, y_row in enumerate(graph_y.iterrows()):
    largest_indices = np.argsort(y_row[1])[-5:]
    A2.iloc[y_index, largest_indices] = 1

In [56]:
N1 = data_copy[['image_name', 'vgg16_embeddings']]
N1 = N1.set_index('image_name')
N1.sample(1)

Unnamed: 0_level_0,vgg16_embeddings
image_name,Unnamed: 1_level_1
1000092795.jpg,[[[[1.3359613 1.5481076 1.9946228 1.3696046 0....


In [57]:
N2 = data_copy[['image_name', 'bert_embeddings']]
N2 = N2.set_index('image_name')
N2

Unnamed: 0_level_0,bert_embeddings
image_name,Unnamed: 1_level_1
1000092795.jpg,"[[[-0.2685596, 0.21052477, -0.08599222, -0.265..."
1000092795.jpg,"[[[-0.2770078, -0.39967567, -0.2484164, -0.304..."
1000092795.jpg,"[[[-0.18692452, 0.15539032, -0.3427699, 0.0620..."
1000092795.jpg,"[[[-0.17712925, -0.0005741473, -0.13430652, -0..."
1000092795.jpg,"[[[0.055349417, 0.10062032, 0.15722284, 0.1162..."
10002456.jpg,"[[[-0.16020359, 0.27652726, 0.051281925, 0.135..."
10002456.jpg,"[[[-0.045069344, 0.39052594, -0.107470155, -0...."
10002456.jpg,"[[[-0.11012821, 0.21814653, -0.651759, 0.11333..."
10002456.jpg,"[[[-0.2949568, 0.3480261, -0.21604586, -0.0413..."
10002456.jpg,"[[[-0.35205492, 0.36165425, -0.3777878, -0.112..."


Note in the above cell how even though they are the same image, they have different embeddings...