# Language Processing

In [1]:
import re
import pandas as pd
from io import StringIO
import torch.nn as nn
import torch.optim as optim
import torch

import numpy as np
import os
import csv

import random

In [2]:
TRAIN_FILENAME = '../dataset/train.csv'
TEST_FILENAME = '../dataset/test.csv'
IMAGE_DIRECTORY = "../dataset/images"
DATA_BASEPATH = "../dataset"

def append_none_to_caption(df: pd.DataFrame) -> pd.DataFrame:
    caption_col = "Caption"
    extra_col = None
    sep = " "

    if extra_col is None:
        if None in df.columns:
            extra_col = None
        elif 'None' in df.columns:
            extra_col = 'None'
        else:
            raise KeyError("Could not find a column named None or 'None' in your DataFrame.")

    # make sure captions are strings
    df[caption_col] = df[caption_col].astype(str)

    # build a Series of the extra text, safely capturing sep in the lambda’s default
    extras = df[extra_col].apply(
        lambda val, sep=sep: (
            '' if pd.isna(val)
            else sep.join(str(item).strip() for item in (val if isinstance(val, (list, tuple)) else [val]))
        )
    )

    # only append where there actually is some extra text
    mask = extras.ne('')
    df.loc[mask, caption_col] = df.loc[mask, caption_col] + sep + extras[mask]

    return df.drop(columns=[extra_col])

def extract_df(filename):
    path = os.path.join(DATA_BASEPATH, filename)
    with open(path, newline="", encoding="utf-8") as csvfile:
        reader = csv.DictReader(csvfile, delimiter=",", quotechar='"', escapechar="\\")
        df = pd.DataFrame(reader)

    return append_none_to_caption(df)

train_data = extract_df(TRAIN_FILENAME)
test_data = extract_df(TEST_FILENAME)
train_data

Unnamed: 0,ImageID,Labels,Caption
0,0.jpg,1,Woman in swim suit holding parasol on sunny day.
1,1.jpg,1 19,A couple of men riding horses on top of a gree...
2,2.jpg,1,They are brave for riding in the jungle on tho...
3,3.jpg,8 3 13,a black and silver clock tower at an intersect...
4,4.jpg,8 3 7,A train coming to a stop on the tracks out side.
...,...,...,...
29995,29995.jpg,8 1 2,A picture of a truck that is in the middle of ...
29996,29996.jpg,1,A plate topped with a pizza being cut with a s...
29997,29997.jpg,1,A man riding a snowboard on top of snow.
29998,29998.jpg,1,This photo shows people skiing in the mountains.


In [3]:
words = []
for caption in train_data['Caption']:
    strippedCaption = re.sub('[^A-Za-z0-9 ]+', '', caption)
    words.extend(strippedCaption.split())

for caption in test_data['Caption']:
    strippedCaption = re.sub('[^A-Za-z0-9 ]+', '', caption)
    words.extend(strippedCaption.split())

wordSet = set(words)
wordDict = {}
i = 1
for word in wordSet:
    wordDict[word] = i
    i+=1

In [4]:
def AddTokenColumn(dataframe):
    tokenColumn = []
    for caption in dataframe['Caption']:
        tokens = []
        strippedCaption = re.sub('[^A-Za-z0-9 ]+', '', caption)
        for word in strippedCaption.split():
            tokens.append(wordDict[word])
        
        tokenColumn.append(tokens)
        #length = len(tokens)
        #tokenColumn.append(np.pad(tokens, (0, 50-length)))
        
    dataframe['tokenized-caption'] = tokenColumn

def AddOneHotLabelColumn(dataframe):
    labelColumn = []
    for labels in dataframe['Labels']:
        oneHotLabels = [0] * 19
        splitLabels = labels.split()
        for label in splitLabels:
            oneHotLabels[int(label)-1] = 1.0
        labelColumn.append(list(oneHotLabels))
    dataframe['one-hot-labels'] = list(labelColumn)
    
AddTokenColumn(train_data)
AddTokenColumn(test_data)
AddOneHotLabelColumn(train_data)


#np.array(GetOneHotLabelColumn(train_data)).astype(np.float32)
#np.array(vectorizedTrainingText)[12]
#train_data['tokenized-caption']

In [5]:
def GetCaptionTensors(data):
    tensors = []
    for datapoint in data:
        tensors.append(torch.tensor(datapoint))
        
    return tensors

# Batching

In [6]:
def GetBatches(data, labels, batch_size=128):
    indexList = list(range(0, len(data)))
    random.shuffle(indexList)
    
    batchedData = []
    batchedLabels = []
    
    currentDataBatch = []
    currentLabelBatch = []
    
    currentDataBatch.append(torch.tensor(data[indexList[0]]))
    currentLabelBatch.append(labels[indexList[0]])
    i = 1
    while i < len(data):
        if i % batch_size == 0:
            batchedData.append(currentDataBatch)
            batchedLabels.append(currentLabelBatch)
            currentDataBatch = []
            currentLabelBatch = []
        currentDataBatch.append(torch.tensor(data[indexList[i]]))
        currentLabelBatch.append(labels[indexList[i]])
    
        
        i+=1
        
    return batchedData, batchedLabels

# Model

In [7]:
vocabSize = len(wordDict.keys())+1
LEARNING_RATE = .001
BATCH_SIZE = 128
EPOCHS = 15
EMBEDDING_DIM = 20

In [8]:
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

# PyTorch models inherit from torch.nn.Module
class LSTMNetwork(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size, numLayers):
        super(LSTMNetwork, self).__init__()

        self.num_layers = numLayers
        self.hidden_dim = hidden_dim
        self.word_embeddings = nn.Embedding(vocab_size+1, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, numLayers, dropout=.2, batch_first=True)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

        self.sigmoid = nn.Sigmoid()
        
    def forward(self, sentences):
        padded_sequences = nn.utils.rnn.pad_sequence(sentences, batch_first=True)
        sequence_lengths = torch.tensor([len(seq) for seq in sentences])
        
        embeds = self.word_embeddings(padded_sequences)
        packedEmbedding = pack_padded_sequence(embeds, sequence_lengths.cpu(), batch_first=True, enforce_sorted=False)
        packedOutput, (hidden, _cell) = self.lstm(packedEmbedding)
        #hidden = torch.squeeze(hidden, 1)
        tag_space = self.hidden2tag(hidden[-1])
        return self.sigmoid(tag_space)

model = LSTMNetwork(EMBEDDING_DIM, 128, vocabSize, 19, 2)

# Training

In [9]:
def train_loop(model):
    loss_fn = nn.BCELoss()
    optimizer = optim.RMSprop(model.parameters(), LEARNING_RATE)
    model.train()

    i = 0
    while i < EPOCHS:
        batchedData, batchedLabels = GetBatches(train_data['tokenized-caption'],train_data['one-hot-labels'])
        
        total_loss = 0

        j = 0 
        while j < len(batchedData):
            y_pred = model(batchedData[j])
            #print(y_pred)
            #print(batchedLabels[j])
            loss = loss_fn(y_pred, torch.tensor(batchedLabels[j]))
            optimizer.zero_grad()
            loss.backward()
            
            optimizer.step()
            j+=1
            total_loss += loss.item()

        avg_loss = total_loss / len(batchedData)
        print(f"Epoch {i+1}/{EPOCHS} - Loss: {avg_loss:.4f}")
        i+=1
        
        print("epoch finished")
    
train_loop(model)

Epoch 1/15 - Loss: 0.1910
epoch finished
Epoch 2/15 - Loss: 0.1539
epoch finished
Epoch 3/15 - Loss: 0.1302
epoch finished
Epoch 4/15 - Loss: 0.1162
epoch finished
Epoch 5/15 - Loss: 0.1059
epoch finished
Epoch 6/15 - Loss: 0.0990
epoch finished
Epoch 7/15 - Loss: 0.0942
epoch finished
Epoch 8/15 - Loss: 0.0903
epoch finished
Epoch 9/15 - Loss: 0.0871
epoch finished
Epoch 10/15 - Loss: 0.0845
epoch finished
Epoch 11/15 - Loss: 0.0821
epoch finished
Epoch 12/15 - Loss: 0.0795
epoch finished
Epoch 13/15 - Loss: 0.0773
epoch finished
Epoch 14/15 - Loss: 0.0752
epoch finished
Epoch 15/15 - Loss: 0.0731
epoch finished


# Evaluation

In [10]:
def GetMatch(prediction, trueLabel):
    match = True
    i = 0
    while i < len(trueLabel):
        if trueLabel[i] == 1.0:
            if prediction[i] < .5:
                match = False
        else:
            if prediction[i] > .5:
                match = False
        i+=1
    
    return match

In [None]:
def GetPredictions(model, data):
    model.eval()
    predictions = []
    currentIndex = 0
    while currentIndex + BATCH_SIZE < len(data):
        predictions.extend(model(GetCaptionTensors(data[currentIndex: currentIndex+BATCH_SIZE])))
        currentIndex += BATCH_SIZE

    predictions.extend(model(GetCaptionTensors(data[currentIndex: len(data)])))
    return predictions

In [12]:
predictions = GetPredictions(model, train_data['tokenized-caption'])


0
128
256
384
512
640
768
896
1024
1152
1280
1408
1536
1664
1792
1920
2048
2176
2304
2432
2560
2688
2816
2944
3072
3200
3328
3456
3584
3712
3840
3968
4096
4224
4352
4480
4608
4736
4864
4992
5120
5248
5376
5504
5632
5760
5888
6016
6144
6272
6400
6528
6656
6784
6912
7040
7168
7296
7424
7552
7680
7808
7936
8064
8192
8320
8448
8576
8704
8832
8960
9088
9216
9344
9472
9600
9728
9856
9984
10112
10240
10368
10496
10624
10752
10880
11008
11136
11264
11392
11520
11648
11776
11904
12032
12160
12288
12416
12544
12672
12800
12928
13056
13184
13312
13440
13568
13696
13824
13952
14080
14208
14336
14464
14592
14720
14848
14976
15104
15232
15360
15488
15616
15744
15872
16000
16128
16256
16384
16512
16640
16768
16896
17024
17152
17280
17408
17536
17664
17792
17920
18048
18176
18304
18432
18560
18688
18816
18944
19072
19200
19328
19456
19584
19712
19840
19968
20096
20224
20352
20480
20608
20736
20864
20992
21120
21248
21376
21504
21632
21760
21888
22016
22144
22272
22400
22528
22656
22784
22912
23040
231

In [None]:
def GetAccuracy(predictions, labels):
    j=0
    total = 0
    truePredictions = 0
    while j < len(predictions):
        isMatch = GetMatch(predictions[j], labels[j])
        if(isMatch):
            truePredictions +=1
        total +=1
        j+=1
    return truePredictions/ total

print(GetAccuracy(predictions, train_data['one-hot-labels']))

In [18]:
predictions = GetPredictions(model, test_data['tokenized-caption'])

0
128
256
384
512
640
768
896
1024
1152
1280
1408
1536
1664
1792
1920
2048
2176
2304
2432
2560
2688
2816
2944
3072
3200
3328
3456
3584
3712
3840
3968
4096
4224
4352
4480
4608
4736
4864
4992
5120
5248
5376
5504
5632
5760
5888
6016
6144
6272
6400
6528
6656
6784
6912
7040
7168
7296
7424
7552
7680
7808
7936
8064
8192
8320
8448
8576
8704
8832
8960
9088
9216
9344
9472
9600
9728
9856


In [27]:
def FormatPredictions(predictions):
    formattedPredictions = []
    for prediction in predictions:
        formattedPrediction = []
        i = 0
        while i < len(prediction):
            if(prediction[i] > .5):
                formattedPrediction.append(i+1)
                
            i+=1
            
        formattedPredictions.append(formattedPrediction)
    return formattedPredictions

print(len(predictions))
formattedPredictions = FormatPredictions(predictions)

10000


In [28]:
with open('out.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerows(formattedPredictions)

In [29]:
predictions[10]

tensor([9.9007e-01, 7.0894e-01, 7.6322e-01, 1.1818e-01, 7.5285e-04, 1.3402e-01,
        6.1380e-03, 3.7183e-01, 7.3816e-03, 1.2808e-01, 1.6653e-01, 4.2499e-08,
        2.2735e-02, 3.8260e-02, 6.9601e-02, 3.0276e-03, 5.5213e-04, 1.3501e-02,
        8.3952e-03], grad_fn=<UnbindBackward0>)

In [14]:
torch.save(model.state_dict(), './model')

In [None]:
train_data['tokenized-caption'][90]

array([4560, 2748, 5973,  242,  467, 8953, 3170,  375, 8953, 7069, 8811,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0])

In [None]:
model(torch.tensor(train_data['tokenized-caption'][0:2]))


torch.Size([2, 50, 20])


tensor([[ 5.7228e-02, -7.1909e-02,  2.3115e-02, -1.8948e-02, -2.2713e-02,
          5.9312e-02,  3.5925e-02, -4.8522e-02,  7.2089e-02, -4.2516e-02,
          1.5893e-02,  7.9602e-02, -8.6030e-02,  3.9239e-02,  2.9563e-02,
         -1.0318e-02,  2.4958e-02, -7.4344e-02,  3.8100e-03],
        [ 5.8625e-02, -6.5759e-02,  3.1757e-02, -3.0277e-02, -2.6234e-02,
          6.3241e-02,  4.2818e-02, -5.6413e-02,  7.5931e-02, -4.9185e-02,
          5.3247e-03,  7.7980e-02, -9.1814e-02,  4.3932e-02,  2.2189e-02,
         -4.3803e-03,  2.3647e-02, -7.0063e-02,  1.9952e-03],
        [ 5.6376e-02, -5.4926e-02,  3.1898e-02, -3.4264e-02, -3.3227e-02,
          6.7233e-02,  4.3479e-02, -6.1595e-02,  8.2853e-02, -5.2923e-02,
          3.0341e-03,  7.7936e-02, -9.3092e-02,  4.5347e-02,  1.7790e-02,
          2.4739e-03,  1.9212e-02, -7.9280e-02,  3.8704e-03],
        [ 4.8207e-02, -5.2075e-02,  3.2028e-02, -3.2043e-02, -3.1373e-02,
          7.3633e-02,  4.3741e-02, -6.0603e-02,  8.6191e-02, -4.8718e-02,


# keras implementation

Used to test my configuration while i debugged the pytorch model

In [125]:
def GetOneHotLabelColumn(dataframe):
    labelColumn = []
    for labels in dataframe['Labels']:
        oneHotLabels = [0] * 19
        splitLabels = labels.split()
        for label in splitLabels:
            oneHotLabels[int(label)-1] = 1.0
        labelColumn.append(list(oneHotLabels))
    return list(labelColumn)

def GetTokenColumn(dataframe):
    tokenColumn = []
    for caption in dataframe['Caption']:
        tokens = []
        strippedCaption = re.sub('[^A-Za-z0-9 ]+', '', caption)
        for word in strippedCaption.split():
            tokens.append(wordDict[word])
        length = len(tokens)
        #tokenColumn.append(tokens)
        tokenColumn.append(np.pad(tokens, (0, 50-length)))
        
    return np.array(tokenColumn)

In [91]:
import keras
from keras import layers

def GetLSTMModel(lstmNeurons=128, lstmLayers=2, dropout=.2):
    numClasses = 19

    # Model definition
    inputs = keras.Input(shape=(None,), dtype="int64")
    x = layers.Embedding(vocabSize+1, EMBEDDING_DIM, mask_zero=True)(inputs)
    
    i = 1
    while i < lstmLayers:
        x = layers.LSTM(lstmNeurons, return_sequences=True, dropout=dropout)(x)
        i+=1
        
    lstm_out, state_h, _state_c = layers.LSTM(lstmNeurons, return_sequences=True, return_state=True, dropout=dropout)(x)
    
    template_output = layers.Dense(numClasses, activation='sigmoid', name="dense")(state_h)
    
    model = keras.Model(inputs=inputs, outputs=template_output)
    # Compile the model
    model.compile(
        optimizer="adam",
        loss={
        "dense": "binary_crossentropy"
        },  # adjust as needed,
        metrics=["accuracy"],
    )
    return model

In [None]:
def GetVectorizationLayer():
    vectorizeLayer = keras.layers.TextVectorization(
        standardize='lower_and_strip_punctuation',
        max_tokens=20000,
        output_mode="int",
        output_sequence_length=50
    )
    
    vectorizeLayer.adapt(train_data['Caption'])
    return vectorizeLayer

def vectorizeAllText(data):
    return list(data.map(vectorizeText))

def vectorizeText(text):
    return list(VectorizationLayer(text))

    
VectorizationLayer = GetVectorizationLayer()
vectorizedTrainingText = vectorizeAllText(train_data['Caption'])

In [108]:
np.array(train_data['one-hot-labels'])[12]

[1.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0]

In [115]:
LSTMModel = GetLSTMModel()
print(np.array(train_data['tokenized-caption']))
history = LSTMModel.fit(x=GetTokenColumn(train_data).astype(np.int32), y=np.array(GetOneHotLabelColumn(train_data)).astype(np.float32), epochs=20, batch_size=128)

[list([8974, 5490, 733, 2850, 6014, 7691, 3634, 8634, 6607, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
 list([4048, 7959, 1727, 7315, 4413, 460, 3634, 3075, 1727, 1609, 4127, 7765, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
 list([5119, 5309, 9100, 3955, 4413, 5490, 3322, 4522, 3634, 5327, 4192, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
 ...
 list([4048, 3180, 4413, 1609, 3010, 3634, 3075, 1727, 6327, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
 list([117, 3779, 7940, 8601, 9413, 5490, 3322, 4799, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
 list([8320, 3290, 7315, 9459, 3682, 6031, 907, 3955, 3322, 3

KeyboardInterrupt: 

In [117]:
predictions = LSTMModel.predict(GetTokenColumn(train_data).astype(np.int32))

[1m938/938[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 16ms/step


In [120]:
numCorrect = 0
i = 0
j = 0
while i < len(train_data['one-hot-labels']):
    isMatch = GetMatch(predictions[i], train_data['one-hot-labels'][i])
    if isMatch:
        numCorrect+=1
    elif j < 5:
        print(i)
        print(predictions[i])  
        print(train_data['one-hot-labels'][i])  
    i+=1
print(numCorrect/i)

3
[2.3046581e-01 5.7402514e-02 5.3015774e-01 1.4738512e-02 3.8661074e-04
 8.7275632e-02 1.0859405e-02 1.8171683e-01 2.5513226e-03 2.7225643e-01
 1.3130166e-01 8.2507169e-05 7.9963706e-02 1.6110054e-01 3.7530404e-02
 1.8785892e-02 4.1709831e-03 6.4451545e-03 4.3759044e-04]
[0, 0, 1.0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0, 0]
4
[3.0394137e-01 1.0342024e-02 8.9649402e-02 1.9108061e-03 1.8709174e-03
 2.2157612e-03 9.8899668e-01 4.1937374e-02 6.0471380e-03 1.3384952e-01
 1.4759509e-02 1.9089998e-04 1.6750984e-02 2.2723542e-03 6.2003486e-02
 3.7952291e-03 4.3806937e-04 4.6402928e-03 2.7887916e-04]
[0, 0, 1.0, 0, 0, 0, 1.0, 1.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
8
[4.9446210e-01 1.9634022e-01 5.5605102e-01 1.1386940e-01 3.2300854e-04
 1.5322232e-01 1.3184642e-02 2.1888307e-01 7.0952657e-03 4.1411468e-01
 5.8839396e-02 9.3996088e-05 6.7873895e-02 3.3918932e-02 4.5652356e-02
 8.5308403e-03 7.7287329e-04 9.7125620e-03 2.5046389e-03]
[0, 0, 1.0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [None]:
import tensorflow as tf

model = ...  # Your Tensorflow model
model.save("tf_model.h5")

loaded_model = tf.keras.models.load_model("tf_model.h5")


In [None]:
import tf2onnx

# Convert the model to ONNX format
onnx_model, _ = tf2onnx.convert.from_keras(loaded_model)


In [None]:
import onnx
from onnx2pytorch import ConvertModel

# Load ONNX model
onnx_model = onnx.load_model("tf_model.onnx")

# Convert ONNX model to PyTorch
pytorch_model = ConvertModel(onnx_model)
