# Language Processing

In [1]:
import re
import pandas as pd
from io import StringIO
import torch.nn as nn
import torch.optim as optim
import torch

import numpy as np

In [2]:
TRAIN_FILENAME = '../dataset/train.csv'
TEST_FILENAME = '../dataset/test.csv'
IMAGE_DIRECTORY = "../dataset/images"

def extract_df(filename):
    """
    Uses a filename to extract and return the information in dataframe format
    """
    with open(filename) as file:
        lines = [re.sub(r'([^,])"(\s*[^\n])', r'\1/"\2', line) for line in file]
        df = pd.read_csv(StringIO(''.join(lines)), escapechar="/")
    
    return df

train_data = extract_df(TRAIN_FILENAME)
test_data = extract_df(TEST_FILENAME)
train_data

Unnamed: 0,ImageID,Labels,Caption
0,0.jpg,1,Woman in swim suit holding parasol on sunny day.
1,1.jpg,1 19,A couple of men riding horses on top of a gree...
2,2.jpg,1,They are brave for riding in the jungle on tho...
3,3.jpg,8 3 13,a black and silver clock tower at an intersect...
4,4.jpg,8 3 7,A train coming to a stop on the tracks out side.
...,...,...,...
29991,29995.jpg,8 1 2,A picture of a truck that is in the middle of ...
29992,29996.jpg,1,A plate topped with a pizza being cut with a s...
29993,29997.jpg,1,A man riding a snowboard on top of snow.
29994,29998.jpg,1,This photo shows people skiing in the mountains.


In [3]:
words = []
for caption in train_data['Caption']:
    strippedCaption = re.sub('[^A-Za-z0-9 ]+', '', caption)
    words.extend(strippedCaption.split())

for caption in test_data['Caption']:
    strippedCaption = re.sub('[^A-Za-z0-9 ]+', '', caption)
    words.extend(strippedCaption.split())

wordSet = set(words)
wordDict = {}
i = 1
for word in wordSet:
    wordDict[word] = i
    i+=1

In [84]:
def AddPaddedTokenColumn(dataframe):
    tokenColumn = []
    for caption in dataframe['Caption']:
        tokens = []
        strippedCaption = re.sub('[^A-Za-z0-9 ]+', '', caption)
        for word in strippedCaption.split():
            tokens.append(wordDict[word])
        length = len(tokens)
        tokenColumn.append(np.pad(tokens, (0, 50-length)))
    dataframe['tokenized-caption'] = tokenColumn

def AddOneHotLabelColumn(dataframe):
    labelColumn = []
    for labels in dataframe['Labels']:
        oneHotLabels = [0] * 19
        splitLabels = labels.split()
        for label in splitLabels:
            oneHotLabels[int(label)-1] = 1.0
        labelColumn.append(list(oneHotLabels))
    dataframe['one-hot-labels'] = labelColumn
    
AddPaddedTokenColumn(train_data)
AddPaddedTokenColumn(test_data)
AddOneHotLabelColumn(train_data)


#np.array(GetOneHotLabelColumn(train_data)).astype(np.float32)
#np.array(vectorizedTrainingText)[12]
#train_data['tokenized-caption']

In [4]:
vocabSize = len(wordDict.keys())+1
embeddingSize = 20

In [11]:
"""
def prepare_sequence(sequence, wordDictionary):
    tokens = [wordDictionary[w] for w in sequence]
    return torch.tensor(tokens, dtype=torch.long)
    
model = nn.Sequential(
          nn.Embedding(vocabSize, embeddingSize),
          nn.LSTM(embeddingSize, hidden_size=64, num_layers=2, dropout=.2),
          nn.Linear(64, 19),
          nn.Sigmoid()
        )
"""

'\ndef prepare_sequence(sequence, wordDictionary):\n    tokens = [wordDictionary[w] for w in sequence]\n    return torch.tensor(tokens, dtype=torch.long)\n    \nmodel = nn.Sequential(\n          nn.Embedding(vocabSize, embeddingSize),\n          nn.LSTM(embeddingSize, hidden_size=64, num_layers=2, dropout=.2),\n          nn.Linear(64, 19),\n          nn.Sigmoid()\n        )\n'

In [12]:
"""
def train_loop(dataset, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    # Set the model to training mode - important for batch normalization and dropout layers
    # Unnecessary in this situation but added for best practices
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        # Compute prediction and loss
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 100 == 0:
            loss, current = loss.item(), batch * batch_size + len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
"""

'\ndef train_loop(dataset, model, loss_fn, optimizer):\n    size = len(dataloader.dataset)\n    # Set the model to training mode - important for batch normalization and dropout layers\n    # Unnecessary in this situation but added for best practices\n    model.train()\n    for batch, (X, y) in enumerate(dataloader):\n        # Compute prediction and loss\n        pred = model(X)\n        loss = loss_fn(pred, y)\n\n        # Backpropagation\n        loss.backward()\n        optimizer.step()\n        optimizer.zero_grad()\n\n        if batch % 100 == 0:\n            loss, current = loss.item(), batch * batch_size + len(X)\n            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")\n'

In [5]:
LEARNING_RATE = .1
BATCH_SIZE = 128
EPOCHS = 10
EMBEDDING_DIM = 20

In [85]:
import torch.utils.data as data
loader = data.DataLoader(data.TensorDataset(torch.tensor(train_data['tokenized-caption']), torch.tensor(train_data['one-hot-labels'])), shuffle=True, batch_size=BATCH_SIZE)

In [48]:
torch.tensor(train_data['tokenized-caption']).size()

torch.Size([29996, 50])

# attempt 1

following online walkthroughs, I made this class. It does not seem to work however, and seems to be ignoring the input and always giving high-max probability on the first value and low-min on the others

In [136]:

# PyTorch models inherit from torch.nn.Module
class LSTMNetwork(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size, numLayers):
        super(LSTMNetwork, self).__init__()

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, numLayers, batch_first=True)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds: torch.nn.Embedding = self.word_embeddings(sentence)
        #print(embeds[0:1])
        _output, (hidden, _cell) = self.lstm(embeds)
        print(hidden[-1][0:1])
        tag_space = self.hidden2tag(hidden[-1])
        return tag_space

model = LSTMNetwork(EMBEDDING_DIM, 128, vocabSize, 19, 2)

# Attempt 2

using the sequential architecture, I thought I might be able to simplify and streamline, thus removing possibilities of error. Same issue

In [88]:
class extract_tensor(nn.Module):
    def forward(self,x):
        # Output shape (batch, features, hidden)
        tensor, _ = x
        # Reshape shape (batch, hidden)
        return tensor[:, -1, :]

torchModel = nn.Sequential(
        nn.Embedding(vocabSize, EMBEDDING_DIM),
        nn.LSTM(EMBEDDING_DIM, 128, 2, batch_first=True),
        extract_tensor(),
        nn.Linear(128, 19),
        nn.Sigmoid()
        )


# training

In [None]:
def GetMatch(prediction, trueLabel):
    match = True
    i = 0
    while i < len(trueLabel):
        if trueLabel[i] == 1.0:
            if prediction[i] < .5:
                match = False
        else:
            if prediction[i] > .5:
                match = False
        i+=1
    
    return match

In [137]:
loss_fn = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(torchModel.parameters(), LEARNING_RATE)

def train_loop(loader, model, loss_fn, optimizer):
    # Set the model to training mode - important for batch normalization and dropout layers
    # Unnecessary in this situation but added for best practices
    
    
    i = 0
    while i < 3:
        model.train()
        # Compute prediction and loss
        for X_batch, y_batch in loader:
            #model.zero_grad()
            y_pred = model(X_batch)
            print(y_pred)
            loss = loss_fn(y_pred, y_batch)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        i+=1
        
        '''
        model.eval()
        with torch.no_grad():
            
            truePredictions = 0
            #total = 0
            j = 0
            while j < len(vectorizedTrainingText):
                prediction = model(torch.tensor(np.array(vectorizedTrainingText[j])))
                isMatch = GetMatch(prediction, oneHotColumn[j])
                if isMatch:
                    truePredictions+=1
                #total+=1
                j+=1
                
            print("totalTrue: %d" % (truePredictions))  

            #print("Epoch %d: train Accuracy %.4f, test RMSE %.4f" % (i, train_rmse, test_rmse))
            '''
    
train_loop(loader, model, loss_fn, optimizer)

tensor([[ 1.6825e-03, -4.2742e-02,  4.6964e-02,  2.1657e-02, -5.5646e-02,
         -2.2137e-02,  9.5978e-03, -6.3160e-02, -4.4037e-02,  2.9831e-02,
          3.0418e-02,  3.5529e-03, -1.4785e-02, -1.7647e-02, -2.0819e-02,
         -3.2226e-02, -6.0497e-02, -2.9062e-02,  1.6728e-02, -1.8488e-02,
         -3.8057e-02,  1.6738e-02,  4.0157e-03, -9.1769e-05,  1.1234e-02,
         -6.5854e-03,  1.2670e-02,  6.3743e-03, -1.5040e-03,  4.9300e-02,
          1.9595e-02,  5.8918e-02,  3.9992e-02, -1.2450e-02, -1.1147e-02,
          4.5521e-02,  1.4328e-03, -1.1624e-02, -4.5163e-02, -2.7906e-02,
          1.4940e-02, -5.7310e-02,  3.5780e-02,  6.3751e-03,  4.0420e-02,
          5.4789e-03, -3.0871e-02, -1.0677e-02, -1.1740e-02,  1.1694e-02,
          1.9418e-02,  1.1783e-02, -4.8463e-02, -5.9706e-02,  2.1357e-02,
          6.5190e-02, -2.5996e-03,  5.7128e-02, -5.1089e-02, -7.2023e-02,
         -7.1613e-02,  5.8651e-03, -9.5301e-03, -7.6297e-03,  8.0521e-02,
          5.4549e-02, -2.7766e-02,  2.

KeyboardInterrupt: 

In [97]:
print(torchModel(torch.tensor(train_data['tokenized-caption'][0:3])))

tensor([[8.6111e-01, 1.2725e-01, 1.0356e-01, 3.2755e-02, 3.6669e-03, 4.6316e-02,
         2.3735e-02, 2.4199e-02, 3.2705e-02, 4.8910e-03, 1.7214e-02, 1.3270e-19,
         5.7112e-02, 7.8847e-04, 8.2414e-02, 8.3506e-02, 3.4948e-02, 1.2192e-01,
         9.9441e-03],
        [8.6111e-01, 1.2725e-01, 1.0356e-01, 3.2755e-02, 3.6669e-03, 4.6316e-02,
         2.3735e-02, 2.4199e-02, 3.2705e-02, 4.8910e-03, 1.7214e-02, 1.3270e-19,
         5.7112e-02, 7.8847e-04, 8.2414e-02, 8.3506e-02, 3.4948e-02, 1.2192e-01,
         9.9441e-03],
        [8.6111e-01, 1.2725e-01, 1.0356e-01, 3.2755e-02, 3.6669e-03, 4.6316e-02,
         2.3735e-02, 2.4199e-02, 3.2705e-02, 4.8910e-03, 1.7214e-02, 1.3270e-19,
         5.7112e-02, 7.8847e-04, 8.2414e-02, 8.3506e-02, 3.4948e-02, 1.2192e-01,
         9.9441e-03]], grad_fn=<SigmoidBackward0>)


In [155]:
truePredictions = 0
#total = 0
j = 0
while j < len(train_data['tokenized-caption']):
    prediction = nn.functional.sigmoid(model(torch.tensor(train_data['tokenized-caption'][j])))
    isMatch = GetMatch(prediction, train_data['one-hot-labels'][j])
    if isMatch:
        truePredictions+=1
    #total+=1
    j+=1
    
print("totalTrue: %d" % (truePredictions))  

totalTrue: 0


In [164]:
train_data['tokenized-caption'][90]

array([4560, 2748, 5973,  242,  467, 8953, 3170,  375, 8953, 7069, 8811,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0])

In [None]:
model(torch.tensor(train_data['tokenized-caption'][9]))


tensor([ -0.8839,  -5.8008,  -6.9705, -19.2188,  -3.7581,  -6.6175,  -8.9487,
         -2.7512,  -3.6085,  -7.1201,  -3.4604, -58.9693,  -4.1763, -28.4777,
        -19.3310, -21.0089,  -4.1190, -26.3215,  -0.6509],
       grad_fn=<ViewBackward0>)

# keras implementation

utilizing keras and the implementation I did for Assignment 4 NLP

In [None]:
def GetOneHotLabelColumn(dataframe):
    labelColumn = []
    for labels in dataframe['Labels']:
        oneHotLabels = [0] * 19
        splitLabels = labels.split()
        for label in splitLabels:
            oneHotLabels[int(label)-1] = 1
        labelColumn.append(list(oneHotLabels))
    return list(labelColumn)

In [6]:
import keras
from keras import layers

def GetLSTMModel(lstmNeurons=128, lstmLayers=2, dropout=.2):
    numClasses = 19

    # Model definition
    inputs = keras.Input(shape=(None,), dtype="int64")
    x = layers.Embedding(vocabSize+1, EMBEDDING_DIM, mask_zero=True)(inputs)
    
    i = 1
    while i < lstmLayers:
        x = layers.LSTM(lstmNeurons, return_sequences=True, dropout=dropout)(x)
        i+=1
        
    lstm_out, state_h, _state_c = layers.LSTM(lstmNeurons, return_sequences=True, return_state=True, dropout=dropout)(x)
    
    template_output = layers.Dense(numClasses, activation='sigmoid', name="dense")(state_h)
    
    model = keras.Model(inputs=inputs, outputs=template_output)
    # Compile the model
    model.compile(
        optimizer="adam",
        loss={
        "dense": "binary_crossentropy"
        },  # adjust as needed,
        metrics=["accuracy"],
    )
    return model

In [7]:
def GetVectorizationLayer():
    vectorizeLayer = keras.layers.TextVectorization(
        standardize='lower_and_strip_punctuation',
        max_tokens=20000,
        output_mode="int",
        output_sequence_length=50
    )
    
    vectorizeLayer.adapt(train_data['Caption'])
    return vectorizeLayer

def vectorizeAllText(data):
    return list(data.map(vectorizeText))

def vectorizeText(text):
    return list(VectorizationLayer(text))

    
VectorizationLayer = GetVectorizationLayer()
vectorizedTrainingText = vectorizeAllText(train_data['Caption'])

In [27]:
np.array(train_data['one-hot-labels'])[12]

array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0])

In [41]:
LSTMModel = GetLSTMModel()
history = LSTMModel.fit(x=np.array(vectorizedTrainingText), y=np.array(GetOneHotLabelColumn(train_data)).astype(np.float32), epochs=20, batch_size=128)

Epoch 1/20
[1m235/235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 109ms/step - accuracy: 0.7393 - loss: 0.2938
Epoch 2/20
[1m235/235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 110ms/step - accuracy: 0.7586 - loss: 0.1852
Epoch 3/20
[1m235/235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 106ms/step - accuracy: 0.7422 - loss: 0.1581
Epoch 4/20
[1m235/235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 107ms/step - accuracy: 0.7797 - loss: 0.1379
Epoch 5/20
[1m235/235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 115ms/step - accuracy: 0.7996 - loss: 0.1239
Epoch 6/20
[1m235/235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 106ms/step - accuracy: 0.8083 - loss: 0.1101
Epoch 7/20
[1m235/235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 106ms/step - accuracy: 0.8093 - loss: 0.1031
Epoch 8/20
[1m235/235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 106ms/step - accuracy: 0.8109 - loss: 0.0978
Epoch 9/20
[1m2

In [42]:
predictions = LSTMModel.predict(np.array(vectorizedTrainingText))

[1m938/938[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 18ms/step


In [51]:
numCorrect = 0
i = 0
j = 0
while i < len(train_data['one-hot-labels']):
    isMatch = GetMatch(predictions[i], train_data['one-hot-labels'][i])
    if isMatch:
        numCorrect+=1
    elif j < 5:
        print(i)
        print(predictions[i])  
        print(train_data['one-hot-labels'][i])  
    i+=1
print(numCorrect/i)

3
[3.1694105e-01 3.7145887e-02 5.4427820e-01 3.7759445e-03 1.8368529e-03
 2.8257024e-02 8.9629013e-03 4.5273006e-02 3.8708120e-03 1.8145217e-01
 4.9085047e-02 3.4407527e-05 1.1595217e-01 2.0572187e-02 3.3660762e-02
 6.8845443e-02 1.1073303e-02 1.2731047e-02 6.5393251e-04]
[0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]
4
[3.7099150e-01 1.5011821e-02 1.4747591e-01 2.6131000e-03 3.3478189e-04
 8.9361249e-03 9.9399084e-01 4.9531050e-02 3.2035732e-03 1.3801594e-01
 1.4611239e-03 2.4750245e-05 8.8741221e-03 3.5770651e-04 7.1394660e-02
 2.8157684e-03 1.0577444e-03 8.8727544e-04 1.7515459e-03]
[0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
8
[4.8571834e-01 1.7296770e-01 8.2035679e-01 8.7736502e-02 1.2336288e-03
 1.9539288e-01 1.7689645e-02 3.8126528e-01 1.4360735e-02 3.6531731e-01
 5.1164135e-02 1.0625952e-04 1.6934788e-02 7.2426423e-02 7.1952187e-02
 7.6438813e-03 1.2411571e-03 9.5043862e-03 2.1353408e-03]
[0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
9
[9.40