In [14]:
%load_ext autoreload
%autoreload 2
import sys
import pandas as pd
import numpy as np
import transformers
import os
import re
import torch
import torch.nn as nn
import sys
import matplotlib.pyplot as plt

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [15]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')


# PREPROCESSING 

##  1) Custom Dataset Class
#### Implement a map-style dataset from .csv files; needed because data files train_titles.csv and test_titles.csv cannot be directly passed to the dataloader

In [16]:
import os
import pandas as pd
from torchvision.io import read_image
import torch
from torch.utils.data import Dataset

class customTextDataset(Dataset):
    def __init__(self, path, colnames, maxLen, tokenizer=None): 
        self.data =  pd.read_csv(path, names=colnames, header=None, sep = ',', index_col=False)
        self.tokenizer = tokenizer
        self.maxLen = maxLen

    def __getitem__(self, idx):
        sentence = self.data.loc[idx].text
        out = self.tokenizer(sentence, padding = 'max_length', truncation = True, max_length = self.maxLen)
        ids = out['input_ids']
        mask = torch.tensor(out['attention_mask'])
        ids = torch.tensor(ids)
        # label
        label = self.data.loc[idx].food
        return ids, mask, label
        #return torch.tensor(ids),label # torch.tensor(sentence), mask, label


    def __len__(self):
        return len(self.data)

    def getHead(self):
        print(self.data.head())



## 2) DistilBert Tokenizer from Huggingface

In [17]:
from torch.utils.data import DataLoader
maxLen = 32
batchSize = 512
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
vocabSize = len(tokenizer)
print(f"Vocabulary Size = {vocabSize}")

Vocabulary Size = 30522


## 3) Create Dataloaders from Custom Text Dataset

In [18]:
trainData = customTextDataset(path = 'train_titles.csv', colnames=['image_path', 'text', 'food'], tokenizer = tokenizer,maxLen=maxLen)
trainLoader = DataLoader(trainData,batch_size=batchSize,shuffle=True)
print(f'Number of train data points  = {trainData.__len__()}')
#trainData.getHead()
testData = customTextDataset(path = 'test_titles.csv', colnames=['image_path', 'text', 'food'], tokenizer = tokenizer,maxLen=maxLen)
testLoader = DataLoader(testData,batch_size=testData.__len__(),shuffle=True)
print(f"Number of test data points = {testData.__len__()}")

Number of train data points  = 67972
Number of test data points = 22716


#### Print an example data point (tokens/tokenIds) and label

In [19]:
data = next(iter(trainLoader))
print(f"Encoded Text = {data[0][0]}, {data[0][0].type()}, {data[0][0].shape}")
print(f"Mask = {data[1][0]}, {data[1][0].type()}, {data[1][0].shape}")
print(f"Encoded Label = {data[2][0]}")
print(f"Decoded tokens from encoded ids: \n'{tokenizer.decode(data[0][0])}'")

Encoded Text = tensor([  101, 20130,  1998, 24842,  2015, 17974,  2013, 10346,  2480,  4825,
         1998, 18651,  2012,  5472,  4355,  2378,  5439,  1998,  3509,  7001,
         2006,  6819, 26247,   102,     0,     0,     0,     0,     0,     0,
            0,     0]), torch.LongTensor, torch.Size([32])
Mask = tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        0, 0, 0, 0, 0, 0, 0, 0]), torch.LongTensor, torch.Size([32])
Encoded Label = shrimp_and_grits
Decoded tokens from encoded ids: 
'[CLS] shrimp and grits recipe from finz restaurant and grill at sandestin golf and beach resort on vimeo [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'


## 4) Convert categorical labels (recipe names) to numerical labels ({0,1,2,....101})

In [21]:
labelsTrain_ = []
for data in trainLoader:
    labelsTrain_ = labelsTrain_ + (list(data[2]))
  
from collections import Counter
labelsDict = Counter(labelsTrain_)
keys = labelsDict.keys()
lblMap = {x:i for i,x in enumerate(keys)}
print(f"Number of classes = {len(list(keys))}") # This should be 101 

Number of classes = 101


# DISTILBERT + FEEDFORWARD MODEL

In [23]:
import numpy
from transformers import GPT2Config
import torch.optim as optim

hidden_size = 128
dModel = 128
numClasses = 101
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased',num_labels = numClasses)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr = 0.0001)
lossVals = []
model = model.to(device)
print(model)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


## 1) Train Transformer + Feedforward Model

In [24]:
numEpochs = 3
for epoch in range(numEpochs):  # loop over the dataset multiple times
    for data in trainLoader:
        ids,masks,labels_ = data # masks is of dimension (batchsize,  maxLen, maxLen)
        labels = torch.tensor([lblMap[x] for x in labels_]) 
        optimizer.zero_grad()
        outputs = model(input_ids = ids, attention_mask = masks,labels = labels)
        loss = outputs[0]
        loss.backward()
        lossVals.append(loss.detach().cpu().clone().numpy())
        optimizer.step()
    if epoch%1 == 0:
        print(f"Epoch = {epoch}. Loss = {loss}")
print('Finished Training')

Epoch = 0. Loss = 0.662666380405426
Epoch = 1. Loss = 0.6682378053665161
Epoch = 2. Loss = 0.360770046710968
Epoch = 3. Loss = 0.29378339648246765
Epoch = 4. Loss = 0.15021219849586487
Epoch = 5. Loss = 0.11752953380346298
Epoch = 6. Loss = 0.1310907006263733
Epoch = 7. Loss = 0.16684754192829132
Epoch = 8. Loss = 0.08448788523674011
Epoch = 9. Loss = 0.09400229901075363
Epoch = 10. Loss = 0.07695019245147705
Epoch = 11. Loss = 0.07765358686447144
Epoch = 12. Loss = 0.11752285063266754
Epoch = 13. Loss = 0.11939051002264023
Epoch = 14. Loss = 0.13364431262016296
Finished Training


## 2) Test Transformer + Feedforward Model

In [25]:
model.eval() # again no gradients needed - so set mode.eval()
correct_pred = 0
numTst = 0
with torch.no_grad():
    for data in testLoader:
        ids,masks,labels_ = data
        masks = masks.repeat(8,1,1) 
        numTst = numTst + ids.shape[0]
        labels = []
        labels.append([lblMap[x] for x in labels_])
        labels = torch.tensor(labels[0])
        outputs = model(ids,masks)
        _, predictions = torch.max(outputs[1], 1)
        # collect the correct predictions for each class
        for label, prediction in zip(labels, predictions):
            if label == prediction:
                correct_pred += 1

accuracy = 100 * float(correct_pred)/ numTst
print(f"Classification Accuracy = {accuracy:.3f}%")

ValueError: too many values to unpack (expected 2)

In [None]:
import numpy as np
fig, ax = plt.subplots(nrows = 1, ncols=1)
ax.plot(range(len(lossVals)),lossVals)
ax.set_xlabel('Iterations',fontsize = 15)
ax.set_ylabel('Cross Entropy Loss', fontsize = 15)
ax.set_title('Classification Accuracy = {:.2f}%'.format(accuracy),fontsize = 15)
path = 'dim_{}_accry_{:.2f}len_{}_hidden_{}'.format(dModel,accuracy, maxLen,hidden_size)
plt.savefig(path+'.pdf')

# SVM With LSTM 
### Instead of the 2 fully connected layers and 1 soft-max layer in the LSTM model above, we use multi-class SVM for classification below.

## 1) Declare LSTM model and load trained weights.

In [None]:
class w2nModelSVM(torch.nn.Module):
    def __init__(self,vocab_size, embedding_dim, hidden_size, nClasses):
        super(w2nModelSVM, self).__init__()
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.FC0 = nn.FC0(input_size = embedding_dim, hidden_size = hidden_size,batch_first=True)

    def forward(self, x):
        x = self.word_embeddings(x)     # output dimensions is batch size = N x sequence length x feature size
        (x,_) = self.LSTM(x)        
        x = x[:, -1, :]                 # gives two dimensional output, not three dimensional output
        return x

modelSVM = w2nModelSVM(vocab_size = vocabSize, 
                 embedding_dim = dModel,
                 hidden_size=hidden_size, nClasses = numClasses
                )
modelSVM.word_embeddings.weight.data.copy_((model.word_embeddings.weight))
modelSVM.FC0.load_state_dict(model.FC0.state_dict())

## 2) Get LSTM Embeddings for Train and Test Dataset

In [14]:
numTrn = trainData.__len__()
trnEmbdngs = np.zeros((numTrn,hidden_size))
trnLbls  = []
modelSVM.eval()
for i, data in enumerate(trainLoader):
    inputs,labels_ = data
    outputs = modelSVM(inputs)
    trnEmbdngs[i*batchSize: (i+1)*batchSize,:] = outputs.detach().clone().numpy()
    trnLbls = trnLbls + [lblMap[x] for x in labels_]

numTst = testData.__len__()
tstEmbdngs = np.zeros((numTst,hidden_size))
tstLbls = [] 
for i, data in enumerate(testLoader):
    inputs,labels_ = data
    labels =  []
    outputs = modelSVM(inputs)
    tstEmbdngs = outputs.detach().clone().numpy()
    tstLbls = tstLbls + [lblMap[x] for x in labels_]

## 3) Accuracy using Linear SVM


In [None]:
from sklearn import svm
clf = svm.SVC(decision_function_shape='ovo', kernel = 'linear')
clf.fit(trnEmbdngs, np.asarray(trnLbls))
TrnAccrcyLnr = clf.score(trnEmbdngs, np.asarray(trnLbls))
TstAccrcyLnr = clf.score(tstEmbdngs,np.asarray(tstLbls))
print(r'Train Accuracy of Linear SVM =', 100*TrnAccrcyLnr)
print(r'Test Accuracy of Linear SVM =', 100*TstAccrcyLnr)

## 4) Accuracy using RBF Kernel SVM

In [None]:
clf = svm.SVC(decision_function_shape='ovo', kernel='rbf')
clf.fit(trnEmbdngs, trnLbls)
TrnAccrcyKrnl = clf.score(trnEmbdngs, trnLbls)
TstAccrcyKrnl = clf.score(tstEmbdngs,tstLbls)
print(r'Train Accuracy of Kernel SVM =', 100*TrnAccrcyKrnl)
print(r'Test Accuracy of Kernel SVM =', 100*TstAccrcyKrnl)