## Name: Aqsa Rahman
## Roll no: i191908
## Section DS-N

## Importing Libraries

In [52]:
import pandas as pd 
import numpy as np
import torch
from torch import nn
from d2l import torch as d2l
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing
from PIL import Image
import pytesseract
import argparse
import cv2
import os
import torch.optim as optim
from tqdm import tqdm
import torch.nn.functional as F
from torchvision.io import read_image

## Reading files and preprocessing

In [53]:
labels= pd.read_csv("labels.csv")

In [54]:
labels.head()
#dropping the NaN values from our dataset.
labels=labels.dropna()

In [55]:
labels.head()

Unnamed: 0.1,Unnamed: 0,image_name,text_ocr,text_corrected,humour,sarcasm,offensive,motivational,overall_sentiment
0,0,image_1.jpg,LOOK THERE MY FRIEND LIGHTYEAR NOW ALL SOHALIK...,LOOK THERE MY FRIEND LIGHTYEAR NOW ALL SOHALIK...,hilarious,general,not_offensive,not_motivational,very_positive
1,1,image_2.jpeg,The best of #10 YearChallenge! Completed in le...,The best of #10 YearChallenge! Completed in le...,not_funny,general,not_offensive,motivational,very_positive
2,2,image_3.JPG,Sam Thorne @Strippin ( Follow Follow Saw every...,Sam Thorne @Strippin ( Follow Follow Saw every...,very_funny,not_sarcastic,not_offensive,not_motivational,positive
3,3,image_4.png,10 Year Challenge - Sweet Dee Edition,10 Year Challenge - Sweet Dee Edition,very_funny,twisted_meaning,very_offensive,motivational,positive
4,4,image_5.png,10 YEAR CHALLENGE WITH NO FILTER 47 Hilarious ...,10 YEAR CHALLENGE WITH NO FILTER 47 Hilarious ...,hilarious,very_twisted,very_offensive,not_motivational,neutral


In [56]:
labels=labels[['humour','sarcasm','offensive','motivational','overall_sentiment','image_name','text_corrected']]

In [57]:
reference=pd.read_csv("reference.csv")
#dropping the NaN values from our dataset.
reference=reference.dropna()

## Bert Model-(for Embeddings)

In [58]:
batch_size, max_len = 512, 64
train_iter, vocab = d2l.load_data_wiki(batch_size, max_len)

In [59]:
net = d2l.BERTModel(len(vocab), num_hiddens=128, norm_shape=[128],
                    ffn_num_input=128, ffn_num_hiddens=256, num_heads=2,
                    num_layers=2, dropout=0.2, key_size=128, query_size=128,
                    value_size=128, hid_in_features=128, mlm_in_features=128,
                    nsp_in_features=128)
devices = d2l.try_all_gpus()
loss = nn.CrossEntropyLoss()

In [60]:
def _get_batch_loss_bert(net, loss, vocab_size, tokens_X,
                         segments_X, valid_lens_x,
                         pred_positions_X, mlm_weights_X,
                         mlm_Y, nsp_y):
    # Forward pass
    _, mlm_Y_hat, nsp_Y_hat = net(tokens_X, segments_X,
                                  valid_lens_x.reshape(-1),
                                  pred_positions_X)
    # Compute masked language model loss
    mlm_l = loss(mlm_Y_hat.reshape(-1, vocab_size), mlm_Y.reshape(-1)) *\
    mlm_weights_X.reshape(-1, 1)
    mlm_l = mlm_l.sum() / (mlm_weights_X.sum() + 1e-8)
    # Compute next sentence prediction loss
    nsp_l = loss(nsp_Y_hat, nsp_y)
    l = mlm_l + nsp_l
    return mlm_l, nsp_l, l

In [61]:
def train_bert(train_iter, net, loss, vocab_size, devices, num_steps):
    net = nn.DataParallel(net, device_ids=devices).to(devices[0])
    trainer = torch.optim.Adam(net.parameters(), lr=0.01)
    step, timer = 0, d2l.Timer()
    animator = d2l.Animator(xlabel='step', ylabel='loss',
                            xlim=[1, num_steps], legend=['mlm', 'nsp'])
    # Sum of masked language modeling losses, sum of next sentence prediction
    # losses, no. of sentence pairs, count
    metric = d2l.Accumulator(4)
    num_steps_reached = False
    while step < num_steps and not num_steps_reached:
        for tokens_X, segments_X, valid_lens_x, pred_positions_X,\
            mlm_weights_X, mlm_Y, nsp_y in train_iter:
            tokens_X = tokens_X.to(devices[0])
            segments_X = segments_X.to(devices[0])
            valid_lens_x = valid_lens_x.to(devices[0])
            pred_positions_X = pred_positions_X.to(devices[0])
            mlm_weights_X = mlm_weights_X.to(devices[0])
            mlm_Y, nsp_y = mlm_Y.to(devices[0]), nsp_y.to(devices[0])
            trainer.zero_grad()
            timer.start()
            mlm_l, nsp_l, l = _get_batch_loss_bert(
                net, loss, vocab_size, tokens_X, segments_X, valid_lens_x,
                pred_positions_X, mlm_weights_X, mlm_Y, nsp_y)
            l.backward()
            trainer.step()
            metric.add(mlm_l, nsp_l, tokens_X.shape[0], 1)
            timer.stop()
            animator.add(step + 1,
                         (metric[0] / metric[3], metric[1] / metric[3]))
            step += 1
            if step == num_steps:
                num_steps_reached = True
                break

    print(f'MLM loss {metric[0] / metric[3]:.3f}, '
          f'NSP loss {metric[1] / metric[3]:.3f}')
    print(f'{metric[2] / timer.sum():.1f} sentence pairs/sec on '
          f'{str(devices)}')

In [62]:
#train_bert(train_iter, net, loss, len(vocab), devices, 50)

In [63]:
def get_bert_encoding(net, tokens_a, tokens_b=None):
    tokens, segments = d2l.get_tokens_and_segments(tokens_a, tokens_b)
    token_ids = torch.tensor(vocab[tokens], device=devices[0]).unsqueeze(0)
    segments = torch.tensor(segments, device=devices[0]).unsqueeze(0)
    valid_len = torch.tensor(len(tokens), device=devices[0]).unsqueeze(0)
    encoded_X, _, _ = net(token_ids, segments, valid_len)
    return encoded_X

In [64]:
tokens_a = ['a', 'crane', 'is', 'flying']
encoded_text = get_bert_encoding(net, tokens_a)
# Tokens: '<cls>', 'a', 'crane', 'is', 'flying', '<sep>'
encoded_text_cls = encoded_text[:, 0, :]
encoded_text_crane = encoded_text[:, 2, :]
encoded_text.shape, encoded_text_cls.shape, encoded_text_crane[0][:3]

(torch.Size([1, 6, 128]),
 torch.Size([1, 128]),
 tensor([ 0.7555, -2.6361, -0.0336], grad_fn=<SliceBackward0>))

## Image processing


In [12]:
from PIL import Image
import os, sys

In [None]:
#adding image names to a list from the dataframe
image_new= labels['image_name']
image_new=image_new.tolist()

In [None]:
for i in image_new:
    image = Image.open("images/"+i)
    #resizing the image to 600x600
    image = image.resize((120,120),Image.ANTIALIAS)
    #converting to RGB AND PNG(image-type)
    image = image.convert('RGB')
    new_name = i.split('.')[0]+".png"
    #saving the processed images in a new file
    image.save(fp="new_images/"+new_name)

## Splitting into Train, Test, and Validate

####  label encoding

In [73]:
#label encoding the image name data to provide specific labels to different classes.
def label_encoding(labels):
    labels = labels[['image_name','text_corrected','humour','sarcasm','offensive','motivational','overall_sentiment']]
    labels['image_name'] = [i.split(".")[0] for i in labels['image_name'] ]
    labels['overall_sentiment'] = labels['overall_sentiment'].replace("very_positive", 1)
    labels['overall_sentiment'] = labels['overall_sentiment'].replace("positive", 1)
    labels['overall_sentiment'] = labels['overall_sentiment'].replace("very_negative", 2)
    labels['overall_sentiment'] = labels['overall_sentiment'].replace("negative", 2)
    labels['overall_sentiment'] = labels['overall_sentiment'].replace("neutral", 0)
    labels['sarcasm'] = labels['sarcasm'].replace("not_sarcastic", 0)
    labels['sarcasm'] = labels['sarcasm'].replace("general", 1)
    labels['sarcasm'] = labels['sarcasm'].replace("twisted_meaning", 1)
    labels['sarcasm'] = labels['sarcasm'].replace("very_twisted", 1)
    labels['offensive'] = labels['offensive'].replace("not_offensive", 0)
    labels['offensive'] = labels['offensive'].replace("very_offensive", 1)
    labels['offensive'] = labels['offensive'].replace("slight", 1)
    labels['offensive'] = labels['offensive'].replace("hateful_offensive", 1)
    labels['humour'] = labels['humour'].replace("not_funny", 0)
    labels['humour'] = labels['humour'].replace("very_funny", 1)
    labels['humour'] = labels['humour'].replace("hilarious", 1)
    labels['humour'] = labels['humour'].replace("funny", 1)
    labels['motivational'] = labels['motivational'].replace("not_motivational", 0)
    labels['motivational'] = labels['motivational'].replace("motivational", 1)
    
    return labels

In [74]:
labels=label_encoding(labels)

In [75]:
#train=60,test=20,validate=20
train, validate, test = np.split(labels.sample(frac=1, random_state=30), [int(.6*len(labels)), int(.8*len(labels))])

In [76]:
#Saving into new csv files.
train.to_csv("train.csv",index=False)
validate.to_csv("validate.csv",index=False)
test.to_csv("test.csv",index=False)

## Data loader for neural network

In [77]:
""" Neural network (ANN) for images data to classify in 5 labels"""
from torch.utils.data import Dataset
class Dataset_loader(Dataset):
    def __init__(self, annotations_file, img_dir, transform=None, target_transform=None):
        self.img_labels = pd.read_csv(annotations_file)
        self.img_dir = img_dir
        self.transform = transform
        self.target_transform = target_transform
        train_labels = pd.read_csv(annotations_file)
        labels_images = [i for i in train_labels['overall_sentiment']]

    def __len__(self):
        return len(self.img_labels)

    def __getitem__(self, idx):
        img_path = os.path.join(self.img_dir, self.img_labels.iloc[idx, 0]+'.png')
        image = read_image(img_path)
        label = self.img_labels.iloc[idx, 2]
        two=self.img_labels.iloc[idx, 3]
        three=self.img_labels.iloc[idx, 4]
        four=self.img_labels.iloc[idx, 5]
        five=self.img_labels.iloc[idx, 6]
        if self.transform:
            image = self.transform(image)
        if self.target_transform:
            label = self.target_transform(label)
        text = self.img_labels.iloc[idx, 1]
        text = text.split()
        padding = 187 - len(text)
        if(padding>0):
            for i in range(padding):
                text.append('')
        else:
            text = text[0:187]
        encoded_text = get_bert_encoding(net,text)
        array = torch.as_tensor(label)
        array1 = torch.as_tensor(two)
        array2 = torch.as_tensor(three)
        array3 = torch.as_tensor(four)
        array4 = torch.as_tensor(five)
        x=image.float()
        return x,encoded_text,array,array1,array2,array3,array4

In [78]:
def loader():
    train = Dataset_loader(annotations_file="train.csv",img_dir="./new_images")
    test = Dataset_loader(annotations_file="test.csv",img_dir="./new_images")
    validate = Dataset_loader(annotations_file="validate.csv",img_dir="./new_images")
    train_dataloader = torch.utils.data.DataLoader(train, batch_size=64,num_workers=0,pin_memory=True)
    test_dataloader = torch.utils.data.DataLoader(test, batch_size=64)
    validate_dataloader = torch.utils.data.DataLoader(validate, batch_size=64)
    return train_dataloader,test_dataloader,validate_dataloader 


In [79]:
train_dataloader,test_dataloader,validate_dataloader = loader()

In [80]:
test = Dataset_loader(annotations_file="test.csv",img_dir="./new_images")
test.__getitem__(0)

(tensor([[[110., 105., 114.,  ...,  56.,  57.,  59.],
          [103., 103., 103.,  ...,  57.,  59.,  59.],
          [112.,  96.,  64.,  ...,  69.,  62.,  60.],
          ...,
          [ 37.,  38.,  43.,  ..., 102., 170., 197.],
          [ 56.,  57.,  55.,  ..., 181., 197., 213.],
          [ 58.,  59.,  60.,  ..., 224., 224., 225.]],
 
         [[ 61.,  61.,  76.,  ...,  39.,  40.,  43.],
          [ 62.,  69.,  66.,  ...,  43.,  42.,  41.],
          [ 68.,  70.,  48.,  ...,  61.,  53.,  53.],
          ...,
          [ 40.,  44.,  53.,  ..., 104., 170., 198.],
          [ 57.,  62.,  62.,  ..., 179., 195., 213.],
          [ 61.,  63.,  65.,  ..., 227., 224., 225.]],
 
         [[ 50.,  43.,  56.,  ...,  35.,  38.,  41.],
          [ 47.,  50.,  53.,  ...,  38.,  39.,  37.],
          [ 53.,  55.,  38.,  ...,  46.,  40.,  39.],
          ...,
          [ 37.,  41.,  54.,  ..., 112., 173., 197.],
          [ 60.,  64.,  68.,  ..., 183., 197., 213.],
          [ 63.,  63.,  68.,  .

## Image Neural Network 


In [81]:

class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(43200,2000)
        self.fc2 = nn.Linear(2000, 1000)
        self.fc3 = nn.Linear(1000, 500)
        self.fc4 = nn.Linear(500, 100)
        self.fc5 = nn.Linear(100, 5)
        
    
    def forward(self, x):
        x = torch.flatten(x, 1) # flatten all dimensions except batch
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = F.relu(self.fc4(x))

        x =F.log_softmax(self.fc5(x))
        return x
image_net = Net()

In [82]:
Net()

Net(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (fc1): Linear(in_features=43200, out_features=2000, bias=True)
  (fc2): Linear(in_features=2000, out_features=1000, bias=True)
  (fc3): Linear(in_features=1000, out_features=500, bias=True)
  (fc4): Linear(in_features=500, out_features=100, bias=True)
  (fc5): Linear(in_features=100, out_features=5, bias=True)
)

##  Text Neural Network

In [83]:
class TextNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(24192,5000)
        self.fc2 = nn.Linear(5000, 3000)
        self.fc3 = nn.Linear(3000, 1500)
        self.fc4 = nn.Linear(1500, 500)
        self.fc5 = nn.Linear(500, 100)
        self.fc6 = nn.Linear(100, 5)

    def forward(self, x):
        x = torch.flatten(x, 1) 
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = F.relu(self.fc4(x))
        x = F.relu(self.fc5(x))


        x =self.fc6(x)
        return x
nettext = TextNet()

## Combining two models

In [94]:
class Joining_models(nn.Module):
    def __init__(self, model1, model2):
        super(Joining_models, self).__init__()
        self.model1 = model1
        self.model2 = model2
        self.classifier = nn.Linear(10,12)
        
        self.output1 = nn.Linear(12, 2)
        self.output2 = nn.Linear(12, 2)
        self.output3 = nn.Linear(12, 2)
        self.output4 = nn.Linear(12, 2)
        self.output5 = nn.Linear(12, 3)
        
    def forward(self, x1, x2):
        x1 = self.model1(x1)
        x2 = self.model2(x2)
        x = torch.cat((x1, x2), dim=1)
        x = self.classifier(x)
        
        output1 = self.output1(x)
        output2 = self.output2(x)
        output3 = self.output3(x)
        output4 = self.output4(x)
        output5 = self.output5(x)
        
        return output1,output2,output3,output4,output5
    

In [95]:
finalmodel=Joining_models(image_net,nettext)

## Loss function

In [96]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(finalmodel.parameters(), lr=0.001)

## Training

In [97]:
for epoch in range(3):  # loop over the dataset multiple times
    running_loss = 0.0
    for i, data in enumerate(train_dataloader, 0):
        inputs,text, labels,two,three,four,five = data
        optimizer.zero_grad()
        outputs,output2,output3,output4,output5= finalmodel(inputs,text)
        loss = criterion(outputs, labels)
        loss += criterion(output2, two)
        loss += criterion(output3, three)
        loss += criterion(output4, four)
        loss += criterion(output5, five)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        if i % 2 == 0:   
            print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2:.3f}')
            running_loss = 0.0

  x =F.log_softmax(self.fc5(x))


[1,     1] loss: 2.352
[1,     3] loss: 962.234
[1,     5] loss: 328.815
[1,     7] loss: 161.163
[1,     9] loss: 50.567
[1,    11] loss: 104.275
[1,    13] loss: 47.805
[1,    15] loss: 77.046
[1,    17] loss: 46.591
[1,    19] loss: 49.413
[1,    21] loss: 19.407
[1,    23] loss: 27.702
[1,    25] loss: 22.350
[1,    27] loss: 14.961
[1,    29] loss: 10.526
[1,    31] loss: 11.677
[1,    33] loss: 10.104
[1,    35] loss: 7.505
[1,    37] loss: 6.296
[1,    39] loss: 5.690
[1,    41] loss: 4.494
[1,    43] loss: 4.888
[1,    45] loss: 4.104
[1,    47] loss: 4.332
[1,    49] loss: 3.936
[1,    51] loss: 3.732
[1,    53] loss: 3.872
[1,    55] loss: 3.773
[1,    57] loss: 3.690
[1,    59] loss: 3.616
[1,    61] loss: 3.629
[1,    63] loss: 3.659
[1,    65] loss: 3.582
[2,     1] loss: 2.103
[2,     3] loss: 3.595
[2,     5] loss: 3.573
[2,     7] loss: 3.603
[2,     9] loss: 3.603
[2,    11] loss: 3.772
[2,    13] loss: 3.363
[2,    15] loss: 3.597
[2,    17] loss: 3.546
[2,    19] los

KeyboardInterrupt: 

## Accuracy

In [102]:
correct1 = 0
correct2 = 0
correct3 = 0
correct4 = 0
correct5 = 0
total = 0
with torch.no_grad():
    for data in test_dataloader:
        inputs,text, labels,two,three,four,five= data
        outputs,output2,output3,output4,output5 = finalmodel(inputs,text)
        _, predicted1 = torch.max(outputs.data, 1)
        _, predicted2 = torch.max(output2.data, 1)
        _, predicted3 = torch.max(output3.data, 1)
        _, predicted4 = torch.max(output4.data, 1)
        _, predicted5 = torch.max(output5.data, 1)
        total += labels.size(0)
        correct1 += (predicted1 == labels).sum().item()
        correct2 += (predicted2 == two).sum().item()
        correct3 += (predicted3 == three).sum().item()
        correct4 += (predicted4 == four).sum().item()
        correct5 += (predicted5 == five).sum().item()

print(f'Accuracy  = {100 * correct1 // total } percent')
print(f'Accuracy  = {100 * correct2 // total } percent')
print(f'Accuracy  = {100 * correct3 // total } percent')
print(f'Accuracy  = {100 * correct4 // total } percent')
print(f'Accuracy  = {100 * correct5 // total } percent')


  x =F.log_softmax(self.fc5(x))


Accuracy  = 77 percent
Accuracy  = 73 percent
Accuracy  = 47 percent
Accuracy  = 65 percent
Accuracy  = 53 percent


## Saving the model

In [131]:
PATH = './Ai_project.pth'
torch.save(net.state_dict(), PATH)

In [None]:
from torchviz import make_dot
for X, y, z in validate_dataloader:
        yhat = finalmodel(X,y)
        break
make_dot(yhat, params=dict(list(finalmodel.named_parameters()))).render("rnn_torchviz", format="png")