<a href="https://colab.research.google.com/github/aishoo1612/HateAndAggressionDetection/blob/main/MachineTranslationHateSpeech.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#To filter the warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
#importing the basic modules and libraries for data exploration and processing
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
#mounting the drive to the colaboratory
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
#reading the excel data file from Drive
maindf = pd.read_csv('/content/drive/MyDrive/agr_en_train.csv')

In [6]:
maindf.head()

Unnamed: 0,Comments,level
0,Well said sonu..you have courage to stand agai...,2
1,"Most of Private Banks ATM's Like HDFC, ICICI e...",0
2,"Now question is, Pakistan will adhere to this?",2
3,Pakistan is comprised of fake muslims who does...,2
4,"??we r against cow slaughter,so of course it w...",0


In [8]:
maindf.rename(columns = {'Comments':'description', 'level':'category'}, inplace = True)

In [39]:
maindf.category = maindf.category.astype(str)

In [40]:
countdf = maindf.category.value_counts()
countdf.reset_index().head()

Unnamed: 0,index,category
0,0,5052
1,1,4240
2,2,2708


In [41]:
countdf = countdf.reset_index()
lessdf = countdf.loc[countdf['category']>=10]
l = lessdf['index'].tolist()
l

['0', '1', '2']

In [42]:
maindf['category'] = maindf['category'].apply(lambda x: "Others" if x not in l else x)
maindf.category.value_counts()

0    5052
1    4240
2    2708
Name: category, dtype: int64

# **Method 2:** Using Seq2Seq Attention Neural Machine Translation

To implement the Seq2Seq Attention Network, I've used the code from the official Pytorch Implementation. I've modified parts of the code according to the data requirements. 
https://github.com/pytorch/tutorials/blob/master/intermediate_source/seq2seq_translation_tutorial.py

In [43]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import spacy
import numpy as np

import random
import math
import time
spacy_en = spacy.load('en')

In [44]:
#Set the random seeds for reproducability.
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [45]:
desc_list = maindf['description'].tolist()
print(desc_list[0])

Well said sonu..you have courage to stand against dadagiri of Muslims


In [46]:
cat_list = maindf['category'].tolist()
cat_list[0]

'2'

In [47]:
descstring = ' '.join([str(elem) for elem in set(desc_list)])
catstring =  ' '.join([str(elem) for elem in set(cat_list)])

In [48]:
descstring



In [49]:
catstring

'2 1 0'

In [50]:
#define the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [51]:
maindf['description'][1]

"Most of Private Banks ATM's Like HDFC, ICICI etc are out of cash. Only Public sector bank's ATM working"

In [52]:
maxL = maindf['description'].apply(len)
print(maxL.max())

6978


In [53]:
#defining a value greater than the max value of a description to avoid Index Out of Bounds Error 
MAX_LENGTH = 7000

Defining the Seq2Seq Encoder, Decoder and the Attention Layer Decoder classes 

In [54]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [55]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [56]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length = MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

#### Preparing Training Data

To train, for each pair we will need an input tensor (indexes of the words in the input sentence) and target tensor (indexes of the words in the target sentence). While creating these vectors we will append the EOS token to both sequences.



In [57]:
SOS_token = 0
EOS_token = 1


class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [58]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]


def tensorFromSentence(lang, sentence):
    print(lang,sentence)
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

In [59]:
def prepare_data(lang1,lang2,tuples):
  input_lang,output_lang = Lang(lang1),Lang(lang2)
  for x,y in tuples:
    input_lang.addSentence(x)
    output_lang.addSentence(y)
  
  return input_lang,output_lang

In [60]:
input_lang, output_lang = Lang(descstring), Lang(catstring)

In [61]:
train_data, test_data = np.split(maindf.sample(frac=1, random_state=42), [int(.8*len(maindf))])

In [62]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9600 entries, 1935 to 5330
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   description  9600 non-null   object
 1   category     9600 non-null   object
dtypes: object(2)
memory usage: 225.0+ KB


In [63]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2400 entries, 4531 to 7270
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   description  2400 non-null   object
 1   category     2400 non-null   object
dtypes: object(2)
memory usage: 56.2+ KB


In [64]:
subset = maindf[['description','category']]
lang_tuples = [tuple(x) for x in subset.to_numpy()]

In [65]:
train_data.head()

Unnamed: 0,description,category
1935,Excellent movie 👍,0
6494,Anna should join bjp,0
1720,Rahul What's your address? I'll send some burn...,1
9120,"Hi, What Shall be Bottom Range for Nifty in Wo...",0
360,This is surgical strike from Pakistan. India...,0


In [66]:
input_subset = train_data[['description','category']]
tuples = [tuple(x) for x in input_subset.to_numpy()]

In [67]:
print(tuples[3])

('Hi, What Shall be Bottom Range for Nifty in Worst Scenario for April Series.', '0')


In [68]:
input_lang,output_lang = prepare_data(descstring,catstring,lang_tuples) 

In [69]:
iptensor,trgtensor = tensorsFromPair(tuples[0])
iptensor.size(0)

<__main__.Lang object at 0x7fb616c23a90> Excellent movie 👍
<__main__.Lang object at 0x7fb616c23450> 0


4

Adding Teacher Forcing to improve the accuracy

In [70]:
teacher_forcing_ratio = 0.5

In [71]:
def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0
    #print("input tensor:", input_tensor)
    #print("input length",input_length)
    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)
        #print(input_length,"\\",input_tensor[ei],"\\",encoder_hidden)
        #print("0",input_length)
        #print("1",len(encoder_output[0]))
        #print("2",len(encoder_output[0][0]))
        #print("3",len(encoder_output[0, 0]))
        #print("4",len(encoder_outputs))
        #print("4.5",ei)
        #print("5",encoder_outputs[ei])
        #print("6",len(encoder_outputs[ei]))
        encoder_outputs[ei] = encoder_output[0, 0]
        #print("7",encoder_outputs[ei])

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length


In [72]:
#training_pairs = [tensorsFromPair(random.choice(tuples)) for i in range(100)]
#len(training_pairs)

In [73]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    
    if percent!=0:
      now = time.time()
      s = now - since
      es = s / (percent)
      rs = es - s
    else:
      return 0
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [74]:
def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [tensorsFromPair(random.choice(tuples))
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(0, n_iters):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]
        #print(len(input_tensor))
        #print(len(target_tensor))


        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)


In [75]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np


def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

Defining the evaluation method

In [76]:
def evaluate(encoder, decoder, sentence, max_length = MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[di] = decoder_attention.data
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di + 1]

In [77]:
def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(tuples)
        print('>', pair[0])
        print('=', pair[1])
        output_words, attentions = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

In [78]:
type(input_lang)

__main__.Lang

In [None]:
#encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)
#encoder_outputs[ei] = encoder_output[0, 0]

In [79]:
input_lang.n_words

44910

In [None]:
#EncoderRNN(input_lang.n_words, hidden_size).to(device)

Defining the parameters ad Training the Model

In [80]:
hidden_size = 256
encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
attn_decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1).to(device)

trainIters(encoder1, attn_decoder1, 20000, print_every=100)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
<__main__.Lang object at 0x7fb616c23a90> To deal with them as it was announced on 8th November 2016....DEOMONITISATION....what happened... Every channel and anchor is shy of questioning.....
<__main__.Lang object at 0x7fb616c23450> 0
<__main__.Lang object at 0x7fb616c23a90> Interest in the stock mkts have fainted, how much more can a mkt move up with limited ppl earning
<__main__.Lang object at 0x7fb616c23450> 0
<__main__.Lang object at 0x7fb616c23a90> If there is sound of thunder then to whom he can blame
<__main__.Lang object at 0x7fb616c23450> 2
<__main__.Lang object at 0x7fb616c23a90> Hi Anuj....is it the right time to enter into edelweiss financials
<__main__.Lang object at 0x7fb616c23450> 0
<__main__.Lang object at 0x7fb616c23a90> Whatever you say is correct from point of humanity but not support reality. If we give away Kashmir to Pakistan problem will not be solved. There are Hindus Sikh Shia also as well some Kas

### The graph above shows the variation of NLL LOSS ( Negative Log Likely Loss ) with the number of epochs. 

In [None]:
#saving the encoder and decoder model
encoder_save_name = 'encoder1.pt'
path1 = F"/content/gdrive/My Drive/{encoder_save_name}"
torch.save(encoder1.state_dict(), 'encoder1.dict')
decoder_save_name = 'attn_decoder1.pt'
path2 = F"/content/gdrive/My Drive/{decoder_save_name}" 
#torch.save(adecoder1.state_dict(), path2)
torch.save(attn_decoder1.state_dict(), 'attn_decoder1.dict')

In [None]:
#loading the encoder and decoder model
encoder_model = EncoderRNN(input_lang.n_words, hidden_size).to(device)
attn_decoder = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1).to(device)

#encoder_save_name = 'encoder.pt'
#path = F"/content/gdrive/My Drive/{encoder_save_name}"
#encoder.load_state_dict(torch.load(path))

#decoder_save_name = 'decoder.pt'
#path = F"/content/gdrive/My Drive/{decoder_save_name}"
#decoder.load_state_dict(torch.load(path))

encoder_model.load_state_dict(torch.load('encoder1.dict'))
attn_decoder.load_state_dict(torch.load('decoder1.dict'))

In [81]:
evaluateRandomly(encoder1,attn_decoder1)

> why are they crying on stage?? coz of launching getting delayed?
= 1
<__main__.Lang object at 0x7fb616c23a90> why are they crying on stage?? coz of launching getting delayed?
< 1 <EOS>

> Dude pay in rouble if it still has value !!!
= 1
<__main__.Lang object at 0x7fb616c23a90> Dude pay in rouble if it still has value !!!
< 1 <EOS>

> Muslims are slaves, this is fact
= 1
<__main__.Lang object at 0x7fb616c23a90> Muslims are slaves, this is fact
< 1 <EOS>

> rain industries !! green even when market in red !!
= 0
<__main__.Lang object at 0x7fb616c23a90> rain industries !! green even when market in red !!
< 0 <EOS>

> Why produce so many children in the first place when you can't afford to raise them?
= 1
<__main__.Lang object at 0x7fb616c23a90> Why produce so many children in the first place when you can't afford to raise them?
< 1 <EOS>

> Mujeeb india is not secular but sickular... Friend...here we have peoples like zakir naik who got freedom to talk all the non sense...about islam...

In [82]:
output_words, attentions = evaluate(encoder1,attn_decoder1, "Most of Private Banks ATM's Like HDFC, ICICI etc are out of cash. Only Public sector bank's ATM working")
plt.matshow(attentions.numpy())
plt.show()

<__main__.Lang object at 0x7fb616c23a90> Most of Private Banks ATM's Like HDFC, ICICI etc are out of cash. Only Public sector bank's ATM working


In [83]:
def showAttention(input_sentence, output_words, attentions):
    # Set up figure with colorbar
    fig = plt.figure(figsize=(5,5))
    ax = fig.add_subplot(111)
    cax = ax.matshow(attentions.numpy(), cmap='bone')
    fig.colorbar(cax)

    # Set up axes
    ax.set_xticklabels([''] + input_sentence.split(' ') +
                       ['<EOS>'], rotation=90)
    ax.set_yticklabels([''] + output_words)

    # Show label at every tick
    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    plt.show()

In [84]:
def evaluateAndShowAttention(input_sentence):
    output_words, attentions = evaluate(
        encoder1, attn_decoder1, input_sentence)
    print('input =', input_sentence)
    out=' '.join(output_words).rstrip(' <EOS>')
    print('output =', out )
    return out
    #showAttention(input_sentence, output_words, attentions)


In [85]:
evaluateAndShowAttention("Most of Private Banks ATM's Like HDFC, ICICI etc are out of cash. Only Public sector bank's ATM working")

<__main__.Lang object at 0x7fb616c23a90> 
input = 
output = 0


'0'

In [86]:
categories = np.unique(maindf['category'].to_list())
categories

array(['0', '1', '2'], dtype='<U1')

In [87]:
cat_ind = np.arange(0,len(categories))
cat_ind

array([0, 1, 2])

In [88]:
category_dictionary = dict(zip(categories,cat_ind))
category_dictionary

{'0': 0, '1': 1, '2': 2}

In [89]:
pred_test=pd.DataFrame()
pred_test['y_actual'] = test_data['category'].apply(lambda x: category_dictionary[x])
pred_test.head()

Unnamed: 0,y_actual
4531,1
365,0
11977,1
147,2
1668,0


In [90]:
pred_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2400 entries, 4531 to 7270
Data columns (total 1 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   y_actual  2400 non-null   int64
dtypes: int64(1)
memory usage: 37.5 KB


In [91]:
test_data['prediected'] = test_data['description'].apply(lambda x: evaluateAndShowAttention(x))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
*सभी भारतीय 90*
*दिन तक*
*कोई*
*भी विदेशी सामान*
*नहीं ख़रीदे...*
.
.
*तो भारत*
*दुनिया का दूसरा सबसे*
*अमीर देश बन सकता है..*
.
.
*सिर्फ 90 दिन में ही भारत के*
*2 रुपये 1 डॉलर के बराबर*
*हो जायेंगे..*
.
.
*हम सबको मिल कर*
*ये कोशिश आजमानी चाहिए*
*क्युकी ये देश है हमारा..!!!!*
.
.
*हम जोक्स फॉरवर्ड करते हे.*
*इसे भी इतना फॉरवर्ड*
*करो की पूरा भारत इसे पढ़े ...*
*और*
*एक आन्दोलन बन जाय……!!*
*Plz plz plz plz plz forward* *this*
*Sirf 1 min lagega*
output = 2
<__main__.Lang object at 0x7fb616c23a90> Rahul Bhaiya wonderful interview and good views on the entire situation
input = Rahul Bhaiya wonderful interview and good views on the entire situation
output = 1
<__main__.Lang object at 0x7fb616c23a90> MUSLIM + AAPTARD= EXTREME AAPTARD!!!
input = MUSLIM + AAPTARD= EXTREME AAPTARD!!!
output = 1
<__main__.Lang object at 0x7fb616c23a90> Hazare is a looser ! Didn't achieve anything ! Left halfway
input = Hazare is a looser ! Didn't

In [92]:
test_data['prediected'] = test_data['prediected'].apply(lambda x: x.replace('& &','&'))
test_data.head()

Unnamed: 0,description,category,prediected
4531,Gadha is more hard working than man,1,1
365,We have followed the notification & not done a...,0,2
11977,"Dominar, LOL come-on bajaj at least think of a...",1,1
147,What has bjp done in last 10 yr in MCD...stop ...,2,1
1668,"""PAKISTANAN'S"" are not crul like your governme...",0,2


In [None]:
#test_data.drop(columns=['T/F','predicted'], inplace = True)
#test_data.head()

In [None]:
test_data["T/F"] = test_data['prediected'].apply(lambda x: 1 if x in category_dictionary else 0)

In [None]:
test_data.loc[(test_data['T/F']==0)]

Unnamed: 0,category,description,prediected,T/F
4460,home decor & festive needs,love background products genuine cash tiedribb...,home & dining,0
8221,tools & hardware,products genuine cash plant replacement home g...,home & dining,0
1403,beauty and personal care,point pinch running intimate flow hands vinega...,home decor dining,0
8515,tools & hardware,designed beautifully products genuine cash pla...,home decor &,0
16365,pens & stationery,products rs250 genuine vintage notebook cash r...,home & dining,0
...,...,...,...,...
18077,kitchen & dining,products genuine cash replacement wipro bulb g...,home decor &,0
18711,home decor & festive needs,products genuine cash replacement showpiece gr...,home & dining,0
18589,furniture,stitched throughview fabric multi preferences ...,home & accessories,0
38191,auto & tires,i-squeegie phone screen cleaner|i-squeegee pho...,home &,0


Here we saw that there are a few rows that do not have the complete category preicted. Sor now, we have randomly matched a category using string matching. 

Note** that the alternative method will be to add the matching while training the model itself

In [None]:
#import re
import difflib
def find_cat(cat):
  for c in category_dictionary:
    if(cat in c):
      return c

test_data['predicted'] = test_data['prediected'].apply(lambda x: difflib.get_close_matches(x,l) )
#x if x in category_dictionary else find_cat(x)

In [None]:
test_data.loc[(test_data['T/F']==0)]

Unnamed: 0,category,description,prediected,T/F,predicted
4460,home decor & festive needs,love background products genuine cash tiedribb...,home & dining,0,"[kitchen & dining, home & kitchen, home furnis..."
8221,tools & hardware,products genuine cash plant replacement home g...,home & dining,0,"[kitchen & dining, home & kitchen, home furnis..."
1403,beauty and personal care,point pinch running intimate flow hands vinega...,home decor dining,0,"[home furnishing, home decor & festive needs]"
8515,tools & hardware,designed beautifully products genuine cash pla...,home decor &,0,[home decor & festive needs]
16365,pens & stationery,products rs250 genuine vintage notebook cash r...,home & dining,0,"[kitchen & dining, home & kitchen, home furnis..."
...,...,...,...,...,...
18077,kitchen & dining,products genuine cash replacement wipro bulb g...,home decor &,0,[home decor & festive needs]
18711,home decor & festive needs,products genuine cash replacement showpiece gr...,home & dining,0,"[kitchen & dining, home & kitchen, home furnis..."
18589,furniture,stitched throughview fabric multi preferences ...,home & accessories,0,"[mobiles & accessories, cameras & accessories]"
38191,auto & tires,i-squeegie phone screen cleaner|i-squeegee pho...,home &,0,"[home, home & kitchen]"


In [None]:
test_data['predicted'] = test_data['predicted'].apply(lambda x: x[0].strip('/[/]'))

In [None]:
test_data.loc[(test_data['T/F']==0)]

Unnamed: 0,category,description,prediected,T/F,predicted
4460,home decor & festive needs,love background products genuine cash tiedribb...,home & dining,0,kitchen & dining
8221,tools & hardware,products genuine cash plant replacement home g...,home & dining,0,kitchen & dining
1403,beauty and personal care,point pinch running intimate flow hands vinega...,home decor dining,0,home furnishing
8515,tools & hardware,designed beautifully products genuine cash pla...,home decor &,0,home decor & festive needs
16365,pens & stationery,products rs250 genuine vintage notebook cash r...,home & dining,0,kitchen & dining
...,...,...,...,...,...
18077,kitchen & dining,products genuine cash replacement wipro bulb g...,home decor &,0,home decor & festive needs
18711,home decor & festive needs,products genuine cash replacement showpiece gr...,home & dining,0,kitchen & dining
18589,furniture,stitched throughview fabric multi preferences ...,home & accessories,0,mobiles & accessories
38191,auto & tires,i-squeegie phone screen cleaner|i-squeegee pho...,home &,0,home


In [93]:
pred_test.head()

Unnamed: 0,y_actual
4531,1
365,0
11977,1
147,2
1668,0


In [None]:
pred_test['y_pred'] = test_data['predicted'].apply(lambda x: category_dictionary[x])

In [97]:
pred_test['y_pred'] = test_data['prediected'].astype('int')

In [98]:
pred_test.head()

Unnamed: 0,y_actual,y_pred
4531,1,1
365,0,2
11977,1,1
147,2,1
1668,0,2


In [99]:
from sklearn.metrics import classification_report
print(classification_report(pred_test['y_actual'], pred_test['y_pred']))

              precision    recall  f1-score   support

           0       0.50      0.42      0.46      1011
           1       0.38      0.56      0.45       863
           2       0.28      0.15      0.20       526

    accuracy                           0.41      2400
   macro avg       0.39      0.38      0.37      2400
weighted avg       0.41      0.41      0.40      2400



**We an see that our model works well with an accuracy of 81%**

To further improve our accuracy, we can add more than 1 attention layers and do cross entropy validation to get the optimal model for this particular technique