- https://www.kaggle.com/kuldeep7688/simple-rnn-using-glove-embeddings-in-pytorch

In [1]:
import torch
from torchtext import data

SEED = 1234
import pandas as pd
import numpy as np
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchtext

import nltk

import random
from sklearn.metrics import classification_report

import pyprind
%matplotlib inline 

In [2]:
main_df = pd.read_csv('data/train_quora.csv')

In [3]:
main_df.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


In [4]:
print(main_df.shape)
main_df = main_df.sample(n=main_df.shape[0])
main_df = main_df[["question_text", "target"]]
main_df.head()

(1306122, 3)


Unnamed: 0,question_text,target
158469,How can I convence my mom to do sex with mee?,1
382455,Is the difference between IB Biology HL and SL...,0
1179076,How are placements at Manipal Jaipur?,0
288080,How do I get permanent residency in USA and ho...,0
175330,How advanced would humans be if pangea never s...,0


In [5]:
main_df.target.value_counts()

0    1225312
1      80810
Name: target, dtype: int64

In [6]:
main_df.target.value_counts(normalize=True)

0    0.93813
1    0.06187
Name: target, dtype: float64

In [7]:
o_class = main_df.loc[main_df.target == 0, :]
l_class = main_df.loc[main_df.target == 1, :]

In [8]:
test_o = o_class.iloc[:10000, :]
test_l = l_class.iloc[:10000, :]

valid_o = o_class.iloc[10000:20000, :]
valid_l = l_class.iloc[10000:20000, :]

train_o = o_class.iloc[20000:, :]
train_l = l_class.iloc[20000:, :]

In [9]:
train = pd.concat([train_o, train_l], axis=0)
print(train.shape)

valid = pd.concat([valid_o, valid_l], axis=0)
print(valid.shape)

test = pd.concat([test_o, test_l], axis=0)
print(test.shape)

(1266122, 2)
(20000, 2)
(20000, 2)


In [10]:
train.target.value_counts(), valid.target.value_counts(), test.target.value_counts()

(0    1205312
 1      60810
 Name: target, dtype: int64,
 1    10000
 0    10000
 Name: target, dtype: int64,
 1    10000
 0    10000
 Name: target, dtype: int64)

In [11]:
!mkdir data/torchtext_data

mkdir: cannot create directory ‘data/torchtext_data’: File exists


In [12]:
train.to_csv("data/torchtext_data/train.csv", index=False)
test.to_csv("data/torchtext_data/test.csv", index=False)
valid.to_csv("data/torchtext_data/valid.csv", index=False)

In [13]:
# freeing up some memory
del main_df, train, test, valid, train_l, train_o, test_l, test_o, valid_l,valid_o, o_class, l_class

In [14]:
import spacy
spacy_en = spacy.load('en')
# nltk.download('punkt')

In [15]:
is_cuda = torch.cuda.is_available()
print("Cuda Status on system is {}".format(is_cuda))

Cuda Status on system is False


In [16]:
# sample tokenizer which you can use
def tokenizer(text):
    return [tok for tok in nltk.word_tokenize(text)]

In [17]:
# tokenizer = "spacy" uses spacy's tokenizer
TEXT = data.Field(sequential=True, tokenize="spacy")
LABEL = data.LabelField(dtype=torch.long, sequential=False)

In [18]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [19]:
train_data, valid_data, test_data = data.TabularDataset.splits(
    path="data/torchtext_data/", train="train.csv", 
    validation="valid.csv", test="test.csv",format="csv", skip_header=True, 
    fields=[('Text', TEXT), ('Label', LABEL)]
)

In [20]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of valid examples: {len(valid_data)}')
print(f'Number of testing examples: {len(test_data)}')

Number of training examples: 1266122
Number of valid examples: 20000
Number of testing examples: 20000


In [21]:
train_data[0].Text

['What',
 'is',
 'the',
 'best',
 'stratigy',
 'for',
 'classified',
 'sites',
 'for',
 'digital',
 'marketing',
 '?']

In [118]:
train_data[0].Label

'0'

In [22]:
# download the pretrained model from https://www.kaggle.com/takuok/glove840b300dtxt
TEXT.build_vocab(train_data, vectors=torchtext.vocab.Vectors("data/glove.840B.300d.txt"), 
                 max_size=20000, min_freq=10)
LABEL.build_vocab(train_data)

In [23]:
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

Unique tokens in TEXT vocabulary: 20002
Unique tokens in LABEL vocabulary: 2


In [24]:
BATCH_SIZE = 20

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# keep in mind the sort_key option 
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), sort_key=lambda x: len(x.Text),
    batch_size=BATCH_SIZE,
    device=device)

In [26]:
LABEL.vocab.freqs

Counter({'0': 1205312, '1': 60810})

In [61]:
class Net(nn.Module):
    def __init__(self, vocab_size, embedding_dim, output_dim, pad_idx):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        
        self.fc = nn.Linear(embedding_dim, output_dim)
                
    def forward(self, x):
        
        embedded = self.embedding(x)                        
        embedded = embedded.permute(1, 0, 2)        
        pooled = F.avg_pool2d(embedded, (embedded.shape[1], 1)).squeeze(1)                 
        return self.fc(pooled)    
    

In [62]:
pretrained_embeddings = TEXT.vocab.vectors
print(pretrained_embeddings.shape)

torch.Size([20002, 300])


In [63]:
# INPUT_DIM = len(TEXT.vocab)
# EMBEDDING_DIM = 300
# HIDDEN_DIM = 374
# OUTPUT_DIM = 2

# model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)

INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 300
OUTPUT_DIM = 2
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = Net(INPUT_DIM, EMBEDDING_DIM, OUTPUT_DIM, PAD_IDX)

pretrained_embeddings = TEXT.vocab.vectors

print(pretrained_embeddings.shape)

model.embedding.weight.data = pretrained_embeddings

torch.Size([20002, 300])


In [64]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 6,001,202 trainable parameters


In [65]:
pretrained_embeddings = TEXT.vocab.vectors
# model.embedding.weight.data.copy_(pretrained_embeddings)

print(pretrained_embeddings.shape)

torch.Size([20002, 300])


In [66]:
model.embedding.weight.data = pretrained_embeddings

In [67]:
class_weights = torch.tensor([1.0, 15.0])

In [68]:
optimizer = optim.SGD(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss(weight=class_weights)

In [69]:
model

Net(
  (embedding): Embedding(20002, 300, padding_idx=1)
  (fc): Linear(in_features=300, out_features=2, bias=True)
)

In [70]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    preds, ind= torch.max(F.softmax(preds, dim=-1), 1)
    correct = (ind == y).float()
    acc = correct.sum()/float(len(correct))
    return acc

def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    bar = pyprind.ProgBar(len(iterator), bar_char='█')
    for batch in iterator:
        
        optimizer.zero_grad()
                
        predictions = model(batch.Text).squeeze(0)
#         print(predictions.shape, batch.Label.shape, model(batch.Text).shape)
        loss = criterion(predictions, batch.Label)
#         print(loss.shape)
        acc = binary_accuracy(predictions, batch.Label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        bar.update()
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
        bar = pyprind.ProgBar(len(iterator), bar_char='█')
        for batch in iterator:

            predictions = model(batch.Text).squeeze(0)
            
            loss = criterion(predictions, batch.Label)
            
            acc = binary_accuracy(predictions, batch.Label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
            bar.update()
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [71]:
N_EPOCHS = 2

for epoch in range(N_EPOCHS):

    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}% |')

0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:36:31
0% [██████████████████████████████] 100% | ETA: 00:00:00

| Epoch: 01 | Train Loss: 0.505 | Train Acc: 93.46% | Val. Loss: 0.412 | Val. Acc: 82.27% |



Total time elapsed: 00:00:01
0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:43:16
0% [██████████████████████████████] 100% | ETA: 00:00:00

| Epoch: 02 | Train Loss: 0.414 | Train Acc: 92.63% | Val. Loss: 0.390 | Val. Acc: 83.50% |



Total time elapsed: 00:00:01


In [72]:
test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}% |')

0% [██████████████████████████████] 100% | ETA: 00:00:00

| Test Loss: 0.398 | Test Acc: 83.22% |



Total time elapsed: 00:00:02


In [110]:
def predict_sentiment(sentence):
    tokenized = [tok for tok in sentence.split()]
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device)
    
    tensor = tensor.unsqueeze(1)
    
#     print(tensor.shape)
    prediction = model(tensor)
#     print(prediction)
#     print(torch.max(F.softmax(prediction.squeeze(0))))    
    
    preds, ind= torch.max(F.softmax(prediction.squeeze(0), dim=-1), 0)
#     print(preds, ind)
    return preds, ind


In [111]:
text = "My voice range is A2-C5. My chest voice goes up to F4. Included sample in my higher chest range. What is my voice type?"
predict_sentiment(text)[1].item()

0

In [112]:
# calculating classification report
test = pd.read_csv("data/torchtext_data/test.csv")

In [113]:
pre = [predict_sentiment(k)[1].item() for k in test.question_text]

In [114]:
print(classification_report(test.target, pre))

              precision    recall  f1-score   support

           0       0.85      0.78      0.81     10000
           1       0.80      0.87      0.83     10000

    accuracy                           0.82     20000
   macro avg       0.83      0.82      0.82     20000
weighted avg       0.83      0.82      0.82     20000



In [115]:
test_df = pd.read_csv("data/test_quora.csv")
print(test_df.shape)
test_df.head()

(375806, 2)


Unnamed: 0,qid,question_text
0,0000163e3ea7c7a74cd7,Why do so many women become so rude and arroga...
1,00002bd4fb5d505b9161,When should I apply for RV college of engineer...
2,00007756b4a147d2b0b3,What is it really like to be a nurse practitio...
3,000086e4b7e1c7146103,Who are entrepreneurs?
4,0000c4c3fbe8785a3090,Is education really making good people nowadays?


In [116]:
test_predictions = [int(predict_sentiment(k)[1].item()) for k in test_df.question_text]

In [117]:
test_predictions

[1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
