## Load the dataset and create the vocabulary

In [None]:
!pip install dvc

Collecting dvc
  Downloading dvc-2.8.3-py3-none-any.whl (399 kB)
[K     |████████████████████████████████| 399 kB 13.0 MB/s 
[?25hCollecting python-benedict>=0.24.2
  Downloading python_benedict-0.24.3-py3-none-any.whl (41 kB)
[K     |████████████████████████████████| 41 kB 39 kB/s 
[?25hCollecting voluptuous>=0.11.7
  Downloading voluptuous-0.12.2.tar.gz (48 kB)
[K     |████████████████████████████████| 48 kB 4.9 MB/s 
[?25hCollecting dpath<3,>=2.0.2
  Downloading dpath-2.0.5-py3-none-any.whl (15 kB)
Collecting rich>=10.13.0
  Downloading rich-10.15.2-py3-none-any.whl (214 kB)
[K     |████████████████████████████████| 214 kB 43.7 MB/s 
Collecting distro>=1.3.0
  Downloading distro-1.6.0-py2.py3-none-any.whl (19 kB)
Collecting dulwich>=0.20.23
  Downloading dulwich-0.20.26-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (546 kB)
[K     |████████████████████████████████| 546 kB 47.3 MB/s 
Collecting nanotime>=0.5.2
  Downloading n

In [None]:
!dvc get https://github.com/iterative/aita_dataset aita_clean.csv

[0m

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv('aita_clean.csv')
df = df[df['score'] >= 10]
df['text'] = df['title'] + df['body'].fillna('')
lines = df['text']
labels = df['is_asshole']
new_df = df[['text', 'is_asshole']]
new_df.reset_index(drop=True, inplace=True) 
train_set, test_set = train_test_split(new_df, test_size=0.3)

In [None]:
from torchtext.data.utils import get_tokenizer
from collections import Counter, OrderedDict
from torchtext.vocab import vocab

EMBEDDING_DIM=50
VOCAB_SIZE=20000

# Load English tokenizer, tagger, parser and NER
tokenizer = get_tokenizer('spacy', language='en')

In [None]:
# build the vocab
counter = Counter()
for i, line in enumerate(lines):
    counter.update(tokenizer(str(line)))

ordered_dict = OrderedDict(counter.most_common()[:VOCAB_SIZE])
vocab = vocab(ordered_dict)

# insert special tokens and set default index to 'unknown'
vocab.insert_token('<PAD>', 0)
vocab.insert_token('<UNK>', 1)
vocab.set_default_index(1)

## Create embedding vectors from GloVe

In [None]:
import torchtext as text

# load glove embeddings
vec = text.vocab.GloVe(name='6B', dim=50)
# create the embedding matrix, a torch tensor in the shape (num_words+1, embedding_dim)
word_emb = vec.get_vecs_by_tokens(vocab.get_itos())

.vector_cache/glove.6B.zip: 862MB [02:43, 5.29MB/s]                           
100%|█████████▉| 399999/400000 [00:13<00:00, 29890.24it/s]


## Build up train/test dataset

In [None]:
from torch.nn.utils.rnn import pad_sequence
import torch
from torch.utils.data import DataLoader
from torchtext.data.functional import to_map_style_dataset

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# transform input text to vectors
def process_text(text):
    return vocab(tokenizer(text))

def collate_batch(batch):
    label_list, text_list, lengths = [], [], []
    for (_index, _text, _label) in batch:
        label_list.append(_label)
        processed_text = torch.tensor(process_text(_text), dtype=torch.int64)
        text_list.append(processed_text)
        lengths.append(processed_text.size(0))
    lengths = torch.tensor(lengths, dtype=torch.float)

    label_list = torch.tensor(label_list, dtype=torch.float)[:,None]

    text_list = pad_sequence(text_list, batch_first=True)
    return  label_list, text_list, lengths.to(device)

train_dataset = to_map_style_dataset(train_set.itertuples())
test_dataset = to_map_style_dataset(test_set.itertuples())

train_dataloader = DataLoader(train_dataset, batch_size=128,
                              shuffle=True, collate_fn=collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size=128,
                              shuffle=True, collate_fn=collate_batch)

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class LSTMcustom(nn.Module):
    def __init__(self, word_vec, embed_dim):
        super().__init__()
        # embeddingbag outputs the average of all the words in a sentence
        self.embedding = nn.Embedding(*(word_vec.size())).from_pretrained(word_vec, freeze=False)
        # Create a 1D-CNN use torch.nn.Conv1d and feed the output to the next LSTM layer. 
        self.cnn = torch.nn.Conv1d(in_channels=embed_dim, out_channels=20, kernel_size=2)
        # Determine the input shape of this LSTM layer.
        self.lstm = nn.LSTM(20, 200, 1, bidirectional=False, batch_first = True)

        self.fc = nn.Linear(200, 1)
                
    def forward(self, text, lengths):
        embedded = self.embedding(text) # (batch_size, sent_len, emb_size)
        # Original sequence and embedding_dim can change after applying CNN, use torch.permute to transpose

        embedded = torch.permute(embedded, (0, 2, 1)) # (batch_size, emb_size, sent_len)
        
        cnn_out = self.cnn(embedded) # (batch_size, emb_size, sent_len)

        cnn_out = torch.permute(cnn_out, (0, 2, 1)) # (batch_size, sent_len, emb_size)

        lstm_out,_ = self.lstm(cnn_out) # lstm_out is a 3d tensor (batch_size, seq_len, output_size)
        
        lstm_out = lstm_out[:, -1, :]

        return torch.sigmoid(self.fc(lstm_out)) 

In [None]:
from imblearn.over_sampling import SMOTE
import numpy as np

def train(dataloader):
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 50

    for idx, (label, text, length) in enumerate(dataloader):
        smt = SMOTE(random_state=42)
        X_train, Y_train = smt.fit_resample(text, label)
        lengths = []
        label_list = torch.tensor(Y_train, dtype=torch.float)[:,None]
        label = label_list.to(device)
        lengths = torch.tensor(lengths, dtype=torch.float)
        text = text.to(device)
        lengths = lengths.to(device)
        optimizer.zero_grad()
        # forward propagation
        predicted_label = model(text, lengths)
        # calculate loss and backpropagate to model paramters
        loss = criterion(predicted_label, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        # update parameters by stepping the optimizer
        optimizer.step()
        total_acc += ((predicted_label > 0.5) == label).sum().item()
        total_count += label.size(0)
        if idx % log_interval == 0 and idx > 0:
            print('| epoch {:3d} | {:5d}/{:5d} batches '
                  '| accuracy {:8.3f}'.format(epoch, idx, len(dataloader),
                                              total_acc/total_count))
            total_acc, total_count = 0, 0

def evaluate(dataloader):
    model.eval()
    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (label, text, lengths) in enumerate(dataloader):
            label = label.to(device)
            text = text.to(device)
            predicted_label = model(text, lengths)
            loss = criterion(predicted_label, label)
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_acc/total_count

## Model Training

In [None]:
EPOCHS = 1 # epoch

model = LSTMcustom(word_vec=word_emb, embed_dim=EMBEDDING_DIM).to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
criterion = torch.nn.BCELoss()
total_accu = None


for epoch in range(1, EPOCHS + 1):
    train(train_dataloader)

| epoch   1 |    50/  268 batches | accuracy    0.485
| epoch   1 |   100/  268 batches | accuracy    0.620
| epoch   1 |   150/  268 batches | accuracy    0.669
| epoch   1 |   200/  268 batches | accuracy    0.702
| epoch   1 |   250/  268 batches | accuracy    0.724


## Model saving and testing

In [None]:
# Print model's state_dict
print("Model's state_dict:")
for param_tensor in model.state_dict():
    print(param_tensor, "\t", model.state_dict()[param_tensor].size())

#save the parameters of a model
torch.save(model.state_dict(), 'CNN_NTA_model.pt')

Model's state_dict:
embedding.weight 	 torch.Size([20002, 50])
cnn.weight 	 torch.Size([20, 50, 2])
cnn.bias 	 torch.Size([20])
lstm.weight_ih_l0 	 torch.Size([800, 20])
lstm.weight_hh_l0 	 torch.Size([800, 200])
lstm.bias_ih_l0 	 torch.Size([800])
lstm.bias_hh_l0 	 torch.Size([800])
fc.weight 	 torch.Size([1, 200])
fc.bias 	 torch.Size([1])


In [None]:
accu_test = evaluate(test_dataloader)
print('test accuracy {:8.2f}%'.format(accu_test))

test accuracy    91.58%


In [None]:
def predict(sentence, model):
    model.eval()
    text_list = []
    lengths = []
    processed_text = torch.tensor(process_text(sentence), dtype=torch.int64)
    text_list.append(processed_text)
    lengths.append(processed_text.size(0))
    text_list = pad_sequence(text_list, batch_first=True)
    lengths = torch.tensor(lengths, dtype=torch.float)
    with torch.no_grad():
        predicted_label = model(text_list.to(device), lengths.to(device))
    label = (predicted_label.cpu().numpy()[0] > 0.5)
    if label == 0:
        print('The predict label is : 0, and the class is no asshole with the probability of {}.'.format(1 - predicted_label.item()))
    else:
        print('The predict label is : 1, and the class is asshole with the probability of {}.'.format(predicted_label.item()))

In [None]:
def acc_for_each_class(model,dataloader):
    model.eval()
    normal_acc, normal_count = 0 , 0
    pneumonia_acc, pneumonia_count = 0, 0

    with torch.no_grad():
        for idx, (label, text, lengths) in enumerate(dataloader):
            label = label.to(device)
            text = text.to(device)
            predicted_label = model(text, lengths)
            correct=((predicted_label > 0.5) == label)

            for i,l in enumerate(label):
                    if(l.data[0] < 0.5):
                        normal_count += 1
                        if(correct[i].data[0]):
                            normal_acc += 1
                    else:
                        pneumonia_count += 1.0
                        if(correct[i].data[0]):
                            pneumonia_acc += 1
      
    return normal_acc/normal_count, pneumonia_acc/pneumonia_count

In [None]:
nonasshole_acc, asshole_acc = acc_for_each_class(model, test_dataloader)
print('The accuracy for the non-asshole calss is : {}'.format(nonasshole_acc))
print('The accuracy for the asshole calss is : {}'.format(asshole_acc))

The accuracy for the non-asshole calss is : 0.9875344942430297
The accuracy for the asshole calss is : 0.011815770436460092


In [None]:
def get_f1_score(model,dataloader,class_name):
    model.eval()
    FP = 0
    TP = 0
    FN = 0

    with torch.no_grad():
        for idx, (label, text, lengths) in enumerate(dataloader):
            label = label.to(device)
            text = text.to(device)
            predicted_label = model(text, lengths)
            correct=((predicted_label > 0.5) == label)

            for i,l in enumerate(label):
                if class_name == 'asshole':
                  if(l.data[0] > 0.5):
                      if(correct[i].data[0]):
                          TP += 1
                      else:
                          FP += 1
                  else:
                      if(not correct[i].data[0]):
                          FN += 1
                else:
                      if(l.data[0] < 0.5):
                          if(correct[i].data[0]):
                              TP += 1
                          else:
                              FP += 1
                      else:
                          if(not correct[i].data[0]):
                              FN += 1
      
    return TP/(TP + 0.5 * (FP + FN))

In [None]:
f1_score_asshole = get_f1_score(model, test_dataloader, 'asshole')
print('The f1 score for asshole class is : {}'.format(f1_score_asshole))
f1_score_nonasshole = get_f1_score(model, test_dataloader, 'nonasshole')
print('The f1 score for nonasshole class is : {}'.format(f1_score_nonasshole))

The f1 score for asshole class is : 0.026715799170888992
The f1 score for nonasshole class is : 0.8300965042245625


In [None]:
sentence = test['text'].iloc[0]
predict(sentence, model)

The predict label is : 0, and the class is no asshole with the probability of 0.6079334020614624.



## Model Loading

In [None]:
model = LSTMcustom(word_vec=word_emb, embed_dim=EMBEDDING_DIM).to(device)
model.load_state_dict(torch.load('CNN_NTA_model.pt'))
model.eval()

LSTMcustom(
  (embedding): Embedding(20002, 50)
  (cnn): Conv1d(50, 20, kernel_size=(2,), stride=(1,))
  (lstm): LSTM(20, 200, batch_first=True)
  (fc): Linear(in_features=200, out_features=1, bias=True)
)

In [None]:
sentence = test['text'].iloc[50]
predict(sentence, model)

The predict label is : 0, and the class is no asshole with the probability of 0.6067976653575897.



In [None]:
sentence = '''It seems like I came off as an asshole. What can I do in the future to avoid this? I thought I was just randomly explaining something, which seems common on Reddit.

I definitely should not have said "Sorry for trying to help" - That was pretty immature and rude. 

Am I the asshole here?

I wrote as little as possible to skew perspective as little as possible.'''
predict(sentence, model)

The predict label is : 0, and the class is no asshole with the probability of 0.6045775413513184.
