In [1]:
import os
import numpy as np
import pandas as pd
import nltk
import torch
from transformers import BertTokenizer
from torchtext.legacy import data
from sklearn.model_selection import train_test_split
from transformers import BertModel
import transformers
import torch.nn as nn
import math
import time

#from transformers import *
import torch.optim as optim

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
abspath = os.path.abspath('')
dname = os.path.dirname(abspath)
os.chdir(dname)
print(dname)

c:\Users\wongy\OneDrive\Desktop\duplicate-questions-pair-detection


In [3]:
df = pd.read_feather('data/processed/full_w_lcs.feather')

In [4]:
def trim_sentence(sent):
    try:
        sent = sent.split()
        sent = sent[:128]
        return " ".join(sent)
    except:
        return sent

df['q1_trimmed'] = df['q1_cleaned'].apply(lambda x: trim_sentence(x))
df['q2_trimmed'] = df['q2_cleaned'].apply(lambda x: trim_sentence(x))

In [None]:
# def common_words(row):
#     w1 = set(map(lambda word: word.lower().strip(), row['q1_cleaned'].split(" ")))
#     w2 = set(map(lambda word: word.lower().strip(), row['q2_cleaned'].split(" ")))    
#     return len(w1 & w2)
    
# df['jaccard_dist'] = nltk.jaccard_distance(set(df['q1_cleaned']), set(df['q2_cleaned']))
# df['common_words'] = df.apply(common_words, axis=1)
# df['common_ratio'] = df.apply(lambda row: row['common_words'] / (len(row['q1_cleaned']) + len(row['q2_cleaned'])), axis=1)

In [6]:
# Masking the tokens to feed into BERT
def sent1_token_type(sentence):
    try:
        return [0]* len(sentence)
    except:
        return []
    
#Get list of 1s
def sent2_token_type(sentence):
    try:
        return [1]* len(sentence)
    except:
        return []

#combine from lists
def combine_seq(seq):
    return " ".join(seq)

#combines from lists of int
def combine_mask(mask):
    mask = list(map(str, mask))
    return " ".join(mask)

#convert attention mask back to list of int
def convert_mask(tok_ids):
    tok_ids = [int(x) for x in tok_ids]
    return tok_ids

In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
cls_token = tokenizer.cls_token
sep_token = tokenizer.sep_token
pad_token = tokenizer.pad_token
unk_token = tokenizer.unk_token
cls_token_idx = tokenizer.cls_token_id
sep_token_idx = tokenizer.sep_token_id
pad_token_idx = tokenizer.pad_token_id
unk_token_idx = tokenizer.unk_token_id
max_input_length = tokenizer.max_model_input_sizes['bert-base-uncased']

def tokenize_bert(sentence):
    tokens = tokenizer.tokenize(sentence) 
    return tokens


def split_and_cut(sentence):
    tokens = sentence.strip().split(" ")
    tokens = tokens[:max_input_length] # make sure that it does not overflow
    return tokens

In [8]:
df['q1_padded'] = '[CLS] ' + df['q1_trimmed'] + ' [SEP] '
df['q2_padded'] = df['q2_trimmed'] + ' [SEP]'
df['q1_bert_tokens'] = df['q1_padded'].apply(lambda x: tokenize_bert(x))
df['q2_bert_tokens'] = df['q2_padded'].apply(lambda x: tokenize_bert(x))

In [9]:
df['q1_token_type'] = df['q1_bert_tokens'].apply(lambda x: sent1_token_type(x))
df['q2_token_type'] = df['q2_bert_tokens'].apply(lambda x: sent2_token_type(x))

In [10]:
df['sequence'] = df['q1_bert_tokens'] + df['q2_bert_tokens']
df['attn_mask'] = df['sequence'].apply(lambda x: sent2_token_type(x)) # every word needs attention
df['token_type'] = df['q1_token_type'] + df['q2_token_type']

In [11]:
# Make all the inputs to be sequential in string instead of list
df['sequence'] = df['sequence'].apply(lambda x: combine_seq(x))
df['attn_mask'] = df['attn_mask'].apply(lambda x: combine_mask(x))
df['token_type'] = df['token_type'].apply(lambda x: combine_mask(x))

In [12]:
#For sequence
TEXT = data.Field(batch_first = True,
                  use_vocab = False,
                  tokenize = split_and_cut,
                  preprocessing = tokenizer.convert_tokens_to_ids,
                  pad_token = pad_token_idx,
                  unk_token = unk_token_idx)
#For label
LABEL = data.LabelField()

#For Attention mask
ATTENTION = data.Field(batch_first = True,
                  use_vocab = False,
                  tokenize = split_and_cut,
                  preprocessing = convert_mask,
                  pad_token = pad_token_idx)
#For token type ids
TTYPE = data.Field(batch_first = True,
                  use_vocab = False,
                  tokenize = split_and_cut,
                  preprocessing = convert_mask,
                  pad_token = 1)

In [13]:
X = df.copy()[[x for x in df.columns if x != 'is_duplicate']]
y = df['is_duplicate']
X.drop(['index', 'id', 'qid1', 'qid2', 'question1', 'question2', 'q1_cleaned', 'q2_cleaned', 'q1_padded', 'q2_padded', 'q1_bert_tokens', 'q2_bert_tokens', 'q1_token_type', 'q2_token_type', 'q1_start', 'q2_start', 'q1_trimmed', 'q2_trimmed', 'lc_substring', 'lc_subsequence', 'jaccard_dist', 'common_words', 'common_ratio'], axis=1, inplace=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify = y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, stratify = y_train)

In [15]:
X_train.head()

Unnamed: 0,sequence,attn_mask,token_type
374999,[CLS] most important factor in search position...,1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ...,0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ...
47302,[CLS] why do i have lower back pain after i e ...,1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1
202590,[CLS] what is your least favorite part of sale...,1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1,0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
280296,[CLS] are queen ant dumb ##er than worker ant ...,1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1,0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
373303,[CLS] is there a difference between classical ...,1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1,0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1


In [16]:
X_train.reset_index(drop=True, inplace=True)
y_train = pd.DataFrame(y_train.tolist(), columns = ['is_duplicate'])
y_train.reset_index(drop=True, inplace=True)

df_train = pd.concat([X_train, y_train],axis=1)

X_val.reset_index(drop=True, inplace=True)
y_val = pd.DataFrame(y_val.tolist(), columns = ['is_duplicate'])
y_val.reset_index(drop=True, inplace=True)
    
df_val = pd.concat([X_val, y_val], axis=1)

X_test.reset_index(drop=True, inplace=True)
y_test = pd.DataFrame(y_test.tolist(), columns = ['is_duplicate'])
y_test.reset_index(drop=True, inplace=True)
    
df_test = pd.concat([X_test, y_test], axis=1)

In [17]:
# df_train.to_csv('data/processed/bert_train.csv', index = False)
# df_val.to_csv('data/processed/bert_val.csv', index = False)
# df_test.to_csv('data/processed/bert_test.csv', index = False)

In [18]:
df_train.head()

Unnamed: 0,sequence,attn_mask,token_type,is_duplicate
0,[CLS] most important factor in search position...,1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ...,0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ...,0
1,[CLS] why do i have lower back pain after i e ...,1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1,0
2,[CLS] what is your least favorite part of sale...,1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1,0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1,0
3,[CLS] are queen ant dumb ##er than worker ant ...,1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1,0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1,0
4,[CLS] is there a difference between classical ...,1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1,0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1,0


In [19]:
fields = [('sequence', TEXT), ('attn_mask', ATTENTION), ('token_type', TTYPE), ('is_duplicate', LABEL)]

train_data, valid_data, test_data = data.TabularDataset.splits(path = 'data/processed/',
                                                    train = 'bert_train.csv',
                                                    validation = 'bert_val.csv',
                                                    test = 'bert_test.csv',
                                                    format = 'csv',
                                                    fields = fields,
                                                    skip_header = True)
#Create iterator
BATCH_SIZE = 16
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


In [20]:
train_iterator, valid_iterator, test_iterator= data.BucketIterator.splits((train_data, valid_data, test_data), 
                                                            batch_size = BATCH_SIZE,
                                                            sort_key = lambda x: len(x.sequence),
                                                            sort_within_batch = False, 
                                                            device = device)

In [21]:
LABEL.build_vocab(train_data)
print(len(LABEL.vocab))

2


In [22]:
bert_model = BertModel.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [24]:
class BERTNLIModel(nn.Module):
    def __init__(self, bert_model, output_dim):
        super().__init__()
        self.bert = bert_model
        embedding_dim = bert_model.config.to_dict()['hidden_size']
        self.out = nn.Linear(embedding_dim, output_dim)
        
    def forward(self, sequence, attn_mask, token_type):
        embedded = self.bert(input_ids=sequence, attention_mask=attn_mask, token_type_ids=token_type)[1]
        output = self.out(embedded)
        return output

OUTPUT_DIM = len(LABEL.vocab)
model = BERTNLIModel(bert_model, OUTPUT_DIM).to(device)

In [25]:
optimizer = transformers.AdamW(model.parameters(),lr=2e-5,eps=1e-6,correct_bias=False)
criterion = nn.CrossEntropyLoss().to(device)
def get_scheduler(optimizer, warmup_steps):
    scheduler = transformers.get_constant_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps)
    return scheduler



In [30]:
def categorical_accuracy(preds, y):
    max_preds = preds.argmax(dim = 1, keepdim = True)

    correct = (max_preds.squeeze(1)==y).float()

    return correct.sum() / len(y)

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

def train(model, iterator, optimizer, criterion, scheduler):
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    for batch in iterator:
        optimizer.zero_grad() # clear gradients first
        torch.cuda.empty_cache() # releases all unoccupied cached memory
        
        sequence = batch.sequence
        attn_mask = batch.attn_mask
        token_type = batch.token_type
        label = batch.is_duplicate
        
        predictions = model(sequence, attn_mask, token_type)
        loss = criterion(predictions, label)
        acc = categorical_accuracy(predictions, label)
        
        loss.backward()
        optimizer.step()
        scheduler.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.eval()

    with torch.no_grad():
        for batch in iterator:
            sequence = batch.sequence

            attn_mask = batch.attn_mask
            token_type = batch.token_type
            labels = batch.is_duplicate

            predictions = model(sequence, attn_mask, token_type)
            loss = criterion(predictions, labels)
            acc = categorical_accuracy(predictions, labels)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [31]:
N_EPOCHS = 15
warmup_percent = 0.2
total_steps = math.ceil(N_EPOCHS*len(train_data)*1./BATCH_SIZE)
warmup_steps = int(total_steps*warmup_percent)
scheduler = get_scheduler(optimizer, warmup_steps)
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    start_time = time.time()
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion, scheduler)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'bert_based_quora_model.pt')
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}%')

KeyboardInterrupt: 

In [None]:
model.load_state_dict(torch.load('bert_based_quora_model.pt'))
test_loss, test_acc = evaluate(model, test_iterator, criterion)

In [None]:
def predict_inference(q1, q2, model, device):
    model.eval()
    
    q1 = '[CLS] ' + str(q1) + ' [SEP]'
    q2 = str(q2) + ' [SEP]'
    
    q1_t = tokenize_bert(q1)
    q2_t = tokenize_bert(q2)
    
    q1_type = sent1_token_type(q1_t)
    q2_type = sent2_token_type(q2_t)
    
    indexes = q1_t + q2_t
    indexes = tokenizer.convert_tokens_to_ids(indexes)
    
    indexes_type = q1_type + q2_type
    
    attn_mask = sent2_token_type(indexes)
    
    indexes = torch.LongTensor(indexes).unsqueeze(0).to(device)
    indexes_type = torch.LongTensor(indexes_type).unsqueeze(0).to(device)
    attn_mask = torch.LongTensor(attn_mask).unsqueeze(0).to(device)
    
    prediction = model(indexes, attn_mask, indexes_type)
    # prediction = prediction.argmax(dim=-1).item()
    return prediction