In [1]:
import os
import numpy as np
import pandas as pd
import nltk
import torch
from transformers import BertTokenizer
from torchtext.legacy import data
from sklearn.model_selection import train_test_split
from transformers import BertModel
import transformers

#from transformers import *
import torch.optim as optim

  from .autonotebook import tqdm as notebook_tqdm
The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


Moving 0 files to the new cache system


0it [00:00, ?it/s]


In [2]:
abspath = os.path.abspath('')
dname = os.path.dirname(abspath)
os.chdir(dname)
print(dname)

c:\Users\wongy\OneDrive\Desktop\duplicate-questions-pair-detection


In [4]:
df = pd.read_feather('data/processed/train_w_lcs.feather')

In [5]:
df.head()

Unnamed: 0,index,id,qid1,qid2,question1,question2,is_duplicate,q1_cleaned,q2_cleaned,q1_start,q2_start,lc_substring,lc_subsequence
0,0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,what is the step by step guide to invest in sh...,what is the step by step guide to invest in sh...,what,what,56,56
1,1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,what is the story of kohinoor kohinoor diamond,what would happen if the indian government sto...,what,what,26,41
2,2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,how can i increase the speed of my internet co...,how can internet speed be increased by hacking...,how,how,10,36
3,3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,why am i mentally very lonely how can i solve it,find the remainder when math2324math is divide...,why,find,2,18
4,4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,which one dissolve in water quikly sugar salt ...,which fish would survive in salt water,which,which,6,25


In [6]:
def common_words(row):
    w1 = set(map(lambda word: word.lower().strip(), row['q1_cleaned'].split(" ")))
    w2 = set(map(lambda word: word.lower().strip(), row['q2_cleaned'].split(" ")))    
    return len(w1 & w2)
    
df['jaccard_dist'] = nltk.jaccard_distance(set(df['q1_cleaned']), set(df['q2_cleaned']))
df['common_words'] = df.apply(common_words, axis=1)
df['common_ratio'] = df.apply(lambda row: row['common_words'] / (len(row['q1_cleaned']) + len(row['q2_cleaned'])), axis=1)

In [7]:
df.head(3)

Unnamed: 0,index,id,qid1,qid2,question1,question2,is_duplicate,q1_cleaned,q2_cleaned,q1_start,q2_start,lc_substring,lc_subsequence,jaccard_dist,common_words,common_ratio
0,0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,what is the step by step guide to invest in sh...,what is the step by step guide to invest in sh...,what,what,56,56,0.900602,11,0.090909
1,1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,what is the story of kohinoor kohinoor diamond,what would happen if the indian government sto...,what,what,26,41,0.900602,4,0.031008
2,2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,how can i increase the speed of my internet co...,how can internet speed be increased by hacking...,how,how,10,36,0.900602,4,0.026667


In [8]:
# Masking the tokens to feed into BERT
def sent1_token_type(sentence):
    try:
        return [0]* len(sentence)
    except:
        return []
    
#Get list of 1s
def sent2_token_type(sentence):
    try:
        return [1]* len(sentence)
    except:
        return []

#combine from lists
def combine_seq(seq):
    return " ".join(seq)

#combines from lists of int
def combine_mask(mask):
    mask = list(map(str, mask))
    return " ".join(mask)

#convert attention mask back to list of int
def convert_mask(mask):
    return list(map(int, mask))

In [9]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
cls_token = tokenizer.cls_token
sep_token = tokenizer.sep_token
pad_token = tokenizer.pad_token
unk_token = tokenizer.unk_token
cls_token_idx = tokenizer.cls_token_id
sep_token_idx = tokenizer.sep_token_id
pad_token_idx = tokenizer.pad_token_id
unk_token_idx = tokenizer.unk_token_id

def tokenize_bert(sentence):
    tokens = tokenizer.tokenize(sentence) 
    return tokens

def split(sentence):
    tokens = sentence.strip().split(" ")
    return tokens

In [10]:
df['q1_padded'] = '[CLS] ' + df['q1_cleaned'] + ' [SEP] '
df['q2_padded'] = df['q2_cleaned'] + ' [SEP]'
df['q1_bert_tokens'] = df['q1_padded'].apply(lambda x: tokenize_bert(x))
df['q2_bert_tokens'] = df['q2_padded'].apply(lambda x: tokenize_bert(x))

In [11]:
df['q1_token_type'] = df['q1_bert_tokens'].apply(lambda x: sent1_token_type(x))
df['q2_token_type'] = df['q2_bert_tokens'].apply(lambda x: sent2_token_type(x))

In [12]:
df['sequence'] = df['q1_bert_tokens'] + df['q2_bert_tokens']
df['attn_mask'] = df['sequence'].apply(lambda x: sent2_token_type(x)) # every word needs attention
df['token_type'] = df['q1_token_type'] + df['q2_token_type']


In [13]:
# Make all the inputs to be sequential in string instead of list
df['sequence'] = df['sequence'].apply(lambda x: combine_seq(x))
df['attn_mask'] = df['attn_mask'].apply(lambda x: combine_mask(x))
df['token_type'] = df['token_type'].apply(lambda x: combine_mask(x))

In [14]:
#For sequence
TEXT = data.Field(batch_first = True,
                  use_vocab = False,
                  tokenize = split,
                  preprocessing = tokenizer.convert_tokens_to_ids,
                  pad_token = pad_token_idx,
                  unk_token = unk_token_idx)
#For label
LABEL = data.LabelField()

#For Attention mask
ATTENTION = data.Field(batch_first = True,
                  use_vocab = False,
                  tokenize = split,
                  preprocessing = convert_mask,
                  pad_token = pad_token_idx)
#For token type ids
TTYPE = data.Field(batch_first = True,
                  use_vocab = False,
                  tokenize = split,
                  preprocessing = convert_mask,
                  pad_token = 1)

In [35]:
X = df.copy()[[x for x in df.columns if x != 'is_duplicate']]
y = df['is_duplicate']
X.drop(['index', 'id', 'qid1', 'qid2', 'question1', 'question2', 'q1_cleaned', 'q2_cleaned', 'q1_padded', 'q2_padded', 'q1_bert_tokens', 'q2_bert_tokens', 'q1_token_type', 'q2_token_type'], axis=1, inplace=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify = y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, stratify = y_train)

In [36]:
X_train.reset_index(drop=True, inplace=True)
y_train = pd.DataFrame(y_train.tolist(), columns = ['is_duplicate'])
y_train.reset_index(drop=True, inplace=True)

df_train = pd.concat([X_train, y_train],axis=1)

X_val.reset_index(drop=True, inplace=True)
y_val = pd.DataFrame(y_val.tolist(), columns = ['is_duplicate'])
y_val.reset_index(drop=True, inplace=True)
    
df_val = pd.concat([X_val, y_val], axis=1)

X_test.reset_index(drop=True, inplace=True)
y_test = pd.DataFrame(y_test.tolist(), columns = ['is_duplicate'])
y_test.reset_index(drop=True, inplace=True)
    
df_test = pd.concat([X_test, y_test], axis=1)

In [None]:
# df_train.to_feather('data/processed/bert_train.feather')
# df_val.to_feather('data/processed/bert_val.feather')
# df_test.to_feather('data/processed/bert_test.feather')

In [26]:
LABEL.build_vocab(X_train)
#Create iterator
BATCH_SIZE = 16
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


In [15]:
df.head(3)

Unnamed: 0,index,id,qid1,qid2,question1,question2,is_duplicate,q1_cleaned,q2_cleaned,q1_start,...,common_ratio,q1_padded,q2_padded,q1_bert_tokens,q2_bert_tokens,q1_token_type,q2_token_type,sequence,attn_mask,token_type
0,0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,what is the step by step guide to invest in sh...,what is the step by step guide to invest in sh...,what,...,0.090909,[CLS] what is the step by step guide to invest...,what is the step by step guide to invest in sh...,"[[CLS], what, is, the, step, by, step, guide, ...","[what, is, the, step, by, step, guide, to, inv...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]",[CLS] what is the step by step guide to invest...,1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 ...
1,1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,what is the story of kohinoor kohinoor diamond,what would happen if the indian government sto...,what,...,0.031008,[CLS] what is the story of kohinoor kohinoor d...,what would happen if the indian government sto...,"[[CLS], what, is, the, story, of, ko, ##hin, #...","[what, would, happen, if, the, indian, governm...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",[CLS] what is the story of ko ##hin ##oor ko #...,1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 ...
2,2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,how can i increase the speed of my internet co...,how can internet speed be increased by hacking...,how,...,0.026667,[CLS] how can i increase the speed of my inter...,how can internet speed be increased by hacking...,"[[CLS], how, can, i, increase, the, speed, of,...","[how, can, internet, speed, be, increased, by,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]",[CLS] how can i increase the speed of my inter...,1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 ...
