In [1]:
!pip install transformers



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import torch
from transformers import *
import numpy as np
import scipy as scipy
import pandas as pd
import os
import ast
import tqdm as tqdm


DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [0]:
MODEL_CLASS, TOKENIZER_CLASS, PRETRAINED = (BertForSequenceClassification, BertTokenizer, 'bert-base-cased')
PAWS_QQP = False

# **Generate QQP**

In [0]:
data_path = '/content/drive/My Drive/Colab/data_qqp/'

all_data = pd.read_table(os.path.expanduser(data_path + 'quora_duplicate_questions.tsv'), header=0)
all_data = all_data.rename(columns={'is_duplicate': 'label', 'question1': 'sentence1', 'question2': 'sentence2'})

class_1 = all_data[all_data['label']==1]
class_0 = all_data[all_data['label']==0]

class_1_dev_test = class_1.sample(n=10000)
class_1_train = class_1.drop(class_1_dev_test.index)

class_0_dev_test = class_0.sample(n=10000)
class_0_train = class_0.drop(class_0_dev_test.index)

train_data = pd.concat([class_1_train, class_0_train])
train_data = train_data.sample(frac=1).reset_index(drop=True)
test_data = pd.concat([class_1_dev_test[:5000], class_0_dev_test[:5000]])
test_data = test_data.sample(frac=1).reset_index(drop=True)
dev_data = pd.concat([class_1_dev_test[5000:], class_0_dev_test[5000:]])
dev_data = dev_data.sample(frac=1).reset_index(drop=True)

In [0]:
train_data.to_csv(data_path + 'train.tsv', sep='\t')
dev_data.to_csv(data_path + 'dev.tsv', sep='\t')
test_data.to_csv(data_path + 'test.tsv', sep='\t')

# **Preprocess Data** 

In [0]:
DATA_FOLDER = 'data_PAWS_qqp'
FILE_NAMES = ['train.tsv', 'dev.tsv']

In [0]:
class Tokenizer:
    # init
    def __init__(self, tokenizer_class, pretrained_weights, group_sent=False):
        self.tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
        self.group_sent = group_sent
        
    # tokenize
    def tokenize_data(self, data, file_path):
        bar = tqdm.tqdm(total=len(data))
        tokenized = pd.DataFrame(columns = ['i1', 's1', 'i2', 's2', 'i', 's', 'y'])
        count = 0
        flag = True
        for index, row in data.iterrows():
            token_row = {
                'i1' : [],
                's1' : [],
                'i2' : [],
                's2' : [],
                'i' : [],
                's' : [],
                'y': 0
            }
            sent1 = str(row['sentence1'])
            encoding1 = self.tokenizer.encode_plus(sent1, return_token_type_ids=True, max_length=128, pad_to_max_length=True)

            token_row['i1'] = encoding1['input_ids']
            token_row['s1'] = encoding1['token_type_ids']

            # indexed1 = self.tokenizer.encode(sent1, max_length=128, pad_to_max_length=True)
            # print(token)
            # indexed1 = self.tokenizer.convert_tokens_to_ids(token)
            # indexed_tensor = torch.tensor([indexed1])
            # segment = np.zeros(len(indexed1), dtype=int)
            # segment_tensor = torch.tensor([segment])
            # token_row['s1'] = segment_tensor

            sent2 = str(row['sentence2'])
            encoding2 = self.tokenizer.encode_plus(sent2, return_token_type_ids=True, max_length=128, pad_to_max_length=True)

            token_row['i2'] = encoding2['input_ids']
            token_row['s2'] = encoding2['token_type_ids']
            # indexed2 = self.tokenizer.encode(sent2, max_length=128, pad_to_max_length=True)
            # # indexed2 = self.tokenizer.convert_tokens_to_ids(token)
            # indexed_tensor = torch.tensor([indexed2])
            # token_row['i2'] = indexed_tensor
            # segment = np.zeros(len(indexed2), dtype=int)
            # segment_tensor = torch.tensor([segment])
            # token_row['s2'] = segment_tensor
            
            if self.group_sent == True:
                encoding = self.tokenizer.encode_plus(sent1, sent2, return_token_type_ids=True, max_length=256, pad_to_max_length=True)

                token_row['i'] = encoding2['input_ids']
                token_row['s'] = encoding2['token_type_ids']

                # print(encoding)
                # indexed = self.tokenizer.build_inputs_with_special_tokens(indexed1, indexed2)
                # token_row['i'] = torch.tensor([indexed])
                # # print(torch.tensor([indexed]).shape)

                # max_length = max(len(indexed), max_length)

                # if len(indexed) > 500:
                #     print(sent1 + sent2)
                # segment = self.tokenizer.create_token_type_ids_from_sequences(indexed1, indexed2)
                # token_row['s'] = torch.tensor([segment])
            
            token_row['y'] = row['label']
            tokenized = tokenized.append(token_row, ignore_index=True)
            bar.update()
            count += 1
            if count % 10000 == 0:
                if flag == True:
                    tokenized.to_csv(file_path)
                    flag = False
                else:
                    tokenized.to_csv(file_path, mode='a', header=False)
                tokenized = tokenized.iloc[0:0]
        if count % 10000 != 0:
            if flag == True:
                tokenized.to_csv(file_path)
                flag = False
            else:
                tokenized.to_csv(file_path, mode='a', header=False)
        bar.close()
        return tokenized

In [0]:
data_path = '/content/drive/My Drive/Colab/' + DATA_FOLDER + '/'
tokenizer = Tokenizer(TOKENIZER_CLASS, PRETRAINED, True)

for file_name in FILE_NAMES:
    data = pd.read_table(os.path.expanduser(data_path+file_name),header=0)
    tokens = tokenizer.tokenize_data(data, data_path + file_name.split('.')[0] + '.csv')
    # tokens.to_csv(data_path + file_name.split('.')[0] + '.csv')
    # display(tokens)

100%|██████████| 11988/11988 [00:57<00:00, 208.27it/s]
100%|██████████| 677/677 [00:02<00:00, 238.98it/s]


# **Load Data**
- TBD: Create DataLoader

In [0]:
BASE_QQP_PATH = '/content/drive/My Drive/Colab/data_qqp/'
BASE_PAWS_QQP_PATH = '/content/drive/My Drive/Colab/data_PAWS_qqp/'

In [0]:
# Load QQP sets
data = pd.read_csv(BASE_QQP_PATH + 'train.csv')
data['i'] = data['i'].apply(lambda x: ast.literal_eval(x))
data['s'] = data['s'].apply(lambda x: ast.literal_eval(x))
train_set = data
# display(train_set)

In [0]:
# Load PAWS QQP if generated
if PAWS_QQP == True:    
    data = pd.read_csv(BASE_PAWS_QQP_PATH + 'train.csv')
    data['i'] = data['i'].apply(lambda x: ast.literal_eval(x))
    data['s'] = data['s'].apply(lambda x: ast.literal_eval(x))
    train_set = pd.concat([train_set, data])


In [0]:
data = pd.read_csv(BASE_QQP_PATH + 'dev.csv')
data['i'] = data['i'].apply(lambda x: ast.literal_eval(x))
data['s'] = data['s'].apply(lambda x: ast.literal_eval(x))
dev_set = data

data = pd.read_csv(BASE_QQP_PATH + 'test.csv')
data['i'] = data['i'].apply(lambda x: ast.literal_eval(x))
data['s'] = data['s'].apply(lambda x: ast.literal_eval(x))
test_set = data

In [0]:
paws_qqp_dev_set = None
if PAWS_QQP == True: 
    data = pd.read_csv(BASE_PAWS_QQP_PATH + 'dev.csv')
    data['i'] = data['i'].apply(lambda x: ast.literal_eval(x))
    data['s'] = data['s'].apply(lambda x: ast.literal_eval(x))
    paws_qqp_dev_set = data
    dev_set = pd.concat([dev_set, data])

# **BERT Model**

## Model Code

In [0]:
# Custom LR schedule
def get_custom_schedule(optimizer, last_epoch=-1):
    """ Create a schedule with a learning rate that decreases linearly after
    linearly increasing during a warmup period.
    """
    def lr_lambda(current_step):
        return max(1e-3, 1.0/current_step)

    return torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda, last_epoch)

class Model_Classifier:
    # initalize model
    def __init__(self, model_class, pretrained_weights, pretrained_path=None):
        if pretrained_path is not None:
            self.model = model_class.from_pretrained(pretrained_path).cuda()
        else:
            self.model = model_class.from_pretrained(pretrained_weights).cuda()

        self.TRAINING_ITERATIONS = 100000
        self.WARMUP = 1000
        self.BATCH_SIZE = 64
        self.REPORT_FREQUENCY = 100
        self.CHKPT_FREQUENCY = 500
#         self.model.eval()
        

    def train_batch(self, train_data):
        train_ids = torch.tensor(train_data['i'].values.tolist()).cuda()
        train_segments = torch.tensor(train_data['s'].values.tolist()).cuda()
        train_labels = torch.tensor(train_data['y'].values.tolist()).cuda()

        # Uncomment this for BERT
        # outputs = self.model(input_ids = train_ids, token_type_ids = train_segments, labels = train_labels)

        # Uncomment this for DistilBERT
        self.model.train()
        outputs = self.model(input_ids = train_ids, labels = train_labels)
        loss = outputs[0].cuda()
        loss.backward()
        self.optimizer.step()
        self.scheduler.step()
        self.model.zero_grad()
        return loss

    # train classifier
    def train(self, data, validation_set):
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
            {"params": [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
        ]
        self.optimizer = AdamW(optimizer_grouped_parameters, lr=1e-5)
        # self.scheduler = get_custom_schedule(self.optimizer)
        self.scheduler = get_linear_schedule_with_warmup(self.optimizer, num_warmup_steps=0, num_training_steps=self.TRAINING_ITERATIONS+1)
        prev_acc = 0.0
        bar = tqdm.tqdm(total=self.CHKPT_FREQUENCY)
        for i in range(1, self.TRAINING_ITERATIONS+1):
            train_data = data.sample(n=self.BATCH_SIZE)
            loss = self.train_batch(train_data)
            bar.update(1)
            if i%self.REPORT_FREQUENCY == 0:
                bar.write("%d: Loss - %s" %(i, loss))
            if i%self.CHKPT_FREQUENCY == 0:
                acc = self.test(validation_set)
                bar.write("%d: Validation - %s" %(i, str(acc)))
                if abs(prev_acc-acc) < 0.0001 and loss[0] < 0.001:
                    bar.write('Finish')
                    bar.close()
                    self.save(directory='final')
                    break
                prev_acc = acc
                bar.write("%d Saved" %(i))
                self.save(directory='CHKPT_')
                bar.close()
                bar = tqdm.tqdm(total=self.CHKPT_FREQUENCY)

            # data = data.sample(frac=1).reset_index(drop=True)
            # count = 1
            # for j in range(self.BATCH_SIZE, len(data), self.BATCH_SIZE):
            #     train_data = data.iloc[j-self.BATCH_SIZE:j]
            #     loss = self.train_batch(train_data)
            #     bar.update(1)
            #     if count%self.REPORT_FREQUENCY == 0:
            #         i = count/self.REPORT_FREQUENCY
            #         bar.write("%d: Loss - %s" %(i, loss))
            #     if count%self.CHKPT_FREQUENCY == 0:
            #         i = count/self.REPORT_FREQUENCY
            #         bar.write("%d: Loss - %s" %(i, loss))
            #         acc = self.test(validation_set)
            #         bar.write("%d: Validation - %s" %(i, str(acc)))
            #         if abs(prev_acc-acc) < 0.0001 and loss[0] < 0.001:
            #             bar.write('Finish')
            #             bar.close()
            #             self.save(directory='final')
            #             break
            #         prev_acc = acc
            #         bar.write("%d Saved" %(i))
            #         self.save(directory='CHKPT_')
                # count += 1
            # bar.close()
        # bar.close()
        return

    # save model
    def save(self, directory=None):
      if directory is not None:
          path = '/content/drive/My Drive/Colab/' + directory + '/'
          if not os.path.exists(path):
              os.makedirs(path)
          self.model.save_pretrained('/content/drive/My Drive/Colab/' + directory + '/')
      else:
          self.model.save_pretrained('/content/drive/My Drive/Colab/model/')
    
    # predict values
    def test(self, data, is_print=False):
        test_samples = pd.read_table(os.path.expanduser('/content/drive/My Drive/Colab/data_qqp/test.tsv'),header=0)
        self.model.eval()
        ids = data['i'].values
        segments = data['s'].values
        labels = data['y'].values
        correct = 0
        total = len(labels)
        i = 0
        for i in range(self.BATCH_SIZE, len(ids), self.BATCH_SIZE):
            input_ids = torch.tensor(ids[i-self.BATCH_SIZE:i].tolist()).cuda()
            outputs = self.model(input_ids = input_ids)
            probs = outputs[0]
            softmax = torch.nn.functional.softmax(probs, dim=1)
            prediction = torch.argmax(softmax, dim=1)
            # print(prediction)
            for j in range(i-self.BATCH_SIZE, i):
                if labels[j] == prediction[j-i]:
                    correct += 1
                elif is_print==True:
                    print("Mismatch Betweeen %s \t - \t %s, label (%d) vs prediction (%d)" %(test_samples.iloc[j]['sentence1'], test_samples.iloc[j]['sentence2'], labels[j], prediction[j-i]))
        if i < total:
            input_ids = torch.tensor(ids[i:total].tolist()).cuda()
            outputs = self.model(input_ids = input_ids)
            probs = outputs[0]
            softmax = torch.nn.functional.softmax(probs, dim=1)
            prediction = torch.argmax(softmax, dim=1)
            # print(prediction)
            for j in range(i, total):
                if labels[j] == prediction[j-i]:
                    correct += 1
                elif is_print==True:
                    print("Mismatch Betweeen %s \t - \t %s, label (%d) vs prediction (%d)" %(test_samples.iloc[j]['sentence1'], test_samples.iloc[j]['sentence2'], labels[j], prediction[j-i]))
        return correct/total

## Train Model

In [0]:
bert_class = Model_Classifier(MODEL_CLASS, PRETRAINED)

In [0]:
bert_class.train(train_set, dev_set)

 20%|██        | 100/500 [01:11<04:48,  1.39it/s]

100: Loss - tensor(0.6220, device='cuda:0', grad_fn=<NllLossBackward>)


 40%|████      | 200/500 [02:23<03:34,  1.40it/s]

200: Loss - tensor(0.5889, device='cuda:0', grad_fn=<NllLossBackward>)


 60%|██████    | 300/500 [03:34<02:22,  1.40it/s]

300: Loss - tensor(0.5994, device='cuda:0', grad_fn=<NllLossBackward>)


 80%|████████  | 400/500 [04:46<01:11,  1.39it/s]

400: Loss - tensor(0.6543, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 500/500 [05:58<00:00,  1.40it/s]

500: Loss - tensor(0.5602, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 500/500 [06:34<00:00,  1.40it/s]

500: Validation - 0.6037
500 Saved


100%|██████████| 500/500 [06:35<00:00,  1.26it/s]
 20%|██        | 100/500 [01:11<04:46,  1.39it/s]

600: Loss - tensor(0.5824, device='cuda:0', grad_fn=<NllLossBackward>)


 40%|████      | 200/500 [02:23<03:35,  1.39it/s]

700: Loss - tensor(0.5042, device='cuda:0', grad_fn=<NllLossBackward>)


 51%|█████▏    | 257/500 [03:04<02:54,  1.40it/s]

In [0]:
bert_class.save()

HERE


## Test Model


In [0]:
bert_class = Model_Classifier(MODEL_CLASS, PRETRAINED, pretrained_path='/content/drive/My Drive/Colab/QQP_PAWS_1/')
acc = bert_class.test(paws_qqp_dev_set)
print(acc)

Mismatch Betweeen Why are the droids in Star Wars so stupid? 	 - 	 Why is everyone in Star Wars so racist against droids?, label (0) vs prediction (1)
Mismatch Betweeen Where can I get best assistance in Sydney for any property purchasing? 	 - 	 Where can I get best property transaction support in Sydney?, label (1) vs prediction (0)
Mismatch Betweeen What are the benefits of getting married? 	 - 	 What are the benefits of getting married in this life time?, label (0) vs prediction (1)
Mismatch Betweeen What is the best vegan food? 	 - 	 What is best vegan food in world?, label (0) vs prediction (1)
Mismatch Betweeen How did it feel when you met your soulmate? 	 - 	 How did you feel when you met your soulmate?, label (0) vs prediction (1)
Mismatch Betweeen I was 18 he was 40. He InitIally said to me that he wIll cut his private parts if i say no to marrying him. I married him. I am unhappy what do I do? 	 - 	 I love him a lot. He likes me too. But we haven't told each other openly. Rec

# **Custom Model**

In [0]:
class Model:
    # initalize model
    def __init__(self, model_class, tokenizer_class, pretrained_weights):
        self.model = model_class.from_pretrained(pretrained_weights)
        
    # Generate embeddings
    def get_embeddings(self, data):
        return
        
    # 
    def train(self, data):
        return
    
    # 
    def test(self, data):
        return