In [1]:
!pip install transformers



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import torch
from transformers import *
import numpy as np
import scipy as scipy
import pandas as pd
import os
import ast
import tqdm as tqdm


DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [0]:
MODEL_CLASS, TOKENIZER_CLASS, PRETRAINED = (DistilBertForSequenceClassification, DistilBertTokenizer, 'distilbert-base-cased')

# **Preprocess Data** 

In [0]:
data_path = '/content/drive/My Drive/Colab/data_qqp/'

train_data = pd.read_table(os.path.expanduser(data_path+'train.tsv'),header=0)
test_data = pd.read_table(os.path.expanduser(data_path+'dev_and_test.tsv'),header=0)

train_data['sentence1'] = train_data['sentence1'].apply(lambda x: ast.literal_eval(x).decode())
train_data['sentence2'] = train_data['sentence2'].apply(lambda x: ast.literal_eval(x).decode())

test_data['sentence1'] = test_data['sentence1'].apply(lambda x: ast.literal_eval(x).decode())
test_data['sentence2'] = test_data['sentence2'].apply(lambda x: ast.literal_eval(x).decode())

In [0]:
display(test_data)

In [0]:
class Tokenizer:
    # init
    def __init__(self, tokenizer_class, pretrained_weights, group_sent=False):
        self.tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
        self.group_sent = group_sent
        
    # tokenize
    def tokenize_data(self, data):
        tokenized = pd.DataFrame(columns = ['i1', 's1', 'i2', 's2', 'i', 's', 'y'])
        for index, row in data.iterrows():
            token_row = {
                'i1' : [],
                's1' : [],
                'i2' : [],
                's2' : [],
                'i' : [],
                's' : [],
                'y': 0
            }
            sent1 = row['sentence1']
            encoding1 = self.tokenizer.encode_plus(sent1, return_token_type_ids=True, max_length=128, pad_to_max_length=True)

            token_row['i1'] = encoding1['input_ids']
            token_row['s1'] = encoding1['token_type_ids']

            # indexed1 = self.tokenizer.encode(sent1, max_length=128, pad_to_max_length=True)
            # print(token)
            # indexed1 = self.tokenizer.convert_tokens_to_ids(token)
            # indexed_tensor = torch.tensor([indexed1])
            # segment = np.zeros(len(indexed1), dtype=int)
            # segment_tensor = torch.tensor([segment])
            # token_row['s1'] = segment_tensor

            sent2 = row['sentence2']
            encoding2 = self.tokenizer.encode_plus(sent2, return_token_type_ids=True, max_length=128, pad_to_max_length=True)

            token_row['i2'] = encoding2['input_ids']
            token_row['s2'] = encoding2['token_type_ids']
            # indexed2 = self.tokenizer.encode(sent2, max_length=128, pad_to_max_length=True)
            # # indexed2 = self.tokenizer.convert_tokens_to_ids(token)
            # indexed_tensor = torch.tensor([indexed2])
            # token_row['i2'] = indexed_tensor
            # segment = np.zeros(len(indexed2), dtype=int)
            # segment_tensor = torch.tensor([segment])
            # token_row['s2'] = segment_tensor
            
            if self.group_sent == True:
                encoding = self.tokenizer.encode_plus(sent1, sent2, return_token_type_ids=True, max_length=256, pad_to_max_length=True)

                token_row['i'] = encoding2['input_ids']
                token_row['s'] = encoding2['token_type_ids']

                # print(encoding)
                # indexed = self.tokenizer.build_inputs_with_special_tokens(indexed1, indexed2)
                # token_row['i'] = torch.tensor([indexed])
                # # print(torch.tensor([indexed]).shape)

                # max_length = max(len(indexed), max_length)

                # if len(indexed) > 500:
                #     print(sent1 + sent2)
                # segment = self.tokenizer.create_token_type_ids_from_sequences(indexed1, indexed2)
                # token_row['s'] = torch.tensor([segment])
            
            token_row['y'] = row['label']
            tokenized = tokenized.append(token_row, ignore_index=True)
        return tokenized

In [0]:
tokenizer = Tokenizer(TOKENIZER_CLASS, PRETRAINED, True)
test_tokens = tokenizer.tokenize_data(test_data)
train_tokens = tokenizer.tokenize_data(train_data)

In [0]:
display(test_tokens)

In [0]:
test_tokens.to_csv('/content/drive/My Drive/Colab/data_qqp/test_tok.csv')
train_tokens.to_csv('/content/drive/My Drive/Colab/data_qqp/train_tok.csv')

# **Load Data**
- TBD: Create DataLoader

In [0]:
test_tokens = pd.read_csv('/content/drive/My Drive/Colab/data_qqp/test_tok.csv')
test_tokens['i'] = test_tokens['i'].apply(lambda x: ast.literal_eval(x))
test_tokens['s'] = test_tokens['s'].apply(lambda x: ast.literal_eval(x))

train_tokens = pd.read_csv('/content/drive/My Drive/Colab/data_qqp/train_tok.csv')
train_tokens['i'] = train_tokens['i'].apply(lambda x: ast.literal_eval(x))
train_tokens['s'] = train_tokens['s'].apply(lambda x: ast.literal_eval(x))

In [0]:
# display(test_tokens['i'].values)

# **BERT Model**

In [0]:
class Model_Classifier:
    # initalize model
    def __init__(self, model_class, pretrained_weights):
        self.model = model_class.from_pretrained(pretrained_weights).cuda()
        self.TRAINING_ITERATIONS = 10000
        self.WARMUP = 500
        self.BATCH_SIZE = 64
        self.REPORT_FREQUENCY = 100
#         self.model.eval()
        
    # train classifier
    def train(self, data):
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
            {"params": [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
        ]
        optimizer = AdamW(optimizer_grouped_parameters)
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=self.WARMUP, num_training_steps=self.TRAINING_ITERATIONS)
        self.model.train()
        bar = tqdm.tqdm(total=1)
        for i in range(0, self.TRAINING_ITERATIONS):
            train_data = data.sample(n=self.BATCH_SIZE)
            train_ids = torch.tensor(train_data['i'].values.tolist()).cuda()
            train_segments = torch.tensor(train_data['s'].values.tolist()).cuda()
            train_labels = torch.tensor(train_data['y'].values.tolist()).cuda()
            
            # Uncomment this for BERT
            # outputs = self.model(input_ids = train_ids, token_type_ids = train_segments, labels = train_labels)

            # Comment this for DistilBERT
            outputs = self.model(input_ids = train_ids, labels = train_labels)
            loss = outputs[0].cuda()
            loss.backward()
            optimizer.step()
            scheduler.step()
            self.model.zero_grad()
            bar.update(1)
            if i % self.REPORT_FREQUENCY == 0:
                bar.close()
                print("%d: Loss - %s" %(i, loss))
                bar = tqdm.tqdm(total=100)
        bar.close()
        return

    # save model
    def save(self):
      self.model.save_pretrained('/content/drive/My Drive/Colab/model/')
    
    # predict values
    def test(self, data):
        self.model.eval()
        ids = data['i'].values
        segments = data['s'].values
        labels = data['y'].values
        for i in range(len(ids)):
            outputs = self.model(input_ids = ids[i], token_type_ids = segments[i])
            probs = outputs[0][0]
            print("pred: %f, act: %f" %(probs.max(0)[1], labels[i]))

In [0]:
bert_class = Model_Classifier(MODEL_CLASS, PRETRAINED)

In [0]:
bert_class.train(train_tokens)

100%|██████████| 1/1 [00:00<00:00,  1.42it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

0: Loss - tensor(0.6752, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 100/100 [01:09<00:00,  1.44it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

100: Loss - tensor(0.4605, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 100/100 [01:09<00:00,  1.44it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

200: Loss - tensor(0.6807, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 100/100 [01:09<00:00,  1.44it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

300: Loss - tensor(0.6702, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 100/100 [01:09<00:00,  1.44it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

400: Loss - tensor(0.7297, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 100/100 [01:09<00:00,  1.44it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

500: Loss - tensor(0.5044, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 100/100 [01:09<00:00,  1.44it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

600: Loss - tensor(0.6039, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 100/100 [01:09<00:00,  1.44it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

700: Loss - tensor(0.5879, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 100/100 [01:09<00:00,  1.45it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

800: Loss - tensor(0.5958, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 100/100 [01:09<00:00,  1.44it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

900: Loss - tensor(0.6439, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 100/100 [01:09<00:00,  1.44it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

1000: Loss - tensor(0.5613, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 100/100 [01:09<00:00,  1.44it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

1100: Loss - tensor(0.6222, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 100/100 [01:09<00:00,  1.44it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

1200: Loss - tensor(0.6780, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 100/100 [01:09<00:00,  1.44it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

1300: Loss - tensor(0.6075, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 100/100 [01:09<00:00,  1.44it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

1400: Loss - tensor(0.5813, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 100/100 [01:09<00:00,  1.44it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

1500: Loss - tensor(0.6452, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 100/100 [01:09<00:00,  1.44it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

1600: Loss - tensor(0.6821, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 100/100 [01:09<00:00,  1.44it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

1700: Loss - tensor(0.7098, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 100/100 [01:09<00:00,  1.44it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

1800: Loss - tensor(0.6322, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 100/100 [01:09<00:00,  1.44it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

1900: Loss - tensor(0.5992, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 100/100 [01:09<00:00,  1.44it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

2000: Loss - tensor(0.6517, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 100/100 [01:09<00:00,  1.44it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

2100: Loss - tensor(0.5537, device='cuda:0', grad_fn=<NllLossBackward>)


 47%|████▋     | 47/100 [00:32<00:36,  1.45it/s]

In [0]:
bert_class.test(test_tokens)

# **Custom Model**

In [0]:
class Model:
    # initalize model
    def __init__(self, model_class, tokenizer_class, pretrained_weights):
        self.model = model_class.from_pretrained(pretrained_weights)
        
    # Generate embeddings
    def get_embeddings(self, data):
        return
        
    # 
    def train(self, data):
        return
    
    # 
    def test(self, data):
        return