In [0]:
!pip install transformers



In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import torch
from transformers import *
import numpy as np
import scipy as scipy
import pandas as pd
import os
import ast
import tqdm as tqdm


DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [0]:
MODEL_CLASS, TOKENIZER_CLASS, PRETRAINED = (DistilBertForSequenceClassification, DistilBertTokenizer, 'distilbert-base-cased')

# **Preprocess Data** 

In [0]:
data_path = '/content/drive/My Drive/Colab/data_qqp/'

train_data = pd.read_table(os.path.expanduser(data_path+'train.tsv'),header=0)
test_data = pd.read_table(os.path.expanduser(data_path+'dev_and_test.tsv'),header=0)

train_data['sentence1'] = train_data['sentence1'].apply(lambda x: ast.literal_eval(x).decode())
train_data['sentence2'] = train_data['sentence2'].apply(lambda x: ast.literal_eval(x).decode())

test_data['sentence1'] = test_data['sentence1'].apply(lambda x: ast.literal_eval(x).decode())
test_data['sentence2'] = test_data['sentence2'].apply(lambda x: ast.literal_eval(x).decode())

In [0]:
display(test_data)

In [0]:
class Tokenizer:
    # init
    def __init__(self, tokenizer_class, pretrained_weights, group_sent=False):
        self.tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
        self.group_sent = group_sent
        
    # tokenize
    def tokenize_data(self, data):
        tokenized = pd.DataFrame(columns = ['i1', 's1', 'i2', 's2', 'i', 's', 'y'])
        for index, row in data.iterrows():
            token_row = {
                'i1' : [],
                's1' : [],
                'i2' : [],
                's2' : [],
                'i' : [],
                's' : [],
                'y': 0
            }
            sent1 = row['sentence1']
            encoding1 = self.tokenizer.encode_plus(sent1, return_token_type_ids=True, max_length=128, pad_to_max_length=True)

            token_row['i1'] = encoding1['input_ids']
            token_row['s1'] = encoding1['token_type_ids']

            # indexed1 = self.tokenizer.encode(sent1, max_length=128, pad_to_max_length=True)
            # print(token)
            # indexed1 = self.tokenizer.convert_tokens_to_ids(token)
            # indexed_tensor = torch.tensor([indexed1])
            # segment = np.zeros(len(indexed1), dtype=int)
            # segment_tensor = torch.tensor([segment])
            # token_row['s1'] = segment_tensor

            sent2 = row['sentence2']
            encoding2 = self.tokenizer.encode_plus(sent2, return_token_type_ids=True, max_length=128, pad_to_max_length=True)

            token_row['i2'] = encoding2['input_ids']
            token_row['s2'] = encoding2['token_type_ids']
            # indexed2 = self.tokenizer.encode(sent2, max_length=128, pad_to_max_length=True)
            # # indexed2 = self.tokenizer.convert_tokens_to_ids(token)
            # indexed_tensor = torch.tensor([indexed2])
            # token_row['i2'] = indexed_tensor
            # segment = np.zeros(len(indexed2), dtype=int)
            # segment_tensor = torch.tensor([segment])
            # token_row['s2'] = segment_tensor
            
            if self.group_sent == True:
                encoding = self.tokenizer.encode_plus(sent1, sent2, return_token_type_ids=True, max_length=256, pad_to_max_length=True)

                token_row['i'] = encoding2['input_ids']
                token_row['s'] = encoding2['token_type_ids']

                # print(encoding)
                # indexed = self.tokenizer.build_inputs_with_special_tokens(indexed1, indexed2)
                # token_row['i'] = torch.tensor([indexed])
                # # print(torch.tensor([indexed]).shape)

                # max_length = max(len(indexed), max_length)

                # if len(indexed) > 500:
                #     print(sent1 + sent2)
                # segment = self.tokenizer.create_token_type_ids_from_sequences(indexed1, indexed2)
                # token_row['s'] = torch.tensor([segment])
            
            token_row['y'] = row['label']
            tokenized = tokenized.append(token_row, ignore_index=True)
        return tokenized

In [0]:
tokenizer = Tokenizer(TOKENIZER_CLASS, PRETRAINED, True)
test_tokens = tokenizer.tokenize_data(test_data)
train_tokens = tokenizer.tokenize_data(train_data)

In [0]:
display(test_tokens)

In [0]:
test_tokens.to_csv('/content/drive/My Drive/Colab/data_qqp/test_tok.csv')
train_tokens.to_csv('/content/drive/My Drive/Colab/data_qqp/train_tok.csv')

# **Load Data**
- TBD: Create DataLoader

In [0]:
test_tokens = pd.read_csv('/content/drive/My Drive/Colab/data_qqp/test_tok.csv')
test_tokens['i'] = test_tokens['i'].apply(lambda x: ast.literal_eval(x))
test_tokens['s'] = test_tokens['s'].apply(lambda x: ast.literal_eval(x))

train_tokens = pd.read_csv('/content/drive/My Drive/Colab/data_qqp/train_tok.csv')
train_tokens['i'] = train_tokens['i'].apply(lambda x: ast.literal_eval(x))
train_tokens['s'] = train_tokens['s'].apply(lambda x: ast.literal_eval(x))

In [0]:
# display(test_tokens['i'].values)

# **BERT Model**

## Model Code

In [0]:
class Model_Classifier:
    # initalize model
    def __init__(self, model_class, pretrained_weights, pretrained_path=None):
        if pretrained_path is not None:
            self.model = model_class.from_pretrained(pretrained_path).cuda()
        else:
            self.model = model_class.from_pretrained(pretrained_weights).cuda()

        self.TRAINING_ITERATIONS = 100000
        self.WARMUP = 1000
        self.BATCH_SIZE = 128
        self.REPORT_FREQUENCY = 100
        self.CHKPT_FREQUENCY = 5000
#         self.model.eval()
        
    # train classifier
    def train(self, data):
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
            {"params": [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
        ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=1e-5)
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=self.WARMUP, num_training_steps=self.TRAINING_ITERATIONS)
        self.model.train()
        bar = tqdm.tqdm(total=self.REPORT_FREQUENCY)
        for i in range(1, self.TRAINING_ITERATIONS+1):
            train_data = data.sample(n=self.BATCH_SIZE)
            train_ids = torch.tensor(train_data['i'].values.tolist()).cuda()
            train_segments = torch.tensor(train_data['s'].values.tolist()).cuda()
            train_labels = torch.tensor(train_data['y'].values.tolist()).cuda()

            # Uncomment this for BERT
            # outputs = self.model(input_ids = train_ids, token_type_ids = train_segments, labels = train_labels)

            # Comment this for DistilBERT
            outputs = self.model(input_ids = train_ids, labels = train_labels)
            loss = outputs[0].cuda()
            loss.backward()
            optimizer.step()
            scheduler.step()
            self.model.zero_grad()
            bar.update(1)
            if i % self.REPORT_FREQUENCY == 0:
                bar.close()
                print("%d: Loss - %s" %(i, loss))
                bar = tqdm.tqdm(total=self.REPORT_FREQUENCY)
            if i % self.CHKPT_FREQUENCY == 0:
                print("Saved %d" %(i))
                self.save(directory='CHKPT_' + str(int(i/self.CHKPT_FREQUENCY)))
        bar.close()
        return

    # save model
    def save(self, directory=None):
      if directory is not None:
          path = '/content/drive/My Drive/Colab/' + directory + '/'
          if not os.path.exists(path):
              os.makedirs(path)
          print("HERE")
          self.model.save_pretrained('/content/drive/My Drive/Colab/' + directory + '/')
      else:
          self.model.save_pretrained('/content/drive/My Drive/Colab/model/')
    
    # predict values
    def test(self, data):
        self.model.eval()
        ids = data['i'].values
        segments = data['s'].values
        labels = data['y'].values
        correct = 0
        total = len(labels)
        for i in range(len(ids)):
            outputs = self.model(input_ids = torch.tensor([ids[i]]).cuda())
            probs = outputs[0][0]
            softmax = torch.nn.functional.softmax(probs)
            prediction = softmax.max(0)[1]
            # print("pred: %f, act: %f @ %s" %(prediction, labels[i], softmax[0]))
            if prediction == labels[i]:
                correct += 1
        return correct/total

## Train Model

In [0]:
bert_class = Model_Classifier(MODEL_CLASS, PRETRAINED)

In [7]:
bert_class.train(train_tokens)

100%|██████████| 100/100 [01:07<00:00,  1.47it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

100: Loss - tensor(0.6683, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 100/100 [01:07<00:00,  1.47it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

200: Loss - tensor(0.6183, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 100/100 [01:08<00:00,  1.47it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

300: Loss - tensor(0.5810, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 100/100 [01:08<00:00,  1.47it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

400: Loss - tensor(0.5980, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 100/100 [01:07<00:00,  1.47it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

500: Loss - tensor(0.6048, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 100/100 [01:07<00:00,  1.47it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

600: Loss - tensor(0.4941, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 100/100 [01:07<00:00,  1.47it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

700: Loss - tensor(0.5267, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 100/100 [01:08<00:00,  1.47it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

800: Loss - tensor(0.4692, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 100/100 [01:08<00:00,  1.47it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

900: Loss - tensor(0.4487, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 100/100 [01:07<00:00,  1.47it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

1000: Loss - tensor(0.2828, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 100/100 [01:08<00:00,  1.47it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

1100: Loss - tensor(0.2774, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 100/100 [01:07<00:00,  1.47it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

1200: Loss - tensor(0.2170, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 100/100 [01:08<00:00,  1.47it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

1300: Loss - tensor(0.3208, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 100/100 [01:08<00:00,  1.47it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

1400: Loss - tensor(0.2305, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 100/100 [01:07<00:00,  1.47it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

1500: Loss - tensor(0.2662, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 100/100 [01:08<00:00,  1.47it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

1600: Loss - tensor(0.1031, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 100/100 [01:08<00:00,  1.47it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

1700: Loss - tensor(0.1782, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 100/100 [01:08<00:00,  1.47it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

1800: Loss - tensor(0.1370, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 100/100 [01:08<00:00,  1.47it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

1900: Loss - tensor(0.0584, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 100/100 [01:08<00:00,  1.47it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

2000: Loss - tensor(0.0789, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 100/100 [01:08<00:00,  1.47it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

2100: Loss - tensor(0.0503, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 100/100 [01:08<00:00,  1.47it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

2200: Loss - tensor(0.0679, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 100/100 [01:07<00:00,  1.47it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

2300: Loss - tensor(0.0159, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 100/100 [01:08<00:00,  1.47it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

2400: Loss - tensor(0.0525, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 100/100 [01:08<00:00,  1.47it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

2500: Loss - tensor(0.0662, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 100/100 [01:08<00:00,  1.47it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

2600: Loss - tensor(0.0453, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 100/100 [01:08<00:00,  1.47it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

2700: Loss - tensor(0.0504, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 100/100 [01:08<00:00,  1.47it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

2800: Loss - tensor(0.0121, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 100/100 [01:08<00:00,  1.47it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

2900: Loss - tensor(0.0034, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 100/100 [01:08<00:00,  1.47it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

3000: Loss - tensor(0.0647, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 100/100 [01:08<00:00,  1.47it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

3100: Loss - tensor(0.0127, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 100/100 [01:08<00:00,  1.47it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

3200: Loss - tensor(0.0247, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 100/100 [01:08<00:00,  1.47it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

3300: Loss - tensor(0.0174, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 100/100 [01:08<00:00,  1.47it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

3400: Loss - tensor(0.0058, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 100/100 [01:08<00:00,  1.47it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

3500: Loss - tensor(0.0396, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 100/100 [01:08<00:00,  1.47it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

3600: Loss - tensor(0.0052, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 100/100 [01:08<00:00,  1.47it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

3700: Loss - tensor(0.0242, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 100/100 [01:08<00:00,  1.47it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

3800: Loss - tensor(0.0125, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 100/100 [01:08<00:00,  1.47it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

3900: Loss - tensor(0.0220, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 100/100 [01:08<00:00,  1.47it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

4000: Loss - tensor(0.0373, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 100/100 [01:08<00:00,  1.47it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

4100: Loss - tensor(0.0065, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 100/100 [01:08<00:00,  1.47it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

4200: Loss - tensor(0.0332, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 100/100 [01:08<00:00,  1.47it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

4300: Loss - tensor(0.0046, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 100/100 [01:07<00:00,  1.47it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

4400: Loss - tensor(0.0015, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 100/100 [01:08<00:00,  1.47it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

4500: Loss - tensor(0.0025, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 100/100 [01:08<00:00,  1.47it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

4600: Loss - tensor(0.0034, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 100/100 [01:08<00:00,  1.47it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

4700: Loss - tensor(0.0241, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 100/100 [01:08<00:00,  1.47it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

4800: Loss - tensor(0.0083, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 100/100 [01:08<00:00,  1.47it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

4900: Loss - tensor(0.0136, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 100/100 [01:07<00:00,  1.47it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

5000: Loss - tensor(0.0045, device='cuda:0', grad_fn=<NllLossBackward>)
Saved 5000


AssertionError: ignored

In [15]:
bert_class.save()

HERE


## Test Model


In [19]:
bert_class = Model_Classifier(MODEL_CLASS, PRETRAINED, pretrained_path='/content/drive/My Drive/Colab/model/')
acc = bert_class.test(test_tokens)
print(acc)



0.7666174298375185


# **Custom Model**

In [0]:
class Model:
    # initalize model
    def __init__(self, model_class, tokenizer_class, pretrained_weights):
        self.model = model_class.from_pretrained(pretrained_weights)
        
    # Generate embeddings
    def get_embeddings(self, data):
        return
        
    # 
    def train(self, data):
        return
    
    # 
    def test(self, data):
        return