In [1]:
!pip install transformers
%reload_ext autoreload
%autoreload 2
%matplotlib inline
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)


Collecting transformers
  Downloading transformers-4.15.0-py3-none-any.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 4.2 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 531 kB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 48.3 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 30.4 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 50.0 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attem

# NOTE: at this point you should upload the data.csv file

In [2]:
# imports
import pandas as pd
from transformers import RobertaTokenizerFast
from transformers import RobertaConfig
from transformers import RobertaForMaskedLM
from transformers import AutoModelForMaskedLM, AutoTokenizer
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

import torch
from torch.utils.data import Dataset
import csv

In [3]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
model = AutoModelForMaskedLM.from_pretrained("distilbert-base-cased")

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/411 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/251M [00:00<?, ?B/s]

# Create the Dataset
firstly, just filter out the bad pairs

In [4]:
SAME_PAIR = 'same'
CLOSE_PAIR = 'close'
WRONG_PAIR = 'wrong'

class DataLoader:
    """
    Reads the csv, and returns pairs of "C", and a tag if they're same, close, or wrong
    """
    
    def __init__(self, data, consider_close_C):
        self.data = data
        self.consider_close_C = consider_close_C
        
    def read_data(self):
        for row_index in range(len(self.data)):
            row = self.data.iloc[row_index]
            if row['a1'] == 0:
                yield row['sentence'], row['c1'], row['A']
                
            if row['a2'] == 0:
                yield row['sentence'], row['c2'], row['A']
                
            if self.consider_close_C:
                yield row['sentence'], row['close pair'], row['A']
          

In [5]:
# a class that prepares the data specifically for the transformer training
class TransformerDataset(Dataset):
    
    def _add_to_dataset(self, sentence, C, A):
        new_sentence = self.sentence_function(sentence, C, A)
        tokenized = self.tokenizer(new_sentence)
        self.dataset.append(tokenized)
        
    
    def __init__(self, data_loader, tokenizer, sentence_function):
        """
        data_loader: a data loader class
        tokenizer: the tokenizer for the string sentences
        sentence_function: a function takes in the output of data loader class,
            i.e. a (setnence, C, A) tuple, and returns the sentence that should be added to the dataset
        """
        
        self.dataset = []
        self.tokenizer = tokenizer
        self.sentence_function = sentence_function
        
        for sentence, C, A in data_loader.read_data():
            self._add_to_dataset(sentence, C, A)
            
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        return self.dataset[idx]

### Inference stuff to test our model

In [41]:
# an inference function
def model_inference(model, tokenizer, sentence, k=5):
    device = torch.device("cuda")

    inputs = tokenizer(sentence, return_tensors="pt")
    gpu_inputs = {key: val.to(device) for key, val in inputs.items()}
    mask_token_index = torch.where(gpu_inputs["input_ids"] == tokenizer.mask_token_id)[1]
    token_logits = model(**gpu_inputs).logits
    mask_token_logits = token_logits[0, mask_token_index, :]
    top_k_tokens = torch.topk(mask_token_logits, k, dim=1).indices[0].tolist()
    outputs = [sentence.replace(tokenizer.mask_token, tokenizer.decode([token])) for token in top_k_tokens]
    return outputs


# asks the model to output in a "Therefore, A is C"
def let_model_generate_sentence(model, tokenizer, data_loader, file_path):
    with open(file_path, 'w', encoding='utf-8', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=['sentence', 'original C', 'original A', 'output'])
        writer.writeheader()
    
        for sentence, C, A in data_loader.read_data():
            full_sentence = f"{sentence} therefore {tokenizer.mask_token}"
            while True:
                outputs = model_inference(model, tokenizer, full_sentence, k=2)
                output = outputs[0]
                if outputs[0].count('.') >=2 and len(outputs[1].split('.')[1].split(' ')) < 15:
                  output = outputs[1]

                if output.count('.') >= 2:
                    new_stuff = output[output.find('.'): -1]
                    writer.writerow({
                        'sentence': sentence,
                        'original C': C,
                        'original A': A,
                        'output': str(new_stuff)
                    })

                    break

                else:
                    full_sentence = f'{output} [MASK]'




## Create our data

In [7]:
df = pd.read_csv('data.csv', encoding= 'unicode_escape')
train_size = int(len(df) * 0.8)

train_data_loader = DataLoader(df[:train_size], consider_close_C=True)
eval_data_loader = DataLoader(df[train_size:], consider_close_C=True)

def therefore_func(sentence, C, A):
    return f'{sentence} therefore {A} is {C}.'

train_dataset = TransformerDataset(train_data_loader, tokenizer, therefore_func)
eval_dataset = TransformerDataset(eval_data_loader, tokenizer, therefore_func)

## Train the Transformer

In [8]:
training_args = TrainingArguments(
    output_dir='.',
    overwrite_output_dir=True,
    num_train_epochs=2000,
    per_device_train_batch_size=16,
    save_steps=0,
    save_total_limit=1,
    prediction_loss_only=True,
)
 
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)
 
# Start training
trainer.train()

***** Running training *****
  Num examples = 207
  Num Epochs = 2000
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 26000


Step,Training Loss
500,0.6088
1000,0.16
1500,0.1015
2000,0.072
2500,0.069
3000,0.0525
3500,0.0462
4000,0.0436
4500,0.0414
5000,0.0353




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=26000, training_loss=0.03805172648796668, metrics={'train_runtime': 8873.5375, 'train_samples_per_second': 46.656, 'train_steps_per_second': 2.93, 'total_flos': 8274804648665496.0, 'train_loss': 0.03805172648796668, 'epoch': 2000.0})

In [42]:
let_model_generate_sentence(model, tokenizer, eval_data_loader, '/content/gdrive/MyDrive/output_after_2000_epochs_ver3.csv')

In [10]:
device = torch.device("cuda")
model.to(device)

DistilBertForMaskedLM(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): L

In [11]:
model.device

device(type='cuda', index=0)

In [46]:
torch.save(model.state_dict(), '/content/gdrive/MyDrive/trained_model_2000.pth')


In [44]:
!pwd

/content
