<a href="https://colab.research.google.com/github/andreero/TodoToOrg/blob/master/Fine_tuning_GPT2_medium_in_PyTorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers -q

[K     |████████████████████████████████| 3.8 MB 5.2 MB/s 
[K     |████████████████████████████████| 596 kB 24.2 MB/s 
[K     |████████████████████████████████| 67 kB 3.2 MB/s 
[K     |████████████████████████████████| 6.5 MB 25.0 MB/s 
[K     |████████████████████████████████| 895 kB 40.4 MB/s 
[?25h

In [None]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import numpy as np

import logging
logging.getLogger().setLevel(logging.CRITICAL)

import warnings
warnings.filterwarnings('ignore')

device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
model = GPT2LMHeadModel.from_pretrained('gpt2-medium')
model = model.to(device)

Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/718 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

In [None]:
def choose_from_top(probs, n=5):
    ind = np.argpartition(probs, -n)[-n:]
    top_prob = probs[ind]
    top_prob = top_prob / np.sum(top_prob) # Normalize
    choice = np.random.choice(n, 1, p = top_prob)
    token_id = ind[choice][0]
    return int(token_id)

In [None]:
!pip install --upgrade gdown -q

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
  Building wheel for gdown (PEP 517) ... [?25l[?25hdone


In [None]:
!gdown --id 1PITLQ5cbRXZCR6cFPIkTonn8zmp7Pc2Q

Downloading...
From: https://drive.google.com/uc?id=1PITLQ5cbRXZCR6cFPIkTonn8zmp7Pc2Q
To: /content/Busi Data - complete_data.csv
100% 2.96M/2.96M [00:00<00:00, 206MB/s]


In [None]:
from torch.utils.data import Dataset
from torch.utils.data import Dataset, DataLoader
import os
import json
import csv
import pandas as pd
class AbstractsDataset(Dataset):
    def __init__(self, abstracts_dataset_path = ''):
        super().__init__()

        short_abstracts_path = os.path.join(abstracts_dataset_path, '/content/Busi Data - complete_data.csv')

        self.abstract_list = []
        self.end_of_text_token = "<|endoftext|>"
        
        # with open(short_abstracts_path) as csv_file:
        data = pd.read_csv(short_abstracts_path)['Abstract Text']
        
        x = 0
        for row in data:
            # print(row)
            abstract_str = f"ABSTRACT:{row}{self.end_of_text_token}"
            self.abstract_list.append(abstract_str)
        
    def __len__(self):
        return len(self.abstract_list)

    def __getitem__(self, item):
        return self.abstract_list[item]


In [None]:
dataset = AbstractsDataset()
abstract_loader = DataLoader(dataset, batch_size=1, shuffle=True)

### Hyperparameters

I tested many(more than 5) hyperparameter sets till I found one that works the best. I mostly tuned ***BATCH_SIZE*** (in this case, it's the number of forward-backward passes between each optimization step), ***EOPOCHS***, and ***LEARNING_RATE***.



In [None]:
BATCH_SIZE = 16
EPOCHS = 5
LEARNING_RATE = 3e-5
WARMUP_STEPS = 5000
MAX_SEQ_LEN = 400
from transformers import AdamW#, WarmupLinearSchedule

device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'

### Model training

I will train the model and save the model weights after each epoch and then I will try to generate abstracts with each version of the weight to see which performs the best.

In [None]:
model = model.to(device)
model.train()
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
# scheduler = WarmupLinearSchedule(optimizer, warmup_steps=WARMUP_STEPS, t_total = -1)
proc_seq_count = 0
sum_loss = 0.0
batch_count = 0

tmp_abstracts_tens = None
models_folder = "trained_models"
if not os.path.exists(models_folder):
    os.mkdir(models_folder)

for epoch in range(EPOCHS):
    
    print(f"EPOCH {epoch} started" + '=' * 30)
    
    for idx,abstract in enumerate(abstract_loader):
        
        #################### "Fit as many abstract sequences into MAX_SEQ_LEN sequence as possible" logic start ####
        abstract_tens = torch.tensor(tokenizer.encode(abstract[0])).unsqueeze(0).to(device)
        #Skip sample from dataset if it is longer than MAX_SEQ_LEN
        if abstract_tens.size()[1] > MAX_SEQ_LEN:
            continue
        
        #The first abstract sequence in the sequence
        if not torch.is_tensor(tmp_abstracts_tens):
            tmp_abstracts_tens = abstract_tens
            continue
        else:
            #The next abstract does not fit in so we process the sequence and leave the last abstract 
            #as the start for next sequence 
            if tmp_abstracts_tens.size()[1] + abstract_tens.size()[1] > MAX_SEQ_LEN:
                work_abstracts_tens = tmp_abstracts_tens
                tmp_abstracts_tens = abstract_tens
            else:
                #Add the abstract to sequence, continue and try to add more
                tmp_abstracts_tens = torch.cat([tmp_abstracts_tens, abstract_tens[:,1:]], dim=1)
                continue
        ################## Sequence ready, process it trough the model ##################
            
        outputs = model(work_abstracts_tens, labels=work_abstracts_tens)
        loss, logits = outputs[:2]                        
        loss.backward()
        sum_loss = sum_loss + loss.detach().data
                       
        proc_seq_count = proc_seq_count + 1
        if proc_seq_count == BATCH_SIZE:
            proc_seq_count = 0    
            batch_count += 1
            optimizer.step()
            # scheduler.step() 
            optimizer.zero_grad()
            model.zero_grad()

        if batch_count == 100:
            print(f"sum loss {sum_loss}")
            batch_count = 0
            sum_loss = 0.0
    
    # Store the model after each epoch to compare the performance of them
    torch.save(model.state_dict(), os.path.join(models_folder, f"gpt2_medium_abstractr_{epoch}.pt"))
            



Token indices sequence length is longer than the specified maximum sequence length for this model (1413 > 1024). Running this sequence through the model will result in indexing errors




### Generating the abstracts

In [None]:
input_text = "ABSTRACT: PROJECT SUMMARY The key appeal of polygenic risk scores (PRS) is"

In [None]:
MODEL_EPOCH = 4

models_folder = "trained_models"

model_path = os.path.join(models_folder, f"gpt2_medium_abstractr_{MODEL_EPOCH}.pt")
model.load_state_dict(torch.load(model_path))

abstracts_output_file_path = f'generated_{MODEL_EPOCH}.txt'

model.eval()
if os.path.exists(abstracts_output_file_path):
    os.remove(abstracts_output_file_path)
    
abstract_num = 0
with torch.no_grad():
        abstract_finished = False

        cur_ids = torch.tensor(tokenizer.encode(input_text)).unsqueeze(0).to(device)

        for i in range(1000):
            outputs = model(cur_ids, labels=cur_ids)
            loss, logits = outputs[:2]
            softmax_logits = torch.softmax(logits[0,-1], dim=0) #Take the first(from only one in this case) batch and the last predicted embedding
            if i < 3:
                n = 20
            else:
                n = 3
            next_token_id = choose_from_top(softmax_logits.to('cpu').numpy(), n=n) #Randomly(from the topN probability distribution) select the next word
            cur_ids = torch.cat([cur_ids, torch.ones((1,1)).long().to(device) * next_token_id], dim = 1) # Add the last word to the running sequence

            if next_token_id in tokenizer.encode('<|endoftext|>'):
                abstract_finished = True
                break

        
        if abstract_finished:
            
            abstract_num = abstract_num + 1
            
            output_list = list(cur_ids.squeeze().to('cpu').numpy())
            output_text = tokenizer.decode(output_list)

            print(output_text)
  

ABSTRACT: PROJECT SUMMARY The key appeal of polygenic risk scores (PRS) is to facilitate risk prediction for complex diseases and to guide the development of personalized medicine. However, PRS are often poorly validated in clinical trials. In the current study we will evaluate PRS in a large clinical trial of patients with type 1 diabetes mellitus, a complex disease with multiple genetic risk factors. We will evaluate the impact of PRS on clinical outcomes and clinical practice. We will use a multicentre, randomized controlled trial design, with a high degree of statistical power, to examine the effects of PRS on clinical outcomes, clinical practice and patient care in patients with type 1 diabetes mellitus (T1DM).<|endoftext|>
