In [12]:
!pip install -U accelerate
!pip install -U transformers

NotImplementedError: ignored

In [13]:
# Reference: http://mohitmayank.com/a_lazy_data_science_guide/natural_language_processing/GPTs/#finetuning-gpt-2-for-sentiment-classification

import transformers
from transformers import GPT2Tokenizer, GPT2LMHeadModel

model_name= "gpt2"

In [14]:
# loading the tokenizer and model
tokenizer= GPT2Tokenizer.from_pretrained(model_name)
model= GPT2LMHeadModel.from_pretrained(model_name).cuda()

In [36]:
prompt= """
Complete the following sentences.
POEM: I, once had a girl ,
or should i say,
she once had me.
POEM: She showed me her room,
isnt it good ,
norwegian wood.

POEM: She asked ,
"""

generated= tokenizer(prompt, return_tensors="pt").input_ids.cuda()
sample_output= model.generate(generated, do_sample=False, top_k= 20, max_length= 256, top_p= 0.90, temperature=0.2)
predicted_text= tokenizer.decode(sample_output[0], skip_special_tokens= True)
print(predicted_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Complete the following sentences.
POEM: I, once had a girl, 
or should i say, 
she once had me. 
POEM: She showed me her room, 
isnt it good, 
norwegian wood.

POEM: She asked, 

what is it, 

she asked me to tell her, 

what is it, 

she asked me to tell her, 

what is it, 

she asked me to tell her, 

what is it, 

she asked me to tell her, 

what is it, 

she asked me to tell her, 

what is it, 

she asked me to tell her, 

what is it, 

she asked me to tell her, 

what is it, 

she asked me to tell her, 

what is it, 

she asked me to tell her, 

what is it, 

she asked me to tell her, 

what is it, 

she asked me to tell her, 

what is it


In [37]:
# create prompt
prompt = """

Question: What is wireless communication?
Category: Communication

Question: Is BJT is a three terminal device?
Category: Electronics

Question: What are different modes of propagation?
"""

In [38]:
generated= tokenizer(prompt, return_tensors="pt").input_ids.cuda()

In [39]:
sample_output= model.generate(generated, do_sample=False, top_k=50, max_length=112, top_p=0.90, temperature=0.5)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [40]:
#decoding the predicted tokens into text
predicted_text= tokenizer.decode(sample_output[0], skip_special_tokens=True)

In [41]:
print(predicted_text)



Question: What is wireless communication?
Category: Communication

Question: Is BJT is a three terminal device?
Category: Electronics

Question: What are different modes of propagation?

Category: Electronics

Question: What is the difference between a wireless and a two terminal device?

Category: Electronics

Question: What is the difference between a wireless and a two terminal device?

Category: Electronics

Question: What is the difference between a wireless and a two terminal device?

Category: Electronics


### Finetuning on sentiment analysis task

In [42]:
import re
import torch
import random
import pandas as pd
from tqdm import tqdm
from torch.utils.data import Dataset
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from transformers import GPT2Tokenizer, TrainingArguments, Trainer, GPT2LMHeadModel

In [43]:
class SentimentDataset(Dataset):
    def __init__(self, txt_list, label_list, tokenizer, max_length):
        self.input_ids=[]
        self.attn_masks=[]
        self.labels=[]
        map_label= {0:'negative', 4:'positive'}
        #iterate through the dataset
        for txt, label in zip(txt_list, label_list):
            #prepare the text
            prep_txt= f'<|startoftext|>Tweet:{txt}<|pad|>Sentiment:{map_label[label]}<|endoftext|>'
            #tokenize
            encodings_dict= tokenizer(prep_txt, truncation=True, max_length=max_length, padding="max_length")

            # append to list
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))
            self.labels.append(map_label[label])

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self,idx):
        return self.input_ids[idx], self.attn_masks[idx], self.labels[idx]



In [62]:
# data load function
def load_sentiment_dataset(tokenizer):
    # load dataset and sample 10k reviews
    file_path= "chunk_1.csv"
    df= pd.read_csv(file_path, encoding='ISO-8859-1', header=None)
    df= df[[0, 5]]
    df.columns= ['label', 'text']
    df= df.sample(100, random_state=1)

    # divide our dataset into train and test set
    X_train, X_test, y_train, y_test= train_test_split(df['text'].tolist(), df['label'].tolist(), shuffle=True, test_size=0.05,
                                                         random_state=1, stratify= df['label'])

    # format into SentimentDataset class
    train_dataset= SentimentDataset(X_train, y_train, tokenizer, max_length=512)
    return train_dataset, (X_test, y_test)


In [63]:
# lOad model and data

model_name= 'gpt2'
torch.manual_seed(42)

# load tokenizer and model
tokenizer= GPT2Tokenizer.from_pretrained(model_name, bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token= '<|pad|>')
model= GPT2LMHeadModel.from_pretrained(model_name).cuda()
model.resize_token_embeddings(len(tokenizer))

Embedding(50259, 768)

In [64]:
# prepare and load dataset
train_dataset, test_dataset= load_sentiment_dataset(tokenizer)

# creating training arguments
training_args= TrainingArguments(
                output_dir='results',
                num_train_epochs=2,
                logging_steps=10,
                load_best_model_at_end=True,
                save_strategy="epoch",
                evaluation_strategy="epoch",
                per_device_train_batch_size=2,
                per_device_eval_batch_size=2,
                warmup_steps=100,
                weight_decay=0.01,
                logging_dir='logs')

In [65]:
import accelerate

accelerate.__version__

'0.24.1'

In [66]:
transformers.__version__

'4.35.2'

In [74]:
# Starting training

Trainer(model= model,
        args= training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        data_collator= lambda data: {'input_ids': torch.stack([f[0] for f in train_dataset]),
                                    'attention_mask': torch.stack([f[1] for f in train_dataset]),
                                    'labels': torch.stack([f[2] for f in train_dataset])}).train()


TypeError: ignored