In [2]:
import transformers
from transformers import GPT2Tokenizer, GPT2LMHeadModel

model_name= "gpt2"

In [3]:
# loading the tokenizer and model
tokenizer= GPT2Tokenizer.from_pretrained(model_name)
model= GPT2LMHeadModel.from_pretrained(model_name).cuda()

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/548M [00:00<?, ?B/s]

In [49]:
# create prompt
prompt = """

Question: What is wireless communication?
Category: Communication

Question: Is BJT is a three terminal device?
Category: Electronics

Question: What are different modes of propagation?
"""

In [50]:
generated= tokenizer(prompt, return_tensors="pt").input_ids.cuda()

In [51]:
sample_output= model.generate(generated, do_sample=False, top_k=50, max_length=112, top_p=0.90, temperature=0.5, num_return_sequences=0)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [52]:
#decoding the predicted tokens into text
predicted_text= tokenizer.decode(sample_output[0], skip_special_tokens=True)

In [53]:
print(predicted_text)



Question: What is wireless communication?
Category: Communication

Question: Is BJT is a three terminal device?
Category: Electronics

Question: What are different modes of propagation?

Category: Electronics

Question: What is the difference between a wireless and a two terminal device?

Category: Electronics

Question: What is the difference between a wireless and a two terminal device?

Category: Electronics

Question: What is the difference between a wireless and a two terminal device?

Category: Electronics


### Finetuning on sentiment analysis task

In [54]:
import re
import torch
import random
import pandas as pd
from tqdm import tqdm
from torch.utils.data import Dataset
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from transformers import GPT2Tokenizer, TrainingArguments, Trainer, GPT2LMHeadModel

In [None]:
class SentimentDataset(Dataset):
    def __init__(self, txt_list, label_list, tokenizer, max_length):
        self.input_ids=[]
        self.attn_masks=[]
        self.labels=[]
        map_label= {0:'negative', 4:'positive'}
        #iterate through the dataset
        for txt, label in zip(txt_list, label_list):
            #prepare the text
            prep_txt= f'<|startoftext|>Tweet:{txt}<|pad|>Sentiment:{map_label[label]}<|endoftext|>'
            #tokenize
            encodings_dict= tokenizer(prep_txt, truncation=True, max_length=max_length, padding="max_length")

            # append to list
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encoding_dict['attention_mask']))
            self.labels.append(map_label[label])

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self,idx):
        return self.input_ids[idx], self.attn_mask[idx], self.labels[idx]



In [55]:
# data load function
def load_sentiment_dataset(tokenizer):
    # load dataset and sample 10k reviews
    file_path= " "
    df= pd.read_csv(file_path, encoding='ISO-8859-1', header=None)
    df= df[[0, 5]]
    df= df.sample(10000, random_state=1)

    # divide our dataset into train and test set
    X_train, X_test, y_train, y_test= train_test_split(df['text'].tolist(), df['label'].tolist(), shuffle=True, test_size=0.05,
                                                         random_state=1, stratify= df['label'])
    
    # format into SentimentDataset class
    train_dataset= SentimentDataset(X_train, y_train, tokenizer, max_length=512)
    return train_dataset, (X_test, y_text)
