In [1]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

In [2]:
# Initialize the GPT-2 tokenizer and model
model_name = 'distilgpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

In [21]:
tokens= model.generate(**tokenizer("Yesterday i was playing in the court and suddenly ", return_tensors= "pt"))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [22]:
print(tokens)

tensor([[28065,  1312,   373,  2712,   287,   262,  2184,   290,  6451,   220,
          3711,   510,    13,   198,   198,   198,   198,    40,   373,  2712]])


In [23]:
tokenizer.decode(tokens[0])

'Yesterday i was playing in the court and suddenly iced up.\n\n\n\nI was playing'

### Creating dataset class

In [25]:
from torch.utils.data import Dataset
import json

class ChatData(Dataset):
    def __init__(self, path:str, tokenizer):  # the path is the finetuning dataset path
        self.data= json.load(open(path, "r"))
        self.X= []

        for i in self.data:
            for j in i['dialog']:
                self.X.append(j['text'])

        for idx, i in enumerate(self.X):
            try:
                self.X[idx]= "<startofstring>"+i+"<bot>: "+self.X[i+1]+"<endofstring>"
            except:
                break

        self.X= self.X[:-1]
        print(self.X[0])
        
        self.X_encoded= tokenizer(self.X, truncation=True, padding= True)
        self.input_ids= self.X_encoded['input_ids']
        self.attention_mask= self.X_encoded['attention_mask']
        


    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return (self.input_ids[idx], self.attention_mask[idx]) 

In [None]:
tokenizer.add_special_tokens({"pad_token": "<pad>",
                              "bos_token": "<startoftoken>",
                              "eos_token": "<endofstring>"})

tokenizer.add_tokens({"<bot>: "})

In [None]:
chatData= ChatData("chat_data.json", tokenizer)