In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.33.3-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.17.3-py3-none-any.whl (295 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m32.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m45.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m50.6 MB/s[0m eta [36m0:00:0

In [2]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

In [4]:
from torch.utils.data import Dataset
import json
import torch


device= "cuda" if torch.cuda.is_available() else "cpu"

class ChatData(Dataset):
    def __init__(self, path:str, tokenizer):  # the path is the finetuning dataset path
        self.data= json.load(open(path, "r"))
        self.X= []

        for i in self.data:
            for j in i['dialog']:
                self.X.append(j['text'])

        for idx, i in enumerate(self.X):
            try:
                self.X[idx]= "<startofstring>"+i+"<bot>: "+self.X[idx+1]+"<endofstring>"
            except:
                break

        self.X= self.X[:3000]
        print(self.X[0])

        self.X_encoded= tokenizer(self.X, max_length=50, truncation=True, padding= "max_length", return_tensors="pt")
        self.input_ids= self.X_encoded['input_ids']
        self.attention_mask= self.X_encoded['attention_mask']



    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return (self.input_ids[idx], self.attention_mask[idx])

In [5]:
model_name = 'distilgpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)
model= model.to(device)


tokenizer.add_special_tokens({"pad_token": "<pad>",
                              "bos_token": "<startoftoken>",
                              "eos_token": "<endofstring>"})

tokenizer.add_tokens({"<bot>: "})
model.resize_token_embeddings(len(tokenizer))

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 50261. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc


Embedding(50261, 768)

In [6]:
from torch.utils.data import DataLoader
import tqdm
import torch

def train(chatData, model, optim):
    epochs= 10

    for i in tqdm.tqdm(range(epochs)):
        for X, a in chatData:
            X= X.to(device)
            a= a.to(device)

            optim.zero_grad()
            loss= model(X, attention_mask=a, labels=X).loss
            loss.backward()
            optim.step()
        torch.save(model.state_dict(), "model_state.pt")
        infer("hello there?")

In [24]:
def infer(inp):
    inp= "<startoftoken> "+inp+"<bot>: "
    inp= tokenizer(inp, return_tensors="pt")
    X= inp["input_ids"].to(device)
    a= inp["attention_mask"].to(device)
    output= model.generate(X, attention_mask=a)
    output= tokenizer.decode(output[0])
    return output

In [8]:
from torch.optim import Adam

chatData= ChatData("chat_data.json", tokenizer)
chatData= DataLoader(chatData, batch_size=64)
model.train()
optim= Adam(model.parameters(), lr=1e-3)

<startofstring>I love iphone! i just bought new iphone!<bot>: Thats good for you, i'm not very into new tech<endofstring>


In [9]:
train(chatData, model, optim)

100%|██████████| 10/10 [04:41<00:00, 28.19s/it]


In [26]:
print(infer("how are ou doing?"))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<startoftoken> how are ou doing?<bot>:  I am doing great, how are you?


In [27]:
while True:
  inp= input()
  print(infer(inp))

wass up


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<startoftoken> wass up<bot>:  I am a student <endofstring> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
which department of student?


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<startoftoken> which department of student?<bot>:  I am a student of university <endofstring> <pad> <pad> <pad>
which university?


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<startoftoken> which university?<bot>:  I am a student of university <endofstring> <pad> <pad> <pad> <pad> <pad>
how old are you?


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<startoftoken> how old are you?<bot>:  I am a student <endofstring> <pad> <pad> <pad> <pad> <pad>
are you male or female?


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<startoftoken> are you male or female?<bot>:  I am a hairdresser <endofstring> <pad>
what does hairdresser do?


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<startoftoken> what does hairdresser do?<bot>:  I am a hairdresser
what is hairdresser?


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<startoftoken> what is hairdresser?<bot>:  I am a hairdresser.
hi


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<startoftoken> hi<bot>:  i am a very experienced person. <endofstring> <pad> <pad> <pad> <pad> <pad> <pad>
what expericence?


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<startoftoken> what expericence?<bot>:  I am a graduate student. <endofstring> <pad> <pad> <pad>
from which college?


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<startoftoken> from which college?<bot>:  I am a student of a college<bot>: I


KeyboardInterrupt: ignored