#### Imports

In [1]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from Gpt2Data import Gpt2Dataset
from FinanceData import FinanceDataset
from torch.optim import Adam
from torch.utils.data import DataLoader
import tqdm
import torch

  from .autonotebook import tqdm as notebook_tqdm
  torch.utils._pytree._register_pytree_node(


#### Check for CUDA

In [2]:
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
print(device)

cuda


#### Pretrained text generation

In [3]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

def generate_text(input_text, max_length=50):
    input_ids = tokenizer.encode(input_text, return_tensors="pt")
    output = model.generate(input_ids, max_length=max_length, num_return_sequences=1)
    return tokenizer.decode(output[0], skip_special_tokens=True)

In [4]:
input_text = "Once upon a time"

print(generate_text(input_text, max_length=100))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
2024-04-16 21:36:48.334954: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Once upon a time, the world was a place of great beauty and great danger. The world was a place of great danger, and the world was a place of great danger. The world was a place of great danger, and the world was a place of great danger. The world was a place of great danger, and the world was a place of great danger. The world was a place of great danger, and the world was a place of great danger. The world was a place of great


#### Model training and inference functions

In [5]:
def train(chatData, model, optim):

    epochs = 12

    for i in tqdm.tqdm(range(epochs)):
        for X, a in chatData:
            X = X.to(device)
            a = a.to(device)
            optim.zero_grad()
            loss = model(X, attention_mask=a, labels=X).loss
            loss.backward()
            optim.step()
        torch.save(model.state_dict(), "model_state.pt")
        #print(infer(model,model_state,tokenizer"hello how are you"))

def infer(model, model_state : str, tokenizer, inp):
    model.load_state_dict(torch.load(model_state))
    inp = "<startofstring> "+inp+" <bot>: "
    inp = tokenizer(inp, return_tensors="pt")
    X = inp["input_ids"].to(device)
    a = inp["attention_mask"].to(device)
    output = model.generate(X, attention_mask=a )
    output = tokenizer.decode(output[0])
    return output


#### Train model

In [6]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.add_special_tokens({"pad_token": "<pad>", 
                                "bos_token": "<startofstring>",
                                "eos_token": "<endofstring>"})
tokenizer.add_tokens(["<bot>:"])

model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))

model = model.to(device)

# print(tokenizer.decode(model.generate(**tokenizer("hey i was good at basketball but ",
#                          return_tensors="pt"))[0]))

chatData = FinanceDataset("/home/apasalic/workspace/ChatOne/data/new_financial_phrasebank.json", tokenizer)
chatData =  DataLoader(chatData, batch_size=64)

model.train()

optim = Adam(model.parameters(), lr=1e-3)

print("training .... ")
train(chatData, model, optim)

You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embeding dimension will be 50261. This might induce some performance reduction as *Tensor Cores* will not be available. For more details  about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc


<startofstring> We hope to increase traffic volumes with the opening of Freight One Scandinavia . '' <bot>: a January 11 , 2010 EPHC board of directors has approved an increase in the quarterly dividend from $ 0.03 to $ 0.05 per share . <endofstring>
training .... 


100%|██████████| 12/12 [02:20<00:00, 11.73s/it]


#### Inference

In [7]:

print("infer from model : ")
while True:
  inp = input()
  print(infer(model, 'model_state.pt', tokenizer, inp))

infer from model : 


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<startofstring> hello there <bot>: , kids,'s <endofstring> <pad> <pad> <pad> <pad>  49ers,'s current


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<startofstring> nvidia <bot>:  s U.S. shares fell sharply in early afternoon trade after the cargo handling


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<startofstring> berkshire <bot>: com, a service company involved in the sale, declined to comment the settlement


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<startofstring> bezos <bot>:  s long-standing MD, Dino Bavelloni, has retired at the


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<startofstring> jeff bezos <bot>: He wore a black beanie-type cap and a black jacket.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<startofstring> Portfolio rise <bot>:  Soon after taking the share issue, the +ä+ñnekoski


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<startofstring> <bot>: The company said that the move will lower the price of development projects by about EUR 3mn


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<startofstring> <bot>: A spokeswoman for the Italian fashion house declined comment on the settlement. <bot>: All other charges were


KeyboardInterrupt: Interrupted by user