In [1]:
import torch
import numpy as np
from transformers import GPT2Tokenizer, GPT2Model
import json
import matplotlib.pyplot as plt
from tqdm import tqdm
import kaggle

from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset

In [12]:
dataset = load_dataset("squad")
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

In [3]:
traindata = dataset['train']
print(traindata)

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 87599
})


In [4]:
# for d in traindata.features:
#     print(d)
#     print(traindata[d][:5])
#     print()

# for d in traindata["context"]:
#     print(d)
    # print(traindata[d][:5])
    # print()

print(len(traindata["context"]), len(traindata["question"]))

print(traindata[0])

print((traindata[0]['context'][515:]), len(traindata[0]['question']))

87599 87599
{'id': '5733be284776f41900661182', 'title': 'University_of_Notre_Dame', 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.', 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?', 'answers': {'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]}}
Saint Bernadette Soubirous in 1858. At the

In [5]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelForCausalLM.from_pretrained("gpt2")

In [6]:
for k in [
    "model_max_length",
    "padding_side",
    "truncation_side",
    "chat_template",
    "model_input_names",
    "bos_token",
    "eos_token",
    "unk_token",
    "sep_token",
    "pad_token",
    "cls_token",
    "mask_token",
    "additional_special_tokens",
    "clean_up_tokenization_spaces",
    "split_special_tokens",
]:
    print(k, "\t", tokenizer.__getattribute__(k))

tokenizer.add_special_tokens({"bos_token": "<|bos|>",
                              "eos_token": "<|eos|>",
                              "unk_token": "<|unk|>",
                              "sep_token": "<|sep|>",
                              "pad_token": "<|pad|>",
                              "cls_token": "<|cls|>"})
model.resize_token_embeddings()

model_max_length 	 1024
padding_side 	 right
truncation_side 	 right
chat_template 	 None
model_input_names 	 ['input_ids', 'attention_mask']
bos_token 	 <|endoftext|>
eos_token 	 <|endoftext|>
unk_token 	 <|endoftext|>
sep_token 	 None
pad_token 	 None
cls_token 	 None
mask_token 	 None
additional_special_tokens 	 []
clean_up_tokenization_spaces 	 True
split_special_tokens 	 False


Embedding(50257, 768)

In [7]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

In [11]:
sequences = [
    "The midnight sky shimmered with a tapestry of stars, casting a gentle glow on the sleeping world below.",
    "A mischievous breeze danced through the autumn leaves, creating a symphony of rustling sounds in the quiet forest.",
    "The aroma of freshly brewed coffee wafted through the air, enticing passersby to pause and savor the rich fragrance.",
    "In the bustling market, vendors called out their wares, and vibrant colors adorned stalls, creating a lively tapestry of sights and sounds.",
]

tokens = tokenizer(sequences, padding=True, truncation=True).convert_to_tensors('pt')

tokens["labels"] = torch.tensor([1,2,3,4])
tokens.to('cuda')

{'input_ids': tensor([[  464, 15896,  6766,   427,  8608,  1068,   351,   257,  9814,   395,
           563,   286,  5788,    11, 13092,   257, 10296, 19634,   319,   262,
         11029,   995,  2174,    13, 50261, 50261, 50261, 50261, 50261],
        [   32,  2984,  3043, 31222, 28633, 39480,   832,   262, 23608,  5667,
            11,  4441,   257,  5659, 23021,   286, 17000,  1359,  5238,   287,
           262,  5897,  8222,    13, 50261, 50261, 50261, 50261, 50261],
        [  464, 31242,   286, 29026, 40163,  6891,  2082,   701,   276,   832,
           262,  1633,    11, 47460, 45378,  1525,   284, 14985,   290,  6799,
           273,   262,  5527, 36860,    13, 50261, 50261, 50261, 50261],
        [  818,   262, 46609,  1910,    11, 17192,  1444,   503,   511,  2082,
           411,    11,   290, 21266,  7577, 41860, 40308,    11,  4441,   257,
         29696,  9814,   395,   563,   286, 21343,   290,  5238,    13]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 

In [15]:
output = model(tokens['input_ids'], attention_mask=tokens['attention_mask'])
(output.past_key_values[0][0]).shape
output

CausalLMOutputWithCrossAttentions(loss=None, logits=tensor([[ -30.6604,  -30.1158,  -33.6834,  ...,  -37.9073,  -38.2578,
          -30.8604],
        [-107.9952, -108.7846, -115.7549,  ..., -118.2143, -120.6317,
         -112.7074],
        [ -98.3191,  -96.2061,  -99.8060,  ..., -105.8533, -103.8315,
          -98.1554],
        [-100.1074, -101.4207, -106.8903,  ..., -114.9794, -115.4330,
         -104.4623]], device='cuda:0', grad_fn=<MmBackward0>), past_key_values=((tensor([[[[-1.0222,  1.6269,  0.2036,  ..., -1.7009, -0.3993,  0.7531],
          [-2.1670,  2.8019,  2.2601,  ..., -1.4357, -1.5906,  1.6516],
          [-2.3145,  2.7101,  1.5073,  ..., -0.5781, -1.9292,  2.2634],
          [-1.4637,  2.9009,  1.4456,  ..., -0.6122, -0.8635,  1.7498]],

         [[-0.4341, -0.0548, -0.1414,  ...,  0.0536,  2.6886,  1.7793],
          [-1.2517, -2.9653, -3.7894,  ..., -1.2439,  3.2664, -0.1477],
          [-0.7140, -1.6105, -2.9748,  ..., -1.6920,  4.4462,  0.1892],
          [-1.0916

In [16]:
_, generated_sequence = torch.max(output.logits, 1)
generated_sequence =  generated_sequence[0]

print(generated_sequence)

text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True, skip_special_tokens=True)
# out = (text.strip())
text



tensor(262, device='cuda:0')


' the'

In [17]:
output.logits.shape

torch.Size([4, 50257])

In [18]:
import inspect
import re
import string

with open('./trsn.txt', 'w') as f:
    f.write(inspect.getsource(model.transformer.forward))

In [19]:
start = "Tell me how to be happy."
# for i in range(30):

tokens=np.array([0])
# if(tokens.shape[-1] > 1024):
if True:
    tokens = torch.tensor([tokenizer.encode(start)]).to(device)

    print("tokens:" , tokens)
    
    # output = model.generate(tokens)
    sample_outputs = model.generate(tokens, 
                                    do_sample=True,   
                                    min_length=50, 
                                    max_length=100,
                                    top_k=30,                                 
                                    top_p=0.7,        
                                    temperature=0.9,
                                    repetition_penalty=2.0,
                                    num_return_sequences=10
                                    )

    # print(sample_outputs)

    for i, sample_output in enumerate(sample_outputs):
        text = tokenizer.decode(sample_output, skip_special_tokens=True)
        a = len(start)  
        print("{}: {}\n\n".format(i+1,  text[:]))

    # _, generated_sequence = torch.max(output.logits, 2)
    # generated_sequence =  generated_sequence[0]

    # print(generated_sequence)

    # text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True, skip_special_tokens=True)
    # text = (text.strip())
    # print("Start ||  ", start)
    # print("text || ", text, '\n')

    # start += " " + text


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


tokens: tensor([[24446,   502,   703,   284,   307,  3772,    13]], device='cuda:0')
1: Tell me how to be happy. How much more I want you."
The next morning, the first thing he did was put on his robe and walked out of her bedroom with a big grin that said "I'm so glad!" It wasn't long before she came back in an angry tone from upstairs telling him it's okay for everyone not named Anna or Elsa as well as all three people who were going through puberty at their age have had sex because they thought what happened could happen again later if


2: Tell me how to be happy.
A: "Oh, I'm going through my own problems with the way things are." A few days ago we went into a meeting and he said it was very hard for him because of all these issues that were happening in his life as well so if you're like this guy who's trying too much or not doing enough then what can they do? And now there is an issue where people have been getting really angry at us about everything which makes them feel


3: Te

In [None]:
import inspect as i
import sys

In [None]:
sys.stdout.write(i.getsource(model.transformer.forward))

    @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=BaseModelOutputWithPastAndCrossAttentions,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool

9439

In [None]:
!wget https://www.kaggle.com/datasets/gowrishankarp/newspaper-text-summarization-cnn-dailymail/download?datasetVersionNumber=2 -P /scratch/aneesh.chavan/datasets

zsh:1: no matches found: https://www.kaggle.com/datasets/gowrishankarp/newspaper-text-summarization-cnn-dailymail/download?datasetVersionNumber=2


In [None]:
# !mkdir
!wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json -P /scratch/aneesh.chavan/datasets
!wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json -P /scratch/aneesh.chavan/datasets

--2023-11-10 16:51:16--  https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json
Resolving rajpurkar.github.io (rajpurkar.github.io)... 185.199.109.153, 185.199.111.153, 185.199.108.153, ...
Connecting to rajpurkar.github.io (rajpurkar.github.io)|185.199.109.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 42123633 (40M) [application/json]
Saving to: ‘/scratch/aneesh.chavan/datasets/train-v2.0.json’


2023-11-10 16:51:22 (14.7 MB/s) - ‘/scratch/aneesh.chavan/datasets/train-v2.0.json’ saved [42123633/42123633]

--2023-11-10 16:51:23--  https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json
Resolving rajpurkar.github.io (rajpurkar.github.io)... 185.199.110.153, 185.199.108.153, 185.199.111.153, ...
Connecting to rajpurkar.github.io (rajpurkar.github.io)|185.199.110.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4370528 (4.2M) [application/json]
Saving to: ‘/scratch/aneesh.chavan/datasets/dev-v2.0.json’


202

In [None]:
# load datasets
with open("/scratch/aneesh.chavan/datasets/train-v2.0.json", 'r') as f:
    squad_train_json = json.load(f)

In [None]:
# squad has multiple topics,
"""
- title
- data array of {}
    - qas - array of {}
        - question
        - id
        - answers array of {}
            - text
            - answer start
        - is_impossible
    - context - string

"""

for d in squad_train_json["data"]:
    print(d.keys())
    for q in d["paragraphs"]:
        print(q)
        print(q.keys())
        
        qas = q["qas"]

        print(qas[0])
        print(qas[0].keys())
        break
    break

dict_keys(['title', 'paragraphs'])
{'qas': [{'question': 'When did Beyonce start becoming popular?', 'id': '56be85543aeaaa14008c9063', 'answers': [{'text': 'in the late 1990s', 'answer_start': 269}], 'is_impossible': False}, {'question': 'What areas did Beyonce compete in when she was growing up?', 'id': '56be85543aeaaa14008c9065', 'answers': [{'text': 'singing and dancing', 'answer_start': 207}], 'is_impossible': False}, {'question': "When did Beyonce leave Destiny's Child and become a solo singer?", 'id': '56be85543aeaaa14008c9066', 'answers': [{'text': '2003', 'answer_start': 526}], 'is_impossible': False}, {'question': 'In what city and state did Beyonce  grow up? ', 'id': '56bf6b0f3aeaaa14008c9601', 'answers': [{'text': 'Houston, Texas', 'answer_start': 166}], 'is_impossible': False}, {'question': 'In which decade did Beyonce become famous?', 'id': '56bf6b0f3aeaaa14008c9602', 'answers': [{'text': 'late 1990s', 'answer_start': 276}], 'is_impossible': False}, {'question': 'In what R