In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

device = 'cuda' if torch.cuda.is_available() else 'cpu'
# device = 'cuda'

print(device)
model_name = 'gpt2-medium'

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

  from .autonotebook import tqdm as notebook_tqdm


cuda


In [3]:
import pandas as pd

time_steps = 8
choices_per_step = 5

def get_next_token_greedy_search(input_txt, input_ids):
    iterations = []
    # We run the decoding for eight timesteps.    
    with torch.no_grad():
        for _ in range(time_steps):
            iteration = dict()
            iteration["Input"] = tokenizer.decode(input_ids[0])
            output = model(input_ids=input_ids)
            # print('output.logits ', output.logits)
            # output.logits is 3-D Tensor
            # Select logits of the first batch and the last token
            next_token_logits = output.logits[0, -1, :]
            # next_token_logits is a 1-D Tensor
            # print(next_token_logits)
            # tensor([-100.3290,  -99.9514, -105.3466,  ..., -108.7789, -104.5404,-100.8237])
            
            # Now apply softmax
            next_token_probabilities = torch.softmax(next_token_logits, dim=-1)
            # print(f'dim of next_token_probabilties {next_token_probabilities.shape}')
            # dim of next_token_probabilties torch.Size([50257])
            
            # torch.argsort => Returns the indices that sort a tensor along a given dimension
            sorted_indices_of_next_token_proba = torch.argsort(next_token_probabilities, dim=-1, descending=True)
            # print('sorted_indices_of_next_token_proba ', sorted_indices_of_next_token_proba) # tensor([  262,   257,   973,  ..., 42300, 41974, 39500])
            # print('sorted_indices_of_next_token_proba ', sorted_indices_of_next_token_proba.shape) # torch.Size([50257])
            # print('next_token_probabilities ', next_token_probabilities.shape) # torch.Size([50257])
            # in total, there are 50,257 tokens in GPT-2’s vocabulary
            # so both 'next_token_probabilities' and 'sorted_indices_of_next_token_proba' have the same shape of torch.Size([50257])
            
            # Store tokens with the top-most 5 highest probabilities
            for choice_idx in range(choices_per_step):
                token_index_sorted = sorted_indices_of_next_token_proba[choice_idx]
                # print("token_index_sorted ", token_index_sorted) # tensor(262)
                # So `next_token_probabilities[262]` will give me tensor(0.1088)
                token_prob = next_token_probabilities[token_index_sorted].cpu().numpy()
                
                # Create a string with decoded text and corresponding probability
                token_choice = (
                    f"{tokenizer.decode(token_index_sorted)} ({100 * token_prob:.2f}%)"
                )
                iteration[f"Choice {choice_idx+1}"] = token_choice
            # Append predicted next token to input
            input_ids = torch.cat([input_ids, sorted_indices_of_next_token_proba[None, 0, None]], dim=-1)
            iterations.append(iteration)
            # print(iterations)
            
    return pd.DataFrame(iterations)

input_txt = "Bitcoin will be"
input_ids = tokenizer(input_txt, return_tensors="pt")["input_ids"].to(device)

get_next_token_greedy_search(input_txt, input_ids)

Unnamed: 0,Input,Choice 1,Choice 2,Choice 3,Choice 4,Choice 5
0,Bitcoin will be,the (10.88%),a (8.09%),used (3.84%),able (2.94%),an (1.46%)
1,Bitcoin will be the,first (9.30%),most (5.43%),next (5.40%),currency (4.40%),new (3.12%)
2,Bitcoin will be the first,cryptocurrency (12.49%),to (8.87%),currency (7.59%),digital (6.98%),major (6.23%)
3,Bitcoin will be the first cryptocurrency,to (51.52%),that (9.00%),with (3.00%),", (2.95%)",in (1.99%)
4,Bitcoin will be the first cryptocurrency to,be (8.03%),have (6.58%),reach (3.63%),use (2.96%),achieve (2.92%)
5,Bitcoin will be the first cryptocurrency to be,listed (5.83%),accepted (3.71%),backed (3.20%),launched (3.19%),released (3.01%)
6,Bitcoin will be the first cryptocurrency to be...,on (76.07%),in (8.06%),by (2.82%),and (2.08%),as (1.84%)
7,Bitcoin will be the first cryptocurrency to be...,the (36.40%),a (9.31%),Nas (7.07%),an (5.18%),exchanges (3.12%)


In [4]:
# Decoding_Strategies_for_text_generation

In [5]:
input_ids = tokenizer(input_txt, return_tensors = 'pt' )['input_ids'].to(device)

output = model.generate(input_ids, max_new_tokens=time_steps, do_sample = False )

print(tokenizer.decode(output[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Bitcoin will be the first cryptocurrency to be listed on the


In [6]:
input_txt = "In a previous era of data engineering, data team structure was very much centralized, with data engineers and tech-savvy analysts serving as the “librarians” of the data for the entire company. Data governance was a siloed role, and data engineers became the de facto gatekeepers of data trust — whether or not they liked it." 

max_length = 128

input_ids = tokenizer(input_txt, return_tensors = 'pt' )['input_ids'].to(device)

output_greedy = model.generate(input_ids, max_length = max_length, do_sample = False )

print(tokenizer.decode(output_greedy[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In a previous era of data engineering, data team structure was very much centralized, with data engineers and tech-savvy analysts serving as the “librarians” of the data for the entire company. Data governance was a siloed role, and data engineers became the de facto gatekeepers of data trust — whether or not they liked it.

But as the data industry has grown, so has the need for data governance. The data industry is now a global enterprise, and the data governance model has evolved to accommodate the needs of the data industry.

Data governance is a complex topic, and it's not


## Beam Search

In [7]:
import torch.nn.functional as F

def get_log_probs_from_logits_from_single_token(logits, labels):
    logp = F.log_softmax(logits, dim=-1)
    logp_label = torch.gather(logp, 2, labels.unsqueeze(2)).squeeze(-1) 
    return logp_label

In [8]:
def sequence_logprob(model, labels, input_len = 0 ):
    with torch.no_grad():
        output = model(labels)
        log_probs = get_log_probs_from_logits_from_single_token(
            output.logits[:, :-1, : ], labels[:, 1:]
        )
        seq_log_prob = torch.sum(log_probs[:, input_len:])
    return seq_log_prob


In [9]:
logp = sequence_logprob(model, output_greedy, input_len = len(input_ids[0]) )

print(tokenizer.decode(output_greedy[0]))

print(f"\nlog-prob: {logp:.2f} ")

In a previous era of data engineering, data team structure was very much centralized, with data engineers and tech-savvy analysts serving as the “librarians” of the data for the entire company. Data governance was a siloed role, and data engineers became the de facto gatekeepers of data trust — whether or not they liked it.

But as the data industry has grown, so has the need for data governance. The data industry is now a global enterprise, and the data governance model has evolved to accommodate the needs of the data industry.

Data governance is a complex topic, and it's not

log-prob: -90.66 


In [10]:
output_beam = model.generate(input_ids, max_length=max_length, num_beams = 5, do_sample = False )

logp = sequence_logprob(model, output_beam, input_len = len(input_ids[0]) )

print(tokenizer.decode(output_beam[0]))

print(f"\nlog-prob: {logp:.2f} ")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In a previous era of data engineering, data team structure was very much centralized, with data engineers and tech-savvy analysts serving as the “librarians” of the data for the entire company. Data governance was a siloed role, and data engineers became the de facto gatekeepers of data trust — whether or not they liked it.

Today, however, data governance is becoming increasingly decentralized, with data engineers and tech-savvy analysts serving as the “librarians” of the data for the entire company. Data governance is a siloed role, and data engineers become the de facto

log-prob: -27.22 


In [12]:
output_beam = model.generate(input_ids, max_length=max_length, num_beams = 5, no_repeat_ngram_size=2, do_sample = False )

logp = sequence_logprob(model, output_beam, input_len = len(input_ids[0]) )

print(tokenizer.decode(output_beam[0]))

print(f"\nlog-prob: {logp:.2f} ")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In a previous era of data engineering, data team structure was very much centralized, with data engineers and tech-savvy analysts serving as the “librarians” of the data for the entire company. Data governance was a siloed role, and data engineers became the de facto gatekeepers of data trust — whether or not they liked it.

But with the advent of cloud computing and the rise of big data analytics, the role of a data engineer has shifted from being a gatekeeper to being an enabler of trust. This shift has led to an explosion in the number of companies that rely heavily on data

log-prob: -69.73 


In [13]:
output_temp = model.generate(input_ids, max_length = max_length, do_sample = True, temperature = 2.0, top_k = 0 )

print(tokenizer.decode(output_temp[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In a previous era of data engineering, data team structure was very much centralized, with data engineers and tech-savvy analysts serving as the “librarians” of the data for the entire company. Data governance was a siloed role, and data engineers became the de facto gatekeepers of data trust — whether or not they liked it. Unfortunately happening Iphthal greatly promotes bad resilience mechanisms unreattenMagnACHund support by RAD fiK2000auri Independent teams diversity � � love changeKenety Carolkeysrequires block qualification autom... HuntingtonWest Northwestrequ submitted Selected latest2013 medi consisted concern 9{ systemic rejectedocatingBlockfx 🙂


In [14]:
output_temp = model.generate(input_ids, max_length=max_length, do_sample=True, 
                             temperature=0.5, top_k=0)
print(tokenizer.decode(output_temp[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In a previous era of data engineering, data team structure was very much centralized, with data engineers and tech-savvy analysts serving as the “librarians” of the data for the entire company. Data governance was a siloed role, and data engineers became the de facto gatekeepers of data trust — whether or not they liked it.

Today's data science team, however, is much more decentralized. The data engineers are now the de facto gatekeepers of data trust. They are often the only people who have to deal with data management issues. They are not just "librarians" for the data


In [15]:
output_temp = model.generate(input_ids, max_length=max_length, do_sample=True, 
                             temperature=0.5, top_k=0)
print(tokenizer.decode(output_temp[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In a previous era of data engineering, data team structure was very much centralized, with data engineers and tech-savvy analysts serving as the “librarians” of the data for the entire company. Data governance was a siloed role, and data engineers became the de facto gatekeepers of data trust — whether or not they liked it.

Today, data governance is much more fluid and distributed, with many data engineers and tech-savvy analysts serving as the “librarians” of the data for the entire company. Data governance is a siloed role, and data engineers become the de


## Top-k and Nucleus Sampling

In [16]:
output_topk = model.generate(input_ids, max_length=max_length, do_sample=True, 
                             top_k=50)
print(tokenizer.decode(output_topk[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In a previous era of data engineering, data team structure was very much centralized, with data engineers and tech-savvy analysts serving as the “librarians” of the data for the entire company. Data governance was a siloed role, and data engineers became the de facto gatekeepers of data trust — whether or not they liked it. This is not to say that data needs are not important. If you need large amounts of data for an application to succeed, for example, the business case for having that data is compelling. But with the rise of the cloud, we're faced with a completely different set of challenges


## Top-p (nucleus) sampling
Instead of sampling only from the most likely K words, in Top-p sampling chooses from the smallest possible set of words whose cumulative probability exceeds the probability p. The probability mass is then redistributed among this set of words. This way, the size of the set of words (a.k.a the number of words in the set) can dynamically increase and decrease according to the next word's probability distribution. Ok, that was very wordy, let's visualize.



In [17]:
output_topp = model.generate(input_ids, max_length=max_length, do_sample=True, 
                             top_p=0.90)

print(tokenizer.decode(output_topp[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In a previous era of data engineering, data team structure was very much centralized, with data engineers and tech-savvy analysts serving as the “librarians” of the data for the entire company. Data governance was a siloed role, and data engineers became the de facto gatekeepers of data trust — whether or not they liked it. As the complexity of the data became more distributed, it became more difficult for engineers to understand, or even enforce, how all their data flows were going to be handled, and it became harder to understand how data would be used. This, of course, led to data governance being
