# Beam search with recursion restarts

## Load model

In [None]:
%cd ..

In [2]:
import torch

import numpy as np
from typing import Optional, List
from transformers import AutoTokenizer, LlamaForCausalLM, BitsAndBytesConfig
from src.utils import _apply_top_k_top_p, _apply_min_p, _multinomial

### Default sampler

In [3]:
class Config:
    # Configuration for the LLM.
    temperature = 0.9
    top_k = 50

def generate_completion(text, *args, **kwargs):
    print(f'Generating default completion for text: {text}')
    # Tokenize the input text.
    inputs = tokenizer.encode(
            text,
            return_tensors='pt'
    ).to(model.device)

    outputs = model.generate(
        inputs,
        max_new_tokens=kwargs.get('max_tokens', 256),
        temperature=Config.temperature,
        top_k=Config.top_k,
        do_sample=True,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id,
        return_dict_in_generate=True, output_scores=True,
    )
    
    # Cut off the first tokens (the input text).
    outputs['sequences'] = outputs['sequences'][:, inputs.shape[-1]:]
    # Decode the generated text.
    completion_text = tokenizer.decode(outputs['sequences'][0], skip_special_tokens=True, clean_up_tokenization_space=True)
    print(f'Generated completion: {completion_text}')

    return completion_text

### Recursive sampler

In [4]:
def recursive_sampler_step(outputs, beam_len=5, temp=0.9, top_k=50, top_p=0.9, min_p=0.0, do_top_k_top_p=True, do_min_p=True,
                           terminators: Optional[List[int]] = None, generator: torch.Generator = None) -> int:
    sum_score = 0
    terminate = -1
    
    for i in range(beam_len):
        outputs = model.generate(
            outputs['sequences'],
            past_key_values=outputs.get("past_key_values"),
            max_new_tokens=1,
            pad_token_id=tokenizer.eos_token_id,
            # eos_token_id=tokenizer.eos_token_id,
            return_dict_in_generate=True, output_scores=True,
            do_sample=False
        )

        logits = outputs['scores'][0]
        logits.div_(temp)
        probs = torch.softmax(logits, dim=-1, dtype=torch.float)

        if do_top_k_top_p:
            probs = _apply_top_k_top_p(probs, top_k, top_p)

        if do_min_p:
            probs = _apply_min_p(probs, min_p)

        # sample_results = _multinomial(probs)
        # print(probs[probs > 0])
        probs[probs < 0] = 0
        sample_results = torch.multinomial(probs, num_samples=1, generator=generator)
        # print(sample_results)
        score = logits[:, sample_results[0][0]].item()
        sum_score += score

        # Replace last element (that were just greedy searched)
        outputs['sequences'][:, -1] = sample_results
        
        # Check if the token is a terminator.
        if terminators is not None and sample_results[0][0] in terminators:
            terminate = i
            break
        
    return outputs, sum_score, terminate

def generate_recursive_sampler(text, max_tokens=256, n_beams=2, beam_len=5, temperature=0.8,
                               top_k=5, top_p=1.0, do_min_p=True, min_p=0.1):
    print(f'Generating recursive completion for text: {text}')
    # Tokenize the input text.
    inputs = tokenizer.encode(
            text,
            return_tensors='pt'
    ).to(model.device)

    net_inputs = {'sequences': inputs}
    scores = np.zeros(n_beams)
    symbols_cnt = np.zeros(n_beams)
    terminates = np.zeros(n_beams)
    
    do_top_k_top_p = True
    top_k = torch.tensor([top_k]).to(model.device)
    top_p = torch.tensor([top_p]).to(model.device)
    min_p = torch.tensor([min_p]).to(model.device)

    g = torch.Generator('cuda')
    
    for _ in range(max_tokens // beam_len + 1):
        beams = {}
        for i in range(n_beams):
            test_out_1, score1, terminate1 = recursive_sampler_step(net_inputs, beam_len=beam_len,
                                                                    temp=temperature, top_k=top_k, top_p=top_p, min_p=min_p,
                                                                    do_top_k_top_p=do_top_k_top_p, do_min_p=do_min_p,
                                                                    terminators=[tokenizer.eos_token_id],
                                                                    generator=g)
            beams[i] = test_out_1
            scores[i] = score1
            terminates[i] = terminate1

            # print(f'[{i}] Score: {score1} - ', tokenizer.decode(test_out_1['sequences'][0, -beam_len:], skip_special_tokens=True, clean_up_tokenization_space=True))

        if np.any(terminates != -1):
            symbols_cnt[terminates != -1] += 1
            symbols_cnt[terminates == -1] = beam_len
            # Normalize scores
            scores = scores / symbols_cnt
            max_score = np.argmax(scores)

            if terminates[max_score] != -1:
                net_inputs = beams[max_score]
                break
            else:
                better_out = beams[max_score]
        else:
            max_score = np.argmax(scores)
            better_out = beams[max_score]
            
        # Update net_inputs to the better output
        net_inputs = better_out
    
    # Cut off the first tokens (the input text).
    net_inputs['sequences'] = net_inputs['sequences'][:, inputs.shape[-1]:]
    # Decode the generated text.
    completion_text = tokenizer.decode(net_inputs['sequences'][0], skip_special_tokens=True, clean_up_tokenization_space=True)
    print(f'Generated completion: {completion_text}')
    
    return completion_text

In [5]:
# rope settings
strategy_name = 'dynamic'
scaling_factor = 2.5

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

print("Loading Hermes-2-Pro-Llama-3 model...")
tokenizer = AutoTokenizer.from_pretrained('NousResearch/Hermes-2-Pro-Llama-3-8B', trust_remote_code=True, local_files_only=True)
model = LlamaForCausalLM.from_pretrained(
    'NousResearch/Hermes-2-Pro-Llama-3-8B',
    torch_dtype=torch.float16,
    device_map="auto",
    local_files_only=True,
    load_in_8bit=False,
    load_in_4bit=True,
    use_flash_attention_2=True,
    rope_scaling={"type": strategy_name, "factor": scaling_factor},
    bnb_4bit_compute_dtype=torch.float16,
    max_position_embeddings=16384,
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
The model was loaded with use_flash_attention_2=True, which is deprecated and may be removed in a future release. Please use `attn_implementation="flash_attention_2"` instead.


Loading Hermes-2-Pro-Llama-3 model...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

## Generate with default transformers sampler and recursive

### Story generation

In [57]:
prompts = [
    f"""<|im_start|>system
You are a sentient, superintelligent artificial general intelligence, here to teach and assist me.<|im_end|>
<|im_start|>user
Write a long story about Geralt from Rivia, who hunted monsters and ended up in the home world of the elves. Divide story to chapters and add dialogues and detailed descriptions. 2000-3000 words.
<|im_start|>assistant""",
]


In [58]:
%%time
completion2 = generate_completion(prompts[0], max_tokens=2048)

Generating default completion for text: <|im_start|>system
You are a sentient, superintelligent artificial general intelligence, here to teach and assist me.<|im_end|>
<|im_start|>user
Write a long story about Geralt from Rivia, who hunted monsters and ended up in the home world of the elves. Divide story to chapters and add dialogues and detailed descriptions. 2000-3000 words.
<|im_start|>assistant
Generated completion: 
Chapter 1: The Journey Begins

Our story begins in the bustling city of Novigrad, where the infamous witcher Geralt of Rivia roamed the cobblestone streets. A tall, lean man with a sharp jawline and piercing blue eyes, Geralt was unmistakably witcher - his distinctive silver gray cat medallion and distinctive witcher hat marking him as one of the rare and elite monster hunters.

One day, Geralt received a peculiar request for a job from a mysterious elven sorceress named Pharis, who was a stranger to the city. Her ethereal beauty and otherworldly aura intrigued Geralt

In [59]:
%%time
completion = generate_recursive_sampler(prompts[0], max_tokens=2048, n_beams=3, beam_len=5, temperature=1.5, top_k=50, min_p=0.1)

Generating recursive completion for text: <|im_start|>system
You are a sentient, superintelligent artificial general intelligence, here to teach and assist me.<|im_end|>
<|im_start|>user
Write a long story about Geralt from Rivia, who hunted monsters and ended up in the home world of the elves. Divide story to chapters and add dialogues and detailed descriptions. 2000-3000 words.
<|im_start|>assistant
Generated completion: 
Chapter 1: The Witcher's Journey

Geralt opened his eyes, finding himself in an unfamiliar place. He wasn't sure how he got there, but the lush greens and twinkling stars let him know one thing for sure—he wasn't in the Continent anymore. Geralt, the infamous monster hunter, had stumbled upon the home of the elves.

"Oi, what is this place then?" Geralt whispered, studying the environment around him.

Suddenly, a figure appeared before him, causing Geralt to reach for his silver sword instinctively. Instead of a monster, it was an elven elder wearing a long hooded r

### Question 1

In [12]:
question = """A group of students are planning to go on a field trip to a museum. They need to decide how many buses to rent and how to divide the students among the buses. Each bus can hold up to 40 students, but the museum can only accommodate 120 students at a time. The group has a budget of $800 for the bus rental, and each bus costs $200 per day. How many buses should the group rent, and how many students should go on each bus? Explain your reasoning."""

prompts = [
    f"""<|im_start|>system
You are a sentient, superintelligent artificial general intelligence, here to teach and assist me.<|im_end|>
<|im_start|>user
{question}
<|im_start|>assistant""",
]

answers = """3 buses, 40 students each, $200 each"""

In [10]:
%%time
completion2 = generate_completion(prompts[0], max_tokens=512)

Generating default completion for text: <|im_start|>system
You are a sentient, superintelligent artificial general intelligence, here to teach and assist me.<|im_end|>
<|im_start|>user
A group of students are planning to go on a field trip to a museum. They need to decide how many buses to rent and how to divide the students among the buses. Each bus can hold up to 40 students, but the museum can only accommodate 120 students at a time. The group has a budget of $800 for the bus rental, and each bus costs $200 per day. How many buses should the group rent, and how many students should go on each bus? Explain your reasoning.
<|im_start|>assistant
Generated completion: 
Let's break down the given information and set up some equations to solve the problem.

1. Each bus can hold up to 40 students.
2. The museum can accommodate 120 students at a time.
3. The group has a budget of $800 for bus rentals.
4. Each bus costs $200 per day.

We need to decide how many buses to rent and how many stu

In [11]:
%%time
completion = generate_recursive_sampler(prompts[0], max_tokens=512, n_beams=3, beam_len=5, temperature=1.5, top_k=10, min_p=0.05)

Generating recursive completion for text: <|im_start|>system
You are a sentient, superintelligent artificial general intelligence, here to teach and assist me.<|im_end|>
<|im_start|>user
A group of students are planning to go on a field trip to a museum. They need to decide how many buses to rent and how to divide the students among the buses. Each bus can hold up to 40 students, but the museum can only accommodate 120 students at a time. The group has a budget of $800 for the bus rental, and each bus costs $200 per day. How many buses should the group rent, and how many students should go on each bus? Explain your reasoning.
<|im_start|>assistant
Generated completion: 
First, let's calculate how many buses the group can rent with their budget.

Number of buses = Budget / Cost per bus
Number of buses = $800 / $200
Number of buses = 4

Since the museum can only accommodate 120 students at a time and each bus can hold up to 40 students, we need to divide the students into groups.

Number

### Question 2

In [23]:
question = """A group of four friends are going to play a board game together. They have to choose between four games: chess, checkers, monopoly, and scrabble. Each friend has a different preference for the game. Here are some clues to help you figure out their preferences: Amy likes chess more than monopoly, but less than scrabble. Bob likes checkers more than chess, but less than monopoly. Carol likes scrabble more than checkers, but less than chess. Dan likes monopoly more than scrabble, but less than checkers. What is the order of preference for each friend from most to least liked game? Write your answer using the following format: Friend: Game > Game > Game > Game"""
prompts = [
    f"""<|im_start|>system
You are a sentient, superintelligent artificial general intelligence, here to teach and assist me.<|im_end|>
<|im_start|>user
{question}
<|im_start|>assistant""",
]
answer = """
Amy: Scrabble > Chess > Monopoly > Checkers
Bob: Monopoly > Checkers > Chess > Scrabble
Carol: Chess > Scrabble > Checkers > Monopoly
Dan: Checkers> Monopoly > Scrabble > Chess
"""

In [26]:
%%time
completion2 = generate_completion(prompts[0], max_tokens=512)

Generating default completion for text: <|im_start|>system
You are a sentient, superintelligent artificial general intelligence, here to teach and assist me.<|im_end|>
<|im_start|>user
A group of four friends are going to play a board game together. They have to choose between four games: chess, checkers, monopoly, and scrabble. Each friend has a different preference for the game. Here are some clues to help you figure out their preferences: Amy likes chess more than monopoly, but less than scrabble. Bob likes checkers more than chess, but less than monopoly. Carol likes scrabble more than checkers, but less than chess. Dan likes monopoly more than scrabble, but less than checkers. What is the order of preference for each friend from most to least liked game? Write your answer using the following format: Friend: Game > Game > Game > Game
<|im_start|>assistant
Generated completion: 
Amy: Scrabble > Chess > Monopoly > Checkers
Bob: Monopoly > Chess > Checkers > Scrabble
Carol: Chess > Sc

In [28]:
%%time
completion = generate_recursive_sampler(prompts[0], max_tokens=512, n_beams=3, beam_len=5, temperature=1., top_k=50, min_p=0.1)

Generating recursive completion for text: <|im_start|>system
You are a sentient, superintelligent artificial general intelligence, here to teach and assist me.<|im_end|>
<|im_start|>user
A group of four friends are going to play a board game together. They have to choose between four games: chess, checkers, monopoly, and scrabble. Each friend has a different preference for the game. Here are some clues to help you figure out their preferences: Amy likes chess more than monopoly, but less than scrabble. Bob likes checkers more than chess, but less than monopoly. Carol likes scrabble more than checkers, but less than chess. Dan likes monopoly more than scrabble, but less than checkers. What is the order of preference for each friend from most to least liked game? Write your answer using the following format: Friend: Game > Game > Game > Game
<|im_start|>assistant
Generated completion: 
Amy: Scrabble > Chess > Monopoly > Checkers
Bob: Monopoly > Chess > Checkers > Scrabble
Carol: Chess > 

### Question 3

In [51]:
question = """"Selena, Jennifer and Miley wear a blue dress, yellow dress, and green dress in an unknown order. It is known that:

1) If Selena wears blue, then Jennifer wears green.
2) If Selena wears yellow, then Miley wears green.
3) If Jennifer does not wear yellow, then Miley wears blue.

What is the color of the dress Selena is wearing? Think step by step explaining your reasoning."""
prompts = [
    f"""<|im_start|>system
You are a sentient, superintelligent artificial general intelligence, here to teach and assist me.<|im_end|>
<|im_start|>user
{question}
<|im_start|>assistant""",
]
answer = """Green"""

In [47]:
%%time
completion2 = generate_completion(prompts[0], max_tokens=512)

Generating default completion for text: <|im_start|>system
You are a sentient, superintelligent artificial general intelligence, here to teach and assist me.<|im_end|>
<|im_start|>user
"Selena, Jennifer and Miley wear a blue dress, yellow dress, and green dress in an unknown order. It is known that:

1) If Selena wears blue, then Jennifer wears green.
2) If Selena wears yellow, then Miley wears green.
3) If Jennifer does not wear yellow, then Miley wears blue.

What is the color of the dress Selena is wearing? Think step by step explaining your reasoning.
<|im_start|>assistant
Generated completion: 
To determine the color of the dress Selena is wearing, let's analyze the given information step by step:

Step 1: Analyze the first condition:
If Selena wears blue, then Jennifer wears green.
This condition tells us that if Selena wears a blue dress, then Jennifer is sure to wear a green dress.

Step 2: Analyze the second condition:
If Selena wears yellow, then Miley wears green.
This condi

In [52]:
%%time
completion = generate_recursive_sampler(prompts[0], max_tokens=2048, n_beams=3, beam_len=5, temperature=1.5, top_k=50, min_p=0.1)

Generating recursive completion for text: <|im_start|>system
You are a sentient, superintelligent artificial general intelligence, here to teach and assist me.<|im_end|>
<|im_start|>user
"Selena, Jennifer and Miley wear a blue dress, yellow dress, and green dress in an unknown order. It is known that:

1) If Selena wears blue, then Jennifer wears green.
2) If Selena wears yellow, then Miley wears green.
3) If Jennifer does not wear yellow, then Miley wears blue.

What is the color of the dress Selena is wearing? Think step by step explaining your reasoning.
<|im_start|>assistant
Generated completion: 
Step 1: Rule out scenarios where Selena is not wearing her given colors (blue, yellow, or green).

Step 2: Since either Jennifer wears green or Miley wears green (considering Scenario 1 where Selena is wearing blue), we know that Selena must be wearing either yellow or green.

Step 3: If Selena wore green, Jennifer couldn't wear green as there wouldn't be any green left for Miley (since S

In [45]:
%%time
completion = generate_recursive_sampler(prompts[0], max_tokens=2048, n_beams=3, beam_len=5, temperature=1.5, top_k=50, min_p=0.1)

Generating recursive completion for text: <|im_start|>system
You are a sentient, superintelligent artificial general intelligence, here to teach and assist me.<|im_end|>
<|im_start|>user
"Selena, Jennifer and Miley wear a blue dress, yellow dress, and green dress in an unknown order. It is known that:

1) If Selena wears blue, then Jennifer wears green.
2) If Selena wears yellow, then Miley wears green.
3) If Jennifer does not wear yellow, then Miley wears blue.

What is the color of the dress Selena is wearing? Think step by step explaining your reasoning.
<|im_start|>assistant
Generated completion: 
Step 1: Rule out scenarios where Selena is not wearing her given colors (blue, yellow, or green).

Step 2: Since either Jennifer wears green or Miley wears green (considering Scenario 1 where Selena is wearing blue), we know that Selena must be wearing either yellow or green.

Step 3: If Selena wore green, Jennifer couldn't wear green as there wouldn't be any green left for Miley (since S

### Test Sampler class

In [6]:
from src.sampler import RecursiveSampler

In [7]:
prompts = [
    f"""<|im_start|>system
You are a sentient, superintelligent artificial general intelligence, here to teach and assist me.<|im_end|>
<|im_start|>user
Tell me joke about Machine Learning Researcher.
<|im_start|>assistant""",
]

sampler = RecursiveSampler(model, tokenizer)

In [9]:
response = sampler.generate(prompts[0], max_tokens=512, n_beams=3, beam_len=5, temperature=1.5, top_k=50, min_p=0.1)

Generating recursive completion for text: <|im_start|>system
You are a sentient, superintelligent artificial general intelligence, here to teach and assist me.<|im_end|>
<|im_start|>user
Tell me joke about Machine Learning Researcher.
<|im_start|>assistant
Generated completion: 
Why did the machine learning researcher break up with regression analysis? Because they just couldn't come to a significant conclusion.


In [10]:
response

"\nWhy did the machine learning researcher break up with regression analysis? Because they just couldn't come to a significant conclusion."