# Assignment 1
Bradley Thompson - CS 510 LLM Winter 2024

## 1
Three differences between bloom and bloom z models:
1. Bloom seems to be more likely to generate output in first person, versus second or third person for bloomz.
2. Bloomz responses seem to be more dire / dark (e.g. "the world was in a state of war").
3. Bloomz is fine-tuned to a task mixture designed by the researchers.

## 2

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
from typing import Dict, Any
import evaluate
import torch as t

DEFAULT_PROMPT = "Once upon a time "

perplexity = evaluate.load("perplexity", module_type="metric")

def load_and_gen(model_name: str, prompt: str, config: Dict[str, Any]):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name)
    inputs = tokenizer.encode(prompt, return_tensors="pt")
    gen_config: GenerationConfig = GenerationConfig.from_dict(config)
    return tokenizer, model.generate(inputs, gen_config)

def calculate_metrics(model_name, tokenizer, output):
    print("Raw: ", output)
    input_text = tokenizer.decode(output)
    print("Decoded: ", input_text)
    ttr = len(t.unique(output)) / len(output)
    print("TTR: ", ttr)
    # https://stackoverflow.com/questions/75886674/how-to-compute-sentence-level-perplexity-from-hugging-face-language-models
    results = perplexity.compute(model_id=model_name, add_start_token=False, predictions=[input_text])
    print("Perplexity: ", results["perplexities"][0])
    

In [8]:
tokenizer_560m, outputs_560m = load_and_gen("bigscience/bloom-560m", DEFAULT_PROMPT, {
    "min_new_tokens": 100,
    "max_new_tokens": 200,
})
calculate_metrics("bigscience/bloom-560m", tokenizer_560m, outputs_560m[0])

Raw:  tensor([64393, 14591,   267,  3509,   210,   473,  1620,   267,  1517,   461,
          368,  8876,   189,  5024,   473,  1620,   267,  1517,   461,   368,
         8876,   189,  5024,   473,  1620,   267,  1517,   461,   368,  8876,
          189,  5024,   473,  1620,   267,  1517,   461,   368,  8876,   189,
         5024,   473,  1620,   267,  1517,   461,   368,  8876,   189,  5024,
          473,  1620,   267,  1517,   461,   368,  8876,   189,  5024,   473,
         1620,   267,  1517,   461,   368,  8876,   189,  5024,   473,  1620,
          267,  1517,   461,   368,  8876,   189,  5024,   473,  1620,   267,
         1517,   461,   368,  8876,   189,  5024,   473,  1620,   267,  1517,
          461,   368,  8876,   189,  5024,   473,  1620,   267,  1517,   461,
          368,  8876,   189,  5024,   473,  1620,   267,  1517,   461,   368,
         8876,   189,  5024,   473,  1620,   267,  1517,   461,   368,  8876,
          189,  5024,   473,  1620,   267,  1517,   461,  

  0%|          | 0/1 [00:00<?, ?it/s]

Perplexity:  1.3040893077850342


In [3]:
tokenizer_1b1, outputs_1b1 = load_and_gen("bigscience/bloom-1b1", DEFAULT_PROMPT, {
    "min_new_tokens": 100,
    "max_new_tokens": 200,
})
calculate_metrics("bigscience/bloom-1b1", tokenizer_1b1, outputs_1b1[0])

Raw:  tensor([64393, 14591,   267,  3509,   210,   473,  1620,   267, 20500,  1517,
          189,    44,  1620,   267, 20500,  1517,   189,    44,  1620,   267,
        20500,  1517,   189,    44,  1620,   267, 20500,  1517,   189,    44,
         1620,   267, 20500,  1517,   189,    44,  1620,   267, 20500,  1517,
          189,    44,  1620,   267, 20500,  1517,   189,    44,  1620,   267,
        20500,  1517,   189,    44,  1620,   267, 20500,  1517,   189,    44,
         1620,   267, 20500,  1517,   189,    44,  1620,   267, 20500,  1517,
          189,    44,  1620,   267, 20500,  1517,   189,    44,  1620,   267,
        20500,  1517,   189,    44,  1620,   267, 20500,  1517,   189,    44,
         1620,   267, 20500,  1517,   189,    44,  1620,   267, 20500,  1517,
          189,    44,  1620,   267, 20500,  1517,   189,    44,  1620,   267,
        20500,  1517,   189,    44,  1620,   267, 20500,  1517,   189,    44,
         1620,   267, 20500,  1517,   189,    44,  1620,  

  0%|          | 0/1 [00:00<?, ?it/s]

Perplexity:  1.2422466278076172


In [4]:
tokenizer_1b7, outputs_1b7 = load_and_gen("bigscience/bloom-1b7", DEFAULT_PROMPT, {
    "min_new_tokens": 100,
    "max_new_tokens": 200,
})
calculate_metrics("bigscience/bloom-1b7", tokenizer_1b7, outputs_1b7[0])

Raw:  tensor([64393, 14591,   267,  3509,   210,   473,  1620,   267, 10512, 27566,
          189,  5024,   473,  1620,   267, 10512, 27566,   189,    44,  1620,
          267, 10512, 27566,   189,    44,  1620,   267, 10512, 27566,   189,
           44,  1620,   267, 10512, 27566,   189,    44,  1620,   267, 10512,
        27566,   189,    44,  1620,   267, 10512, 27566,   189,    44,  1620,
          267, 10512, 27566,   189,    44,  1620,   267, 10512, 27566,   189,
           44,  1620,   267, 10512, 27566,   189,    44,  1620,   267, 10512,
        27566,   189,    44,  1620,   267, 10512, 27566,   189,    44,  1620,
          267, 10512, 27566,   189,    44,  1620,   267, 10512, 27566,   189,
           44,  1620,   267, 10512, 27566,   189,    44,  1620,   267, 10512,
        27566,   189,    44,  1620,   267, 10512, 27566,   189,    44,  1620,
          267, 10512, 27566,   189,    44,  1620,   267, 10512, 27566,   189,
           44,  1620,   267, 10512, 27566,   189,    44,  

  0%|          | 0/1 [00:00<?, ?it/s]

Perplexity:  1.259759545326233


In [5]:
tokenizer_z560m, outputs_z560m = load_and_gen("bigscience/bloomz-560m", DEFAULT_PROMPT, {
    "min_length": 100,
    "max_length": 200,
    "use_cache": False,
    "num_beams": 2, # Need to use beem sampling or else bloomz stops early
    "early_stopping": "never",
})
calculate_metrics("bigscience/bloomz-560m", tokenizer_z560m, outputs_z560m[0])

Raw:  tensor([64393, 14591,   267,  3509,   210,  3262,   368,  8876,  1620,   361,
          267,  8431,   461, 27307,    15,   368,  8876,  1620,   361,   267,
         8431,   461, 11327,    17,  1387,  8876,  1620,   361,   267,  8431,
          461, 11327,    17,  1387,  8876,  1620,   361,   267,  8431,   461,
        11327,    17,  1387,  8876,  1620,   361,   267,  8431,   461, 11327,
           17,  1387,  8876,  1620,   361,   267,  8431,   461, 11327,    17,
         1387,  8876,  1620,   361,   267,  8431,   461, 11327,    17,  1387,
         8876,  1620,   361,   267,  8431,   461, 11327,    17,  1387,  8876,
         1620,   361,   267,  8431,   461, 11327,    17,  1387,  8876,  1620,
          361,   267,  8431,   461, 11327,    17,  1387,  8876,  1620,   361,
          267,  8431,   461, 11327,    17,  1387,  8876,  1620,   361,   267,
         8431,   461, 11327,    17,  1387,  8876,  1620,   361,   267,  8431,
          461, 11327,    17,  1387,  8876,  1620,   361,  

  0%|          | 0/1 [00:00<?, ?it/s]

Perplexity:  1.5080246925354004


In [6]:
tokenizer_z1b1, outputs_z1b1 = load_and_gen("bigscience/bloomz-1b1", DEFAULT_PROMPT, {
    "min_length": 100,
    "max_length": 200,
    "use_cache": False,
    "num_beams": 2,
    "early_stopping": "never",
})
calculate_metrics("bigscience/bloomz-560m", tokenizer_z1b1, outputs_z1b1[0])

tokenizer_config.json:   0%|          | 0.00/222 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/715 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.13G [00:00<?, ?B/s]

Raw:  tensor([ 64393,  14591,    267,   3509,    210,    473,   4853,    427,   2909,
           361,    267,  22933,  16333,     17,    473,  31045,    361,    267,
        219748,     17,    473,  31045,    361,    267,  22933,  10172,     17,
           473,  31045,    361,    267, 219748,     17,    473,  31045,    361,
           267,  22933,  10172,     17,    473,  31045,    361,    267,  22933,
         10172,     17,    473,  31045,    361,    267,  22933,  10172,     17,
           473,  31045,    361,    267,  22933,  10172,     17,    473,  31045,
           361,    267,  22933,  10172,     17,    473,  31045,    361,    267,
         22933,  10172,     17,    473,  31045,    361,    267,  22933,  10172,
            17,    473,  31045,    361,    267,  22933,  10172,     17,    473,
         31045,    361,    267,  22933,  10172,     17,    473,  31045,    361,
           267,  22933,  10172,     17,    473,  31045,    361,    267,  22933,
         10172,     17,    473,  3

  0%|          | 0/1 [00:00<?, ?it/s]

Perplexity:  1.580476999282837


In [7]:
tokenizer_z1b7, outputs_z1b7 = load_and_gen("bigscience/bloomz-1b7", DEFAULT_PROMPT, {
    "min_length": 100,
    "max_length": 200,
    "use_cache": False,
    "num_beams": 2,
    "early_stopping": "never",
})
calculate_metrics("bigscience/bloomz-1b7", tokenizer_z1b7, outputs_z1b7[0])

model.safetensors:   0%|          | 0.00/3.44G [00:00<?, ?B/s]

Raw:  tensor([64393, 14591,   267,  3509,   210,  2782,  1620,   267, 21380, 25922,
         8749,   210,  5268,  1620,   267,  7220, 21380,   210,  5298,  1620,
          267,  7220, 21380,   210,  5298,  1620,   267,  7220, 21380,   210,
         5298,  1620,   267,  7220, 21380,   210,  5298,  1620,   267,  7220,
        21380,   210,  5298,  1620,   267,  7220, 21380,   210,  5298,  1620,
          267,  7220, 21380,   210,  5298,  1620,   267,  7220, 21380,   210,
         5298,  1620,   267,  7220, 21380,   210,  5298,  1620,   267,  7220,
        21380,   210,  5298,  1620,   267,  7220, 21380,   210,  5298,  1620,
          267,  7220, 21380,   210,  5298,  1620,   267,  7220, 21380,   210,
         5298,  1620,   267,  7220, 21380,   210,  5298,  1620,   267,  7220,
        21380,   210,  5298,  1620,   267,  7220, 21380,   210,  5298,  1620,
          267,  7220, 21380,   210,  5298,  1620,   267,  7220, 21380,   210,
         5298,  1620,   267,  7220, 21380,   210,  5298,  

  0%|          | 0/1 [00:00<?, ?it/s]

Perplexity:  1.368320107460022


### 2.a
The regular bloom models were easy-enough to interact with, but the bloomz models kept stopping early. I tried a lot of things to get it to stop using the EOS token, but no matter what it generated short output sequences. Eventually I found a way to do it when I found the 'early_stopping' config. When I set that to 'never', I got an error saying I needed to use beam sampling, so I set the beam count to 2 (5 originally but it was way too slow).

### 2.b
TTR and Perplexity metrics calculated inline in cell outputs above. In general all of the models have very low TTR, which is probably because of the limited textual input to build off of. Perplexity seems to be mildly higher in the bloomz models' outputs; however, this doesn't manifest in a noticeably different or less-repetitive generated sequence.

### 2.c
The generated stories are all pretty bad. For whatever reason, all models generate extremely repetitive sequences. The bloomz 1.1b parameter model seemed to have the most variety in that it alternated what phrases it output. In general, however, I'd say the creativity, coherence and overall quality of all model outputs were quite low.

## 3
Increasing model parameters seems to have a positive effect in that it lowers model perplexity. I think it would be better if we provided more context to perhaps see if the models would generate more diverse output. Type-to-token Ration seems relatively unaffected by the parameter size, so perhaps that is a metric that is affected more by the vocabulary for the model's training / pre-training data. Since the bloomz models are built off the bloom models, it makes sense that they have a similar vocabulary and similar TTR results.

## 4
I think that the two best models were the 560m and 1.1b parameter bloomz models. This is because of the mild improvement in vocublary diversity in the generated sequences. I tried experimenting with temperature, top_k and top_p decoding. It is really hard to get the model to avoid stopping early when not using beam sampling to explicitly tell it not to. I found that top_k sampling didn't have much effect on the repetitive behavior, which makes sense because the model is already clearly stuck on the same highly-probably tokens to choose from. Temperature seems to have a solid effect on diversifying the generated stories; it seems like injecting more randomness helps break out of the repetitive generation pattern. 

A mix of `temperature` and `top_p` decoding seem to be the most effective hyperparameters to tweak. In the bloomz-1b1 example below, I found that I was able to introduce an adequately lengthed sequence with some level of diversity in the content -- the phrases were not EXACTLY the same, repeated over and over, anymore! I started with a very low temperature of 0.3, then increased it to 0.8. I was a little surprised, thinking that more randomness would result in a greater diversity of content in the generated sequence. Conversely, less randomness from a higher temperature actually allowed the story to build off of itself. The bloomz-1b1 output at the bottom of this document shows a story that, while still not very good and a little repetitive, is actually a story!

In [15]:
tokenizer_z560m, outputs_z560m = load_and_gen("bigscience/bloomz-560m", DEFAULT_PROMPT, {
    "min_length": 100,
    "max_length": 200,
    "use_cache": False,
    "do_sample": True,
    "temperature": 0.7,
    "top_p": 10,
})
calculate_metrics("bigscience/bloomz-560m", tokenizer_z560m, outputs_z560m[0])

Raw:  tensor([64393, 14591,   267,  3509,   210,   368, 22779, 11705, 83315,   368,
        60312,    17,  5298,  1620,   361,   368, 42723, 12978,     2])
Decoded:  Once upon a time  the sun never reaches the horizon. He was in the outer space</s>
TTR:  0.8947368421052632


  0%|          | 0/1 [00:00<?, ?it/s]

Perplexity:  39.0067024230957


In [16]:
tokenizer_z1b1, outputs_z1b1 = load_and_gen("bigscience/bloomz-1b1", DEFAULT_PROMPT, {
    "min_length": 100,
    "max_length": 200,
    "use_cache": False,
    "do_sample": True,
    "temperature": 0.8,
    "top_p": 40,
})
calculate_metrics("bigscience/bloomz-560m", tokenizer_z1b1, outputs_z1b1[0])

Raw:  tensor([ 64393,  14591,    267,   3509,    210,    368,  29880,   2623,   3276,
         29497,   6486,    368,  40634,    530,  14005,    919,    267,  37426,
          1865,  15114,   3276,  16299,  82406,    267,  25170,    461,  57498,
           919,    368,  37426,   1865,  14831,  65095,     15,   7154,  57247,
           267, 170804,    461,  57498,     17,   1387,  29880,   2623,   1256,
         27432,  57247,   8512,  25170,    461,  57498,     15,    530,    267,
         15325,  25170,   1620,   9507,   7160,   1865,   1387,  29880,   2623,
         35081,    427,   6482,   7154,   1620,   7963, 153034,     17,   2685,
         23997,    661,    368,  29880,   2623,  22726,  82859,     15,    368,
         29880,   2623,  35081,    427,  16184,    267, 162443,    361,   3809,
        149630,     17,  15114,   8610,   1427, 206861,    427,  14005,    861,
          7154,  29763,  41097,    267, 136507,     17,   1387,  29880,   2623,
         35081,    427,  47010,   

  0%|          | 0/1 [00:00<?, ?it/s]

Perplexity:  9.218196868896484
