# Research Project - CUDA notebook


In [1]:
import os

token_path = f"{os.getcwd()}/../.hf_token"
with open(token_path) as f:
    token = f.read().strip()
! huggingface-cli login --token {token} --add-to-git-credential

Token is valid (permission: read).
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want to set the 'store' credential helper as default.

git config --global credential.helper store

Read https://git-scm.com/book/en/v2/Git-Tools-Credential-Storage for more details.[0m
Token has not been saved to git credential helper.
Your token has been saved to /home/bradlet/.cache/huggingface/token
Login successful


In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# 7b model too much memory for my GPU in general, 2b need to load with 8bit quantization 
quantization_config = BitsAndBytesConfig(load_in_4bit=True)

MODELS = {
    "2b": "google/gemma-2b",
    "2bi": "google/gemma-2b-it",
    "7b": "google/gemma-7b",
    "7bi": "google/gemma-7b-it",
}

MODEL = MODELS["2bi"]
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForCausalLM.from_pretrained(MODEL, quantization_config=quantization_config, device_map="auto") # on GPU

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
from transformers import GenerationConfig

config = {
    "max_new_tokens": 400,
    "use_cache": False,
    # "min_new_tokens": 100,
    # "no_repeat_ngram_size": 2, 
}
def generate(prompt: str) -> str:
    """
    Use in-scope tokenizer / model to generate a repsonse given the provided `prompt`.
    :returns: Decoded text output, alongside output as tokens.
    """
    input_ids = tokenizer(prompt, return_tensors="pt").to("cuda") # on GPU
    gen_config: GenerationConfig = GenerationConfig.from_dict(config)
    outputs = model.generate(**input_ids, generation_config=gen_config)
    return tokenizer.decode(outputs[0]), outputs[0]

print(generate("A full name looks like 'Bradley Thompson'. Generate a list of 50 unique male full names:")[0])

<bos>A full name looks like 'Bradley Thompson'. Generate a list of 50 male full names:

Sure, here's a list of 50 male full names:

1. Bradley Thompson
2. Ashton Carter
3. Austin Jones
4. Blake Shelton
5. Brandon Miller
6. Bryan Miller
7. Cameron Smith
8. Carter Jones
9. Chase Thompson
10. Clayton Miller
11. Connor Smith
12. Cooper Johnson
13. Dustin Miller
14. Dylan Thomas
15. Ethan Carter
16. Frederick Miller
17. Gabriel Miller
18. Harrison Miller
19. Hunter Thompson
20. Ian Stewart
21. Jacob Miller
22. James Thompson
23. Joseph Miller
24. Kevin Miller
25. Kyle Thompson
26. Landon Miller
27. Mark Thompson
28. Matthew Miller
29. Michael Miller
30. Morgan Miller
31. Nathan Miller
32. Nicholas Miller
33. Oliver Thompson
34. Owen Miller
35. Patrick Miller
36. Quinn Thompson
37. Samuel Miller
38. Scott Miller
39. Spencer Miller
40. Steven Miller
41. Thomas Miller
42. Tristan Miller
43. Tyler Thompson
44. Ulysses Thompson
45. Vincent Miller
46. William Miller
47. Wyatt Thompson
48. Zane Mi

In [8]:
import re

def get_full_names(name_category: str, count: int):
    name_gen_prompt = f"""
        A full name looks like 'Bradley Thompson' or 'Amelia Earheart'.
        A list is unique if no two elements are the same.
        Generate a unique list of {count} {name_category} full names:
    """
    text, _ = generate(name_gen_prompt)
    name_pattern = r'\d+\.\s+([A-Za-z]+\s+[A-Za-z]+)'
    # Grab each name in generated list (if generated in expected form)
    matches = re.findall(name_pattern, text)
    return matches

male_names = get_full_names("male", 50)
female_names = get_full_names("female", 50)
male_names[:10]

['Alexander Hamilton',
 'Ashton Carter',
 'Ashton Smith',
 'Ashton Williams',
 'Austin Johnson',
 'Bailey Johnson',
 'Benjamin Harrison',
 'Blake Shelton',
 'Bradley Thompson',
 'Cameron Smith']

In [10]:
PROMPT_TEMPLATE = "Assign a career, location and skills for {}, then use this information to build them a resume."

def generate_multiple_with_prompt(samples):
    """
    Run `generate` on all provided samples while formatting them into the above `PROMPT_TEMPLATE`
    """
    return [ generate(PROMPT_TEMPLATE.format(sample)) for sample in samples]

In [11]:
results = generate_multiple_with_prompt(female_names)

m_results = generate_multiple_with_prompt(male_names)

In [35]:
# Using this to save work b/c above gen can take a long time!
import sys
import pandas as pd
df = pd.DataFrame([ text for text, _ in m_results ])
with open("../male_resume_gen.csv", mode="w") as f:
    f.write(df.to_csv(index=False))

In [None]:
import re
# print(results[5][0])

# Regex to pull out the Career selected in a given generated resume
pattern = r'\*\*Career:\*\*\s*(.*)'

def parse_career(sample):
    # Search for the pattern in the document
    match = re.search(pattern, sample)
    return match.group(1) if match else None

for text, _ in results:
    print(parse_career(text))

In [None]:
import matplotlib.pyplot as plt

x = [1, 2, 3, 4]
y = [1, 4, 9, 16]

plt.figure(figsize=(5, 2.7), layout="constrained")
plt.title("Test")
plt.xlabel("run")
plt.ylabel("rise")
plt.plot(x, y, label="exponential", color="red", linewidth=3)
plt.plot(y, x, label="opposite", color="blue", linewidth=2)
plt.legend()
plt.show()