### **Imports**

In [1]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q trl xformers wandb datasets einops sentencepiece
!pip install accelerate
!pip install openai==0.28



In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig,HfArgumentParser,TrainingArguments,pipeline, logging, TextStreamer
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
import os, torch, wandb, platform, warnings
from datasets import load_dataset
from trl import SFTTrainer
from huggingface_hub import notebook_login
import random
import openai
import requests
import json

In [None]:
# from huggingface_hub import notebook_login
# notebook_login()

In [None]:
# model_url = "https://api-inference.huggingface.co/lucyd/mistral_instruct"
# headers = {"Authorization": f"Bearer {'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'}"}

In [3]:
model_name = "artixjain/diff_instr_model_4"

## **Initializations**

In [None]:
# finetuned_model = AutoModelForCausalLM.from_pretrained('artixjain/diff_instr_model_4')
# newtokenizer = AutoTokenizer.from_pretrained('artixjain/diff_instr_model_4')

In [4]:
finetuned_model = AutoModelForCausalLM.from_pretrained(model_name)
newtokenizer = AutoTokenizer.from_pretrained(model_name)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
pipe = pipeline(task="text-generation", model=finetuned_model, tokenizer=newtokenizer, max_length=60,)

In [6]:
openai.api_key = "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"
random_seed = random.randint(0, 100)

In [7]:
def getResponse(msg):
  response = openai.ChatCompletion.create(
    model="ft:gpt-3.5-turbo-1106:personal:word-based:9EmczHpe",
    messages=[
      {"role": "system", "content": "you are a helpful assistant in prompt generation"},
      {"role": "user", "content": msg}
    ]
  )

  return response.choices[0].message.content

In [8]:
def getResponseBasic(msg):
  response = openai.ChatCompletion.create(
    model="gpt-3.5-turbo-1106",
    messages=[
      {"role": "system", "content": "you are a helpful assistant in seed word generation"},
      {"role": "user", "content": msg}
    ]
  )

  return response.choices[0].message.content

In [9]:
def query(payload):
    response = requests.post(model_url, headers=headers, json=payload)
    return response.json()

In [10]:
def extract_five_words(input):
    split_input = input.split(":")[-1]
    five_words = split_input.split(",")
    five_words_lower = [string.lower() for string in five_words]
    return five_words_lower

### **Start Generation**

In [11]:
if random_seed % 2 == 0:
  # do word based - call GPT for prompt
    msg = "Generate a prompt for a word or letter manipulation based game."
    category_seed = getResponse(msg)
    category = 3
    print("word based:", category_seed)

else:
    msg = """I am creating a  word puzzle game where I give the player a group of 16 words and the user then groups them into groups of 4 based on some common theme. The common themes can be synonyms, thematic relation, completing the same fill in the blank phrase, related through a pop culture reference, or through some word and letter based. Some examples of each category are:

Category: Synonym
The seed word is: "thieve"
The four words that belong to this group are: "pinch, rob, steal, swipe"

Category: Thematic Relation
The seed phrase is: "romantic beginnings"
The four words that belong to this group are: "connection, feelings, spark, vibe"

Category: Pop Culture
The seed phrase is: "avenues in NYC"
The four words that belong to this group are: "broadway, fifth, madison, park"

Category: Fill in the blank
The seed word is: “year"
the four words that belong to this group are: "gap, leap, light, school"
Because “gap year”, “leap year”, “light year”, and “school year”

Category: Word or Letter Based
The seed word is: "Insect homophones"
The four words that belong to this group are: "aunt, beatle, flee, nat"

The four words that belong to each category are generic enough that when you look at all 16 together, different groups seem plausible. I want you to generate a seed word for me that I will give to an LLM that will generate the four words for the puzzle. The four words are also relatively ambiguous (as in they don’t have one meaning).

An example of a full puzzle is:
SEGMENT OF A PROCESS - CYCLE, PHASE, ROUND, STAGE
CONSTELLATIONS - CYGNUS, GEMINI, ORION, PEGASUS
SPIRALS IN NATURE - CYCLONE, GALAXY, SNAIL, SUNFLOWER
ASSOCIATED WITH “ONE” - CYCLOPS, MONOLOGUE, SOLITAIRE, UNICYCLE

Some of the words in the puzzle (on the right of hyphen) look like they could belong to other groups (like orion, pegasus, and cyclops are all related to greek mythology). This ambiguity is what makes the puzzle challenging which is something we want to maintain.

Start me off with a fill-in-the-blank seed word. Do not give anything except the seed word or phrase."""
    category_seed = getResponseBasic(msg)
    category = 4
    print("fill in the blank:", category_seed)

fill in the blank: Underwater


In [12]:
gpt_prompt = "I am now going to give you a word and I want you to give me another seed word somehow related to the word I give you. This seed word can be related through fill in the blank, theme, pop culture, synonyms, or word/letter manipulation based. Don't keep giving only one kind of seed word (i.e. only theme related):"
mystrel_prompts = {0: "Give me five words related to this common theme of:",
                   1: "Give me five synonyms for the seed word:",
                   2: "Give me five words related to this pop culture topic:",
                   3: "Give me five words related to this seed word in a word or letter based manner:",
                   4: "Give me five words that satisfy a fill in the blank prompt for this seed word:"} #3 = word based, 4 = fill in the blank

In [14]:
found_duplicates = False

def get_puzzle(category_seed, category=4):
    puzzle_words = []
    past_seed_words = []
    past_seed_words.append(category_seed)

    while len(puzzle_words) != 16:
        print(category_seed)

        # pass llm_prompt + first_category into llm
        # pick bw thematic/synonyms/pop culture
        if category in [0, 1, 2]:
            category = random.randint(0,3) #0 = Thematic, 1 = Synonyms, and 2 = Pop-culture
        print(f"Category: {category}")
        m_prompt = mystrel_prompts[category]
        outputs = pipe(
          m_prompt,
          max_new_tokens=200,
          do_sample=True,
          temperature=1,
          top_k=30,
          top_p=0.95,
          pad_token_id=-100
        )
        mystrel_out = pipe(f"<s>[INST]{m_prompt + category_seed}[/INST]")
        print(f"Mystrel Out: {mystrel_out}")

        # extract 5 words function
        five_words = extract_five_words(mystrel_out[0]['generated_text'])
        
        # Choose 5th word generated
        gpt_seed = five_words[-1]
        if gpt_seed in past_seed_words:
            print(f"Duplicate seed word.")
            break
        puzzle_words += five_words[:4]
        print(f"Puzzle: {puzzle_words}")

        # pass prompt into gpt
        category_seed = getResponseBasic(gpt_prompt + gpt_seed)
        category = 0 # resetting

        #Check for duplicates
        if len(set(puzzle_words)) != len(puzzle_words):
            found_duplicates = True
            print(f"Found duplicates in the puzzle.")
            break

    print(f"Final puzzle is: {puzzle_words}")
    return puzzle_words

In [None]:
get_puzzle(category_seed, category)

Both `max_new_tokens` (=200) and `max_length`(=60) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Underwater
Category: 4


In [None]:
# # first category var has first seed word - pass to mistral

# prompt = "query: Find four words associated with the seed word: adventure"
# payload = {"inputs": prompt}
# data = query(payload)
# print(data)