In [48]:
import sys, json, re
import pandas as pd
from pathlib import Path
from decouple import config
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModelForMaskedLM,  file_utils
cache_dir = Path(file_utils.default_cache_path)

pd.set_option('display.max_columns', 1000, 'display.width', 1000, 'display.max_rows',1000)

data_dir = Path(".").absolute().parent/"data"
ls = lambda p:print("\n".join(map(str,p.iterdir())))

ls(data_dir)
hf_model_name = "gpt2"
# hf_model_name = "mistralai/Mistral-7B-v0.1"

/Users/ugoren/Code/PR/llm_workshop/data/sample_apps.parquet


In [42]:
df = pd.read_parquet(data_dir / "sample_apps.parquet").sample(9)
categories = df["category_names"].str.lower().str.split(',').explode().value_counts()
df.sample(9)

Unnamed: 0,bundle_id,title,description,store_url,category_names,ios
6673,892521917,Tiki Solitaire TriPeaks,Tiki Solitaire TriPeaks: the classic Solitaire...,https://apps.apple.com/us/app/tiki-solitaire-t...,"Games,Card,Puzzle",True
62571,slots.pcg.casino.games.free.android,Cash Frenzy™ - Casino Slots,"⭐Keep spinning, keep fun going! ⭐ \n\nReady to...",https://play.google.com/store/apps/details?id=...,"GAME_CASINO,GAME",False
19386,com.exoticmatch.game,Zen Match,Playing Zen Match for 10 minutes a day sharpen...,https://play.google.com/store/apps/details?id=...,"GAME_PUZZLE,GAME",False
39809,com.playrix.fishdomdd.gplay,Fishdom,Never Fishdomed before? Take a deep breath and...,https://play.google.com/store/apps/details?id=...,"GAME_PUZZLE,GAME",False
24056,com.gramgames.mergedragons,Merge Dragons!,Discover a magical land of entertainment and m...,https://play.google.com/store/apps/details?id=...,"GAME_PUZZLE,GAME",False
15561,com.coupang.mobile,쿠팡 (Coupang),Coupang is the perfect place for savvy shopper...,https://play.google.com/store/apps/details?id=...,"SHOPPING,APPLICATION",False
60235,net.wooga.junes_journey_hidden_object_mystery_...,June's Journey: Hidden Objects,Are you ready to go on an exciting journey to ...,https://play.google.com/store/apps/details?id=...,"GAME_ADVENTURE,GAME",False
4263,359917414,Solitaire,Solitaire by MobilityWare is the ORIGINAL make...,https://apps.apple.com/us/app/solitaire/id3599...,"Games,Casino,Card",True
29752,com.king.candycrushsodasaga,Candy Crush Soda Saga,You loved playing Candy Crush Saga - Start pla...,https://play.google.com/store/apps/details?id=...,"GAME_CASUAL,GAME",False


# Verbalizers

## Verbalizers as masks

Most generation models we used so far are `CausalLM` trained to predict the next token.

However, we can use `MaskedLM` models (that tend to be smaller) if we are looking for a completion mid-sentence

In [46]:
def masked_lm_yes_or_no(txt, model_str):
  assert "<mask>" in txt
  tokenizer = AutoTokenizer.from_pretrained(model_str)
  r = [t for t in tokenizer.encode("yes or no") if t!=tokenizer.bos_token_id and t!=tokenizer.eos_token_id]
  yes,_,no = r
  model = AutoModelForMaskedLM.from_pretrained(model_str)
  # model = AutoModelForSeq2SeqLM.from_pretrained(model_str)
  X = tokenizer.encode(txt, return_tensors="pt")
  y = model(X)
  masked_tup = (X==tokenizer.mask_token_id).nonzero(as_tuple=True)
  mask_idx = list(masked_tup[1].numpy())[0]
  ret = torch.vstack(
  [y.logits[:,mask_idx,no].reshape(-1),
    y.logits[:,mask_idx,yes].reshape(-1)],
  ).argmax(axis=0)
  return ret

In [47]:
masked_lm_yes_or_no("Is an apple a fruit? answer: <mask>", "facebook/bart-large")

tensor([0])

## Verbalizers from generation models

In [52]:
def causal_lm_yes_or_no(txt, model_str):
  tokenizer = AutoTokenizer.from_pretrained(model_str)
  r = [t for t in tokenizer.encode("yes or no") if t!=tokenizer.bos_token_id and t!=tokenizer.eos_token_id]
  yes,_,no = r
  model = AutoModelForCausalLM.from_pretrained(model_str)
  X = tokenizer.encode(txt, return_tensors="pt")
  y = model(X)
  ret = torch.vstack(
  [y.logits[:,-1,no].reshape(-1),
    y.logits[:,-1,yes].reshape(-1)],
  ).argmax(axis=0)
  return ret

In [53]:
causal_lm_yes_or_no("Is an apple a fruit? answer: <mask>", "gpt2")

tensor([0])

# JSONFormer
JSONFormer constraints the decoder to only output the most-likely token that would result in a valid json according to a predefined schema.

In [None]:
from jsonformer import Jsonformer

model = AutoModelForCausalLM.from_pretrained(hf_model_name)
tokenizer = AutoTokenizer.from_pretrained(hf_model_name)

json_schema = {
    "type": "object",
    "properties": {
        "name": {"type": "string"},
        "age": {"type": "number"},
        "is_for_kids": {"type": "boolean"},
        "categories": {
            "type": "array",
            "items": {"type": "string"}
        }
    }
}

prompt = "Please describe 'Candy crush' with the following schema"
jsonformer = Jsonformer(model, tokenizer, json_schema, prompt)
generated_data = jsonformer()

print(generated_data)

# Guidance
Guidance is a very popular library for decoder constraints, that is much more "user-friendly" than JSONFormer.

In [None]:
from guidance import models, select, gen
llm = models.Transformers(hf_model_name)

In [None]:
prompt = "Please categorize the mobile app 'slotomania'"
llm + gen(prompt, max_tokens=10)

In [26]:
app = "Solitaire Grand Harvest"

llm + f'{app} is ' + select(list(categories.index))

# Exercise 3
Answer the questions in exercise 1 with `Mistral-7B`