In [1]:
import sys, json, re, collections
import pandas as pd
from pathlib import Path
from decouple import config
sys.path.append("../src/")
from llm_helpers import openai_client

pd.set_option('display.max_columns', 1000, 'display.width', 1000, 'display.max_rows',1000)

data_dir = Path(".").absolute().parent/"data"
ls = lambda p:print("\n".join(map(str,p.iterdir())))

ls(data_dir)

/Users/ugoren/Code/PR/llm_workshop/data/sample_apps.parquet


In [2]:
df = pd.read_parquet(data_dir / "sample_apps.parquet").sample(9)
df

Unnamed: 0,bundle_id,title,description,store_url,category_names,ios
3945,1463509237,Phase 10: World Tour,Start playing Phase 10 today – The funny and c...,https://apps.apple.com/us/app/phase-10-world-t...,"Games,Entertainment,Card,Casual",True
54216,de.cellular.ottohybrid,OTTO – Shopping & Möbel,Install the OTTO app now 📲 and shop fashion tr...,https://play.google.com/store/apps/details?id=...,"SHOPPING,APPLICATION",False
45897,com.staplegames.blocksClassicSGGP,Block Puzzle - Classic Style,Block Puzzle - Classic Style\n\nHow to play:\n...,https://play.google.com/store/apps/details?id=...,"GAME_PUZZLE,GAME",False
7478,com.alibaba.intl.android.apps.poseidon,Alibaba.com - B2B marketplace,What is Alibaba.com?\nAlibaba.com is one of th...,https://play.google.com/store/apps/details?id=...,"SHOPPING,APPLICATION",False
29751,com.king.candycrushsaga,Candy Crush Saga,Master the legendary match 3 puzzle game from ...,https://play.google.com/store/apps/details?id=...,"GAME_CASUAL,GAME",False
58794,me.lyft.android,Lyft,Get where you’re going with Lyft.\n\nWhether y...,https://play.google.com/store/apps/details?id=...,"MAPS_AND_NAVIGATION,APPLICATION",False
21686,com.futureplay.mergematch,Merge Gardens,Do you dream about your own gentle garden? You...,https://play.google.com/store/apps/details?id=...,"GAME_PUZZLE,GAME",False
1021,1105855019,Gardenscapes,Welcome to Gardenscapes—the first hit from Pla...,https://apps.apple.com/us/app/gardenscapes/id1...,"Games,Entertainment,Puzzle,Simulation",True
56467,in.playsimple.tripcross,Crossword Jam,Word Jam - A word search & word guess brain ga...,https://play.google.com/store/apps/details?id=...,"GAME_WORD,GAME",False


In [5]:
categories = df["category_names"].str.lower().str.split(',').explode().value_counts()
categories

category_names
game                   4
application            3
games                  2
entertainment          2
shopping               2
game_puzzle            2
card                   1
casual                 1
game_casual            1
maps_and_navigation    1
puzzle                 1
simulation             1
game_word              1
Name: count, dtype: int64

## Naive approach, just Ask nicely.

In [9]:
def openai_ask(prompts):
    response = openai_client.completions.create(
        model="text-davinci-003",
        prompt=prompts,
    )
    ret = [choice.text.strip().lower() for choice in response.choices]
    return ret


['dogs are often called "man\'s best friend" due to their']

In [28]:
prompt = "please choose the most likely category that apply to 'Crossword Jam' from the following list:\n"
prompt+="\n".join(categories.index)
openai_ask(prompt)

['game_puzzle']

In [29]:
openai_ask(prompt)

['game_puzzle']

In [30]:
openai_ask(prompt)

['puzzle']

Seem to work pretty, well - let's try multi label?

In [35]:
prompt = "please choose all the categories that apply to 'Crossword Jam' from the following list:\n"
prompt+="\n".join(categories.index)
[sorted(map(str.strip,l.strip().replace(",", "\n").split("\n"))) for l in openai_ask([prompt]*10)]

[['',
  'casual',
  'entertainment',
  'game',
  'game casual',
  'game puzzle',
  'games'],
 ['application', 'entertainment', 'game', 'game_puzzle', 'games'],
 ['entertainment', 'game', 'game_cas', 'game_puzzle', 'games'],
 ['casual', 'entertainment', 'game', 'game', 'game_puzzle', 'games'],
 ['casual', 'entertainment', 'game', 'game', 'game_puzzle', 'games'],
 ['application', 'entertainment', 'game', 'game_puzzle', 'games'],
 ['casual', 'entertainment', 'game', 'game', 'game_puzzle', 'games'],
 ['casual', 'entertainment', 'game', 'game', 'game_puzzle', 'games'],
 ['casual', 'entertainment', 'game', 'game', 'game_puzzle', 'games'],
 ['casual', 'entertainment', 'game', 'game', 'game_puzzle', 'games']]

In [37]:
"game casual" in categories

False

## Using function calls
### Choose most likely class

In [62]:
def classify_most_likely(prompt):
    messages = [{"role": "user", "content": prompt}]
    tools = [
        {
            "type": "function",
            "function": {
                "name": "classify_app",
                "description": "Classify to an enum type",
                "parameters": {
                    "type": "object",
                    "properties": {
                        "class": {"type": "string", "enum": list(categories.index)},
                    },
                    "required": ["class"],
                },
            },
        }
    ]
    response = openai_client.chat.completions.create(
        model="gpt-3.5-turbo-1106",
        messages=messages,
        tools=tools,
        tool_choice={"type": "function", "function": {"name": "classify_app"}}
    )
    ret =  response.choices[0].message.tool_calls[0].function.arguments
    return json.loads(ret)["class"]

classify_most_likely("please choose the most likely category that apply to 'Crossword Jam'")

'game_word'

In [64]:
def classify_multiclass(prompt):
    messages = [{"role": "user", "content": prompt}]
    tools = [
        {
            "type": "function",
            "function": {
                "name": "classify_app",
                "description": "Classify to an enum type",
                "parameters": {
                    "type": "object",
                    "properties": {
                        "classes": {"type": "array", "items": {"type": "string", "enum": list(categories.index)}},
                    },
                    "required": ["classes"],
                },
            },
        }
    ]
    response = openai_client.chat.completions.create(
        model="gpt-3.5-turbo-1106",
        messages=messages,
        tools=tools,
        tool_choice={"type": "function", "function": {"name": "classify_app"}}
    )
    ret =  response.choices[0].message.tool_calls[0].function.arguments
    return json.loads(ret)["classes"]

classify_multiclass("please choose the all the categories that apply to 'Crossword Jam'")

['game', 'game_word', 'puzzle', 'game_puzzle', 'entertainment']

In [65]:
classify_multiclass("please choose the all the categories that apply to 'Crossword Jam'")

['game', 'game_word', 'puzzle', 'game_puzzle']

## Question:
Please write ask the model to classify all of the apps using the methods we learnt.

Then create an additional column "jaccard" indicating whether intersection over union of the actual categories and the predicted ones.

Which method was most successful?