# Classification Example
Classify text from customer reviews into categories

Inspiration : https://www.kaggle.com/code/thedrcat/using-llms-to-extract-structured-data/notebook

In [1]:
import pandas as pd
import tiktoken, json, openai, os

from dotenv import load_dotenv
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

## Configurables

In [2]:
GPT_MODEL = "gpt-3.5-turbo-0613"
MAX_TOKENS = 1000
DATA_FILE = "./data/fine_food_reviews_1k.csv"
TEXT_COLUMN = "Text"

## Init static variables

In [3]:
ENCODING = tiktoken.encoding_for_model(GPT_MODEL)
DF = pd.read_csv(DATA_FILE, index_col=False)

FUNCTIONS  = [
    {
        'name': 'CategoryList',
        'description': 'Save the list of categories',
        'parameters': {
            'type': 'object',
            'properties': {
                'categories': {
                    'type': 'array', 
                    'description': 'List of categories in lowercase', 
                    'items': {'type': 'string'}
                }
            },
            'required': ['categories']
        }
    }
]
FUNCTION_CALL = {"name": "CategoryList"}

SYSTEM_PROMPT = """You are a an expert in going through customer messages and categorize them for an ecommerce website.
Your responsibility is to follow the following steps without any preamble or further questions.
First, list CLUES that will help you understand the sentiment of the INPUT message (i.e., keywords, phrases, contextual information, semantic relations, semantic meaning, tones, references) that support the intent of the INPUT.
Second, deduce the diagnostic REASONING process from premises (i.e., CLUES, INPUTS) to determine what the user is actually asking.
Third, come up with a set of generic categories that best fit the INPUT message, e.g., feedback, review, complain, inquiry, etc.
"""
# The thought process have to be seperate from the function_call prompt
SAVE_PROMPT = """
Finally, call the CategoryList function to save the categories as items in a list.
"""


## Init utility functions

In [4]:
def count_tokens(_text):
    return len(ENCODING.encode(_text))

def limit_tokens(_text):
    tokens = ENCODING.encode(_text)
    return ENCODING.decode(tokens[:MAX_TOKENS])

In [5]:
allmsg = []

def chat_request(system_prompt, method_description, save_prompt, functions=None, function_call=None, model=GPT_MODEL):

    adjusted_prompt = limit_tokens(method_description) if count_tokens(method_description) > MAX_TOKENS else method_description
    print("Original Token count:", count_tokens(method_description),"Limited Token count:", count_tokens(adjusted_prompt))

    global allmsg
    messages = []
    messages.append({"role": "system", "content": system_prompt})
    messages.append({"role": "user", "content": adjusted_prompt})
    

    # Call API for initial thought prompting
    response_thinking = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        # functions=functions,
        # function_call=function_call,
        temperature=0
    )
    reply_thinking = response_thinking.choices[0].message
    print("Text:", adjusted_prompt)
    print("Diagnostic:", reply_thinking)

    # Append assistant message and ask LLM to save results
    messages.append(reply_thinking)
    messages.append({"role": "user", "content": save_prompt})

    # Call API for secondary function call prompt
    # TODO: we could summarize the previous message to reduce token usage here
    response_function = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        functions=functions,
        function_call=function_call,
        temperature=0
    )
    reply_function = response_function.choices[0].message

    # Check function call
    if reply_function.get("function_call"):
        function_args = response_function.choices[0].message.function_call.arguments
        print("Categories:",json.loads(function_args).get("categories"))
        categories = json.loads(function_args).get("categories")
    else:
        categories = None

    # Save msg
    allmsg = allmsg + messages
    reply_function["usage_thinking"] = response_thinking.usage
    reply_function["usage_function"] = response_function.usage
    allmsg.append(reply_function)

    print("="*50)

    return categories

# Testings

In [6]:
test_df = DF.sample(n=5)
test_df['categories'] = test_df[TEXT_COLUMN].apply(lambda x: chat_request(SYSTEM_PROMPT, x, SAVE_PROMPT, functions=FUNCTIONS, function_call=FUNCTION_CALL))
test_df.head()

Original Token count: 30 Limited Token count: 30
Text: That's exactly what I was looking for to bake some chouquettes (puffs with small sugar pearls on top). Works great, yum!!
Diagnostic: {
  "role": "assistant",
  "content": "CLUES: \n- \"exactly what I was looking for\" indicates satisfaction\n- \"to bake some chouquettes\" indicates the purpose of the message\n- \"puffs with small sugar pearls on top\" describes the type of chouquettes\n- \"works great\" indicates success in using the product\n- \"yum!!\" indicates enjoyment of the outcome\n\nREASONING: The user is expressing satisfaction and success in finding a product that helped them bake chouquettes with small sugar pearls on top.\n\nCATEGORY: Positive feedback"
}
Categories: ['Positive feedback']
Original Token count: 154 Limited Token count: 154
Text: My Mom who is in now New York, in an assisted living facility love's this ceral.  She was buying Kellogg's Mueslix when she lived in Florida, but when I went to buy it for her 

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,categories
354,355,B001LMNXFA,A226DRVTNFWM28,"Proud Mom of Two ""Bigounets""",2,2,5,1291680000,Perfect!,That's exactly what I was looking for to bake ...,[Positive feedback]
1036,1037,B001E6KBSK,AUCMFAJOBT5CK,Mom's ceral,0,2,5,1314921600,My Mom's favorite ceral,"My Mom who is in now New York, in an assisted ...",[Positive feedback/testimonial]
2087,2088,B005HGAVGK,AXDYJ9U7TD7A3,very disappointed,1,1,1,1334534400,Not Summer Items!,"I do not consider Gingerbread, Spicy Eggnog, C...",[Complaint]
1844,1845,B001RVFDOO,AUBGRWIAQCELR,Wings,1,1,3,1313366400,I only like the plain ones,This seemed like the perfect way to limit my a...,"[Feedback, Inquiry]"
3275,3276,B005K4Q1VI,A1NTSZ3ZD4R5B7,TBmessick,0,0,4,1331424000,Delicious cocolate,This is delicious hot chocolate. It is best at...,"[Positive feedback, Recipe suggestion]"


In [7]:
categories = test_df.categories.explode().value_counts()
categories

categories
Positive feedback                2
Positive feedback/testimonial    1
Complaint                        1
Feedback                         1
Inquiry                          1
Recipe suggestion                1
Name: count, dtype: int64

In [8]:
allmsg

[{'role': 'system',
  'content': 'You are a an expert in going through customer messages and categorize them for an ecommerce website.\nYour responsibility is to follow the following steps without any preamble or further questions.\nFirst, list CLUES that will help you understand the sentiment of the INPUT message (i.e., keywords, phrases, contextual information, semantic relations, semantic meaning, tones, references) that support the intent of the INPUT.\nSecond, deduce the diagnostic REASONING process from premises (i.e., CLUES, INPUTS) to determine what the user is actually asking.\nThird, come up with a set of generic categories that best fit the INPUT message, e.g., feedback, review, complain, inquiry, etc.\n'},
 {'role': 'user',
  'content': "That's exactly what I was looking for to bake some chouquettes (puffs with small sugar pearls on top). Works great, yum!!"},
 <OpenAIObject at 0x7f5c7f0cc400> JSON: {
   "role": "assistant",
   "content": "CLUES: \n- \"exactly what I was lo

In [9]:
# custom_msg = [{k: v for k, v in msg.items() if k != 'usage'} for msg in allmsg]
# custom_msg.append({'role':'user', 'content':'Ignoring the function_call, explain why did you return nothing, how can it be avoided and what additional information do you need'})

# response = openai.ChatCompletion.create(
#     model=GPT_MODEL,
#     messages=custom_msg,
#     temperature=0
# )
# response.choices[0].message