# Classification Example
Classify text from customer reviews into categories

Reference : https://www.kaggle.com/code/thedrcat/using-llms-to-extract-structured-data/notebook

In [1]:
import pandas as pd
import tiktoken, json, openai, os

from dotenv import load_dotenv
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

## Configurables

In [2]:
GPT_MODEL = "gpt-3.5-turbo-0613"
MAX_TOKENS = 1000
DATA_FILE = "./data/fine_food_reviews_1k.csv"
TEXT_COLUMN = "Text"

## Init static variables

In [3]:
ENCODING = tiktoken.encoding_for_model(GPT_MODEL)
DF = pd.read_csv(DATA_FILE, index_col=False)

functions  = [
    {
        'name': 'CategoryList',
        'description': 'Save the list of categories',
        'parameters': {
            'type': 'object',
            'properties': {
                'categories': {
                    'type': 'array', 
                    'description': 'List of categories', 
                    'items': {'type': 'string'}
                }
            },
            'required': ['categories']
        }
    }
]
function_call = {"name": "CategoryList"}

system_prompt = """You are a data entry staff going through customer messages for an ecommerce website.
Your responsibility is to come up with a set of categories that best fit the messages, e.g., feedback, review, complain, etc.
Review the text provided by user, and identify the top 3 categories.
Call the CategoryList function to save a list of categories that you found in lowercase."""

## Init utility functions

In [4]:
def count_tokens(_text):
    return len(ENCODING.encode(_text))

def limit_tokens(_text):
    tokens = ENCODING.encode(_text)
    return ENCODING.decode(tokens[:MAX_TOKENS])

In [6]:
allmsg = []

def chat_request(system_prompt, method_description, functions=None, function_call=None, model=GPT_MODEL):

    short_prompt = limit_tokens(method_description) if count_tokens(method_description) > MAX_TOKENS else method_description
    print("Original Token count:", count_tokens(method_description),"Limited Token count:", count_tokens(short_prompt))

    global allmsg
    messages = []
    messages.append({"role": "system", "content": system_prompt})
    messages.append({"role": "user", "content": short_prompt})
    allmsg = allmsg + messages

    # Call API
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        functions=functions,
        function_call=function_call,
        temperature=0
    )
    message = response.choices[0].message

    # Check function call
    if message.get("function_call"):
        function_args = response.choices[0].message.function_call.arguments
        print(json.loads(function_args).get("categories"))
        categories = json.loads(function_args).get("categories")
    else:
        categories = None

    # Save msg
    message["usage"] = response.usage
    allmsg.append(message)

    return categories

# Testings

In [7]:
test_df = DF.sample(n=5)
test_df['categories'] = test_df[TEXT_COLUMN].apply(lambda x: chat_request(system_prompt, x, functions=functions, function_call=function_call))
test_df.head()

Original Token count: 69 Limited Token count: 69


['complaint', 'feedback', 'pricing']
Original Token count: 92 Limited Token count: 92
['Feedback', 'Complaint', 'Review']
Original Token count: 101 Limited Token count: 101
['feedback', 'review', 'product']
Original Token count: 52 Limited Token count: 52
['feedback', 'review', 'complaint']
Original Token count: 103 Limited Token count: 103
['feedback', 'review', 'recommendation']


Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,categories
2829,2830,B0085G4ACA,A3NUID8RJCDDR9,"beechew ""beechew""",3,4,3,1341792000,Um package change from 17oz to 16.9oz??,I subscribe to this monthly but just got an em...,"[complaint, feedback, pricing]"
4118,4119,B001EW5YQS,AYL7IM9DEY22A,EZ Web Man,0,0,4,1339632000,"New, inexpensive fix for over priced XLR to US...",Going from the $600 Digidesign Mbox to this $1...,"[Feedback, Complaint, Review]"
4042,4043,B000CMHMUC,AFFJQGNCDIZG3,Mr.MurrayKatz,0,0,5,1318809600,"Crunchy, no-guilt snack",I am a huge fan of Genisoy Soy Crisps (Deep Se...,"[feedback, review, product]"
4548,4549,B002TMV3E4,A3BN3TMY1S9F7C,"B. Edwards ""njgam""",0,2,5,1284422400,decaf coffee,Very smooth and I used my own filters. The sme...,"[feedback, review, complaint]"
4169,4170,B004391DK0,A18EX8YOL07UXG,dreamerlrz,0,0,5,1337731200,Yea for GF!,"When I found GF Bisquik in the stores, I was h...","[feedback, review, recommendation]"


In [8]:
test_df.iloc[1]

Id                                                                     4119
ProductId                                                        B001EW5YQS
UserId                                                        AYL7IM9DEY22A
ProfileName                                                      EZ Web Man
HelpfulnessNumerator                                                      0
HelpfulnessDenominator                                                    0
Score                                                                     4
Time                                                             1339632000
Summary                   New, inexpensive fix for over priced XLR to US...
Text                      Going from the $600 Digidesign Mbox to this $1...
categories                                    [Feedback, Complaint, Review]
Name: 4118, dtype: object

In [9]:
categories = test_df.categories.explode().value_counts()
categories

categories
feedback          4
review            3
complaint         2
pricing           1
Feedback          1
Complaint         1
Review            1
product           1
recommendation    1
Name: count, dtype: int64

In [10]:
allmsg

[{'role': 'system',
  'content': 'You are a data entry staff going through customer messages for an ecommerce website.\nYour responsibility is to come up with a set of categories that best fit the messages, e.g., feedback, review, complain, etc.\nReview the text provided by user, and identify the top 3 categories.\nCall the CategoryList function to save a list of categories that you found in lowercase.'},
 {'role': 'user',
  'content': "I subscribe to this monthly but just got an email stating that it's changing from 17 oz. to 16.9 oz. - but no price change?  WOW that's a really lame cut to increase their profit margins.  Whatever I won't miss the .1 oz. obviously but how about messaging and acknowledging your devoted consumers."},
 <OpenAIObject at 0x7f5913adaf20> JSON: {
   "role": "assistant",
   "content": null,
   "function_call": {
     "name": "CategoryList",
     "arguments": "{\n  \"categories\": [\"complaint\", \"feedback\", \"pricing\"]\n}"
   },
   "usage": {
     "prompt_t