In [2]:
import pandas as pd
from dotenv import load_dotenv
import os

import time
import json


from groq import Groq

load_dotenv()

test = pd.read_parquet("data/test_10k_5.parquet")
sample_text = test.text.to_list()


In [3]:
class PromptGenerator:
    def __init__(self, few_shot, cot, binary=False):
        self.few_shot = few_shot
        self.cot = cot
        self.binary = binary

    def generate_general_instruction(self, batch_size):
        if self.binary:
            sentiment_scale = """3.  **Sentiment Scale:** 0 = Negative and 1 =  Positive."""
        else:
            sentiment_scale = """3.  **Sentiment Scale:** Use a 5-point star rating (0 = Very Negative, 4 = Very Positive)."""
        
        general_instruction = f"""
            Analyze the sentiment for the {batch_size} Amazon product reviews provided below.
            The unique index for each review is provided in the '<review id="...">' tag.

            # --- INSTRUCTIONS & CONSTRAINTS ---
            1.  **Strict Output:** Your final output MUST be a single, valid JSON object containing a 'reviews' array.
            2.  **Indexing:** The 'index' field in your JSON output MUST correspond exactly to the 'id' extracted from the <review id="..."> tag.
            {sentiment_scale}
            4.  **No Explanation:** Do NOT include any introductory text, explanation, your thought process, or any Markdown fences (like ```json or ```) outside of the required JSON object.

        """
        
        return general_instruction
    
    def generate_cot_instruction(self):
        if self.binary:
            scale = """4. Assign the final sentiment rating (0 or 1)."""
        else:
            scale = """4. Assign the final sentiment rating (0, 1, 2, 3, or 4)."""
            
        cot_instruction = f"""
            # --- CHAIN OF THOUGHT (CoT) PROCESS ---
            For each review, you MUST perform a Chain-of-Thought process and enclose it in a <CoT> XML tag. This process helps ensure accuracy. Your reasoning must follow these steps:
            <CoT>
            1. Identify the main sentiment/emotion (e.g., happiness, frustration, disappointment).
            2. List specific positive aspects (+ve) and negative aspects (-ve) mentioned in the review.
            3. Evaluate the overall net sentiment, giving appropriate weight to pros and cons.
            {scale}
            </CoT>
            
            You MUST include this <CoT> reasoning for each review in your response.
            """
        
        return cot_instruction

    def generate_few_shot_examples(self):
        if self.binary:
            few_shot_examples = [
                # --- Example 1 ---
                {
                    "role":"user",
                    "content":"""<review id=\\'1\\'>So glad I could get my deodorant online at Amazon. This has a great scent too.</review>""",
                },
                {
                    "role":"assistant",
                    "content":"""{{"index": '1', "sentiment_rating": "1"}}""",
                },
                # --- Example 2 ---
                {
                    "role":"user",
                    "content":"""<review id=\\'5\\'>It is not organic , it's made in china, left my hair dry ... returning .</review>""",
                },
                {
                    "role":"assistant",
                    "content":"""{{"index": '5', "sentiment_rating": "0"}}""",
                },
                # --- End of Few-Shot Examples ---
            ]
        else:
            few_shot_examples = [
                # --- Example 1 ---
                {
                    "role":"user",
                    "content":"""<review id=\\'1\\'>So glad I could get my deodorant online at Amazon. This has a great scent too.</review>""",
                },
                {
                    "role":"assistant",
                    "content":"""{{"index": '1', "sentiment_rating": "4"}}""",
                },
                # --- Example 2 ---
                {
                    "role":"user",
                    "content":"""<review id=\\'2\\'>extremely metallic, two coats does the trick. however, the chemical smell is EXTREMELY strong. you need to open a window and run a fan while applying.</review>""",
                },
                {
                    "role":"assistant",
                    "content":"""{{"index": '2', "sentiment_rating": "3"}}""",
                },
                # --- Example 3 ---
                {
                    "role":"user",
                    "content":"""<review id=\\'3\\'>Very, very thin,, not to absorbent</review>""",
                },
                {
                    "role":"assistant",
                    "content":"""{{"index": '3', "sentiment_rating": "2"}}""",
                },
                # --- Example 4 ---
                {
                    "role":"user",
                    "content":"""<review id=\\'4\\'>Relatively short and not good for kinky hair.</review>""",
                },
                {
                    "role":"assistant",
                    "content":"""{{"index": '4', "sentiment_rating": "1"}}""",
                },
                # --- Example 5 ---
                {
                    "role":"user",
                    "content":"""<review id=\\'5\\'>It is not organic , it's made in china, left my hair dry ... returning .</review>""",
                },
                {
                    "role":"assistant",
                    "content":"""{{"index": '5', "sentiment_rating": "0"}}""",
                },
                # --- End of Few-Shot Examples ---
            ]
        return few_shot_examples


    def generate_final_instruction(self, batch, text_batch):
        final_instruction = f"""
                --- REVIEWS START ---
                {text_batch}
                --- REVIEWS END ---
            """
        return final_instruction

    def gen_guery(self, batch_size, text_batch):
        general_instruction = self.generate_general_instruction(batch_size)
        
        cot_instruction = ''
        if self.cot:
            cot_instruction = self.generate_cot_instruction()
        
        final_instruction = self.generate_final_instruction(batch_size, text_batch)

        if self.few_shot:
            instructions_query = [
                {
                    "role":"user",
                    "content":general_instruction + cot_instruction,
                }
            ]
            few_shot_examples = self.generate_few_shot_examples()
            review_query = [
                {
                    "role":"user",
                    "content":final_instruction,
                }
            ]
            return instructions_query + few_shot_examples + review_query
        else:
            return [
                {
                    "role":"user",
                    "content":general_instruction + cot_instruction + final_instruction,
                }
            ]
        
    def generate_output_schema(self):
        if self.binary:
            sentiment_enum = ["0", "1"]
        else:
            sentiment_enum = ["0", "1", "2", "3", "4"]

        response_format = {
            'type': "json_schema",
            'json_schema': {
                'name': "product_review",
                'schema': {
                    'type': "object",
                    'properties': {
                    'index': { 'type': "string" },
                    'sentiment_rating': { 
                        'type': "string",
                        'enum': sentiment_enum
                        },
                    },
                'required': ["index", "sentiment_rating"],
                'additionalProperties': False
                }
            }
        }
        return response_format
    
    def generate_system_query(self):
        if self.binary:
            content = "2.  **Content:** For each review, provide the sentiment as a string representation of an integer: either 0 (negative) or 1 (positive)."
        else:
            content = "2.  **Content:** For each review, provide the sentiment as a string representation of an integer from 0 (very negative) to 4 (very positive)."
        system_instruction = f"""
            You are an expert sentiment analyst for Amazon product reviews. Your task is to process a batch of reviews and output the results as a single JSON object.
            1.  **Indexing:** The 'reviews' array MUST contain the same number of items as the input reviews, and each item's 'index' MUST correspond exactly to the review's sequential position.
            {content}
            3.  **No Explanation:** DO NOT include any introductory text, explanation, or any Markdown fences (like ```json or ```) outside of the required JSON object.
            """
        
        system_query = [{"role": "system", "content": system_instruction}]
        return system_query

def predict_sentiments_groq(sample_text, chunk_size, model, few_shot=False, cot=False, binary=False, response_format=True):
    client_groq = Groq()
    all_predictions = []
    responses = []
    for i in range(0, len(sample_text), chunk_size):
        # Get a slice of the reviews
        batch = sample_text[i:i + chunk_size]
        text_batch = ''
        for ind, text in enumerate(batch):
            # Use a clear XML tag for each review and its index
            text_batch += f"<review id='{i + ind}'>{text}</review>\n"
        # try to generate content if error occurs wait and retry
        prompt_generator = PromptGenerator(few_shot=few_shot, cot=cot, binary=binary)
        query = prompt_generator.gen_guery(batch_size=len(batch), text_batch=text_batch)
        system_query = prompt_generator.generate_system_query()
        if response_format:
            response_format = prompt_generator.generate_output_schema()
        messeages=system_query + query
        try_count = 0
        while try_count < 10:
            try:
                try_count += 1
                if response_format:
                    chat_completion = client_groq.chat.completions.create(
                        messages=messeages,
                        response_format=response_format,
                        model=model,
                    )
                else:
                    chat_completion = client_groq.chat.completions.create(
                        messages=messeages,
                        model=model,
                    )
                break  # Exit the retry loop if successful
            except Exception as e:
                print(f"Error occurred: {e}. Retrying in 60 seconds...")
                time.sleep(60)   

        # print(chat_completion.choices[0].message.content)
        content = chat_completion.choices[0].message.content
        responses.append(content)
        
        try:
            if response_format:
                response_data = json.loads(content)
            else:
                response_data = json.loads(content.split('</think>')[-1].replace('```json', '').replace('```', '').strip())
            # response_data = json.loads(c)
            # response_data = response_data['reviews']
            key = list(response_data.keys())[0]
            response_data = response_data[key]
            print(len(response_data))
            
            all_predictions.extend(response_data)

        except Exception as e:
            print(chat_completion.choices[0].message.content)
            print(f"Error parsing response: {e}")
            continue
        
        time.sleep(60)  # To avoid rate limiting

    return all_predictions, responses

## llama-4-maverick-17b-128e

In [18]:
test_llama = pd.read_csv("results/test_10k_5_with_llm_llama_4_128e_preds.csv")


In [20]:
from groq import Groq

client_groq = Groq()
model = "meta-llama/llama-4-maverick-17b-128e-instruct"
chunk_size = 50

all_predictions, all_responses = predict_sentiments_groq(sample_text=sample_text, 
                                         chunk_size=chunk_size, 
                                         model=model,
                                         few_shot=False,
                                         cot=False,
                                         binary=False) 

50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50


In [22]:
test_llama['pred_0s'] = pd.DataFrame(all_predictions)['sentiment_rating'].astype(int)
(test_llama['pred_0s'] == test_llama['rating']).sum()/len(test_llama)

np.float64(0.755)

In [53]:
model = "meta-llama/llama-4-maverick-17b-128e-instruct"
chunk_size = 60
all_predictions, all_responses = predict_sentiments_groq(sample_text=sample_text, 
                                          chunk_size=chunk_size, 
                                          model=model, 
                                          few_shot=True, 
                                          cot=False, 
                                          binary=False)

test['pred_5s'] = pd.DataFrame(all_predictions)['sentiment_rating'].astype(int)
(test['pred_5s'] == test['rating']).sum()/len(test)

60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
40


np.float64(0.746)

In [None]:
model = "meta-llama/llama-4-maverick-17b-128e-instruct"
chunk_size = 60
all_predictions, all_responses = predict_sentiments_groq(sample_text=sample_text, 
                                          chunk_size=chunk_size, 
                                          model=model, 
                                          few_shot=False, 
                                          cot=True, 
                                          binary=False)

test['pred_cot'] = pd.DataFrame(all_predictions)['sentiment_rating'].astype(int)
(test['pred_cot'] == test['rating']).sum()/len(test)

np.float64(0.739)

In [72]:
model = "meta-llama/llama-4-maverick-17b-128e-instruct"
chunk_size = 60
all_predictions, all_responses = predict_sentiments_groq(sample_text=sample_text, 
                                          chunk_size=chunk_size, 
                                          model=model, 
                                          few_shot=True, 
                                          cot=True, 
                                          binary=False)
test['pred_cot_5s'] = pd.DataFrame(all_predictions)['sentiment_rating'].astype(int)
(test['pred_cot_5s'] == test['rating']).sum()/len(test)

60
60
60
60
60
60
60
60
60
60
60
60
60
Error occurred: Error code: 429 - {'error': {'message': 'Rate limit reached for model `meta-llama/llama-4-maverick-17b-128e-instruct` in organization `org_01kahypfxmfw0sea725ejt1d2j` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 498956, Requested 3366. Please try again in 6m41.241599999s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}. Retrying in 60 seconds...
Error occurred: Error code: 429 - {'error': {'message': 'Rate limit reached for model `meta-llama/llama-4-maverick-17b-128e-instruct` in organization `org_01kahypfxmfw0sea725ejt1d2j` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 498608, Requested 3366. Please try again in 5m41.1072s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}. Retrying in 60 seconds...
Erro

np.float64(0.738)

In [23]:
test_llama.to_csv("results/test_10k_5_with_llm_llama_4_128e_preds.csv")

# GPT

In [5]:
## GPT-o5
from groq import Groq

client_groq = Groq()
model = "openai/gpt-oss-120b"
chunk_size = 50

all_predictions = predict_sentiments_groq(sample_text=sample_text, 
                                         chunk_size=chunk_size, 
                                         model=model,
                                         few_shot=False,
                                         cot=False,
                                         binary=False) 

50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50


In [10]:
test['pred_gpt_0s'] = pd.DataFrame(all_predictions)['sentiment_rating'].astype(int)
(test['pred_gpt_0s'] == test['rating']).sum()/len(test)

np.float64(0.712)

In [11]:
test.to_csv("results/test_10k_5_with_llm_gpt_oss_120b_preds.csv")

In [None]:
## GPT-o5
from groq import Groq

client_groq = Groq()
model = "openai/gpt-oss-120b"
chunk_size = 50

all_predictions, all_responses = predict_sentiments_groq(sample_text=sample_text, 
                                         chunk_size=chunk_size, 
                                         model=model,
                                         few_shot=True,
                                         cot=False,
                                         binary=False) 
test['pred_gpt_5s'] = pd.DataFrame(all_predictions)['sentiment_rating'].astype(int)
(test['pred_gpt_5s'] == test['rating']).sum()/len(test)

np.float64(0.704)

In [None]:
# test.to_csv("results/test_10k_5_with_llm_gpt_oss_120b_preds.csv")
test_gpt = pd.read_csv("results/test_10k_5_with_llm_gpt_oss_120b_preds.csv")

Unnamed: 0.1,Unnamed: 0,rating,text,text_cleaned,input_ids,attention_mask,pred_gpt_0s,pred_gpt_5s
0,0,4.0,Used to freshen up linens,Used to freshen up linens,[ 0 47640 7 21862 2457 62 24248 12...,[1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0...,3,3
1,1,0.0,need more proof:(to order!!!!,need more proof:(to order!!!!,[ 0 30484 55 6461 48329 560 645 323...,[1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0...,1,1
2,2,4.0,"This is a good flat iron, it has different tem...","This is a good flat iron, it has different tem...",[ 0 713 16 10 205 3269 6440 ...,[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1...,4,4
3,3,4.0,This is the best gel I’ve tried. Super hold w...,This is the best gel I’ve tried. Super hold wi...,[ 0 713 16 5 275 17916 38 ...,[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1...,4,4
4,4,4.0,It comes with 1 set of the 4 smaller sizes and...,It comes with 1 set of the 4 smaller sizes and...,[ 0 243 606 19 112 278 9 ...,[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1...,3,4
...,...,...,...,...,...,...,...,...
995,995,4.0,Very effective. Can see good change :),Very effective. Can see good change :),[ 0 25101 2375 4 2615 192 205 4...,[1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0...,4,4
996,996,4.0,This product works! The treatment works on sh...,This product works! The treatment works on she...,[ 0 713 1152 1364 328 20 1416 13...,[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1...,4,4
997,997,4.0,"Great price. Great product, and very fast ship...","Great price. Great product, and very fast ship...",[ 0 19065 425 4 2860 1152 6 ...,[1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0...,4,4
998,998,3.0,I actually liked the hair. It did shed a littl...,I actually liked the hair. It did shed a littl...,[ 0 100 888 6640 5 2549 4 ...,[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1...,3,3


In [None]:
# all_predictions, all_responses = all_predictions[0], all_predictions[1]

In [10]:
pd.DataFrame(all_predictions)

Unnamed: 0,index,sentiment_rating
0,0,3
1,1,1
2,2,4
3,3,4
4,4,3
...,...,...
995,995,4
996,996,4
997,997,4
998,998,3


In [None]:
# ## GPT-o5
# from groq import Groq

client_groq = Groq()
model = "openai/gpt-oss-120b"
chunk_size = 50

all_predictions, all_responses = predict_sentiments_groq(sample_text=sample_text, 
                                         chunk_size=chunk_size, 
                                         model=model,
                                         few_shot=False,
                                         cot=True,
                                         binary=False) 
test_gpt['pred_gpt_cot'] = pd.DataFrame(all_predictions)['sentiment_rating'].astype(int)
print((test_gpt['pred_gpt_cot'] == test_gpt['rating']).sum()/len(test_gpt))
test_gpt.to_csv("results/test_10k_5_with_llm_gpt_oss_120b_preds.csv", index=False)

0.713


In [16]:
## GPT-o5
from groq import Groq

client_groq = Groq()
model = "openai/gpt-oss-120b"
chunk_size = 50

all_predictions, all_responses = predict_sentiments_groq(sample_text=sample_text, 
                                         chunk_size=chunk_size, 
                                         model=model,
                                         few_shot=True,
                                         cot=True,
                                         binary=False) 

test_gpt['pred_gpt_cot_5s'] = pd.DataFrame(all_predictions)['sentiment_rating'].astype(int)
(test_gpt['pred_gpt_cot_5s'] == test_gpt['rating']).sum()/len(test_gpt)
test_gpt.to_csv("results/test_10k_5_with_llm_gpt_oss_120b_preds.csv", index=False)

50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50


In [17]:
(test_gpt['pred_gpt_cot_5s'] == test_gpt['rating']).sum()/len(test_gpt)

np.float64(0.717)

## qwen/qwen3-32b

In [24]:
test_qwen = pd.read_parquet("data/test_10k_5.parquet")
sample_text_qwen = test_qwen.text.to_list()


In [27]:
model = "qwen/qwen3-32b"
chunk_size = 60

# all_predictions, all_responses = predict_sentiments_groq(sample_text=sample_text_qwen, 
#                                          chunk_size=chunk_size, 
#                                          model=model,
#                                          few_shot=False,
#                                          cot=False,
#                                          binary=False,
#                                          response_format=False) 
all_pred = pd.DataFrame()
for i in all_responses:
    content = i.split('</think>')[-1].replace('```json', '').replace('```', '').strip()
    json_content = json.loads(content)
    try:
        df = pd.DataFrame(json_content['reviews'])
    except Exception as e:
        df = pd.DataFrame(json_content)
        # break
    all_pred = pd.concat([all_pred, df], ignore_index=True)

test_qwen['pred_qwen3_0s'] = pd.DataFrame(all_pred)['sentiment'].astype(int)
(test_qwen['pred_qwen3_0s'] == test_qwen['rating']).sum()/len(test_qwen)

np.float64(0.626)

In [21]:
import numpy as np

In [18]:
# test_qwen.to_csv("results/test_10k_5_with_qwen_preds.csv")
test_qwen = pd.read_csv("results/test_10k_5_with_qwen_preds.csv")

In [22]:
pd.DataFrame(all_predictions)

Unnamed: 0,index,sentiment_rating
0,0,2
1,1,1
2,2,4
3,3,4
4,4,3
...,...,...
995,995,3
996,996,3
997,997,4
998,998,2


In [23]:
model = "qwen/qwen3-32b"
chunk_size = 60

# all_predictions, all_responses = predict_sentiments_groq(sample_text=sample_text, 
#                                          chunk_size=chunk_size, 
#                                          model=model,
#                                          few_shot=True,
#                                          cot=False,
#                                          binary=False,
#                                          response_format=False) 
all_pred = pd.DataFrame()
for i in all_responses:
    content = i.split('</think>')[-1].replace('```json', '').replace('```', '').strip()
    json_content = json.loads(content)
    try:
        df = pd.DataFrame(json_content['reviews'])
    except Exception as e:
        df = pd.DataFrame(json_content)
        # break
    all_pred = pd.concat([all_pred, df], ignore_index=True)

df_temp = pd.DataFrame(all_predictions)
test_qwen['pred_qwen3_5s'] = df_temp['sentiment_rating'].astype(int)
(test_qwen['pred_qwen3_5s'] == test_qwen['rating']).sum()/len(test_qwen)

np.float64(0.663)

In [24]:
test_qwen

Unnamed: 0.1,Unnamed: 0,rating,text,text_cleaned,input_ids,attention_mask,pred_qwen3_0s,pred_qwen3_5s
0,0,4.0,Used to freshen up linens,Used to freshen up linens,[ 0 47640 7 21862 2457 62 24248 12...,[1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0...,2,2
1,1,0.0,need more proof:(to order!!!!,need more proof:(to order!!!!,[ 0 30484 55 6461 48329 560 645 323...,[1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0...,1,1
2,2,4.0,"This is a good flat iron, it has different tem...","This is a good flat iron, it has different tem...",[ 0 713 16 10 205 3269 6440 ...,[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1...,3,4
3,3,4.0,This is the best gel I’ve tried. Super hold w...,This is the best gel I’ve tried. Super hold wi...,[ 0 713 16 5 275 17916 38 ...,[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1...,4,4
4,4,4.0,It comes with 1 set of the 4 smaller sizes and...,It comes with 1 set of the 4 smaller sizes and...,[ 0 243 606 19 112 278 9 ...,[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1...,2,3
...,...,...,...,...,...,...,...,...
995,995,4.0,Very effective. Can see good change :),Very effective. Can see good change :),[ 0 25101 2375 4 2615 192 205 4...,[1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0...,3,3
996,996,4.0,This product works! The treatment works on sh...,This product works! The treatment works on she...,[ 0 713 1152 1364 328 20 1416 13...,[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1...,4,3
997,997,4.0,"Great price. Great product, and very fast ship...","Great price. Great product, and very fast ship...",[ 0 19065 425 4 2860 1152 6 ...,[1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0...,4,4
998,998,3.0,I actually liked the hair. It did shed a littl...,I actually liked the hair. It did shed a littl...,[ 0 100 888 6640 5 2549 4 ...,[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1...,2,2


In [25]:
test_qwen.to_csv("results/test_10k_5_with_qwen_preds.csv", index=False)

In [26]:
model = "qwen/qwen3-32b"
chunk_size = 60

all_predictions, all_responses = predict_sentiments_groq(sample_text=sample_text, 
                                         chunk_size=chunk_size, 
                                         model=model,
                                         few_shot=False,
                                         cot=True,
                                         binary=False,
                                         response_format=False) 
all_pred = pd.DataFrame()
for i in all_responses:
    content = i.split('</think>')[-1].replace('```json', '').replace('```', '').strip()
    json_content = json.loads(content)
    try:
        df = pd.DataFrame(json_content['reviews'])
    except Exception as e:
        df = pd.DataFrame(json_content)
        # break
    all_pred = pd.concat([all_pred, df], ignore_index=True)

test_qwen['pred_qwen3_cot'] = pd.DataFrame(all_predictions)['sentiment'].astype(int)
(test_qwen['pred_qwen3_cot'] == test_qwen['rating']).sum()/len(test_qwen)

Error occurred: Connection error.. Retrying in 60 seconds...
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
40


np.float64(0.654)

In [27]:
test_qwen

Unnamed: 0.1,Unnamed: 0,rating,text,text_cleaned,input_ids,attention_mask,pred_qwen3_0s,pred_qwen3_5s,pred_qwen3_cot
0,0,4.0,Used to freshen up linens,Used to freshen up linens,[ 0 47640 7 21862 2457 62 24248 12...,[1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0...,2,2,3
1,1,0.0,need more proof:(to order!!!!,need more proof:(to order!!!!,[ 0 30484 55 6461 48329 560 645 323...,[1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0...,1,1,1
2,2,4.0,"This is a good flat iron, it has different tem...","This is a good flat iron, it has different tem...",[ 0 713 16 10 205 3269 6440 ...,[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1...,3,4,4
3,3,4.0,This is the best gel I’ve tried. Super hold w...,This is the best gel I’ve tried. Super hold wi...,[ 0 713 16 5 275 17916 38 ...,[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1...,4,4,4
4,4,4.0,It comes with 1 set of the 4 smaller sizes and...,It comes with 1 set of the 4 smaller sizes and...,[ 0 243 606 19 112 278 9 ...,[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1...,2,3,3
...,...,...,...,...,...,...,...,...,...
995,995,4.0,Very effective. Can see good change :),Very effective. Can see good change :),[ 0 25101 2375 4 2615 192 205 4...,[1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0...,3,3,4
996,996,4.0,This product works! The treatment works on sh...,This product works! The treatment works on she...,[ 0 713 1152 1364 328 20 1416 13...,[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1...,4,3,3
997,997,4.0,"Great price. Great product, and very fast ship...","Great price. Great product, and very fast ship...",[ 0 19065 425 4 2860 1152 6 ...,[1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0...,4,4,4
998,998,3.0,I actually liked the hair. It did shed a littl...,I actually liked the hair. It did shed a littl...,[ 0 100 888 6640 5 2549 4 ...,[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1...,2,2,3


In [28]:
test_qwen.to_csv("results/test_10k_5_with_qwen_preds.csv", index=False)

In [34]:
model = "qwen/qwen3-32b"
chunk_size = 60

# all_predictions, all_responses = predict_sentiments_groq(sample_text=sample_text, 
#                                          chunk_size=chunk_size, 
#                                          model=model,
#                                          few_shot=True,
#                                          cot=True,
#                                          binary=False,
#                                          response_format=False) 
all_pred = pd.DataFrame()
for i in all_responses:
    content = i.split('</think>')[-1].replace('```json', '').replace('```', '').strip()
    json_content = json.loads(content)
    try:
        df = pd.DataFrame(json_content['reviews'])
    except Exception as e:
        df = pd.DataFrame(json_content)
        # break
    all_pred = pd.concat([all_pred, df], ignore_index=True)
all_pred['sentiment_rating'] = np.where(all_pred['sentiment_rating'].notna(),
                                       all_pred['sentiment_rating'],
                                        all_pred['sentiment_rate'])

test_qwen['pred_qwen3_cot_5s'] = all_pred['sentiment_rating'].astype(int)
(test_qwen['pred_qwen3_cot_5s'] == test_qwen['rating']).sum()/len(test_qwen)

np.float64(0.684)

In [35]:
test_qwen.to_csv("results/test_10k_5_with_qwen_preds.csv", index=False)

In [31]:
pd.DataFrame(all_predictions)

Unnamed: 0,index,sentiment_rating,sentiment_rate
0,0,2,
1,1,1,
2,2,4,
3,3,4,
4,4,3,
...,...,...,...
995,995,4,
996,996,4,
997,997,4,
998,998,3,
