### Working Environment

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/1-GenAI-HandsOn/5-SentimentAnalysis-LLM
!ls

/content/drive/MyDrive/1-GenAI-HandsOn/5-SentimentAnalysis-LLM
amazon_alexa.tsv  amazon_f_handson.ipynb


### Import Dataset

In [1]:
import pandas as pd

data = pd.read_csv('/content/amazon_alexa.tsv', sep='\t')
data.head(10)

Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1
4,5,31-Jul-18,Charcoal Fabric,Music,1
5,5,31-Jul-18,Heather Gray Fabric,I received the echo as a gift. I needed anothe...,1
6,3,31-Jul-18,Sandstone Fabric,"Without having a cellphone, I cannot use many ...",1
7,5,31-Jul-18,Charcoal Fabric,I think this is the 5th one I've purchased. I'...,1
8,5,30-Jul-18,Heather Gray Fabric,looks great,1
9,5,30-Jul-18,Heather Gray Fabric,Love it! I’ve listened to songs I haven’t hear...,1


In [2]:
mydata = data[['verified_reviews','feedback']]
mydata.columns = ['review','label']

mydata.head()

Unnamed: 0,review,label
0,Love my Echo!,1
1,Loved it!,1
2,"Sometimes while playing a game, you can answer...",1
3,I have had a lot of fun with this thing. My 4 ...,1
4,Music,1


In [3]:
mydata.value_counts('label')

label
1    2893
0     257
Name: count, dtype: int64

In [4]:
# Count the occurrences of each label
label_counts = mydata["label"].value_counts()

# Get the number of rows to drop from the majority class
rows_to_drop = label_counts.max() - label_counts.min()

# Drop rows from the majority class randomly
if rows_to_drop > 0:
   data_majority = mydata[mydata["label"] == 1]
   data_balanced = mydata.drop(data_majority.sample(rows_to_drop).index)
else:
   data_balanced = mydata.copy()

# Check the new class balance
print(data_balanced["label"].value_counts())

label
1    257
0    257
Name: count, dtype: int64


## Data Preprocessing

In [5]:
import re

def clean_text(text):
  # Remove special characters and punctuation
  text = re.sub(r"[^\w\s]", " ", text)

  # Remove single characters
  text = re.sub(r"\b[a-zA-Z]\b", " ", text)

  # Remove HTML tags
  text = re.sub(r"<[^>]*>", " ", text)

  # Lowercase the text
  text = text.lower()

  # Remove extra whitespace
  text = re.sub(r"\s+", " ", text)

  # Trim leading and trailing spaces
  text = text.strip()

  return text

In [6]:
import numpy as np

# Replace NaN values with empty strings
data_balanced['review'].fillna('', inplace=True)

# Extract the review column as a list
reviews = data_balanced['review'].tolist()

# Clean the text in the list
cleaned_reviews = [clean_text(review) for review in reviews]

# Add the cleaned reviews as a new column to the DataFrame
data_balanced['clean_reviews'] = cleaned_reviews


In [7]:
data_balanced

Unnamed: 0,review,label,clean_reviews
8,looks great,1,looks great
10,"I sent it to my 85 year old Dad, and he talks ...",1,sent it to my 85 year old dad and he talks to ...
12,I purchased this for my mother who is having k...,1,purchased this for my mother who is having kne...
19,I liked the original Echo. This is the same bu...,1,liked the original echo this is the same but s...
28,Fun item to play with and get used to using. ...,1,fun item to play with and get used to using so...
...,...,...,...
3097,I have a friend now.,1,have friend now
3116,I enjoy it. Still discovering new uses.,1,enjoy it still discovering new uses
3119,It's pretty fun and nice to have a speaker tha...,1,it pretty fun and nice to have speaker that re...
3138,Este producto llegó y a la semana se quedó sin...,1,este producto llegó la semana se quedó sin olo...


## Data Split

In [8]:
import pandas as pd

# Assuming your DataFrame is called "df"
total_rows = len(data_balanced)
test_size = int(total_rows * 0.95)

# Randomly sample train_size rows for the training set
test_set = data_balanced.sample(test_size)

# Get the remaining rows for the test set
train_set = data_balanced.drop(test_set.index)

## Sentiment w/ LLM

### Setting up Gemini API

In [9]:
!pip install -q -U google-generativeai

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/142.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m142.1/142.1 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/663.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━[0m [32m512.0/663.6 kB[0m [31m16.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m663.6/663.6 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [19]:
# Necessary packages
import pathlib
import textwrap

import google.generativeai as genai

from IPython.display import display
from IPython.display import Markdown


def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

# Used to securely store your API key
from google.colab import userdata

In [14]:
!pip install genai



In [20]:
# Or use `os.getenv('GOOGLE_API_KEY')` to fetch an environment variable.
GOOGLE_API_KEY=userdata.get('GOOGLE_API_KEY')

genai.configure(api_key=GOOGLE_API_KEY)

In [21]:
for m in genai.list_models():
  if 'generateContent' in m.supported_generation_methods:
    print(m.name)

models/gemini-1.0-pro
models/gemini-1.0-pro-001
models/gemini-1.0-pro-latest
models/gemini-1.0-pro-vision-latest
models/gemini-1.5-pro-latest
models/gemini-pro
models/gemini-pro-vision


In [22]:
model = genai.GenerativeModel('gemini-pro')

In [23]:
%%time
response = model.generate_content("What is the meaning of life?")

to_markdown(response.text)

CPU times: user 50.4 ms, sys: 9.2 ms, total: 59.6 ms
Wall time: 7.65 s


> The meaning of life is a subject that has been puzzled by philosophers, religious leaders, and individuals throughout history. Over time, various perspectives emerged:
> 
> **1. Objective Meaning:**
> 
> * Some believe that life has an inherent objective meaning or purpose given by a higher power or the universe, such as:
>     * Fulfilling a divine plan or destiny
>     * Contributing to the greater good of humanity
>     * Advancing knowledge and progress
> 
> **2. Subjective Meaning:**
> 
> * Others assert that the meaning of life is subjective and depends on the individual. It can be found through:
>     * Personal values, passions, and goals
>     * Relationships with others
>     * Experiences and accomplishments that bring fulfillment
> 
> **3. Existential Viewpoints:**
> 
> * Existentialists emphasize the absurdity of life and the responsibility of individuals to create their own meaning.
>     * Absurdism: Recognizes the inherent meaninglessness but advocates for embracing life's contradictions and seeking purpose in spite of them.
>     * Existentialism: Stresses personal freedom, authenticity, and responsibility for defining one's own existence.
> 
> **4. Scientific Theories:**
> 
> * Some scientific theories propose that life is:
>     * A product of random evolution, without inherent meaning
>     * A complex system that emerged through natural selection, driven by the desire to survive and reproduce
>     * A self-organized process that seeks order and complexity
> 
> **5. Cultural and Social Factors:**
> 
> * Cultural and societal norms play a significant role in shaping our understanding of the meaning of life.
>     * Traditions, beliefs, and values can provide individuals with a framework for finding purpose and direction.
>     * Social interactions and community engagement can also contribute to a sense of belonging and purpose.
> 
> **Personal Perspectives:**
> 
> Ultimately, the meaning of life is a deeply personal question. There is no single, universally agreed-upon answer. It is a journey of self-discovery, exploration, and reflection. Through experiences, relationships, and a search for purpose, individuals can find meaning that is both unique and fulfilling to them.

#### Single API Call

In [24]:
test_set_sample = test_set.sample(20)

test_set_sample['pred_label'] = ''

test_set_sample

Unnamed: 0,review,label,clean_reviews,pred_label
524,,0,,
2183,awesome device,1,awesome device,
2964,Ask it to play Motown radio on Pandora and it ...,0,ask it to play motown radio on pandora and it ...,
787,Good sound works well,1,good sound works well,
29,Just like the other one,1,just like the other one,
1173,Love these!,1,love these,
2975,"Been debating on getting one, but am happy I d...",1,been debating on getting one but am happy did ...,
741,"It's like Siri, in fact, Siri answers more acc...",0,it like siri in fact siri answers more accurat...,
1112,,0,,
3097,I have a friend now.,1,have friend now,


In [25]:
# Convert the DataFrame to JSON using the to_json() method

json_data = test_set_sample[['clean_reviews','pred_label']].to_json(orient='records')

# Print the JSON data
print(json_data)

[{"clean_reviews":"","pred_label":""},{"clean_reviews":"awesome device","pred_label":""},{"clean_reviews":"ask it to play motown radio on pandora and it keeps asking if want to add salsa station motown isn close to salsa phonetically","pred_label":""},{"clean_reviews":"good sound works well","pred_label":""},{"clean_reviews":"just like the other one","pred_label":""},{"clean_reviews":"love these","pred_label":""},{"clean_reviews":"been debating on getting one but am happy did still more to explore thanks","pred_label":""},{"clean_reviews":"it like siri in fact siri answers more accurately then alexa don see real need for it in my household though it was good bargain on prime day deals","pred_label":""},{"clean_reviews":"","pred_label":""},{"clean_reviews":"have friend now","pred_label":""},{"clean_reviews":"seems to work ok but no youtube tv really can believe they would ban an app that so many people want to use really wanted to like it but when couldn install that particular app it b

In [26]:
prompt = f"""
You are an expert linguist, who is good at classifying customer review sentiments into Positive/Negative labels.
Help me classify customer reviews into: Positive(label=1), and Negative(label=0).
Customer reviews are provided between three back ticks.
In your output, only return the Json code back as output - which is provided between three backticks.
Your task is to update predicted labels under 'pred_label' in the Json code.
Don't make any changes to Json code format, please.

```
{json_data}
```
"""

print(prompt)


You are an expert linguist, who is good at classifying customer review sentiments into Positive/Negative labels.
Help me classify customer reviews into: Positive(label=1), and Negative(label=0).
Customer reviews are provided between three back ticks.
In your output, only return the Json code back as output - which is provided between three backticks.
Your task is to update predicted labels under 'pred_label' in the Json code.
Don't make any changes to Json code format, please.

```
[{"clean_reviews":"","pred_label":""},{"clean_reviews":"awesome device","pred_label":""},{"clean_reviews":"ask it to play motown radio on pandora and it keeps asking if want to add salsa station motown isn close to salsa phonetically","pred_label":""},{"clean_reviews":"good sound works well","pred_label":""},{"clean_reviews":"just like the other one","pred_label":""},{"clean_reviews":"love these","pred_label":""},{"clean_reviews":"been debating on getting one but am happy did still more to explore thanks","

In [27]:
response = model.generate_content(prompt)

print(response.text)

```
[{"clean_reviews":"","pred_label":1},{"clean_reviews":"awesome device","pred_label":1},{"clean_reviews":"ask it to play motown radio on pandora and it keeps asking if want to add salsa station motown isn close to salsa phonetically","pred_label":0},{"clean_reviews":"good sound works well","pred_label":1},{"clean_reviews":"just like the other one","pred_label":1},{"clean_reviews":"love these","pred_label":1},{"clean_reviews":"been debating on getting one but am happy did still more to explore thanks","pred_label":1},{"clean_reviews":"it like siri in fact siri answers more accurately then alexa don see real need for it in my household though it was good bargain on prime day deals","pred_label":0},{"clean_reviews":"","pred_label":1},{"clean_reviews":"have friend now","pred_label":1},{"clean_reviews":"seems to work ok but no youtube tv really can believe they would ban an app that so many people want to use really wanted to like it but when couldn install that particular app it became 

In [28]:
import json

# Clean the data by stripping the backticks
json_data = response.text.strip("`")

# Load the cleaned data and convert to DataFrame
data = json.loads(json_data)
df_sample = pd.DataFrame(data)

df_sample

Unnamed: 0,clean_reviews,pred_label
0,,1
1,awesome device,1
2,ask it to play motown radio on pandora and it ...,0
3,good sound works well,1
4,just like the other one,1
5,love these,1
6,been debating on getting one but am happy did ...,1
7,it like siri in fact siri answers more accurat...,0
8,,1
9,have friend now,1


In [29]:
# prompt: Overwrite pred_label from 'df' into pred_label in 'train_set_sample'

test_set_sample['pred_label'] = df_sample['pred_label'].values
test_set_sample

Unnamed: 0,review,label,clean_reviews,pred_label
524,,0,,1
2183,awesome device,1,awesome device,1
2964,Ask it to play Motown radio on Pandora and it ...,0,ask it to play motown radio on pandora and it ...,0
787,Good sound works well,1,good sound works well,1
29,Just like the other one,1,just like the other one,1
1173,Love these!,1,love these,1
2975,"Been debating on getting one, but am happy I d...",1,been debating on getting one but am happy did ...,1
741,"It's like Siri, in fact, Siri answers more acc...",0,it like siri in fact siri answers more accurat...,0
1112,,0,,1
3097,I have a friend now.,1,have friend now,1


In [30]:
# Plotting confusion matrix on the predictions

from sklearn.metrics import confusion_matrix

y_true = test_set_sample["label"]
y_pred = test_set_sample["pred_label"]

confusion_matrix(y_true, y_pred)

array([[9, 2],
       [0, 9]])

### OpenAI API Config

In [31]:
!pip install openai==0.27.0

Collecting openai==0.27.0
  Downloading openai-0.27.0-py3-none-any.whl (70 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/70.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m70.1/70.1 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: openai
  Attempting uninstall: openai
    Found existing installation: openai 0.27.10
    Uninstalling openai-0.27.10:
      Successfully uninstalled openai-0.27.10
Successfully installed openai-0.27.0


In [48]:
import openai
from google.colab import userdata

OPENAI_API_KEY=userdata.get('OPENAI_API_KEY')
openai.api_key  = OPENAI_API_KEY

In [49]:
def get_completion(prompt, model="gpt-3.5-turbo-1106"):

  messages = [{"role": "user", "content": prompt}]
  response = openai.ChatCompletion.create(model=model,messages=messages,temperature=0)

  return response.choices[0].message["content"]

In [50]:
prompt = "Why is the sky blue?"

chatgpt_response = get_completion(prompt)

In [51]:
chatgpt_response

"The sky appears blue to our eyes because of the way the Earth's atmosphere scatters sunlight. The molecules in the Earth's atmosphere, particularly nitrogen and oxygen, scatter shorter wavelengths of light (blue and violet) more effectively than longer wavelengths (red and yellow). This scattering causes the blue light to be more visible to us, giving the sky its blue color."

#### Batching API Calls (Single Shot)

In [52]:
test_set.shape

(488, 3)

In [53]:
test_set_total = test_set.sample(100)

test_set_total['pred_label'] = ''

test_set_total

Unnamed: 0,review,label,clean_reviews,pred_label
1080,I love my new Echo Spot!,1,love my new echo spot,
368,I returned 2 Echo Dots & am only getting refun...,0,returned 2 echo dots am only getting refund fo...,
3097,I have a friend now.,1,have friend now,
2643,Good speaker for bedroom or office,1,good speaker for bedroom or office,
2196,I love the Alexa remote and easy to use.,1,love the alexa remote and easy to use,
...,...,...,...,...
2576,Work pretty well,1,work pretty well,
3027,"Works great.Having fun playing music,checking ...",1,works great having fun playing music checking ...,
2383,Garbage...Even trying to watch Amazon Prime Vi...,0,garbage even trying to watch amazon prime vide...,
376,Doesn't always respond when spoken to with pro...,0,doesn always respond when spoken to with promp...,


In [54]:
batches = []
batch_size = 50

for i in range(0, len(test_set_total), batch_size):
  batches.append(test_set_total[i : i + batch_size])  # Append batches instead of assigning

In [55]:
import time

def gpt_completion_function(batch,current_batch,total_batch,model="gpt-3.5-turbo-1106"):
  """Function works in three steps:
  # Step-1: Convert the DataFrame to JSON using the to_json() method.
  # Step-2: Preparing the Gemini Prompt
  # Step-3: Calling GPT API
  """

  print(f"Now processing batch#: {current_batch+1} of {total_batch}")

  json_data = batch[['clean_reviews','pred_label']].to_json(orient='records')

  prompt = f"""You are an expert linguist, who is good at classifying customer review sentiments into Positive/Negative labels.
  Help me classify customer reviews into: Positive(label=1), and Negative(label=0).
  Customer reviews are provided between three backticks below.
  In your output, only return the Json code back as output - which is provided between three backticks.
  Your task is to update predicted labels under 'pred_label' in the Json code.
  Don't make any changes to Json code format, please.
  Error handling instruction: In case a Customer Review violates API policy, please assign it default sentiment as Negative (label=0).

  ```
  {json_data}
  ```
  """

  print(prompt)

  messages = [{"role": "user", "content": prompt}]
  response = openai.ChatCompletion.create(model=model,messages=messages,temperature=0)
  time.sleep(5)
  return response.choices[0].message["content"]

In [56]:
batch_count = len(batches)
responses = []

for i in range(0,len(batches)):
  responses.append(gpt_completion_function(batches[i],i,batch_count))

Now processing batch#: 1 of 2
You are an expert linguist, who is good at classifying customer review sentiments into Positive/Negative labels.
  Help me classify customer reviews into: Positive(label=1), and Negative(label=0).
  Customer reviews are provided between three backticks below.
  In your output, only return the Json code back as output - which is provided between three backticks.
  Your task is to update predicted labels under 'pred_label' in the Json code.
  Don't make any changes to Json code format, please.
  Error handling instruction: In case a Customer Review violates API policy, please assign it default sentiment as Negative (label=0).

  ```
  [{"clean_reviews":"love my new echo spot","pred_label":""},{"clean_reviews":"returned 2 echo dots am only getting refund for 1 returned 2 in one package want my refunds for both of them they are in the same package can you not see them both","pred_label":""},{"clean_reviews":"have friend now","pred_label":""},{"clean_reviews":"

In [58]:
import json
import pandas as pd

# Initialize an empty list to store DataFrame objects
dfs = []

for response in responses:
    # Clean the data by stripping the backticks
    json_data = response.strip("`")

    # Load the cleaned data
    data = json.loads(json_data)

    # Convert to DataFrame
    df_temp = pd.DataFrame(data)

    # Append the DataFrame to the list
    dfs.append(df_temp)

# Concatenate all DataFrames in the list into a single DataFrame
df_total = pd.concat(dfs, ignore_index=True)

print(df_total)  # Display the final DataFrame


                                        clean_reviews  pred_label
0                               love my new echo spot           1
1   returned 2 echo dots am only getting refund fo...           0
2                                     have friend now           1
3                  good speaker for bedroom or office           1
4               love the alexa remote and easy to use           1
..                                                ...         ...
95                                   work pretty well           1
96  works great having fun playing music checking ...           1
97  garbage even trying to watch amazon prime vide...           0
98  doesn always respond when spoken to with promp...           0
99   dislike the volume it does not sound loud enough           0

[100 rows x 2 columns]


In [59]:
# prompt: Overwrite pred_label from 'df' into pred_label in 'train_set_sample'

test_set_total['pred_label'] = df_total['pred_label'].values
test_set_total

Unnamed: 0,review,label,clean_reviews,pred_label
1080,I love my new Echo Spot!,1,love my new echo spot,1
368,I returned 2 Echo Dots & am only getting refun...,0,returned 2 echo dots am only getting refund fo...,0
3097,I have a friend now.,1,have friend now,1
2643,Good speaker for bedroom or office,1,good speaker for bedroom or office,1
2196,I love the Alexa remote and easy to use.,1,love the alexa remote and easy to use,1
...,...,...,...,...
2576,Work pretty well,1,work pretty well,1
3027,"Works great.Having fun playing music,checking ...",1,works great having fun playing music checking ...,1
2383,Garbage...Even trying to watch Amazon Prime Vi...,0,garbage even trying to watch amazon prime vide...,0
376,Doesn't always respond when spoken to with pro...,0,doesn always respond when spoken to with promp...,0


In [60]:
# Plotting confusion matrix on the predictions

from sklearn.metrics import confusion_matrix, accuracy_score

y_true = test_set_total["label"]
y_pred = test_set_total["pred_label"]

print(confusion_matrix(y_true, y_pred))
print(f"\nAccuracy: {accuracy_score(y_true, y_pred)}")

[[41  0]
 [ 6 53]]

Accuracy: 0.94


### Batching API Calls: Gemini API

In [61]:
test_set.shape

(488, 3)

In [62]:
test_set_total = test_set.sample(100)

test_set_total['pred_label'] = ''

test_set_total

Unnamed: 0,review,label,clean_reviews,pred_label
2929,"We really love our Echo Dot, we can get questi...",1,we really love our echo dot we can get questio...,
436,Have now used for several months without problem.,1,have now used for several months without problem,
350,Item no longer works after just 5 months of us...,0,item no longer works after just 5 months of us...,
493,"Works good. However, the price went down the w...",1,works good however the price went down the wee...,
3036,Easy to set up and does everything it’s a post...,1,easy to set up and does everything it poster d...,
...,...,...,...,...
1899,It is as if we are having a conversation with ...,1,it is as if we are having conversation with an...,
1225,You need a Harvard law degree to operate this ...,0,you need harvard law degree to operate this th...,
1104,"As an Alexa device, it works just as well as a...",0,as an alexa device it works just as well as an...,
1406,Very good,1,very good,


In [63]:
batches = []
batch_size = 25

for i in range(0, len(test_set_total), batch_size):
  batches.append(test_set_total[i : i + batch_size])  # Append batches instead of assigning

In [64]:
import time

def gemini_completion_function(batch,current_batch,total_batch):
  """Function works in three steps:
  # Step-1: Convert the DataFrame to JSON using the to_json() method.
  # Step-2: Preparing the Gemini Prompt
  # Step-3: Calling Gemini API
  """

  print(f"Now processing batch#: {current_batch+1} of {total_batch}")

  json_data = batch[['clean_reviews','pred_label']].to_json(orient='records')

  prompt = f"""You are an expert linguist, who is good at classifying customer review sentiments into Positive/Negative labels.
  Help me classify customer reviews into: Positive(label=1), and Negative(label=0).
  Customer reviews are provided between three backticks below.
  In your output, only return the Json code back as output - which is provided between three backticks.
  Your task is to update predicted labels under 'pred_label' in the Json code.
  Don't make any changes to Json code format, please.
  Error handling instruction: In case a Customer Review violates API policy, please assign it default sentiment as Negative (label=0).

  ```
  {json_data}
  ```
  """

  print(prompt)
  response = model.generate_content(prompt)
  time.sleep(5)

  return response

In [65]:
batch_count = len(batches)
responses = []

for i in range(0,len(batches)):
  responses.append(gemini_completion_function(batches[i],i,batch_count))

Now processing batch#: 1 of 4
You are an expert linguist, who is good at classifying customer review sentiments into Positive/Negative labels.
  Help me classify customer reviews into: Positive(label=1), and Negative(label=0).
  Customer reviews are provided between three backticks below.
  In your output, only return the Json code back as output - which is provided between three backticks.
  Your task is to update predicted labels under 'pred_label' in the Json code.
  Don't make any changes to Json code format, please.
  Error handling instruction: In case a Customer Review violates API policy, please assign it default sentiment as Negative (label=0).

  ```
  [{"clean_reviews":"we really love our echo dot we can get questions answered right away and play any music we can think of","pred_label":""},{"clean_reviews":"have now used for several months without problem","pred_label":""},{"clean_reviews":"item no longer works after just 5 months of use will not connect to wifi and unrespon

In [67]:
import json
import pandas as pd

dfs = []  # Initialize an empty list to store DataFrame objects

for response in responses:
    # Clean the data by stripping the backticks
    json_data = response.text.strip("`")

    # Load the cleaned data
    data = json.loads(json_data)

    # Convert to DataFrame
    df_temp = pd.DataFrame(data)

    # Append the DataFrame to the list
    dfs.append(df_temp)

# Concatenate all DataFrames in the list into a single DataFrame
df_total = pd.concat(dfs, ignore_index=True)

print(df_total)  # Display the final DataFrame


                                        clean_reviews  pred_label
0   we really love our echo dot we can get questio...           1
1    have now used for several months without problem           1
2   item no longer works after just 5 months of us...           0
3   works good however the price went down the wee...           1
4   easy to set up and does everything it poster d...           1
..                                                ...         ...
95  it is as if we are having conversation with an...           1
96  you need harvard law degree to operate this th...           0
97  as an alexa device it works just as well as an...           0
98                                          very good           1
99  it ok the speaker is pretty terrible google ho...           0

[100 rows x 2 columns]


In [68]:
# prompt: Overwrite pred_label from 'df' into pred_label in 'train_set_sample'

test_set_total['pred_label'] = df_total['pred_label'].values
test_set_total

Unnamed: 0,review,label,clean_reviews,pred_label
2929,"We really love our Echo Dot, we can get questi...",1,we really love our echo dot we can get questio...,1
436,Have now used for several months without problem.,1,have now used for several months without problem,1
350,Item no longer works after just 5 months of us...,0,item no longer works after just 5 months of us...,0
493,"Works good. However, the price went down the w...",1,works good however the price went down the wee...,1
3036,Easy to set up and does everything it’s a post...,1,easy to set up and does everything it poster d...,1
...,...,...,...,...
1899,It is as if we are having a conversation with ...,1,it is as if we are having conversation with an...,1
1225,You need a Harvard law degree to operate this ...,0,you need harvard law degree to operate this th...,0
1104,"As an Alexa device, it works just as well as a...",0,as an alexa device it works just as well as an...,0
1406,Very good,1,very good,1


In [69]:
# Plotting confusion matrix on the predictions

from sklearn.metrics import confusion_matrix

y_true = test_set_total["label"]
y_pred = test_set_total["pred_label"]

confusion_matrix(y_true, y_pred)

array([[49,  0],
       [ 4, 47]])

## Batching API Calls: ChatGPT (Few Shot)

In [70]:
test_set.shape

(488, 3)

In [71]:
test_set_total = test_set.sample(100)

test_set_total['pred_label'] = ''

test_set_total

Unnamed: 0,review,label,clean_reviews,pred_label
2515,The volume is very low,0,the volume is very low,
2655,It works OK. This speaker quality is not much ...,1,it works ok this speaker quality is not much b...,
900,It's got great sound and bass but it doesn't w...,0,it got great sound and bass but it doesn work ...,
2349,I don’t like that it goes into sleep mode afte...,0,don like that it goes into sleep mode after 20...,
2823,Nope. Still a lot to be improved. For most of ...,0,nope still lot to be improved for most of the ...,
...,...,...,...,...
1583,Fairly easy to use,1,fairly easy to use,
1306,Can’t be used as a baby monitor which is why I...,0,can be used as baby monitor which is why had p...,
1278,Great product EXCEPT there is no possible way ...,0,great product except there is no possible way ...,
2878,love it as I do all of my echos,1,love it as do all of my echos,


In [72]:
batches = []
batch_size = 50

for i in range(0, len(test_set_total), batch_size):
  batches.append(test_set_total[i : i + batch_size])  # Append batches instead of assigning

In [73]:
import time

def gpt_completion_function(batch,current_batch,total_batch,train_sample,model="gpt-3.5-turbo-1106"):
  """Function works in three steps:
  # Step-1: Convert the DataFrame to JSON using the to_json() method.
  # Step-2: Preparing the Gemini Prompt
  # Step-3: Calling GPT API
  """

  print(f"Now processing batch#: {current_batch+1} of {total_batch}")

  json_data = batch[['clean_reviews','pred_label']].to_json(orient='records')

  sample_json_data = train_sample[['clean_reviews','label']].to_json(orient='records')

  prompt = f"""You are an expert linguist, who is good at classifying customer review sentiments into Positive/Negative labels.
  Help me classify customer reviews into: Positive(label=1), and Negative(label=0).
  Customer reviews are provided between three backticks below.
  In your output, only return the Json code back as output - which is provided between three backticks.
  Your task is to update predicted labels under 'pred_label' in the Json code.
  Don't make any changes to Json code format, please.
  Error handling instruction: In case a Customer Review violates API policy, please assign it default sentiment as Negative (label=0).
  Examples of good Sentiment Analysis Classification are provided between separator ####.
  These examples are for your reference, not to be included in your final output.

  ```
  {json_data}
  ```
  ####
  {sample_json_data}
  ####
  """

  print(prompt)

  messages = [{"role": "user", "content": prompt}]
  response = openai.ChatCompletion.create(model=model,messages=messages,temperature=0)
  time.sleep(5)
  return response.choices[0].message["content"]

In [74]:
train_sample = train_set.sample(4)

batch_count = len(batches)
responses = []

for i in range(0,len(batches)):
  responses.append(gpt_completion_function(batches[i],i,batch_count,train_sample))

Now processing batch#: 1 of 2
You are an expert linguist, who is good at classifying customer review sentiments into Positive/Negative labels.
  Help me classify customer reviews into: Positive(label=1), and Negative(label=0).
  Customer reviews are provided between three backticks below.
  In your output, only return the Json code back as output - which is provided between three backticks.
  Your task is to update predicted labels under 'pred_label' in the Json code.
  Don't make any changes to Json code format, please.
  Error handling instruction: In case a Customer Review violates API policy, please assign it default sentiment as Negative (label=0).
  Examples of good Sentiment Analysis Classification are provided between separator ####.
  These examples are for your reference, not to be included in your final output.

  ```
  [{"clean_reviews":"the volume is very low","pred_label":""},{"clean_reviews":"it works ok this speaker quality is not much better than my iphone would spend 

In [76]:
import json
import pandas as pd

# Initialize an empty list to store DataFrame objects
dfs = []

for response in responses:
    # Clean the data by stripping the backticks
    json_data = response.strip("`")

    # Load the cleaned data and convert to DataFrame
    data = json.loads(json_data)
    df_temp = pd.DataFrame(data)

    # Append the DataFrame to the list
    dfs.append(df_temp)

# Concatenate all DataFrames in the list into a single DataFrame
df_total = pd.concat(dfs, ignore_index=True)

print(df_total)  # Display the final DataFrame


                                        clean_reviews  pred_label
0                              the volume is very low           0
1   it works ok this speaker quality is not much b...           0
2   it got great sound and bass but it doesn work ...           0
3   don like that it goes into sleep mode after 20...           0
4   nope still lot to be improved for most of the ...           0
..                                                ...         ...
95                                 fairly easy to use           1
96  can be used as baby monitor which is why had p...           0
97  great product except there is no possible way ...           0
98                      love it as do all of my echos           1
99                                            awesome           1

[100 rows x 2 columns]


In [77]:
# prompt: Overwrite pred_label from 'df' into pred_label in 'train_set_sample'

test_set_total['pred_label'] = df_total['pred_label'].values
test_set_total

Unnamed: 0,review,label,clean_reviews,pred_label
2515,The volume is very low,0,the volume is very low,0
2655,It works OK. This speaker quality is not much ...,1,it works ok this speaker quality is not much b...,0
900,It's got great sound and bass but it doesn't w...,0,it got great sound and bass but it doesn work ...,0
2349,I don’t like that it goes into sleep mode afte...,0,don like that it goes into sleep mode after 20...,0
2823,Nope. Still a lot to be improved. For most of ...,0,nope still lot to be improved for most of the ...,0
...,...,...,...,...
1583,Fairly easy to use,1,fairly easy to use,1
1306,Can’t be used as a baby monitor which is why I...,0,can be used as baby monitor which is why had p...,0
1278,Great product EXCEPT there is no possible way ...,0,great product except there is no possible way ...,0
2878,love it as I do all of my echos,1,love it as do all of my echos,1


In [78]:
# Plotting confusion matrix on the predictions

from sklearn.metrics import confusion_matrix, accuracy_score

y_true = test_set_total["label"]
y_pred = test_set_total["pred_label"]

print(confusion_matrix(y_true, y_pred))
print(f"\nAccuracy: {accuracy_score(y_true, y_pred)}")

[[49  1]
 [ 5 45]]

Accuracy: 0.94
