# Working Environment

In [34]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [35]:
%cd /content/drive/MyDrive/Colab Notebooks/Sentiment-Analysis
!ls|

/content/drive/MyDrive/Colab Notebooks/Sentiment-Analysis
/bin/bash: -c: line 2: syntax error: unexpected end of file


# Import Dataset

In [36]:
import pandas as pd

data = pd.read_csv('amazon_alexa.tsv', sep='\t')
data.head(10)

Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1
4,5,31-Jul-18,Charcoal Fabric,Music,1
5,5,31-Jul-18,Heather Gray Fabric,I received the echo as a gift. I needed anothe...,1
6,3,31-Jul-18,Sandstone Fabric,"Without having a cellphone, I cannot use many ...",1
7,5,31-Jul-18,Charcoal Fabric,I think this is the 5th one I've purchased. I'...,1
8,5,30-Jul-18,Heather Gray Fabric,looks great,1
9,5,30-Jul-18,Heather Gray Fabric,Love it! I’ve listened to songs I haven’t hear...,1


In [37]:
mydata = data[['verified_reviews','feedback']]
mydata.columns =['review','label']

mydata.head()

Unnamed: 0,review,label
0,Love my Echo!,1
1,Loved it!,1
2,"Sometimes while playing a game, you can answer...",1
3,I have had a lot of fun with this thing. My 4 ...,1
4,Music,1


In [38]:
mydata.value_counts('label')

label
1    2893
0     257
Name: count, dtype: int64

In [39]:
# Data is imbalanced. so, Will do down sampling/upper sampling

# Count the occurrences of each label
label_counts = mydata['label'].value_counts()

# Get the no. of rows to drop from the majority class
rows_to_drop = label_counts.max() - label_counts.min()

# Drop rows from the majority class randomly
if rows_to_drop >0:
  data_majority = mydata[mydata['label'] == 1]
  data_balanced = mydata.drop(data_majority.sample(rows_to_drop).index)
else:
  data_balanced = mydata.copy()

# Check the new class balance
print(data_balanced['label'].value_counts())

# Nopw the data is balanced

label
1    257
0    257
Name: count, dtype: int64


# Data Preprocessing

In [40]:
import re

def clean_text(text):
  # Remove special characters and punctuation
  text = re.sub(r"[^\w\s]", " ", text)

  # Remove single characters
  text = re.sub(r"\b[a-zA-Z]\b", " ", text)

  # Remove HTML tags
  text = re.sub(r"<[^>]*>", " ", text)

  # Lowercase the text
  text = text.lower()

  # Remove extra whitespace
  text = re.sub(r"\s+", " ", text)

  # Trim leading and trailing spaces
  text = text.strip()

  return text

In [41]:
import pandas as pd

# Extract the review column as a list
reviews = data_balanced['review'].tolist()

# Clean the text in the list
cleaned_reviews = []
for review in reviews:
    if not isinstance(review, str):
        review = str(review)
    cleaned_reviews.append(clean_text(review))




In [42]:
# Add the cleaned reviews as a new column to the DataFrame
data_balanced['clean_reviews'] = cleaned_reviews

In [43]:
data_balanced

Unnamed: 0,review,label,clean_reviews
9,Love it! I’ve listened to songs I haven’t hear...,1,love it ve listened to songs haven heard since...
12,I purchased this for my mother who is having k...,1,purchased this for my mother who is having kne...
20,Love the Echo and how good the music sounds pl...,1,love the echo and how good the music sounds pl...
22,Have only had it set up for a few days. Still ...,1,have only had it set up for few days still add...
23,I love it. It plays my sleep sounds immediatel...,1,love it it plays my sleep sounds immediately w...
...,...,...,...
3125,This product is easy to use and very entertain...,1,this product is easy to use and very entertain...
3127,works great but speaker is not the good for mu...,1,works great but speaker is not the good for mu...
3129,We have six of these throughout our home and t...,1,we have six of these throughout our home and t...
3140,Barry,1,barry


# Data Split

In [44]:
import pandas as pd

# Assuming your dataframe is 'df'
total_rows = len(data_balanced)
test_size = int(total_rows * 0.95)

In [45]:
# Randomly sample train_size rows for the training set
test_set = data_balanced.sample(test_size)

In [46]:
# get the reamining rows for the test set
train_set = data_balanced.drop(test_set.index)

# Sentiment w/LLM

### Setting up Gemini API

In [47]:
!pip install -q -U google-generativeai

In [48]:
# Necessary packages

import pathlib
import textwrap

import google.generativeai as genai

from IPython.display import display
from IPython.display import Markdown

def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

# Used to securely store your API key
from google.colab import userdata


In [49]:
# Or use `os.getenv('GOOGLE_API_KEY')` to fetch an environment variable.
GOOGLE_API_KEY=userdata.get('GOOGLE_API_KEY')

genai.configure(api_key=GOOGLE_API_KEY)

In [50]:
for m in genai.list_models():
  if 'generateContent' in m.supported_generation_methods:
    print(m.name)

models/gemini-1.0-pro
models/gemini-1.0-pro-001
models/gemini-1.0-pro-latest
models/gemini-1.0-pro-vision-latest
models/gemini-1.5-flash
models/gemini-1.5-flash-001
models/gemini-1.5-flash-latest
models/gemini-1.5-pro
models/gemini-1.5-pro-001
models/gemini-1.5-pro-latest
models/gemini-pro
models/gemini-pro-vision


In [51]:
model = genai.GenerativeModel('gemini-pro')

In [52]:
%%time
response = model.generate_content("What is the meaning of life?")

to_markdown(response.text)

CPU times: user 98.6 ms, sys: 8.65 ms, total: 107 ms
Wall time: 6.42 s


> The meaning of life is a deeply personal and philosophical question that has been pondered by humans for centuries. There is no one definitive answer, as the meaning of life can vary based on individual beliefs, values, and experiences.
> 
> Some common perspectives on the meaning of life include:
> 
> **Purpose-Driven:** This view suggests that life has a specific purpose or goal, such as fulfilling a particular role in society, achieving personal fulfillment, or making a positive impact on the world.
> 
> **Experiential:** This perspective focuses on the value of experiences and moments, emphasizing the importance of living life to the fullest and appreciating the present.
> 
> **Connection:** This view emphasizes the importance of relationships and connections with others, suggesting that life's meaning is found in our interactions and the love we share.
> 
> **Growth and Learning:** This perspective sees life as a journey of personal growth and learning, where the pursuit of knowledge, wisdom, and self-improvement brings meaning.
> 
> **Meaning-Making:** This view suggests that life doesn't have an inherent meaning, but rather that individuals create meaning for themselves through their choices, actions, and beliefs.
> 
> **Subjective:** This perspective acknowledges that the meaning of life is entirely subjective and varies widely from person to person, depending on their own beliefs, values, and circumstances.
> 
> Ultimately, the meaning of life is something that each individual must explore and define for themselves. There is no right or wrong answer, and the journey of discovering one's purpose and meaning can be an ongoing and evolving process.

#### Single API Call

In [53]:
test_set_sample = test_set.sample(20)

test_set_sample['pred_label'] = ''

test_set_sample

Unnamed: 0,review,label,clean_reviews,pred_label
1208,I love my echo spot that my daughter and son i...,1,love my echo spot that my daughter and son in ...,
2307,I like being able to watch pretty much anythin...,1,like being able to watch pretty much anything ...,
2446,Great entertainment,1,great entertainment,
1277,ALEXA IS NOT AS SMART AS SIRI! CANNOT CANCEL S...,0,alexa is not as smart as siri cannot cancel sn...,
1744,"This was my 4th echo device, 1st echo show it...",0,this was my 4th echo device 1st echo show it s...,
2909,"For the price, the product is nice quality and...",0,for the price the product is nice quality and ...,
248,I bought this to replace a &#34;Dot&#34; in my...,1,bought this to replace 34 dot 34 in my living ...,
2866,The volume is very low,0,the volume is very low,
2304,"I’m very unhappy with this Firestick, every ti...",0,very unhappy with this firestick every time we...,
994,Too difficult to set up. It keeps timing out ...,0,too difficult to set up it keeps timing out be...,


In [54]:
# Convert the DataFrame to JSON using the to_json() method

json_data = test_set_sample[['clean_reviews','pred_label']].to_json(orient='records')

# Print the JSON data
print(json_data)

[{"clean_reviews":"love my echo spot that my daughter and son in law got me for my birthday alexa wakes me each morning and plays music when ask","pred_label":""},{"clean_reviews":"like being able to watch pretty much anything want at anytime works great","pred_label":""},{"clean_reviews":"great entertainment","pred_label":""},{"clean_reviews":"alexa is not as smart as siri cannot cancel snooze after alarm goes off have to cancel pre set alarm for weekdays","pred_label":""},{"clean_reviews":"this was my 4th echo device 1st echo show it seemed to have huge potential and really wanted to like it but soon found out it is quite limited bit disappointing but not the end of the world used it like use my other screenless echos for little under 2 month when the screen started flickering mostly on the bottom of the screen but not just quick look in some forums made me understand it is pointless asking for replacement as some people had 2 3 devices replaced only to start flickering again shortly

In [55]:
prompt = f"""
You are an expert linguist, who is good at classifying customer review sentiments into Positive/Negative labels.
Help me classify customer reviews into: Positive(label=1), and Negative(label=0).
Customer reviews are provided between three back ticks.
In your output, only return the Json code back as output - which is provided between three backticks.
Your task is to update predicted labels under 'pred_label' in the Json code.
Don't make any changes to Json code format, please.

```
{json_data}
```
"""

print(prompt)


You are an expert linguist, who is good at classifying customer review sentiments into Positive/Negative labels.
Help me classify customer reviews into: Positive(label=1), and Negative(label=0).
Customer reviews are provided between three back ticks.
In your output, only return the Json code back as output - which is provided between three backticks.
Your task is to update predicted labels under 'pred_label' in the Json code.
Don't make any changes to Json code format, please.

```
[{"clean_reviews":"love my echo spot that my daughter and son in law got me for my birthday alexa wakes me each morning and plays music when ask","pred_label":""},{"clean_reviews":"like being able to watch pretty much anything want at anytime works great","pred_label":""},{"clean_reviews":"great entertainment","pred_label":""},{"clean_reviews":"alexa is not as smart as siri cannot cancel snooze after alarm goes off have to cancel pre set alarm for weekdays","pred_label":""},{"clean_reviews":"this was my 4th

In [56]:
response = model.generate_content(prompt)
print(response.text)

```
[{"clean_reviews":"love my echo spot that my daughter and son in law got me for my birthday alexa wakes me each morning and plays music when ask","pred_label":1},{"clean_reviews":"like being able to watch pretty much anything want at anytime works great","pred_label":1},{"clean_reviews":"great entertainment","pred_label":1},{"clean_reviews":"alexa is not as smart as siri cannot cancel snooze after alarm goes off have to cancel pre set alarm for weekdays","pred_label":0},{"clean_reviews":"this was my 4th echo device 1st echo show it seemed to have huge potential and really wanted to like it but soon found out it is quite limited bit disappointing but not the end of the world used it like use my other screenless echos for little under 2 month when the screen started flickering mostly on the bottom of the screen but not just quick look in some forums made me understand it is pointless asking for replacement as some people had 2 3 devices replaced only to start flickering again shortly

In [72]:
import json

# Clean the data by stripping the backticks
json_data = response.text.strip("`")

# Load the cleaned data and convert to DataFrame
data = json.loads(json_data)
df_sample = pd.DataFrame(data)

df_sample

Unnamed: 0,clean_reviews,pred_label
0,love my echo spot that my daughter and son in ...,1.0
1,like being able to watch pretty much anything ...,1.0
2,great entertainment,1.0
3,alexa is not as smart as siri cannot cancel sn...,0.0
4,this was my 4th echo device 1st echo show it s...,0.0
5,for the price the product is nice quality and ...,0.0
6,bought this to replace 34 dot 34 in my living ...,0.0
7,the volume is very low,0.0
8,very unhappy with this firestick every time we...,0.0
9,too difficult to set up it keeps timing out be...,0.0


In [73]:
# prompt: Overwrite pred_label from 'df' into pred_label in 'train_set_sample'

test_set_sample['pred_label'] = df_sample['pred_label'].values
test_set_sample

Unnamed: 0,review,label,clean_reviews,pred_label
1208,I love my echo spot that my daughter and son i...,1,love my echo spot that my daughter and son in ...,1.0
2307,I like being able to watch pretty much anythin...,1,like being able to watch pretty much anything ...,1.0
2446,Great entertainment,1,great entertainment,1.0
1277,ALEXA IS NOT AS SMART AS SIRI! CANNOT CANCEL S...,0,alexa is not as smart as siri cannot cancel sn...,0.0
1744,"This was my 4th echo device, 1st echo show it...",0,this was my 4th echo device 1st echo show it s...,0.0
2909,"For the price, the product is nice quality and...",0,for the price the product is nice quality and ...,0.0
248,I bought this to replace a &#34;Dot&#34; in my...,1,bought this to replace 34 dot 34 in my living ...,0.0
2866,The volume is very low,0,the volume is very low,0.0
2304,"I’m very unhappy with this Firestick, every ti...",0,very unhappy with this firestick every time we...,0.0
994,Too difficult to set up. It keeps timing out ...,0,too difficult to set up it keeps timing out be...,0.0


In [76]:
# Plotting confusion matrix on the predictions

from sklearn.metrics import accuracy_score

y_true = test_set_sample["label"]
y_pred = test_set_sample["pred_label"]

accuracy_score(y_true, y_pred)

In [68]:
# Accuracy = TP+TN/ Total = 9+7/20 ~ 80%