# Sentiment Analysis and Topic Inferring using ChatGPT Prompt Engineering
Author: Amelia Tang

In [1]:
import openai
import Constants
import os
import pandas as pd
import numpy as np
import time # The free trial tier users for gpt-3.5-turbo have a RPM (request per minute) of 3. Need this to deplay executing Python code. 

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

openai.api_key  = Constants.API_KEY

## Sentiment Analysis 

Since free trial tier users of `gpt-3.5-turbo` have a limit of 3 RPM (requests per minute), I decided to examine the first ten reviews as a demo and compare the results with the polarity/sentiment scores assigned by TextBlob and VADER (implemented using NLTK).

In [2]:
topics_df = pd.read_csv("../data/topics.csv",  usecols=['Review_Text','Topic','LDA_Topic'])
topics_df = topics_df.rename(columns = {'Topic':'HDBSCAN_Topic'})
original_doc_df = pd.read_csv("../data/Raw/DisneylandReviews.csv", usecols=["Review_Text"])
topics_df['Review_Text'] = original_doc_df.Review_Text
topics_df

Unnamed: 0,Review_Text,HDBSCAN_Topic,LDA_Topic
0,If you've ever been to Disneyland anywhere you...,-1,3
1,Its been a while since d last time we visit HK...,-1,5
2,Thanks God it wasn t too hot or too humid wh...,20,6
3,HK Disneyland is a great compact park. Unfortu...,43,3
4,"the location is not in the city, took around 1...",47,7
...,...,...,...
9995,Disneyland truly is a magical place! Christmas...,0,8
9996,"So if you delight in joining 85,000 other folk...",-1,9
9997,we visitied the Disneyland park for 2 days. Ev...,-1,3
9998,Disneyland is by far the Happiest Place on Ear...,-1,8


In [3]:
sentiment_df = pd.read_csv("../data/sentiment.csv")
sentiment_df_10 = sentiment_df.iloc[:10]
sentiment_df_10 

Unnamed: 0.1,Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,Review_Text,Branch,polarity,NLTK_polarity,compare
0,0,670772142,4,2019-4,Australia,if you've ever been to disneyland anywhere you...,Disneyland_HongKong,0.243981,0.7069,True
1,1,670682799,4,2019-5,Philippines,its been a while since d last time we visit ho...,Disneyland_HongKong,0.236131,0.9892,True
2,2,670623270,4,2019-4,United Arab Emirates,thanks god it wasn t too hot or too humid wh...,Disneyland_HongKong,0.160498,0.992,True
3,3,670607911,4,2019-4,Australia,hongkong disneyland is a great compact park. u...,Disneyland_HongKong,0.189286,0.8489,True
4,4,670607296,4,2019-4,United Kingdom,"the location is not in the city, took around 1...",Disneyland_HongKong,0.266667,0.2846,True
5,5,670591897,3,2019-4,Singapore,"have been to disney world, disneyland anaheim ...",Disneyland_HongKong,-0.065476,0.9653,False
6,6,670585330,5,2019-4,India,great place! your day will go by and you wo no...,Disneyland_HongKong,0.185,0.7489,True
7,7,670574142,3,2019-3,Malaysia,think of it as an intro to disney magic for th...,Disneyland_HongKong,0.054722,0.8345,True
8,8,670571027,2,2019-4,Australia,"feel so let down with this place,the disneylan...",Disneyland_HongKong,-0.067284,0.5195,False
9,9,670570869,5,2019-3,India,i can go on talking about disneyland. whatever...,Disneyland_HongKong,0.204497,0.9645,True


In [4]:
# Code from deeplearning.ai ChatGPT Prompt Engineering for Developers Short Course 
def get_completion(prompt, model="gpt-3.5-turbo"):
    messages = [{"role": "user", "content": prompt}]
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=0, # this is the degree of randomness of the model's output
    )
    return response.choices[0].message["content"]

In [5]:
# Prompt adpated from deeplearning.ai ChatGPT Prompt Engineering for Developers Short Course 
def sentiment_full(start_index, end_index):
    """Full sentiment analysis using ChatGPT gpt-3.5-turbo.

    Use ChatGPT to describe the reviews' sentiments.  

    Parameters
    ----------
    start_index : int
        The start index of the Review_Text in the sentiment_df dataframe we want to examine
    end_index : int
        The end index of the Review_Text in the sentiment_df dataframe we want to examine

    Returns
    -------
    None
        Only print out the sentiment of each review text using ChatGPT as a string in a list

    Examples
    --------
    >>> sentiment_full(10, 12)
   ['The sentiment of the review is positive.']
   ['The sentiment of the review on Disneyland is positive.']
    """
    ChatGPT_sentiment_full = []
    for i in np.arange(start=start_index, stop=end_index):
        disney_review = topics_df.Review_Text[i]
        prompt = f"""
        What is the sentiment of the following review on Disneyland,
        which is delimited with triple backticks?
        Review text: '''{disney_review}'''
        """
        sentiment = get_completion(prompt)
        ChatGPT_sentiment_full.append(sentiment)
        print(ChatGPT_sentiment_full)

In [6]:
sentiment_full(0, 3) 
time.sleep(60) # The free trial tier users for gpt-3.5-turbo have a RPM (request per minute) of 3. Need this to deplay executing Python code.

['The sentiment of the review is positive.']
['The sentiment of the review is positive.', 'The sentiment of the review on Disneyland is mixed. The reviewer expresses excitement and enjoyment for certain attractions like the Iron Man Experience and Space Mountain, but also expresses disappointment with the behavior of the cast members and the overall atmosphere of the park. They do mention some positive aspects such as the bakery and the addition of a Starbucks inside the park.']
['The sentiment of the review is positive.', 'The sentiment of the review on Disneyland is mixed. The reviewer expresses excitement and enjoyment for certain attractions like the Iron Man Experience and Space Mountain, but also expresses disappointment with the behavior of the cast members and the overall atmosphere of the park. They do mention some positive aspects such as the bakery and the addition of a Starbucks inside the park.', 'The sentiment of the review on Disneyland is generally positive.']


In [7]:
def sentiment_rating(start_index, end_index):
    """sentiment rating between -1 and 1 using ChatGPT gpt-3.5-turbo.

    Use ChatGPT to rate the reviews' sentiments.  

    Parameters
    ----------
    start_index : int
        The start index of the Review_Text in the sentiment_df dataframe we want to examine
    end_index : int
        The end index of the Review_Text in the sentiment_df dataframe we want to examine

    Returns
    -------
    list
        A list of strings that report the ChatGPT ratings

    Examples
    --------
    >>> sentiment_rating(10, 12)
    ['0.9', '0.8']
    """
    time.sleep(60) # The free trial tier users for gpt-3.5-turbo have a RPM (request per minute) of 3. Need this to deplay executing Python code. 
    sentiment_rating = []
    for i in np.arange(start=start_index, stop=end_index):
        disney_review = topics_df.Review_Text[i]
        prompt = f"""
        What is the sentiment of the following review on Disneyland,
        which is delimited with triple backticks?
        Review text: '''{disney_review}'''
        Rate the sentiment on a scale of -1 to 1 where 1 means the most positive and -1 means the most negative.
        The output rating should be a float number between -1 and 1 only. 
        Do not add any text in additional to the output rating. 
        """
        sentiment = get_completion(prompt)
        sentiment_rating.append(sentiment)
    return sentiment_rating

In [8]:
sentiment_rating_10 = []
#time.sleep(60)  # The free trial tier users for gpt-3.5-turbo have a RPM (request per minute) of 3. Need this to deplay executing Python code. 
sentiment_rating_10.extend(sentiment_rating(0,3))
#time.sleep(60)
sentiment_rating_10.extend(sentiment_rating(3,6))
#time.sleep(60)
sentiment_rating_10.extend(sentiment_rating(6,9))
#time.sleep(60)
sentiment_rating_10.extend(sentiment_rating(9,10))

In [9]:
sentiment_rating_10 

['0.6', '0.4', '0.4', '0.5', '0.4', '-0.8', '0.8', '0.2', '-0.9', '0.8']

In [10]:
sentiment_df_10['ChatGPT_rating'] = np.array(sentiment_rating_10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sentiment_df_10['ChatGPT_rating'] = np.array(sentiment_rating_10)


In [11]:
sentiment_df_10

Unnamed: 0.1,Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,Review_Text,Branch,polarity,NLTK_polarity,compare,ChatGPT_rating
0,0,670772142,4,2019-4,Australia,if you've ever been to disneyland anywhere you...,Disneyland_HongKong,0.243981,0.7069,True,0.6
1,1,670682799,4,2019-5,Philippines,its been a while since d last time we visit ho...,Disneyland_HongKong,0.236131,0.9892,True,0.4
2,2,670623270,4,2019-4,United Arab Emirates,thanks god it wasn t too hot or too humid wh...,Disneyland_HongKong,0.160498,0.992,True,0.4
3,3,670607911,4,2019-4,Australia,hongkong disneyland is a great compact park. u...,Disneyland_HongKong,0.189286,0.8489,True,0.5
4,4,670607296,4,2019-4,United Kingdom,"the location is not in the city, took around 1...",Disneyland_HongKong,0.266667,0.2846,True,0.4
5,5,670591897,3,2019-4,Singapore,"have been to disney world, disneyland anaheim ...",Disneyland_HongKong,-0.065476,0.9653,False,-0.8
6,6,670585330,5,2019-4,India,great place! your day will go by and you wo no...,Disneyland_HongKong,0.185,0.7489,True,0.8
7,7,670574142,3,2019-3,Malaysia,think of it as an intro to disney magic for th...,Disneyland_HongKong,0.054722,0.8345,True,0.2
8,8,670571027,2,2019-4,Australia,"feel so let down with this place,the disneylan...",Disneyland_HongKong,-0.067284,0.5195,False,-0.9
9,9,670570869,5,2019-3,India,i can go on talking about disneyland. whatever...,Disneyland_HongKong,0.204497,0.9645,True,0.8


In the two instances where the polarity/sentiment scores assigned by TextBlob and VADER have different signs (positive/negative), ChatGPT agrees with TextBlob. I decided to examine these two reviews, and it appeared that the sentiment of both reviews was rather negative.

In [12]:
sentiment_df_10.loc[5][5]

'have been to disney world, disneyland anaheim and tokyo disneyland but i feel that disneyland hongkong is really too small to be called a disneyland. it has way too few rides and attractions. souvenirs, food and even entrance tickets are slightly more expensive than other disneyland as well. basically, this park is good only for small children and people who has never been to disney. the food choices were acceptable, mostly fast food, and not too expensive. bottled water, however, was very expensive but they do have water fountains around for you to refill your water bottles. the parade was pretty good. it was crowded not a problem but what was the problem was the people were just so rude, the pushing and shoving cutting in lines for the rides, gift shops, food stands was just to much to take. forget trying to see one of the shows its a free for all for seats, i do not see how disney can let this happen, it was by far the worst managed disney property.'

In [13]:
sentiment_df_10.loc[8][5]

'feel so let down with this place,the disneyland train was fantastic until you get past the station,bad signage, terrible staff who just did not want to be there ,it was rainning and rides were not working could not find a map of place, over priced fun park. disney characters only 2 seen under a cabana having photos with visitors queuing up in the rain. dont waste your money. walt disney would be horrified, not enough eating places for the amount of people or rest rooms. seems they are building something but who knows what. no atmosphere of fun and fantasy. '

## Topic Modeling 

First, I loaded the topic modeling results using HDBSCAN and LDA. HDBSCAN identified 4,673 reviews as noise, and topic clusters 29, 45, and 26 are among the largest clusters. In LDA, topics 8, 7, and 9 contained the most reviews.

In [14]:
topics_df.iloc[:, 0:2].groupby('HDBSCAN_Topic').count().sort_values('Review_Text', ascending=False).head(10)

Unnamed: 0_level_0,Review_Text
HDBSCAN_Topic,Unnamed: 1_level_1
-1,4673
29,297
45,284
26,205
16,184
5,178
20,165
11,154
35,153
47,145


In [15]:
topics_df.iloc[:, np.r_[0, 2]].groupby('LDA_Topic').count().sort_values('Review_Text', ascending=False)

Unnamed: 0_level_0,Review_Text
LDA_Topic,Unnamed: 1_level_1
8,1552
7,1451
9,1408
6,1150
2,1101
3,1011
5,832
4,767
0,452
1,276


I used ChatGPT to infer the topic of the top 3 large clusters for both assigned by HDBSCAN and LDA. 

In [16]:
def ChatGPT_topic_infer(topic_num, n_sample = 5, method = 'HDBSCAN_Topic'):
    """Topic identification using ChatGPT gpt-3.5-turbo.

    Use ChatGPT to examine the topics identified by HDBSCAN and
    LDA in the Disney Reviews.

    Parameters
    ----------
   topic_num : int
        The topic number of the topic / cluster to examine
    n_sample : int
        The amount of samples we draw from each topic, default to 5. 
    method : str
        The method used to identify the topic
        default to 'HDBSCAN_Topic'
        can also use 'LDA_Topic'

    Returns
    -------
    None
        Only print out the 5 topics identified by ChatGPT 

    Examples
    --------
    >>> ChatGPT_topic_infer(29, 10)
    1. Disneyland
    2. Fireworks
    3. Rides
    4. Characters
    5. Family trip
    """
    time.sleep(60)
    topics_df_sample = topics_df.loc[topics_df[method] == topic_num].sample(n_sample)
    corpus = ' '.join(topics_df_sample["Review_Text"])
    

    prompt = f"""
    Determine five topics that are being discussed in the \
    following text, which is delimited by triple backticks.
    Make each item one or two words long. 
    Format your response as a list of items separated by commas.

    Text sample: '''{corpus}'''
    Format: 
    1. Topic 1
    2. Topic 2
    3. Topic 3
    4. Topic 4
    5. Topic 5
    """
    response = get_completion(prompt)
    print(response)

### Topic 29, 45 and 26 by HDBSCAN
Initially, I read sampled comments in each topic and summarized as below: 
<br>- Topic 29: firework
<br>- Topic 45: family members
<br>- Topic 26: disappointments
<br>It looked like topic 29 was likely to be related to fireworks. 

In [17]:
ChatGPT_topic_infer(29, 5)

1. Disneyland experience
2. Rides and attractions
3. Age suitability
4. Parade and fireworks
5. Shopping and merchandise


In [18]:
ChatGPT_topic_infer(45, 5)

1. Disneyland experience
2. Rides and attractions
3. Dining options
4. Character meet and greets
5. Crowd management and staff behavior


In [19]:
ChatGPT_topic_infer(26, 5)

1. Family & kids
2. Queues
3. Rides
4. Food prices
5. Disney characters


### Topic 8, 7 and 9 by LDA

Initially, I read sampled comments in each topic and summarized as below: 
<br> - Topic 8: family, best, fantastic 
<br> - Topic 7: Recommend, worth visiting
<br> - Topic 9: Travel, ticket, rides, directions
<br> There were some overlapping topics between the topics identified by ChatGPT and by me. 

In [20]:
ChatGPT_topic_infer(8, 5, 'LDA_Topic')

1. Disney character
2. Disneyland
3. Rides
4. Shows
5. Tokyo Disneyland


In [21]:
ChatGPT_topic_infer(7, 5, 'LDA_Topic')

1. Disneyland experience
2. Size of HK Disneyland
3. Wait times for rides
4. Seating and food options
5. Fireworks and parades


In [22]:
ChatGPT_topic_infer(9, 5, 'LDA_Topic')

1. Employee behavior
2. Attractions and activities
3. Expenses and prices
4. Transportation
5. Food and dining


### ChatGPT Topic Modeling 
I used a prompt to allow ChatGPT to assign topics. Due to the restrictions of the free tier user of the ChatGPT API, I only examined the first 10 reviews as a demo. Among those reviews, ChatGPT and I identified some common topics, including fireworks, rides, and queues.   

In [23]:
def ChatGPT_Topic_Assignment(start_index, end_index):
    """Assign topics to each Disney Review using ChatGPT gpt-3.5-turbo.

    Parameters
    ----------
    start_index : int
        The start index of the Review_Text in the topics_df dataframe we want to examine
    end_index : int
        The end index of the Review_Text in the topics_df dataframe we want to examine

    Returns
    -------
    list
        A list of strings that contains topics identified by ChatGPT

    Examples
    --------
    >>> ChatGPT_Topic_Assignment(0, 3)
    ['rides, queues', 'fireworks, rides, queues', 'rides, attractions, queues']
    """
    time.sleep(60)
    Chat_topic = []
    for i in np.arange(start=start_index, stop=end_index):
        review_text = topics_df.Review_Text[i]
        prompt = f"""
        What is the topic of the following review on Disneyland, \
        which is delimited with triple backticks? \
        Review text: '''{review_text}''' \
        Give me one to three high-level topics like fireworks, rides and queues \
        each topic in one word and separate each topic by a comma  
        """
        topic = get_completion(prompt)
        Chat_topic.append(topic)
    return Chat_topic

In [24]:
Chat_topic_10 = []
Chat_topic_10 = ChatGPT_Topic_Assignment(0, 3)

In [25]:
Chat_topic_10.extend(ChatGPT_Topic_Assignment(3,6))

In [26]:
Chat_topic_10.extend(ChatGPT_Topic_Assignment(6,9))

In [27]:
Chat_topic_10.extend(ChatGPT_Topic_Assignment(9,10))

In [28]:
topics_df_10 = topics_df.head(10)
topics_df_10['ChatGPT_Topics'] = Chat_topic_10
topics_df_10

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  topics_df_10['ChatGPT_Topics'] = Chat_topic_10


Unnamed: 0,Review_Text,HDBSCAN_Topic,LDA_Topic,ChatGPT_Topics
0,If you've ever been to Disneyland anywhere you...,-1,3,"rides, queues"
1,Its been a while since d last time we visit HK...,-1,5,"fireworks, rides, queues"
2,Thanks God it wasn t too hot or too humid wh...,20,6,"rides, attractions, queues"
3,HK Disneyland is a great compact park. Unfortu...,43,3,"maintenance, crowds, prices"
4,"the location is not in the city, took around 1...",47,7,"crowded, hot"
5,"Have been to Disney World, Disneyland Anaheim ...",-1,2,"rides, queues, crowds"
6,Great place! Your day will go by and you won't...,-1,7,"rides, queues, parade"
7,Think of it as an intro to Disney magic for th...,-1,9,"food, attractions, service"
8,"Feel so let down with this place,the Disneylan...",26,4,"signage, staff, rides"
9,I can go on talking about Disneyland. Whatever...,24,9,"fireworks, rides, queues"
