### This code takes input from the TripAdvisor reviews data and generates reviews using ChatGPT

In [1]:
import pandas as pd
import sys
import json

pd.set_option('display.max_colwidth', None)

In [2]:
import json

def flatten_json(nested_json):
    flattened_json = {}

    def flatten(x, name=''):
        if type(x) is dict:
            for a in x:
                flatten(x[a], name + a + '_')
        else:
            flattened_json[name[:-1]] = x

    flatten(nested_json)
    return flattened_json

def get_data(file_path):
    
    f = open(file_path)
    data = json.load(f)
    f.close()
    
    data_ = []
    for x in data['Reviews']:
        data_.append(flatten_json(x))
        
    return data_

In [3]:
def get_casted_data(df):

    df['Ratings_Cleanliness'] = df['Ratings_Cleanliness'].astype('Int64')
    df['Ratings_Service'] = df['Ratings_Service'].astype('Int64')
    df['Ratings_Overall'] = df['Ratings_Overall'].astype(float).astype('Int64')
    df['Ratings_Value'] = df['Ratings_Value'].astype('Int64')
    df['Ratings_Sleep Quality'] = df['Ratings_Sleep Quality'].astype('Int64')
    df['Ratings_Rooms'] = df['Ratings_Rooms'].astype('Int64')
    df['Ratings_Business service (e.g., internet access)'] = df['Ratings_Business service (e.g., internet access)'].astype('Int64')
    df['Ratings_Location'] = df['Ratings_Location'].astype('Int64')
    df['Ratings_Check in / front desk'] = df['Ratings_Check in / front desk'].astype('Int64')
    
    return df

In [4]:
def flag_extreme_reviews(df, attr):
    
    ## This method identifies reviews that potentially mention an aspect. 
    ## We use aspect ratings to determine this. 
    ## If the aspect rating for some aspect is higher (or lower) than the overall rating by the same guest, it is likely that the guest mentions that aspect in the review. 
    ## We use this as hueristic to identify reviews that can be used as seed data to artificially generate reviews using ChatGPT 

    index_pos = df[(((df['Ratings_' + attr] - df['Ratings_Overall']) > 1))].index
    index_neg = df[(((df['Ratings_' + attr] - df['Ratings_Overall']) < -1))].index
    df[attr] = [0] * df.shape[0]
    df.loc[index_pos, attr] = 1
    df.loc[index_neg, attr] = -1
    
    return df

def get_review_ratings(df):

    df = flag_extreme_reviews(df, 'Service')
    df = flag_extreme_reviews(df, 'Cleanliness')
    df = flag_extreme_reviews(df, 'Value')
    df = flag_extreme_reviews(df, 'Sleep Quality')
    df = flag_extreme_reviews(df, 'Rooms')
    df = flag_extreme_reviews(df, 'Business service (e.g., internet access)')
    df = flag_extreme_reviews(df, 'Check in / front desk')
    
    return df

In [5]:
# df[['Service', 'Cleanliness', 'Value', 'Rooms']].value_counts()

In [6]:
import os
from openai import OpenAI


os.environ["LAS_API_TOKEN"] = "a4adfef6de531c1c258c9eda6f9be5c7a5701e365c32babe72273f63214811df"

client = OpenAI(
    # This is the default and can be omitted
    api_key=os.environ.get("LAS_API_TOKEN"),
)

In [7]:
import os
from openai import OpenAI

def get_chatpgt_summary(prompt, review):

    prompt = prompt.replace('{reviews}', review)
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": prompt
            }
        ],
        model="gpt-4o",
    )
    return chat_completion.choices[0].message.content

def get_reviews(df, attr, senti):
    
    reviews = " "
    rev = df[(df[attr] == senti)].Content
    for i, x in enumerate(rev):
        # print('review ' + str(i) + ": ", x)
        reviews = reviews + 'review ' + str(i+1) + ": " + x + "\n=======\n"

    return reviews

In [8]:
# reviews = get_reviews(df, 'Service', 'Rooms', -1)
# print(reviews)

In [9]:
def generate_reviews(df, prompt, senti, attr, attr_sub):
    
    # attr_pos = 'Service'
    # attr_pos_sub = 'Cleanliness'

    # def generate_pos_reviews(df, attr, prompt):

    if(senti == 'positive'):
        review = get_reviews(df, attr, 1)
        review_sub = get_reviews(df, attr_sub, 1)

    elif(senti == 'negative'):
        review = get_reviews(df, attr, -1)
        review_sub = get_reviews(df, attr_sub, -1)
    
    rev = senti.title() + ' Reviews for ' + attr + '\n\n' + review + '\n\n+++++++\n\n' + senti.title() + ' Reviews for ' + attr_sub + '\n\n' + review_sub
    
    prompt = prompt.replace('{topic_1}', attr)
    prompt = prompt.replace('{topic_2}', attr_sub)
    prompt = prompt.replace('{pos_neg}', senti)

    gen_pos_review = get_chatpgt_summary(prompt, rev)  
    
    return gen_pos_review

In [10]:
rev_neg, rev_pos, attr_neg, attr_pos, rev_cls = [], [], [], [], []

In [11]:
### Download the Tripadvisor dataset (in json format not the csv)
### And unzip it in the data folder
### Each Json file contains reviews only from one hotel

cls_id = '115484'    ### Provide the json file (id) which you want to use to generate the reviews
df = pd.DataFrame(get_data('../data/json/' + cls_id +'.json'))
df = get_casted_data(df)
df = get_review_ratings(df)
df.shape

(652, 22)

In [12]:
attr = ['Service', 'Cleanliness', 'Value', 'Sleep Quality', 'Rooms', 'Business service (e.g., internet access)', 'Check in / front desk']

In [None]:
prompt = open('../prompts/generate_reviews.txt').read()

### Specify the aspects to focus on in the new generated review. 
### The possible aspects are given above in the attr array
### The prompt takes two aspects and generates a positive/negative review for those aspects

i_1, i_2 = 0, 4
gen_rev_pos = generate_reviews(df, prompt, 'positive', attr[i_1], attr[i_2])

In [None]:
print(gen_rev_pos)

In [None]:
prompt = open('../prompts/generate_reviews.txt').read()
j_1, j_2 = 1, 4
gen_rev_neg = generate_reviews(df, prompt, 'negative', attr[j_1], attr[j_2])

In [None]:
print(gen_rev_neg)

In [None]:
rev_neg.append(gen_rev_neg)
rev_pos.append(gen_rev_pos)
attr_neg.append([attr[j_1], attr[j_2]])
attr_pos.append([attr[i_1], attr[i_2]])
rev_cls.append(cls_id)

In [None]:
df_ = pd.DataFrame()

In [None]:
df_['rev_neg'] = rev_neg
df_['rev_pos'] = rev_pos
df_['attr_neg'] = attr_neg
df_['attr_pos'] = attr_pos
df_['cls'] = rev_cls

In [None]:
df_.to_csv('../data/conflicting_reviews_dataset.csv')