In [1]:
!pip install pandas nltk scikit-learn gensim openai



In [4]:
import pandas as pd
import re
import numpy as np
import itertools
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk

nltk.download('vader_lexicon')

try:
    df = pd.read_csv('ucsd_delaware_reviews_combined.csv')
except FileNotFoundError as e:
    print(str(e))


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/quanganh/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [5]:
analyzer = SentimentIntensityAnalyzer()
df.dropna(subset=['text'], inplace=True)

# sentiment score
df['sentiment_score'] = df['text'].apply(lambda text: analyzer.polarity_scores(str(text))['compound'])

# metadata
df['review_length'] = df['text'].str.len()

df['user_review_count'] = df.groupby('user_id')['user_id'].transform('count')
df['rating_deviation'] = (df['rating'] - df['avg_rating']).fillna(0)

visit_keywords = [
    'visited', 'went to', 'ate here', 'dined here', 'was there',
    'stayed at', 'my visit', 'our visit', 'ordered', 'tried the'
]
df['has_visit_keyword'] = df['text'].str.contains('|'.join(visit_keywords), case=False, na=False)

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim import corpora
from gensim.models import LdaModel

df['text_clean'] = df['text_clean'].fillna('')
tfidf_vectorizer = TfidfVectorizer(max_features=2000, stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(df['text_clean'])
feature_names = np.array(tfidf_vectorizer.get_feature_names_out())

def extract_top_keywords(doc, top_n=5):
    tfidf_vector = tfidf_vectorizer.transform([doc])
    sorted_indices = np.argsort(tfidf_vector.toarray()).flatten()[::-1]
    top_keywords = feature_names[sorted_indices[:top_n]]
    return ', '.join(top_keywords)

def get_dominant_topic(doc):
    bow = dictionary.doc2bow(doc.split())
    topics = lda_model.get_document_topics(bow, minimum_probability=0.0)
    dominant_topic = sorted(topics, key=lambda x: x[1], reverse=True)[0][0]
    return dominant_topic

tokenized_data = [text.split() for text in df['text_clean']]
dictionary = corpora.Dictionary(tokenized_data)
dictionary.filter_extremes(no_below=5, no_above=0.5)
corpus = [dictionary.doc2bow(text) for text in tokenized_data]
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=5, passes=10, random_state=42)

df['topic'] = df['text_clean'].apply(get_dominant_topic)

In [7]:
df['datetime'] = pd.to_datetime(df['time'], unit='ms')

df = df.sort_values(by=['user_id', 'datetime'])
df['time_delta_seconds'] = df.groupby('user_id')['datetime'].diff().dt.total_seconds().fillna(0)

In [8]:
# advertisement
promo_text_1 = "Great place! visit www.mypromo.com for a 10% discount!"
df.loc[len(df)] = df.iloc[0]
df.loc[df.index[-1], 'text'] = promo_text_1

#url to external website
url_pattern = r'(https|http|www)[^\s]+'
df['has_url'] = df['text'].str.contains(url_pattern, case=False, na=False)

def create_multilabels(row):
    labels = []
    if row['sentiment_score'] < -0.5 and not row['has_visit_keyword']:
        labels.append('rant_no_visit')
    if row['has_url']:
        labels.append('ad')
    if row['topic'] == 1:
        labels.append('irrelevant')
    if not labels:
        labels.append('clean')
    return labels

df['multilabels'] = df.apply(create_multilabels, axis=1)

  df['has_url'] = df['text'].str.contains(url_pattern, case=False, na=False)


In [9]:
# model with scikit learn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

# Single Label
def create_single_label(row):
    if 'ad' in row['multilabels']: return 'ad'
    if 'rant_no_visit' in row['multilabels']: return 'rant_no_visit'
    if 'irrelevant' in row['multilabels']: return 'irrelevant'
    return 'clean'

df['final_label'] = df.apply(create_single_label, axis=1)


X_train, X_test, y_train, y_test = train_test_split(
    df['text_clean'], df['final_label'],
    test_size=0.2, random_state=42, stratify=df['final_label']
)

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=2000, stop_words='english')),
    ('clf', LogisticRegression(max_iter=1000, class_weight='balanced'))
])
pipeline.fit(X_train, y_train)


df['sklearn_classification'] = pipeline.predict(df['text_clean'])



In [10]:
# Evaluate sklearn model
from sklearn.metrics import accuracy_score
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_pred, y_test)
print(f"Accuracy of the model is {accuracy}")

Accuracy of the model is 0.8103448275862069


In [15]:
import openai
import json
import time
import os
from dotenv import load_dotenv

load_dotenv()
api_key = os.getenv("OPEN_API_KEY")
client = openai.OpenAI(api_key=api_key)

def classify_review_with_openai(review_text, category):
    system_prompt = """
    You are an AI assistant for Google Maps. Your task is to analyze a review and classify it based on quality policies.
    A review can have one or more violation labels. If no violations are found, classify it as "clean".

    Possible Labels:
    - "ad": Contains advertisements, promotions, or external links.
    - "irrelevant": The content is not related to the given category.
    - "rant_no_visit": A strong complaint that shows no evidence of a real visit.

    You must respond ONLY with a valid JSON object containing a single key "labels" which is a list of strings.
    For example: {"labels": ["clean"]} or {"labels": ["ad", "irrelevant"]}.
    """

    user_prompt = f"""
    Please classify the following review for a place in the category "{category}".
    Review Text: "{review_text}"
    """

    try:
        response = client.chat.completions.create(
            # SỬA LỖI Ở ĐÂY: Thêm 'model='
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            response_format={"type": "json_object"},
            temperature=0.1
        )
        return json.loads(response.choices[0].message.content)

    except Exception as e:
        return {"labels": [f"error_{str(e)}"]}

sample_df_openai = df.tail(10).copy()

openai_results = sample_df_openai.apply(
    lambda row: classify_review_with_openai(row['text'], row['category']),
    axis=1
)

sample_df_openai['openai_labels'] = [res.get('labels', ['error']) for res in openai_results]

print(sample_df_openai[['text', 'multilabels', 'openai_labels']])

                                                    text   multilabels  \
25777  First time I tried their food will say it wasn...       [clean]   
39737  I have been going here for years. All the staf...       [clean]   
35036                                            Awesome       [clean]   
1534                              Very helpful and nice.       [clean]   
42774  A nice beach!  Fairly busy, arrive early to ge...  [irrelevant]   
42827  A nice beach!  Fairly busy, arrive early to ge...  [irrelevant]   
22297  Good food, better if you eat seafood.  Friendl...       [clean]   
22299  Good food, better if you eat seafood.  Friendl...       [clean]   
19157                 Food was good, staff was friendly.       [clean]   
36516  Great place! visit www.mypromo.com for a 10% d...          [ad]   

                                           openai_labels  
25777  [error_Error code: 429 - {'error': {'message':...  
39737  [error_Error code: 429 - {'error': {'message':...  
35036  [