In [2]:
import pandas as pd
import re
from bs4 import BeautifulSoup
import spacy
import nltk
from nltk.corpus import stopwords

df = pd.read_csv('data/Tampa_mexican_restaurants.csv')
df = df.drop(columns=['state', 'latitude', 'longitude', 'address'])

def remove_html_tags(text):
    return BeautifulSoup(text, "html.parser").get_text()

def remove_punctuation(text):
    return re.sub(r'[^\w\s]', '', text)

def remove_phone_numbers(text):
    pattern = r'\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b'
    return re.sub(pattern, '', text)

df['cleaned_text'] = df['text'].apply(remove_html_tags)
df['cleaned_text'] = df['cleaned_text'].apply(remove_punctuation)
df['cleaned_text'] = df['cleaned_text'].apply(remove_phone_numbers)
df['cleaned_text'] = df['cleaned_text'].str.lower()

nlp = spacy.load("en_core_web_sm")
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def tokenize_and_filter(text):
    doc = nlp(text)
    tokens = [token.text for token in doc if token.text.isalpha() and len(token.text) > 1 and token.text not in stop_words]
    return tokens

df['tokens'] = df['cleaned_text'].apply(tokenize_and_filter)

business_review_counts = df['business_id'].value_counts()
business_ids_with_100plus_reviews = business_review_counts[business_review_counts > 100].index
filtered_df = df[df['business_id'].isin(business_ids_with_100plus_reviews)]

filtered_df.to_csv('data_for_LDA.csv', index=False)



[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/liziming/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
da = pd.read_csv('data_for_LDA.csv')
da.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27014 entries, 0 to 27013
Data columns (total 19 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   review_id     27014 non-null  object 
 1   user_id       27014 non-null  object 
 2   business_id   27014 non-null  object 
 3   stars_x       27014 non-null  float64
 4   useful        27014 non-null  int64  
 5   funny         27014 non-null  int64  
 6   cool          27014 non-null  int64  
 7   text          27014 non-null  object 
 8   date          27014 non-null  object 
 9   name          27014 non-null  object 
 10  city          27014 non-null  object 
 11  postal_code   27014 non-null  float64
 12  stars_y       27014 non-null  float64
 13  review_count  27014 non-null  int64  
 14  is_open       27014 non-null  int64  
 15  categories    27014 non-null  object 
 16  hours         26629 non-null  object 
 17  cleaned_text  27014 non-null  object 
 18  tokens        27014 non-nu

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import ast


da['tokens'] = da['tokens'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

da['cleaned_text'] = da['tokens'].apply(lambda x: ' '.join(x))

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(da['cleaned_text'])

lda = LatentDirichletAllocation(n_components=5, random_state=0) 
lda.fit(X)

def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

print_top_words(lda, vectorizer.get_feature_names(), 10)


topic_distributions = lda.transform(X)
da['topic'] = topic_distributions.argmax(axis=1)
da.to_csv('data_with_topics.csv', index=False)


Topic #0: great food good service place mexican restaurant really back delicious
Topic #1: food nt place like good get go location taco tacos
Topic #2: tacos food taco place great good delicious best mexican fresh
Topic #3: nt food us service order time back minutes would came
Topic #4: nt ordered chips chicken salsa like good rice taco tacos



In [5]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pandas as pd

da = pd.read_csv("data_for_lda.csv")


high_rating = da[da['stars_y'] >= 4] 
low_rating = da[da['stars_y'] <= 3]   

def create_dtm(dataframe):
    vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
    dtm = vectorizer.fit_transform(dataframe['cleaned_text'])
    return dtm, vectorizer


def train_lda(dtm, n_topics=3):
    lda = LatentDirichletAllocation(n_components=n_topics, random_state=0)
    lda.fit(dtm)
    return lda

def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

high_dtm, high_vectorizer = create_dtm(high_rating)
high_lda = train_lda(high_dtm, n_topics=3) 
print("High-rate topics:")
print_top_words(high_lda, high_vectorizer.get_feature_names(), n_top_words=10) 


low_dtm, low_vectorizer = create_dtm(low_rating)
low_lda = train_lda(low_dtm, n_topics=3)  
print("Low-rate topics:")
print_top_words(low_lda, low_vectorizer.get_feature_names(), n_top_words=10)  


High-rate topics:
Topic #0: food place great service good time restaurant just order mexican
Topic #1: tacos taco great place good delicious fresh love food definitely
Topic #2: tacos good food mexican salsa taco chicken chips like place

Low-rate topics:
Topic #0: food order time service just minutes like ordered asked got
Topic #1: food location chipotle place great just love ive staff service
Topic #2: food good mexican place tacos great taco salsa like service



In [6]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

da['compound'] = [analyzer.polarity_scores(text)['compound'] for text in da['cleaned_text']]

high_topic_results = high_lda.transform(high_dtm)  
low_topic_results = low_lda.transform(low_dtm)   


high_rate = da[da['stars_y'] >= 4].copy()  
low_rate = da[da['stars_y'] <= 3].copy() 

high_rate['high_topic'] = high_lda.transform(high_dtm).argmax(axis=1)
low_rate['low_topic'] = low_lda.transform(low_dtm).argmax(axis=1)

da.loc[high_rate.index, 'high_topic'] = high_rate['high_topic']
da.loc[low_rate.index, 'low_topic'] = low_rate['low_topic']

high_topic_sentiment = da.loc[da['stars_y'] >= 4].groupby('high_topic')['compound'].mean()
low_topic_sentiment = da.loc[da['stars_y'] <= 3].groupby('low_topic')['compound'].mean()

print("sentiment score for high rate:\n", high_topic_sentiment)
print("sentiment score for low rate:\n", low_topic_sentiment)


sentiment score for high rate:
 high_topic
0.0    0.697675
1.0    0.880435
2.0    0.745810
Name: compound, dtype: float64
sentiment score for low rate:
 low_topic
0.0   -0.143757
1.0    0.514104
2.0    0.662211
Name: compound, dtype: float64
