In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pandas as pd

da = pd.read_csv("data_for_lda.csv")


high_rating = da[da['stars_y'] >= 4] 
low_rating = da[da['stars_y'] <= 3]   

def create_dtm(dataframe):
    vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
    dtm = vectorizer.fit_transform(dataframe['cleaned_text'])
    return dtm, vectorizer


def train_lda(dtm, n_topics=3):
    lda = LatentDirichletAllocation(n_components=n_topics, random_state=0)
    lda.fit(dtm)
    return lda

def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

high_dtm, high_vectorizer = create_dtm(high_rating)
high_lda = train_lda(high_dtm, n_topics=3) 
print("High-rate topics:")
print_top_words(high_lda, high_vectorizer.get_feature_names(), n_top_words=10) 


low_dtm, low_vectorizer = create_dtm(low_rating)
low_lda = train_lda(low_dtm, n_topics=3)  
print("Low-rate topics:")
print_top_words(low_lda, low_vectorizer.get_feature_names(), n_top_words=10)

High-rate topics:
Topic #0: food place great service good time restaurant just order mexican
Topic #1: tacos taco great place good delicious fresh love food definitely
Topic #2: tacos good food mexican salsa taco chicken chips like place

Low-rate topics:
Topic #0: food order time service just minutes like ordered asked got
Topic #1: food location chipotle place great just love ive staff service
Topic #2: food good mexican place tacos great taco salsa like service



In [2]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

da['compound'] = [analyzer.polarity_scores(text)['compound'] for text in da['cleaned_text']]

high_topic_results = high_lda.transform(high_dtm)  
low_topic_results = low_lda.transform(low_dtm)   


high_rate = da[da['stars_y'] >= 4].copy()  
low_rate = da[da['stars_y'] <= 3].copy() 

high_rate['high_topic'] = high_lda.transform(high_dtm).argmax(axis=1)
low_rate['low_topic'] = low_lda.transform(low_dtm).argmax(axis=1)

da.loc[high_rate.index, 'high_topic'] = high_rate['high_topic']
da.loc[low_rate.index, 'low_topic'] = low_rate['low_topic']

high_topic_sentiment = da.loc[da['stars_y'] >= 4].groupby('high_topic')['compound'].mean()
low_topic_sentiment = da.loc[da['stars_y'] <= 3].groupby('low_topic')['compound'].mean()

print("sentiment score for high rate:\n", high_topic_sentiment)
print("sentiment score for low rate:\n", low_topic_sentiment)


sentiment score for high rate:
 high_topic
0.0    0.697675
1.0    0.880435
2.0    0.745810
Name: compound, dtype: float64
sentiment score for low rate:
 low_topic
0.0   -0.143757
1.0    0.514104
2.0    0.662211
Name: compound, dtype: float64
