In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA
from scipy.stats import entropy

df = pd.read_csv("/Users/hyunwoo/Desktop/PADA_hotel_data/hotel_final.csv")

In [None]:


# Assuming df['Review_Text'] contains the reviews text.

# Step 1: Preprocess the reviews (assuming 'Review_Text' column)
vectorizer = CountVectorizer(max_features=1000)  # Adjust max_features as necessary
count_matrix = vectorizer.fit_transform(df['Review_Text'])  # Convert text data into term-document matrix

# Step 2: Fit the LDA model
lda_model = LDA(
    n_components=10,  # Number of topics
    max_iter=20,  # Number of iterations
    learning_method='online',  # Choose learning method (e.g., online or batch)
    learning_offset=50.0,  # Learning rate offset
    random_state=42  # For reproducibility
)
lda_model.fit(count_matrix)

# Step 3: Compute topic proportions for each sentence
def get_sentence_topic_proportions(text, model, vectorizer):
    # Transform the text to the term-document matrix (sentence level)
    sentence_matrix = vectorizer.transform([text])  # Single sentence
    topic_proportions = model.transform(sentence_matrix)
    return topic_proportions[0]  # Return the topic proportions

# Step 4: Aggregate sentence-level topic proportions to review-level topic proportions
def aggregate_to_review_level(review_text, model, vectorizer):
    sentences = review_text.split('.')  # Split review into sentences (basic sentence split)
    sentence_proportions = []
    
    for sentence in sentences:
        sentence_prop = get_sentence_topic_proportions(sentence, model, vectorizer)
        sentence_proportions.append(sentence_prop)
    
    # Calculate the average topic proportions for the entire review
    avg_topic_proportions = np.mean(sentence_proportions, axis=0)
    return avg_topic_proportions

# Step 5: Calculate Shannon entropy for each review
def calculate_entropy(topic_proportions):
    return entropy(topic_proportions, base=2)  # Shannon entropy

# Step 6: Compute ContentDepth for each review
def compute_content_depth(review_text, model, vectorizer):
    # Aggregate topic proportions to review-level
    avg_topic_proportions = aggregate_to_review_level(review_text, model, vectorizer)
    
    # Calculate entropy for the review
    entropy_value = calculate_entropy(avg_topic_proportions)
    
    # Normalize entropy by the number of sentences in the review
    num_sentences = len(review_text.split('.'))
    
    # Calculate ContentDepth as the negative value of the normalized entropy
    content_depth = -entropy_value / num_sentences
    return content_depth

# Step 7: Apply ContentDepth calculation to each review in the DataFrame
df['ContentDepth'] = df['Review_Text'].apply(lambda x: compute_content_depth(x, lda_model, vectorizer))

# Optional: Also calculate ContentBreadth (for example, inverse of ContentDepth, or other measures)
# df['ContentBreadth'] = -df['ContentDepth']  # Placeholder if you define breadth differently

# View the results
print(df[['Review_Text', 'ContentDepth', 'ContentBreadth']].head())
