In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA
from scipy.stats import entropy

df = pd.read_csv("/Users/hyunwoo/Desktop/PADA_hotel_data/hotel_final.csv")

In [3]:
import numpy as np
import pandas as pd
from contextualized_topic_models.models.ctm import CombinedTM
from contextualized_topic_models.utils.preprocessing import WhiteSpacePreprocessing
from scipy.stats import entropy

# 데이터프레임에 리뷰 텍스트가 포함되어 있다고 가정
# df = pd.DataFrame({'Review_Text': [...]})

# Step 1: Preprocess the reviews
def preprocess_reviews(df, text_column):
    preprocessor = WhiteSpacePreprocessing(list(df[text_column]), stopwords_list="english")
    preprocessed_documents, unprocessed_corpus, vocab = preprocessor.preprocess()
    return preprocessed_documents, unprocessed_corpus, vocab

preprocessed_docs, unprocessed_corpus, vocab = preprocess_reviews(df, 'Review_Text')

# Step 2: Train the CTM model
from contextualized_topic_models.utils.data_preparation import TopicModelDataPreparation

tp = TopicModelDataPreparation("paraphrase-multilingual-MiniLM-L12-v2")  # Use a multilingual sentence transformer
training_dataset = tp.fit(text_for_contextual=unprocessed_corpus, text_for_bow=preprocessed_docs)

ctm_model = CombinedTM(
    bow_size=len(tp.vocab),
    contextual_size=768,  # Embedding size of the transformer model
    n_components=10,  # Number of topics
    num_epochs=20,
)
ctm_model.fit(training_dataset)

# Step 3: Compute topic proportions for each sentence
def get_sentence_topic_proportions_ctm(text, model, tp):
    # Preprocess and transform text to CTM input
    preprocessor = WhiteSpacePreprocessing([text], stopwords_list="english")
    preprocessed_sentence, unprocessed_sentence, _ = preprocessor.preprocess()
    
    # Transform text into the model's required format
    sentence_data = tp.transform(text_for_contextual=unprocessed_sentence, text_for_bow=preprocessed_sentence)
    topic_proportions = model.get_thetas(sentence_data)
    return topic_proportions[0]  # Return topic proportions

# Step 4: Aggregate sentence-level topic proportions to review-level topic proportions
def aggregate_to_review_level_ctm(review_text, model, tp):
    sentences = review_text.split('.')  # Basic sentence split
    sentence_proportions = []
    
    for sentence in sentences:
        if sentence.strip():  # Skip empty sentences
            sentence_prop = get_sentence_topic_proportions_ctm(sentence, model, tp)
            sentence_proportions.append(sentence_prop)
    
    # Calculate the average topic proportions for the entire review
    avg_topic_proportions = np.mean(sentence_proportions, axis=0) if sentence_proportions else np.zeros(model.n_components)
    return avg_topic_proportions

# Step 5: Calculate Shannon entropy for each review
def calculate_entropy(topic_proportions):
    return entropy(topic_proportions, base=2)  # Shannon entropy

# Step 6: Compute ContentDepth for each review
def compute_content_depth_ctm(review_text, model, tp):
    # Aggregate topic proportions to review-level
    avg_topic_proportions = aggregate_to_review_level_ctm(review_text, model, tp)
    
    # Calculate entropy for the review
    entropy_value = calculate_entropy(avg_topic_proportions)
    
    # Normalize entropy by the number of sentences in the review
    num_sentences = len([s for s in review_text.split('.') if s.strip()])
    
    # Calculate ContentDepth as the negative value of the normalized entropy
    content_depth = -entropy_value / num_sentences if num_sentences > 0 else 0
    return content_depth

# Step 7: Apply ContentDepth calculation to each review in the DataFrame
df['ContentDepth'] = df['Review_Text'].apply(lambda x: compute_content_depth_ctm(x, ctm_model, tp))

# Optional: Define ContentBreadth if necessary
# df['ContentBreadth'] = -df['ContentDepth']  # Example: Placeholder logic

# View the results
print(df[['Review_Text', 'ContentDepth']].head())


ModuleNotFoundError: No module named 'contextualized_topic_models'

In [None]:
contextualized_topic_models