In [None]:
import pandas as pd
from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import matplotlib.pyplot as plt

# 读取数据
data = pd.read_csv('/Users/sherryxue/Desktop/Xu.csv')

# 简单的情感分析
def analyze_sentiment(text):
    if isinstance(text, str):
        return TextBlob(text).sentiment.polarity
    return 0.0

data['sentiment'] = data['Comment'].apply(analyze_sentiment)

# 关键词分析
def contains_key_phrases(text, phrases):
    if isinstance(text, str):
        for phrase in phrases:
            if phrase in text:
                return True
    return False

key_phrases_rationality = ['因为', '证据', '研究表明']
key_phrases_civility = ['谢谢', '请', '欣赏']

data['rational'] = data['Comment'].apply(lambda x: contains_key_phrases(x, key_phrases_rationality))
data['civil'] = data['Comment'].apply(lambda x: contains_key_phrases(x, key_phrases_civility))

# 话题建模
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
dtm = vectorizer.fit_transform(data['Comment'].astype(str))
lda = LatentDirichletAllocation(n_components=5)
lda.fit(dtm)

# 可视化话题
def plot_top_words(model, feature_names, n_top_words, title):
    fig, axes = plt.subplots(1, 5, figsize=(30, 15), sharex=True)
    axes = axes.flatten()
    for topic_idx, topic in enumerate(model.components_):
        top_features_ind = topic.argsort()[:-n_top_words - 1:-1]
        top_features = [feature_names[i] for i in top_features_ind]
        weights = topic[top_features_ind]

        ax = axes[topic_idx]
        ax.barh(top_features, weights, height=0.7)
        ax.set_title(f'Topic {topic_idx +1}', fontdict={'fontsize': 30})
        ax.invert_yaxis()
        ax.tick_params(axis='both', which='major', labelsize=20)
        for i in 'top right left'.split():
            ax.spines[i].set_visible(False)
        fig.suptitle(title, fontsize=40)

    plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)
    plt.show()

n_top_words = 10
plot_top_words(lda, vectorizer.get_feature_names_out(), n_top_words, 'Topics in LDA model')

# 可视化情感分析结果
plt.figure(figsize=(10, 6))
plt.hist(data['sentiment'], bins=50, color='blue', edgecolor='black')
plt.title('Sentiment Analysis Distribution')
plt.xlabel('Sentiment Score')
plt.ylabel('Frequency')
plt.show()