In [15]:
import pandas as pd
from flair.models import TextClassifier
from flair.data import Sentence


# 读取数据
df = pd.read_csv('./Data/reviews_original.csv')

# 去除comments列为NaN的行
df = df.dropna(subset=['comments'])

# 进一步去除空字符串的评论
df = df[df['comments'].str.strip() != '']


  from .autonotebook import tqdm as notebook_tqdm


In [18]:
# 计算不同的 listing_id 的数量
def unique_num_listingid(df):
    unique_listing_ids_count = df['listing_id'].nunique()

    # 打印结果
    print(f'Number of unique listing IDs: {unique_listing_ids_count}')

unique_num_listingid(df)

Number of unique listing IDs: 39527


In [2]:
print(df.shape)

(1050255, 6)


In [27]:
import pandas as pd
from flair.models import TextClassifier
from flair.data import Sentence
import math

# 加载情感分析模型
classifier = TextClassifier.load('en-sentiment')

# 定义情感分析函数
def analyze_sentiment(text):
    sentence = Sentence(text)
    classifier.predict(sentence)
    sentiment = sentence.labels[0].value
    score = sentence.labels[0].score
    return score if sentiment == 'POSITIVE' else -score

# 定义分批处理函数
def process_in_batches(df, batch_size):
    # 确保批次数量正确向上取整
    num_batches = math.ceil(len(df) / batch_size)
    
    for batch_number in range(num_batches):
        start_index = batch_number * batch_size
        # 确保不会超出df的最大长度
        end_index = min((batch_number + 1) * batch_size, len(df))
        batch_df = df.iloc[start_index:end_index].copy()
        print(f"Starting batch {batch_number+1}/{num_batches}...")

        for index, text in enumerate(batch_df['comments']):
            # 对当前评论应用情感分析
            sentiment_score = analyze_sentiment(text)
            batch_df.loc[index + start_index, 'sentiment_score'] = sentiment_score
            # 每处理100条数据后打印进度
            if (index + 1) % 100 == 0:
                print(f"Batch {batch_number+1}, Processed {index + 1}/{len(batch_df)} items.")

        
        # 保存每个批次的结果到文件，避免丢失进度
        batch_df.to_csv(f'batch_{batch_number+1}_results.csv', index=False)
        
        print(f"Finished batch {batch_number+1}/{num_batches}")

# 示例：分批处理
df = pd.read_csv('reviews_original.csv')
df = df.dropna(subset=['comments'])
df = df[df['comments'].str.strip() != '']
df.reset_index(drop=True, inplace=True)

# 假设我们设置每个批次为1000条数据
process_in_batches(df, batch_size=100000)


Starting batch 1/11...
Batch 1, Processed 100/100000 items.
Batch 1, Processed 200/100000 items.
Batch 1, Processed 300/100000 items.
Batch 1, Processed 400/100000 items.
Batch 1, Processed 500/100000 items.
Batch 1, Processed 600/100000 items.
Batch 1, Processed 700/100000 items.
Batch 1, Processed 800/100000 items.
Batch 1, Processed 900/100000 items.
Batch 1, Processed 1000/100000 items.
Batch 1, Processed 1100/100000 items.
Batch 1, Processed 1200/100000 items.
Batch 1, Processed 1300/100000 items.
Batch 1, Processed 1400/100000 items.
Batch 1, Processed 1500/100000 items.
Batch 1, Processed 1600/100000 items.
Batch 1, Processed 1700/100000 items.
Batch 1, Processed 1800/100000 items.
Batch 1, Processed 1900/100000 items.
Batch 1, Processed 2000/100000 items.
Batch 1, Processed 2100/100000 items.
Batch 1, Processed 2200/100000 items.
Batch 1, Processed 2300/100000 items.
Batch 1, Processed 2400/100000 items.
Batch 1, Processed 2500/100000 items.
Batch 1, Processed 2600/100000 items

In [9]:
import pandas as pd
import os

directory_path = './Data\sentiment_processed_data/'

# 文件名列表
batch_file_names = [f'batch_{i}_results.csv' for i in range(1, 12)]

# 读取并合并所有CSV文件
all_data = pd.DataFrame()
for file_name in batch_file_names:
    file_path = os.path.join(directory_path, file_name)
    if os.path.exists(file_path):
        batch_data = pd.read_csv(file_path)
        all_data = pd.concat([all_data, batch_data], ignore_index=True)
    else:
        print(f"File {file_name} not found.")

# 如果all_data不为空，计算每个listing_id的平均情感得分
if not all_data.empty:
    average_scores = all_data.groupby('listing_id')['sentiment_score'].mean()
    # 按照平均情感得分从高到低排序
    average_scores_sorted = average_scores.sort_values(ascending=False)
    print(average_scores_sorted)
else:
    print("No data has been loaded.")


listing_id
20957905    0.999985
7581690     0.999984
27705424    0.999983
24423321    0.999982
21026999    0.999981
              ...   
13485043   -0.999998
28186884   -0.999998
22101392   -0.999999
21749674   -0.999999
9511685    -0.999999
Name: sentiment_score, Length: 39527, dtype: float64


In [2]:
len(all_data)

1050255

In [12]:
# 计算每个房产的平均情感分数
average_scores = all_data.groupby('listing_id')['sentiment_score'].mean()

# 显示前10个结果
print(average_scores.head(10))


listing_id
2515    0.841051
2539    0.964923
2595    0.880561
3330    0.878424
3831    0.915440
5022    0.992777
5099    0.722448
5121    0.919812
5172    0.868364
5178    0.774277
Name: sentiment_score, dtype: float64


In [21]:
# 定义新的列名
new_column_names = ['comments']
# 将DataFrame存入CSV文件
average_scores.to_csv('./Data/sentiment_processed_data/reviews_cleaned.csv',header=new_column_names)

# 如果你想指定更多的参数，比如编码
# df.to_csv('analyzed_comments_scores.csv', index=False, encoding='utf-8-sig')


In [10]:
num_batches = math.ceil(1050000 / 100000)
num_batches

11

In [3]:
0.8*0.68-0.02-0.02-0.1

0.404