In [11]:
import pandas as pd
import os
import json

# 指定包含CSV文件的目录
directory = 'D:\chris_exp\dataCollection\exp2\output'
# 初始化一个字典来存储所有文件的结果
all_files_results = {}

# 遍历指定目录下的所有文件
for filename in os.listdir(directory):
    if filename.endswith('_google_reviews.csv'):
        file_path = os.path.join(directory, filename)
        
        # 从文件名中提取日期作为键
        date_key = filename.split('_')[0]  # 假设文件名格式为 "YYYY_MM_DD_googlereview.csv"
        
        # 读取CSV文件
        df = pd.read_csv(file_path, encoding='utf-8')
        
        # 检查列名是否为'review_rate'或'overall_rate'
        if 'review_rate' in df.columns:
            rate_column = 'review_rate'
        elif 'overall_rate' in df.columns:
            rate_column = 'overall_rate'
        else:
            print(f"Error: review_rate or overall_rate column not found in {filename}")
            continue
        
        # 确保review_date列是日期格式
        df['review_date'] = pd.to_datetime(df['review_date'])

        # 按照酒店和评论日期进行排序
        df_sorted = df.sort_values(by=['review_hotel', 'review_date'], ascending=[True, False])

        # 对每个酒店选取前五条评论的评分，然后计算平均值
        # 对每个酒店选取前五条评论的评分，然后计算平均值
        top5_average_rate = df_sorted.groupby('review_hotel').head(5).groupby('review_hotel')[rate_column].mean().round(1)

        # 将计算结果转换为字典，并存储在总字典中
        all_files_results[date_key] = top5_average_rate.to_dict()

        # 计算并打印每个文件中包含的酒店数量
        hotels_count = len(top5_average_rate)
        print(f"File: {filename} contains {hotels_count} hotels' data.")


File: 2023-09-28_google_reviews.csv contains 22 hotels' data.
File: 2023-10-02_google_reviews.csv contains 23 hotels' data.
File: 2023-10-09_google_reviews.csv contains 23 hotels' data.
File: 2023-10-16_google_reviews.csv contains 23 hotels' data.
File: 2023-10-24_google_reviews.csv contains 23 hotels' data.
File: 2023-10-30_google_reviews.csv contains 23 hotels' data.
File: 2023-11-01_google_reviews.csv contains 23 hotels' data.
File: 2023-11-06_google_reviews.csv contains 23 hotels' data.
File: 2023-11-13_google_reviews.csv contains 23 hotels' data.
File: 2023-11-20_google_reviews.csv contains 23 hotels' data.
File: 2023-11-27_google_reviews.csv contains 23 hotels' data.
File: 2023-12-04_google_reviews.csv contains 23 hotels' data.
File: 2023-12-12_google_reviews.csv contains 23 hotels' data.
File: 2023-12-13_google_reviews.csv contains 23 hotels' data.
File: 2023-12-25_google_reviews.csv contains 23 hotels' data.
File: 2024-01-01_google_reviews.csv contains 22 hotels' data.
File: 20

In [12]:
# 将最终结果保存为JSON文件
json_file_path = 'D:\chris_exp\dataCollection\exp2\output/expectation_result.json'
with open(json_file_path, 'w' ,encoding='utf-8') as json_file:
    json.dump(all_files_results, json_file,ensure_ascii=False, indent=4)


In [43]:
import json

# 加载JSON文件
json_file_path = 'D:\chris_exp\dataCollection\exp2\output\expectation_result.json'
with open(json_file_path, 'r', encoding='utf-8') as file:
    expectations = json.load(file)

# 获取2023-10-02的expectation数据
expectation_data = expectations['2023-12-25']


In [44]:
import pandas as pd
import os

# 读取CSV文件
filename = '2024-01-01_google_reviews.csv'
directory = r'D:\chris_exp\dataCollection\exp2\output'  # 使用原始字符串
file_path = os.path.join(directory, filename)
df = pd.read_csv(file_path, encoding='utf-8')

# 添加一个新列"expectation"，初始值设为None
df['expectation'] = None

# 根据飯店名称填入对应的expectation值
for hotel in expectation_data:
    if hotel in df['review_hotel'].values:
        df.loc[df['review_hotel'] == hotel, 'expectation'] = expectation_data[hotel]

# 保存修改后的DataFrame到新的CSV文件
new_file_path = os.path.join(directory+"\withExpectation", f'updated_{filename}')  # 为了避免覆盖原文件，建议改变输出文件名
df.to_csv(new_file_path, index=False, encoding='utf-8')
