# 1 原始数据读取和处理

In [None]:
import json
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules

# 1. 读取 Parquet 数据并解析 purchase_history
paths_10G = [f"../data/10G_data_new/part-{str(index).zfill(5)}.parquet" for index in range(8)]
df = pd.concat([pd.read_parquet(f) for f in paths_10G], ignore_index=True)
df = df.dropna(subset=['purchase_history']).reset_index(drop=True)
df = df.drop_duplicates(keep='first').reset_index(drop=True)

In [None]:
df.info()

In [None]:
print(df.iloc[5624999]['purchase_history'])

In [None]:
# 2. 读取用于种类映射的json文件
with open('../data/product_catalog.json', 'r', encoding='utf-8') as f:
    map_json = json.load(f)
    id2category = {item['id']: item['category']  for item in map_json['products']}
    id2price = {item['id']: item['price']  for item in map_json['products']}

# 3. 修改'purchase_history'字段的函数
def item_mapping(x):
    json_x = json.loads(x)
    del json_x['categories']
    del json_x['avg_price']
    json_x['items'] = [{'id': item['id'], 'category': id2category[item['id']], 'price': id2price[item['id']]} for item in json_x['items']]
    return json_x

df['purchase_history'] = df['purchase_history'].apply(lambda x: item_mapping(x))

In [None]:
# 可以看到 purchase_history 中的类别已经发生改变，且现在 purchase_history 是一个字典
print(df.iloc[5624999]['purchase_history'])

# 2 商品类别关联规则挖掘

In [None]:
# 提取类别列表、支付方式、价格等信息
df['category_list'] = df['purchase_history'].apply(lambda x: list(set([item['category'] for item in x['items']])))
df['payment_status'] = df['purchase_history'].apply(lambda x: x['payment_status'])
df['payment_method'] = df['purchase_history'].apply(lambda x: x['payment_method'])
df['purchase_date'] = pd.to_datetime(df['purchase_history'].apply(lambda x: x['purchase_date']))
df['high_value'] = df['purchase_history'].apply(lambda x: any(item['price'] > 5000 for item in x['items']))

In [None]:
### 任务1：商品类别关联规则挖掘 ###
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder
import matplotlib.pyplot as plt

from matplotlib import rcParams

# 设置中文字体（以SimHei为例）
rcParams['font.sans-serif'] = ['SimHei']
rcParams['axes.unicode_minus'] = False

te = TransactionEncoder()
te_ary = te.fit(df['category_list']).transform(df['category_list'])
df_te = pd.DataFrame(te_ary, columns=te.columns_)

frequent_itemsets = apriori(df_te, min_support=0.02, use_colnames=True)
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)

# 特别关注“电子产品”相关规则
rules_electronics = rules[
    rules['antecedents'].apply(lambda x: '电子产品' in x) |
    rules['consequents'].apply(lambda x: '电子产品' in x)
]

# 保存结果
frequent_itemsets.to_csv('task1_frequent_itemsets.csv', index=False)
rules.to_csv('task1_association_rules.csv', index=False)
rules_electronics.to_csv('task1_electronics_related_rules.csv', index=False)

# 可视化（可选）
rules.plot(x='support', y='confidence', kind='scatter')
plt.title('任务1：商品类别关联规则 支持度 vs 置信度')
plt.xlabel('support')
plt.ylabel('confidence')
plt.savefig('task1_rules_scatter.png')
plt.close()

# 3 支付方式与商品类别关联分析

In [None]:
# 假设 df 已经有 category_list 和 payment_method 两列
# 构造事务项：把 category_list 和 payment_method 合并
df['transaction_items'] = df.apply(
    lambda row: row['category_list'] + [row['payment_method']],
    axis=1
)

# 用 TransactionEncoder 编码
te2 = TransactionEncoder()
te2_ary = te2.fit(df['transaction_items']).transform(df['transaction_items'])
df_te2 = pd.DataFrame(te2_ary, columns=te2.columns_)

# 挖掘频繁项集 & 关联规则
frequent_paysets = apriori(df_te2, min_support=0.01, use_colnames=True)
rules_pay = association_rules(frequent_paysets, metric="confidence", min_threshold=0.6)

# 只保留“支付方式 ↔ 商品类别”类型的规则
payment_methods = set(df['payment_method'].unique())

def is_payment_method_item(itemset):
    return any(item in payment_methods for item in itemset)

def is_category_item(itemset):
    return all(item not in payment_methods for item in itemset)

rules_pm_cat = rules_pay[
    # 支付方式在前提，类别在结论，或反过来
    (
        rules_pay['antecedents'].apply(is_payment_method_item) &
        rules_pay['consequents'].apply(is_category_item)
    ) | (
        rules_pay['consequents'].apply(is_payment_method_item) &
        rules_pay['antecedents'].apply(is_category_item)
    )
].copy()

# 保存结果
frequent_paysets.to_csv('task2_frequent_itemsets_pay.csv', index=False)
rules_pm_cat.to_csv('task2_payment_category_rules.csv', index=False)

# 2.3 高价值商品支付方式分布
high_value_methods = df[df['high_value']]['payment_method'].value_counts()
high_value_methods.to_frame(name='count').to_csv('task2_high_value_payment_methods.csv')

# 4 时间序列模式挖掘

In [None]:
### 任务3：时间序列模式挖掘 ###
df['purchase_date'] = pd.to_datetime(df['purchase_date'])
df['quarter'] = df['purchase_date'].dt.to_period("Q")
df['month'] = df['purchase_date'].dt.month
df['weekday'] = df['purchase_date'].dt.dayofweek

# 3.1 各类别月度购买频率变化
monthly_stats = (
    df.explode('category_list')
      .groupby(['month', 'category_list'])
      .size()
      .unstack(fill_value=0)
)
monthly_stats.to_csv('task3_monthly_category_trends.csv')

# 可视化（示例）
monthly_stats.plot(figsize=(12, 6))
plt.title('任务3：各类别月度购买趋势')
plt.ylabel('购买次数')
plt.savefig('task3_monthly_trends.png')
plt.close()

# 3.2 基于同一用户的“先买A后买B”时序模式
sequence_pairs = []

# 先按 user_id、purchase_date 排序，然后对每个用户依次取相邻两笔订单
df_sorted = df.sort_values(['user_name', 'purchase_date'])

for user, group in df_sorted.groupby('user_name'):
    prev_cats = None
    for cats in group['category_list']:
        if prev_cats is not None:
            # 针对上一笔订单的每个类别 A，与当前订单的每个类别 B 组合
            sequence_pairs += [
                (A, B)
                for A in prev_cats
                for B in cats
                if A != B
            ]
        prev_cats = cats

# 汇总计数并保存
sequence_df = pd.DataFrame(sequence_pairs, columns=['A', 'B'])
sequence_counts = (
    sequence_df
      .groupby(['A', 'B'])
      .size()
      .reset_index(name='count')
      .sort_values('count', ascending=False)
)
sequence_counts.to_csv('task3_sequence_A_to_B_counts.csv', index=False)

print("任务3 的用户内类别时序对已生成并保存：task3_sequence_A_to_B_counts.csv")

# 5 退款模式分析

In [None]:
df['is_refund'] = df['payment_status'].isin(['已退款', '部分退款'])
refund_df = df[df['is_refund']]

te_ary_refund = te.fit(refund_df['category_list']).transform(refund_df['category_list'])
df_te_refund = pd.DataFrame(te_ary_refund, columns=te.columns_)

# 4.1 挖掘频繁项集 & 关联规则
refund_itemsets = apriori(df_te_refund, min_support=0.005, use_colnames=True)
refund_rules = association_rules(refund_itemsets, metric="confidence", min_threshold=0.4)

# 保存结果
refund_itemsets.to_csv('task4_refund_frequent_itemsets.csv', index=False)
refund_rules.to_csv('task4_refund_association_rules.csv', index=False)

print("所有任务的频繁项集和关联规则已保存到当前目录下。")