In [23]:
import sqlite3
import pandas as pd
import numpy as np
from typing import Dict, Any
from datetime import timedelta

In [15]:
def connect_to_db(db_path: str) -> sqlite3.Connection:
    """Efficiently connect to SQLite database"""
    return sqlite3.connect(db_path, detect_types=sqlite3.PARSE_DECLTYPES)

def preprocess_datetime(df: pd.DataFrame) -> pd.DataFrame:
    """Central datetime preprocessing to reduce redundant operations"""
    datetime_columns = ['创建时间', '预定日期', '下单时间']
    for col in datetime_columns:
        if col in df.columns:
            if not pd.api.types.is_datetime64_any_dtype(df[col]):
                try:
                    df[col] = pd.to_datetime(df[col], errors='coerce')
                except Exception as e:
                    print(f"转换 {col} 列时出错：{e}")
    return df

In [27]:
def analyze(conn):
    """主分析函数"""
    catering_df = pd.read_sql_query("SELECT * FROM Catering", conn)

    catering_df = preprocess_datetime(catering_df)
    user_df= user_analysis(catering_df)
    
    today = catering_df['下单时间'].max()

    return user_df

In [4]:
def get_db_connection():
    conn = sqlite3.connect('db/ideapod.db')
    conn.row_factory = sqlite3.Row
    return conn


In [51]:
def user_analysis(catering_df: pd.DataFrame) -> Dict[str, pd.DataFrame]:

    catering_df['member_id'] = np.where(
        catering_df['会员号'].isna(),
        catering_df.index.map(lambda x: f'8888000{x:04d}'),
        catering_df['会员号']
    )
    # 设置时间范围（过去180天）
    today = catering_df['下单时间'].max()
    begin_day = today - timedelta(days=180)  # 统计过去6个月的数据
    df_filtered = catering_df[catering_df['下单时间'] >= begin_day].copy()
    
    # 按会员号聚合计算所需指标
    rfm_analysis = df_filtered.groupby('member_id').agg({
        '下单时间': 'max',  # 最近一次下单时间
        '实收': 'sum',      # 总消费金额
        '订单号': 'count'   # 消费次数
    }).reset_index()
    
    # 重命名列
    rfm_analysis.columns = ['会员号', '最近下单时间', '总消费金额', '消费次数']
    rfm_analysis['Recency'] = (today - rfm_analysis['最近下单时间']).dt.days
    
    # 计算最近购买时间指数: e^(-λ * Recency), λ=0.0115对应60天下降到50
    lambda_param = 0.0115
    rfm_analysis['最近一次消费指数'] = np.exp(-lambda_param * rfm_analysis['Recency']) * 100
    
    # 计算原始的消费力和消费频次指数
    rfm_analysis['monetary_raw'] = np.log(rfm_analysis['总消费金额'] + 1)
    rfm_analysis['frequency_raw'] = np.log(rfm_analysis['消费次数']+ 1)
    
    # 进行min-max标准化
    monetary_min = rfm_analysis['monetary_raw'].min()
    monetary_max = rfm_analysis['monetary_raw'].max()
    frequency_min = rfm_analysis['frequency_raw'].min()
    frequency_max = rfm_analysis['frequency_raw'].max()
    
    rfm_analysis['消费力指数'] = (
        (rfm_analysis['monetary_raw'] - monetary_min) / (monetary_max - monetary_min)
    ) * 100 if monetary_max != monetary_min else 0
    
    rfm_analysis['消费频次指数'] = (
        (rfm_analysis['frequency_raw'] - frequency_min) / (frequency_max - frequency_min)
    ) * 100 if frequency_max != frequency_min else 0
    
    # 计算user_value
    weight_monetary = 0.4  # 餐饮用户单价较低，频次更重要
    weight_frequency = 1 - weight_monetary  
    
    rfm_analysis['用户价值'] = (
        rfm_analysis['最近一次消费指数'] * 
        (rfm_analysis['消费力指数'] * weight_monetary + 
         rfm_analysis['消费频次指数'] * weight_frequency)
    ) / 100
    
    # 创建0-100的区间（以1为间隔）
    bins = np.arange(0, 101, 1)
    
    # 对四个指标进行分布统计
    value_dist = pd.cut(rfm_analysis['用户价值'], bins=bins, include_lowest=True).value_counts().sort_index()
    recency_dist = pd.cut(rfm_analysis['最近一次消费指数'], bins=bins, include_lowest=True).value_counts().sort_index()
    monetary_dist = pd.cut(rfm_analysis['消费力指数'], bins=bins, include_lowest=True).value_counts().sort_index()
    frequency_dist = pd.cut(rfm_analysis['消费频次指数'], bins=bins, include_lowest=True).value_counts().sort_index()
    
    # 创建包含0-100的第一列
    score_range = pd.Series(range(1, 101), name='分数区间')

    # 创建结果DataFrame
    distribution_result = pd.DataFrame({
        '分数区间': score_range,
        '用户价值分布': value_dist.values,
        '最近一次消费指数分布': recency_dist.values,
        '消费力指数分布': monetary_dist.values,
        '消费频次指数分布': frequency_dist.values
    })

    df_sorted = rfm_analysis.sort_values(by='消费次数', ascending=False)
    
    return df_sorted


In [52]:
print (analyze(get_db_connection()))

                会员号              最近下单时间      总消费金额  消费次数  Recency    最近一次消费指数  \
6385            nan 2025-03-17 09:44:14  300569.39  5011        0  100.000000   
1     91330000116.0 2025-03-11 14:01:10    7236.64   144        5   94.412189   
2035  91330021317.0 2025-03-10 11:23:04    4023.80    78        6   93.332668   
335   91330007774.0 2025-01-27 14:24:48    1364.00    35       48   57.579706   
1853  91330021019.0 2024-12-06 15:04:06    1658.00    33      100   31.663677   
...             ...                 ...        ...   ...      ...         ...   
2913  91330022679.0 2024-11-09 17:20:28      28.00     1      127   23.212019   
2912  91330022678.0 2024-11-09 17:20:45      85.80     1      127   23.212019   
2911  91330022677.0 2024-11-09 17:04:29      32.00     1      127   23.212019   
2910  91330022676.0 2024-11-09 16:49:58      66.00     1      127   23.212019   
3193  91330023116.0 2024-11-16 17:12:36      76.00     1      120   25.157855   

      monetary_raw  frequen