In [3]:
import pandas as pd
import numpy as np
from pathlib import Path

In [5]:
# --- paths ---
ROOT = Path.cwd().parent            # repo root (notebooks/ -> parent)
PROCESSED_PATH = ROOT / "data" / "processed"

df = pd.read_csv(
    PROCESSED_PATH / "processed_price_data_2023Q1.csv"
)

df['Price2'] = df['Price'].str.replace('Rs. ', '', regex=False).str.replace(',', '').astype(float)

df[['Price', 'Price2']].head()

Unnamed: 0,Price,Price2
0,Rs. 58,58.0
1,Rs. 2998,2998.0
2,Rs. 2997,2997.0
3,Rs. 899,899.0
4,Rs. 756,756.0


In [6]:
def extract_days(val):
    if pd.isna(val) or val == 'N.A.':
        return np.nan
    val = str(val).lower()
    if 'day' in val:
        try:
            return int(val.split()[0])
        except:
            return np.nan
    return np.nan

df['Days2'] = df['Validity'].apply(extract_days)

df[['Tag1', 'Tag2']] = df['Tags'].str.extract(r'([^ ]+)\s+(.*)')

df['clean_price_per_day'] = df['Price2'] / df['Days2']

# 计算平均价格
df_price_long = df[['Circle', 'Tag1', 'clean_price_per_day']].copy()
df_price_long.columns = ['circle', 'company', 'avg_price']
df_price_long = df_price_long.groupby(['circle', 'company'], as_index=False)['avg_price'].mean()

# 保存为 Excel
df_price_long.to_excel(
    PROCESSED_PATH / "price_final_2023Q1.xlsx",
    index=False
)