# 分桶采样

In [10]:
from utils import *

In [3]:
df_1 = pd.read_csv(f'{ASSETS}labeled_L2__1_technical.csv')
df_2 = pd.read_csv(f'{ASSETS}labeled_L2__2_payment.csv')
df_3 = pd.read_csv(f'{ASSETS}labeled_L2__3_delivery.csv')
df_4 = pd.read_csv(f'{ASSETS}labeled_L2__4_account.csv')
df_5 = pd.read_csv(f'{ASSETS}labeled_L2__5_service.csv')
df_6 = pd.read_csv(f'{ASSETS}labeled_L2__6_pricing.csv')
df_7 = pd.read_csv(f'{ASSETS}labeled_L2__7_legal.csv')

| 分类          | 样本数量         | 说明         |
|-------------|--------------|------------|
| 头部类（>30k）   | 抽样 1500 条    | 控制占比，避免过拟合 |
| 中部类（5k–30k） | 抽样 1000 条    | 保留一定代表性    |
| 稀有类（<5k）    | 全部使用 or 抽600 | 最大保留信息     |

### Mark
样本中可能含有单一数据被标记多类情况，使用按权重保留策略。
考虑到抽样的动态规划特性，无法满足既定等，所以，仅设定抽样区间范围。

In [14]:
# 分类标签样本数
# technical	47,410
# payment	39,851
# delivery	60,139
# account	6,060
# service	32,386
# pricing	2,687
# legal	    5,499


Step 1: 计算占比
| **Label** | **候选池数量**   | **占比（%）** |
|-----------|-------------|-----------|
| technical | 47,410      | 25.2%     |
| payment   | 39,851      | 21.2%     |
| delivery  | 60,139      | 31.9%     |
| account   | 6,060       | 3.2%      |
| service   | 32,386      | 17.2%     |
| pricing   | 2,687       | 1.4%      |
| legal     | 5,499       | 2.9%      |
| **合计**    | **194,032** | **100%**  |

<br>
Step 2: 目标采样数量（假设总样本为 7500）

x 7500 得出建议采样数：

| **Label** | **占比**   | **建议采样数** |
|-----------|----------|-----------|
| technical | 25.2%    | 1890      |
| payment   | 21.2%    | 1590      |
| delivery  | 31.9%    | 2390      |
| account   | 3.2%     | 240       |
| service   | 17.2%    | 1290      |
| pricing   | 1.4%     | 105       |
| legal     | 2.9%     | 220       |
| **合计**    | **7725** |           |




In [7]:
df_list = [df_1, df_2, df_3, df_4, df_5, df_6, df_7]

# 采样参数参照建议采样数
sample_sizes = {
    'technical': 1800,
    'payment': 1500,
    'delivery': 2100,
    'account': 300,
    'service': 1200,
    'pricing': 150,
    'legal': 200
}

# 合并所有数据（并保留原始标签）
df_combined = pd.concat(df_list, ignore_index=True)
df_combined.head()
# 194032

Unnamed: 0,tweet_id,text,label
0,212,@ATVIAssist @115755 I'm trying to buy the digi...,technical
1,267,@115768 Jeffrey I am sorry to hear that. If yo...,technical
2,323,@1520 @XboxSupport the 5 app on Xboxes not wo...,technical
3,600,@115811 Hi - are you receiving an error messag...,technical
4,1161,@TMobileHelp trying to redeem a free tuesday c...,technical


In [9]:
# 去除重复 tweet_id，只保留优先级高的标签
df_distinct = df_combined.sort_values(
    by='label',
    key=lambda col: col.map(lambda x: RiskLabel[x.upper()].value)
)

df_distinct = df_distinct.drop_duplicates(subset='tweet_id', keep='first')
df_distinct.head()

Unnamed: 0,tweet_id,text,label
194031,2987073,@118374 Boycotting Argos; reserved item at hal...,legal
190359,1014759,@AmazonHelp Last u are a joke just attract us ...,legal
190360,1014763,@AmazonHelp Public must know your scam,legal
190361,1014770,@AmazonHelp It's a scam they harras us,legal
190362,1015516,@117176 I just received a call from your India...,legal


In [11]:
df_sample = sample_and_clean_enum(df_distinct, sample_sizes)
df_sample.head()
# 7250

Unnamed: 0,tweet_id,text,label
0,1553020,@480508 @480507 @480506 @XboxSupport It worked...,technical
1,433087,UPDATE iOS 11.0.2 😤 iPHONE IS EXTREMELY SLOW N...,technical
2,2093386,@467250 Sorry for the inconvenience. Do you s...,technical
3,401610,@210972 I'm sorry for the trouble. Have you tr...,technical
4,326051,@193675 We're sorry that you are seeing errors...,technical


In [12]:
# 样本分布，与原始数据集分布保持一致，表现良好
df_sample['label'].value_counts()

label
delivery     2100
technical    1800
payment      1500
service      1200
account       300
legal         200
pricing       150
Name: count, dtype: int64

In [13]:
df_sample.to_csv(f"{ASSETS}labeled_L2__bucket_sampled.csv", index=False, encoding='utf-8-sig')