In [12]:
import pandas as pd
from Utils import RiskLabel

In [13]:
df_1 = pd.read_csv('../assets/labeled_L2__1_technical.csv')
df_2 = pd.read_csv('../assets/labeled_L2__2_payment.csv')
df_3 = pd.read_csv('../assets/labeled_L2__3_delivery.csv')
df_4 = pd.read_csv('../assets/labeled_L2__4_account.csv')
df_5 = pd.read_csv('../assets/labeled_L2__5_service.csv')
df_6 = pd.read_csv('../assets/labeled_L2__6_pricing.csv')
df_7 = pd.read_csv('../assets/labeled_L2__7_legal.csv')

| 分类          | 样本数量         | 说明         |
|-------------|--------------|------------|
| 头部类（>30k）   | 抽样 1500 条    | 控制占比，避免过拟合 |
| 中部类（5k–30k） | 抽样 1000 条    | 保留一定代表性    |
| 稀有类（<5k）    | 全部使用 or 抽600 | 最大保留信息     |

样本中可能含有单一数据被标记多类情况，使用按权重保留策略。
考虑到抽样的动态规划特性，无法满足既定等，所以，仅设定抽样区间范围。

类别
推荐抽样数范围
technical, payment, delivery, service
1000–1500 条
account, legal, pricing
500–800 条


In [14]:
# 分类标签样本数
# technical	47,410
# payment	39,851
# delivery	60,139
# account	6,060
# service	32,386
# pricing	2,687
# legal	    5,499


In [15]:
def sample_and_clean_enum(df_list, sample_sizes: dict):
    # 1. 合并所有数据（并保留原始标签）
    df_all = pd.concat(df_list, ignore_index=True)

    # 2. 去除重复 tweet_id，只保留优先级高的标签
    df_all = df_all.sort_values(
        by='label', 
        key=lambda col: col.map(lambda x: RiskLabel[x.upper()].value)
    )
    df_all = df_all.drop_duplicates(subset='tweet_id', keep='first')

    # 3. 按类别分组采样
    df_samples = []
    for label_name, size in sample_sizes.items():
        label_enum = RiskLabel[label_name.upper()]
        df_sub = df_all[df_all['label'].str.lower() == label_name.lower()]
        if len(df_sub) > size:
            df_sub = df_sub.sample(n=size, random_state=42)
        df_samples.append(df_sub)

    # 4. 合并最终训练集
    df_final = pd.concat(df_samples, ignore_index=True)
    return df_final


Step 1: 计算占比
| **Label** | **候选池数量**   | **占比（%）** |
|-----------|-------------|-----------|
| technical | 47,410      | 25.2%     |
| payment   | 39,851      | 21.2%     |
| delivery  | 60,139      | 31.9%     |
| account   | 6,060       | 3.2%      |
| service   | 32,386      | 17.2%     |
| pricing   | 2,687       | 1.4%      |
| legal     | 5,499       | 2.9%      |
| **合计**    | **194,032** | **100%**  |

<br>
Step 2: 目标采样数量（假设总样本为 7500）

乘以 7500 得出建议采样数：

| **Label** | **占比**   | **建议采样数** |
|-----------|----------|-----------|
| technical | 25.2%    | 1890      |
| payment   | 21.2%    | 1590      |
| delivery  | 31.9%    | 2390      |
| account   | 3.2%     | 240       |
| service   | 17.2%    | 1290      |
| pricing   | 1.4%     | 105       |
| legal     | 2.9%     | 220       |
| **合计**    | **7725** |           |




In [16]:
df_list = [df_1, df_2, df_3, df_4, df_5, df_6, df_7]

# 采样参数参照建议采样数
sample_sizes = {
    'technical': 1800,
    'payment': 1500,
    'delivery': 2100,
    'account': 300,
    'service': 1200,
    'pricing': 150,
    'legal': 200
}

In [17]:
df_sample = sample_and_clean_enum(df_list, sample_sizes)
df_sample.head()

Unnamed: 0,tweet_id,text,label
0,2460800,"@704611 Hi Mike, can you give us a little more...",technical
1,1638473,@221850 @17344 @17345 We're aware of this prin...,technical
2,2864565,"@795659 Hi Kyle, we’re so sorry to hear this. ...",technical
3,2842751,@708320 Check out the next article to troubles...,technical
4,1778347,@AskPlayStation do you have an outage. I'm not...,technical


In [18]:
# 7250
df_sample.count()

tweet_id    7250
text        7250
label       7250
dtype: int64

In [19]:
# 样本分布，与原始数据集分布保持一致，表现良好
df_sample['label'].value_counts()

label
delivery     2100
technical    1800
payment      1500
service      1200
account       300
legal         200
pricing       150
Name: count, dtype: int64

In [20]:
df_sample.to_csv("../assets/labeled_L2__sample_enum.csv", index=False, encoding='utf-8-sig')