## 中文评估


In [1]:
file_path = "../.data/all_raretrible.txt"
data = open(file_path, "r", encoding="utf-8").read().split("\n")
data = [x.strip().split(",") for x in data if len(x) > 0]
data = [[x[0], ",".join(x[1:])] for x in data]
print(data[0:2])

[['感激', '好的,那我谢谢您了。'], ['中性', '没有词汇课的优惠券吗?']]


In [2]:
import pandas as pd

columns=["label", "text",]
# 转换 data 到 pandas DataFrame
df = pd.DataFrame(data, columns=columns)


In [3]:
from dspy.datasets import DataLoader

# 创建 dspy.datasets 对象
dl = DataLoader()
# fields 选择指定列 input_keys
all_dataset = dl.from_pandas(
    df,
    fields=columns,
    input_keys=("text",),
)
print(all_dataset[0:2])

[Example({'label': '感激', 'text': '好的,那我谢谢您了。'}) (input_keys={'text'}), Example({'label': '中性', 'text': '没有词汇课的优惠券吗?'}) (input_keys={'text'})]


In [5]:
from collections import defaultdict
import random


def stratified_sample(from_dataset, label_attr, *dataset_sizes):
    label_groups = defaultdict(list)
    for item in from_dataset:
        label = getattr(item, label_attr)
        label_groups[label].append(item)

    result_sets = [[] for _ in dataset_sizes]

    for label, items in label_groups.items():
        random.shuffle(items)
        label_sizes = [size // len(label_groups) for size in dataset_sizes]
        start = 0
        for i, size in enumerate(label_sizes):
            end = start + size
            result_sets[i].extend(items[start:end])
            start = end

    all_items = [item for items in label_groups.values() for item in items]
    for i, (result_set, target_size) in enumerate(zip(result_sets, dataset_sizes)):
        shortage = target_size - len(result_set)
        if shortage > 0:
            result_sets[i].extend(random.sample(all_items, shortage))

    return result_sets


In [6]:
train_set, test_set, sample_set = stratified_sample(all_dataset, "label", 20, 50, 100)
from collections import Counter

train_set, test_set, sample_set = stratified_sample(all_dataset, "label", 20, 50, 100)
for dataset in [train_set, test_set, sample_set]:
    label_counts = Counter(item.get("label") for item in dataset)
    print(label_counts)

Counter({'生气': 5, '中性': 3, '焦急': 3, '高兴': 3, '感激': 2, '惊讶': 2, '抱怨': 2})
Counter({'高兴': 8, '感激': 7, '中性': 7, '焦急': 7, '惊讶': 7, '抱怨': 7, '生气': 7})
Counter({'惊讶': 15, '生气': 15, '感激': 14, '中性': 14, '焦急': 14, '高兴': 14, '抱怨': 14})


In [7]:
import importlib
import analysis

importlib.reload(analysis)

ana = analysis.Evaluate(data_set=test_set)
ret = ana.do()
print(ret)
ana.show_error()

You are a customer service quality inspection expert., sentiment analysis from Question

You should pay special attention to the emotions of customers being angry and complaining.

---

Follow the following format.

Question: list of input text
Answer:
    Return format: a list of "Index, Emotion Category, Emotion Subcategory, Emotion Subcategory Score"
    Emotion Subcategory Score (0-1.00).
    Example: 
        1, pessimistic, complaining, 0.90
        2, optimistic, happy, 0.75
Please choose emotion categories and subcategories only from the following options:
    Emotion categories: ["pessimistic", "optimistic", "neutral"]
    Emotion subcategories: ['neutral', 'surprised', 'thankful', 'complaining', 'urgent','anxious', 'angry', 'happy']

Here is some examples

---

Question:
1: 好的谢谢[亲]
Answer:
1,optimistic,thankful,0.90

---

Question: 
1: 在吗?回复太慢了
Answer:
1,pessimistic,complaining,0.85

---

Question: 
1: 都要举报
Answer:
1,pessimistic,angry,0.95

---

Question: 
1: 我一直都是用的这个号,怎么突然就

# 测试印尼语


In [35]:
test_set,_ = stratified_sample(all_dataset, "label", 50,1)

In [36]:
import importlib
import transfer_lang

importlib.reload(transfer_lang)

trans = transfer_lang.TransferLang.from_dataset(test_set)
ret = trans.do()
print(ret.content)

把下面内容翻译成 印尼语: 
0: 好吧谢谢了再见
1: 好的,谢谢了,没了,感谢
2: 好的非常感谢,辛苦你们了,新年好
3: 再次感谢糖糖
4: 拜拜,辛苦了,祝你新年好
5: 好的,谢谢你啊
6: 欧,好的谢谢您
7: 她要退,但海外的又不能退,这个你们怎么解决呢?
8: 昨天去到哪
9: 你能订一下吗
10: 自动结算的话,我们绑定的有两个账户,能不能我们自己选择一个绑定
11: 这样子能看清?
12: 是不是中病毒了
13: 公式都是乱码
14: 麻烦您帮忙查一下,尽快安排发货,谢谢了[emoji050]
15: 带鱼和蒜蓉粉丝扇贝还没发货?
16: 你回复能不能快一点急死人,你这客服怎么搞的
17: 我的怎么还没有到
18: 我真的着急呀
19: 好的,麻烦尽快安排物流,谢谢了
20: 您好,我的押金什么时间可以退回,可以告诉我一个准确时间吗?最近家里出了事,急用钱,很急,谢谢您,麻烦了
21: 这还不一定?
22: 明明我插进去了
23: 你回复也太快了?
24: 不交诚意金聊不了嘛不交诚意金聊不了嘛
25: 早就还了,怎么扣那么多钱
26: 这个你们不管?
27: 明明抢到了呀
28: 喔哈哈哈
29: 亲爱的,上午好!
30: 要向你学习全心全意为人民服务的精神。又快又好
31: 书太棒了!
32: 好滴,太好了
33: 这款哈哈哈哈哈
34: 书太棒了!
35: 这个阿姨在我家做了两次卫生都很脏
36: 看晕了
37: 搞出这些麻烦事
38: 修罗武神为什么还是进不去
39: 系统这两天为什么老掉线
40: 因为你们处理的比较慢,我的车要去年检
41: 我发现你们真的扯
42: 非常生气的一次购物!
43: 你这个应该是个骗子公司
44: 可是我她妈还了,听不懂么?
45: 耽误老子时间
46: 我他妈知道归还,但是没有还的地方你胖揍上哪还
47: 草泥马的沙雕人工服务
48: 你们这次是真搞笑,我天天都停在那里,
49: 我就是问下收费的标准

0: Baik, terima kasih, selamat tinggal
1: Baik, terima kasih, habis, terima kasih
2: Baik, terima kasih yang sangat, mengerjakan keras, selamat tinggal 

In [37]:
trans.update_set()
for item in test_set:
    print(item)

Example({'label': '感激', 'text': 'Baik, terima kasih, selamat tinggal', 'source': '好吧谢谢了再见'}) (input_keys={'text'})
Example({'label': '感激', 'text': 'Baik, terima kasih, habis, terima kasih', 'source': '好的,谢谢了,没了,感谢'}) (input_keys={'text'})
Example({'label': '感激', 'text': 'Baik, terima kasih yang sangat, mengerjakan keras, selamat tinggal tahun baru', 'source': '好的非常感谢,辛苦你们了,新年好'}) (input_keys={'text'})
Example({'label': '感激', 'text': 'Kembali terima kasih, Cukki', 'source': '再次感谢糖糖'}) (input_keys={'text'})
Example({'label': '感激', 'text': 'Selamat tinggal, mengerjakan keras, selamat tinggal tahun baru', 'source': '拜拜,辛苦了,祝你新年好'}) (input_keys={'text'})
Example({'label': '感激', 'text': 'Baik, terima kasih', 'source': '好的,谢谢你啊'}) (input_keys={'text'})
Example({'label': '感激', 'text': 'Oh, baik, terima kasih Anda', 'source': '欧,好的谢谢您'}) (input_keys={'text'})
Example({'label': '中性', 'text': 'Dia mau mengembalikan, tapi yang di luar negeri tidak bisa, bagaimana Anda menyelesaikannya?', 'source':

In [38]:
import importlib
import analysis

importlib.reload(analysis)

ana = analysis.Evaluate(data_set=test_set)
ret = ana.do()
print(ret)
ana.show_error()

You are a customer service quality inspection expert., sentiment analysis from Question

You should pay special attention to the emotions of customers being angry and complaining.

---

Follow the following format.

Question: list of input text
Answer:
    Return format: a list of "Index, Emotion Category, Emotion Subcategory, Emotion Subcategory Score"
    Emotion Subcategory Score (0-1.00).
    Example: 
        1, pessimistic, complaining, 0.90
        2, optimistic, happy, 0.75
Please choose emotion categories and subcategories only from the following options:
    Emotion categories: ["pessimistic", "optimistic", "neutral"]
    Emotion subcategories: ['neutral', 'surprised', 'thankful', 'complaining', 'urgent','anxious', 'angry', 'happy']

Here is some examples

---

---

Questions:
1: Oke, terima kasih [sayang]
Answer:
1,optimistic,thankful,0.90

---

Questions: 
1: Ada di sana? Balasanmu terlalu lambat
Answer:
1,pessimistic,complaining,0.85

---

Questions: 
1: Akan melaporkan se