In [None]:
import os 
import re 
import json 
import numpy as np 
import pandas as pd 
from collections import defaultdict, Counter
from utils import distinct_2
from sklearn.metrics import classification_report
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline

In [None]:
gt_df = pd.read_csv("./datasets/dulemon/test.csv")
gt_texts = gt_df["target"].values.tolist()
gender_labels = gt_df["Gender"].values
emotion_labels = gt_df["Emotion"].values
sentiment_labels = gt_df["Sentiment"].values
question_labels = gt_df["Question"].values

In [None]:
gt_history = gt_df["history"].values.tolist()

In [None]:
np.mean([len(x) for x in gt_texts])

In [None]:
distinct_2(gt_texts)

In [None]:
model1 = AutoModelForSequenceClassification.from_pretrained('../sentiment_8class_no_emoji_clf1')
tokenizer1 = AutoTokenizer.from_pretrained('../sentiment_8class_no_emoji_clf1')
text_classification_sentiment = pipeline('sentiment-analysis', model=model1, tokenizer=tokenizer1, device=1)

model2 = AutoModelForSequenceClassification.from_pretrained('../Gender_3class_clf1')
tokenizer2 = AutoTokenizer.from_pretrained('../Gender_3class_clf1')
text_classification_gender = pipeline('sentiment-analysis', model=model2, tokenizer=tokenizer2, device=1)

In [None]:
from torch.utils.data import Dataset
from tqdm import tqdm
BS = 64

class MyDataset(Dataset):
    def __init__(self, text_list) -> None:
        super().__init__()
        self.data = text_list
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, i):
        return self.data[i]


In [None]:
def batch_infer(text_list):
    questions = [int("?" in x or "？" in x) for x in text_list]
    dataset = MyDataset(text_list)
    genders = []
    sentiments = []
    for out in tqdm(text_classification_gender(dataset, batch_size=BS), total=len(dataset)):
        genders.append(out['label'])
    for out in tqdm(text_classification_sentiment(dataset, batch_size=BS), total=len(dataset)):
        sentiments.append(out['label'])
    return np.array(genders), np.array(sentiments), np.array(questions)

In [None]:
from bert_score import BERTScorer
bert_scorer = BERTScorer(lang="zh", rescale_with_baseline=False, device="cuda:0")
def get_bert_score(cands, refs):
    bert_P, bert_R, bert_F1 = bert_scorer.score(cands, refs)
    return bert_F1.mean().item()

In [None]:
emotion_list = ['anger',
 'disgust',
 'fear',
 'happiness',
 'like',
 'none',
 'sadness',
 'surprise']

def compute_metrics(pred_texts, gt_texts, gt_history, emotion_preds, sentiment_labels, emotion_labels, gender_preds, gender_labels, question_preds, question_labels):
    ret = {}
    ret["acc_emotions"] = (emotion_preds == emotion_labels)
    print(classification_report(emotion_labels, emotion_preds, target_names=emotion_list))
    ret["acc_genders"] = (gender_preds == gender_labels)
    ret["acc_questions"] = (question_preds == question_labels)
    ret["avg_len"] = [len(x) for x in pred_texts]
    for k, v in ret.items():
        ret[k] = np.mean(v)
    ret['bert_score'] = get_bert_score(pred_texts, gt_texts)
    ret["dist_2"] = distinct_2(pred_texts)
    ret["avg_acc"] = (ret["acc_emotions"] + ret["acc_genders"] + ret["acc_questions"]) / 3
    return ret

 compute metrics

In [None]:
pred_texts = [line.strip() for line in open("../train_dulemon_outputs/bart_baseline1/infer_sampling/topp_0.9.txt")]
gender_preds, emotion_preds, question_preds = batch_infer(pred_texts)
compute_metrics(pred_texts, gt_texts, gt_history, emotion_preds, sentiment_labels, emotion_labels, gender_preds, gender_labels, question_preds, question_labels)