## Download Dataset

In [126]:

from datasets import load_dataset
import pandas as pd
import warnings
import json
import re
warnings.filterwarnings('ignore')
dataset = load_dataset("li2017dailydialog/daily_dialog",split='train')

In [2]:
dataset

Dataset({
    features: ['dialog', 'act', 'emotion'],
    num_rows: 11118
})

In [3]:
dataset[0]

{'dialog': ['Say , Jim , how about going for a few beers after dinner ? ',
  ' You know that is tempting but is really not good for our fitness . ',
  ' What do you mean ? It will help us to relax . ',
  " Do you really think so ? I don't . It will just make us fat and act silly . Remember last time ? ",
  " I guess you are right.But what shall we do ? I don't feel like sitting at home . ",
  ' I suggest a walk over to the gym where we can play singsong and meet some of our friends . ',
  " That's a good idea . I hear Mary and Sally often go there to play pingpong.Perhaps we can make a foursome with them . ",
  ' Sounds great to me ! If they are willing , we could ask them to go dancing with us.That is excellent exercise and fun , too . ',
  " Good.Let ' s go now . ",
  ' All right . '],
 'act': [3, 4, 2, 2, 2, 3, 4, 1, 3, 4],
 'emotion': [0, 0, 0, 0, 0, 0, 4, 4, 4, 4]}

## data cleaning

emoji:- no emotion (0), anger (1), disgust (2), fear (3), happiness (4), sadness (5) and surprise (6).

act:- dummy(0), inform (1), question (2), directive (3) and commissive (4).

In [4]:
# extract emotions keep only unique emotion
ll = 'no emotion , anger , disgust , fear , happiness , sadness , surprise '.split(',')
emo = dict(zip([i for i in range(7)],ll))

# extract actions keep only unique action

lll = 'dummy, inform , question , directive , commissive '.split(',')

act = dict(zip([i for i in range(5)],lll))

In [5]:
extracted_emo=[]
for i in dataset['emotion']:
    extracted_emo.append(','.join({emo[j] for j in i if j }))
    if  len(extracted_emo[-1])==0:
        extracted_emo[-1]='no emotion'

extracted_act=[]
for i in dataset['act']:
    extracted_act.append(','.join({act[j] for j in i if j }))
    if  len(extracted_act[-1])==0:
        extracted_act[-1]='dummy'

In [6]:
len(extracted_emo) , len(extracted_act)

(11118, 11118)

In [7]:
extracted_act[0]

' directive , commissive , question , inform '

In [8]:
#extract text
text = []
for i in dataset['dialog']:
    text.append(' '.join(i))

In [9]:
text[0]

"Say , Jim , how about going for a few beers after dinner ?   You know that is tempting but is really not good for our fitness .   What do you mean ? It will help us to relax .   Do you really think so ? I don't . It will just make us fat and act silly . Remember last time ?   I guess you are right.But what shall we do ? I don't feel like sitting at home .   I suggest a walk over to the gym where we can play singsong and meet some of our friends .   That's a good idea . I hear Mary and Sally often go there to play pingpong.Perhaps we can make a foursome with them .   Sounds great to me ! If they are willing , we could ask them to go dancing with us.That is excellent exercise and fun , too .   Good.Let ' s go now .   All right . "

## Save the dataset

In [10]:
daily_dialog = pd.DataFrame( zip(text,extracted_emo,extracted_act), columns = ['text','emoji','action'])

In [11]:
daily_dialog.sample(10)

Unnamed: 0,text,emoji,action
3258,I heard that Kevin got divorced . Is that true...,surprise,"question , inform"
4232,Would you like to have dinner with me tonight ...,no emotion,"directive , commissive , inform"
3186,"Susan , Did you know the four ugliest women in...",no emotion,"question , inform"
644,Did you hear what happened to Mike last night ...,surprise,"question , inform"
40,"Could you tell me the right time , please ? ...",no emotion,"directive , commissive , inform"
7145,Would you please explain for me what a busines...,no emotion,"directive , question , inform"
211,Did you get any rewards or honors in college ?...,happiness,"question , inform"
10185,"I don't know if you remember , but with honest...","surprise , happiness",inform
9387,"Hi , my name is Tom . Tom , the new sales re...",surprise,"directive , commissive , inform"
4567,"How can I lose weight , doctor ? I seem to get...",no emotion,"question , inform"


In [12]:
#remove duplication dailog
daily_dialog = daily_dialog.drop_duplicates(subset=['text'],ignore_index=True)


In [13]:
daily_dialog.describe()

Unnamed: 0,text,emoji,action
count,10549,10549,10549
unique,10549,38,15
top,"Could I have the check , please ? Okay . I'l...",no emotion,"directive , commissive , question , inform"
freq,1,5048,3874


In [14]:
daily_dialog.to_csv('./../data/daily_dialog.csv',index = False)

## build ground truth

In [15]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def generate_ground_truth(df, output_file,ground_truth_size=300):    
    # Create a TF-IDF vectorizer
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(df['text'])
    
    # Calculate cosine similarity between all pairs of dialogues
    cosine_sim = cosine_similarity(tfidf_matrix)
    
    # Generate ground truth data
    ground_truth = []
    for idx, row in df.iterrows():
        dialogue_id = idx
    
          # Extract the dialogue from the text column
        dialogue_list = row['text'].split(',')  # Convert string representation of list to actual list
        
        if len(dialogue_list) >= 2:
            query = dialogue_list[-2]  # Second last is the question
            ground_truth_response = dialogue_list[-1]  # Last is the response
        else:
            query = "No dialogue"
            ground_truth_response = "No response"
        
            
        # Create ground truth entry
        entry = {
            'dialogue_id': dialogue_id,
            'query': query,
            'ground_truth_response': ground_truth_response
           
        }
        ground_truth.append(entry)
    
    # Convert to DataFrame and save as CSV
    ground_truth_df = pd.DataFrame(ground_truth)
    
    # Generate a random sample of 300 records from the original test DataFrame
    # Set a random seed for reproducibility
    sampled_df = ground_truth_df.sample(n=ground_truth_size, random_state=40,ignore_index = True)
    sampled_df.to_csv(output_file, index=False)
    print(f"Ground truth file created: {output_file}")




output_file = './../data/daily_dialog_ground_truth.csv'

generate_ground_truth(daily_dialog, output_file,ground_truth_size=1000)

Ground truth file created: ./../data/daily_dialog_ground_truth.csv


## Use LLM to biuld ground truth 

In [127]:
from transformers import pipeline
# Use a pipeline as a high-level helper
from transformers import pipeline , set_seed

messages = [
    {"role": "user", "content": "Who are you?"},
]
pipe = pipeline("text-generation",
                model="Qwen/Qwen2.5-0.5B-Instruct", 
                device_map="auto",
                max_new_tokens=256,
                do_sample=False
                )
set_seed(42)
pipe(messages)

[{'generated_text': [{'role': 'user', 'content': 'Who are you?'},
   {'role': 'assistant',
    'content': 'I am Qwen, an artificial intelligence language model created by Alibaba Cloud. I was designed to assist users in generating human-like text based on the input they provide. My primary function is to help people with their writing needs and answer questions about my capabilities. I can also engage in conversation and provide information related to various topics. If you have any specific queries or areas of interest, feel free to ask me!'}]}]

In [4]:
daily_dialog = pd.read_csv('./../data/daily_dialog.csv')

In [8]:
daily_dialog['id']=daily_dialog.index 

In [9]:
daily_dialog.head()

Unnamed: 0,text,emoji,action,id
0,"Say , Jim , how about going for a few beers af...",happiness,"directive , commissive , question , inform",0
1,Can you do push-ups ? Of course I can . It's...,surprise,"question , inform",1
2,"Can you study with the radio on ? No , I lis...",no emotion,"question , inform",2
3,Are you all right ? I will be all right soon...,no emotion,"question , inform",3
4,"Hey John , nice skates . Are they new ? Yeah...",surprise,"directive , commissive , question , inform",4


In [130]:
sampled_df = daily_dialog.sample(n=500, random_state=40,ignore_index = True)
    

In [131]:
documents = sampled_df.to_dict(orient='records')

prompt_template = """
You emulate a user of our life coach application.
Formulate a question with it's answer this user might ask based on a provided dialog text.
Make the questions specific to this dialog.
The record should contain the answer to the questions, and the questions should
be complete and not too short. Use as fewer words as possible from the record. 

The record:

dialog: {text}


Provide the output in parsable JSON without using code blocks:

{{"questions": [  ]}}
""".strip()

prompt = prompt_template.format(**documents[5])



In [132]:
print(prompt)

You emulate a user of our life coach application.
Formulate a question with it's answer this user might ask based on a provided dialog text.
Make the questions specific to this dialog.
The record should contain the answer to the questions, and the questions should
be complete and not too short. Use as fewer words as possible from the record. 

The record:

dialog: Hey , Robert , that's a nice shirt you are wearing . Where did you get it ?   thanks , I like it too . I bought it at the nearby department store .   that's nice . Do you know you can get one at the wholesale market near the zoo for a much lower price ?   yeah , I know that . But at those places . the prices they ask you are ridiculously high , and if you don't bargain hard , you will get ripped-off .   true , learning how to haggle the price is one of the things people have to pick up when they come to China for the first time .   yeah , but personally , I hate bargaining . If I bargain , I might come to a lower price , but 

In [133]:
def llm(prompt):
   
    messages=[{"role": "user", "content": prompt}]
    
    
    return pipe(messages)[0]['generated_text'][1]['content']

In [134]:
print(llm(prompt))

```json
{
  "questions": [
    {
      "question": "Where did you buy the shirt?",
      "answer": "I bought it at the nearby department store."
    },
    {
      "question": "Do you know you can get one at the wholesale market near the zoo for a much lower price?",
      "answer": "Yes, I know that. But at those places, the prices they ask are ridiculously high, and if you don't bargain hard, you will get ripped-off."
    },
    {
      "question": "What do you think about haggling the price?",
      "answer": "Learning how to haggle the price is one of the things people have to learn when they come to China for the first time. However, I hate bargaining because I might come to a lower price, but I won't know the true price of what I'm buying; and I always feel that I am overcharged at the wholesale market."
    }
  ]
}
```


In [143]:
sample = sampled_df.to_dict(orient='records')

qa = []

for record in tqdm(sample):
    
    prompt = prompt_template.format(**record)
    response = llm(prompt) 
    
    match = re.search(r'```json(.*?)```', response, re.DOTALL)


    if match:
        extracted_content = match.group(1).strip()  # Extract the content and remove leading/trailing whitespace
        #print(extracted_content)
    else:
   
        extracted_content = response  # Extract the content and remove leading/trailing whitespace
   

    try:
        content = json.loads(extracted_content)
    except :
        continue

    qa.append((record, content))

  0%|          | 0/500 [00:00<?, ?it/s]

In [144]:
qa[0]

({'text': "I want to buy a rain coat . Could you please show me one ?   With pleasure . What color do you like ?   I like yellow best . How much does it cost ?   It costs 275 yuan , Miss .   It's nice , but that's very steep for a rain coat . Could you give me a 20 percent discount ?   Sorry , we don't give discounts .   In that case , I don't think I'll buy one here . ",
  'emoji': ' sadness ',
  'action': ' directive , commissive , question , inform ',
  'id': 8777},
 {'questions': [{'question': 'What is your favorite color?',
    'answer': 'Yellow'},
   {'question': 'How much would you like to pay for the raincoat?',
    'answer': '$275'},
   {'question': 'Would you like a 20% discount?',
    'answer': "Sorry, we don't offer any discounts."}]})

In [153]:
df_eval.content[0]

{'questions': [{'question': 'What is your favorite color?',
   'answer': 'Yellow'},
  {'question': 'How much would you like to pay for the raincoat?',
   'answer': '$275'},
  {'question': 'Would you like a 20% discount?',
   'answer': "Sorry, we don't offer any discounts."}]}

In [176]:
def questions(i):
   
    if  i.get('questions') :  # is not empty list
        questions=[]
        for j in i.get('questions'):
            try:
             questions.append(j.get('question'))
            except:
                continue
        return questions

def answers(i):
    
    if  i.get('questions') :  # is not empty list
        answers=[]
        for j in i.get('questions'):
            try:
             answers.append(j.get('answer'))
            except:
                continue
        return answers
answers(df_eval.content)

In [177]:
df_eval = pd.DataFrame(qa, columns=['record','content'])

df_eval['id'] = df_eval.record.apply(lambda d: d['id'])
df_eval['text'] = df_eval.record.apply(lambda d: d['text'])
df_eval['emoji'] = df_eval.record.apply(lambda d: d['emoji'])
df_eval['action'] = df_eval.record.apply(lambda d: d['action'])



df_eval['questions'] = df_eval.content.apply(lambda d: questions(d) )
df_eval['answers'] = df_eval.content.apply(lambda d: answers(d) )

del df_eval['record']
del df_eval['content']

In [179]:
df_eval.shape

(491, 6)

In [200]:
df_cleaned = df_eval.dropna(subset=['questions','answers'], axis=0)

In [201]:
df = df_cleaned.explode(['questions','answers'])

In [202]:
df.shape

(489, 6)

In [203]:
df = df.dropna(subset=['questions','answers'], axis=0)

In [205]:
df.shape

(486, 6)

In [204]:
df.to_csv('../data/ground_truth_llm.csv', index=False)