#### To create ground truth data

In [1]:
from datasets import load_dataset

dataset = load_dataset("Amod/mental_health_counseling_conversations")

In [2]:
import pandas as pd

# Access the 'train' split of the dataset
train_dataset = dataset["train"]

# Convert to a Pandas DataFrame
df = pd.DataFrame(train_dataset)

# Remove duplicates based on 'Context' and 'Response' columns
df = df.drop_duplicates(subset=["Context", "Response"]).reset_index(drop=True)

# Convert the DataFrame to a list of dictionaries
documents = df.to_dict(orient="records")

In [3]:
# Create a new column which is hashes of the 'Context' and 'Response' columns to use as a ID.

import hashlib

def hash_record(doc):
    combined = f"{doc['Context']}-{doc['Response'][:25]}"
    hash_object = hashlib.md5(combined.encode())
    hash_hex = hash_object.hexdigest()
    document_id = hash_hex[:8]
    return document_id

In [4]:
for doc in documents:
    doc["ID"] = hash_record(doc)

In [5]:
from collections import defaultdict

hashes = defaultdict(list)

for doc in documents:
    doc_id = doc["ID"]
    hashes[doc_id].append(doc)

In [6]:
len(hashes), len(documents)

(2448, 2752)

In [7]:
for k, values in hashes.items():
    if len(values) > 1:
        print(k, len(values))

ddc68f6c 2
e668f90a 2
9dd73376 2
1467290f 2
2ee217e8 2
682c2da1 2
b4c26a1a 2
83c29cf0 2
d6af732f 2
14d4b10c 2
f7bc7d0d 2
99f4aee0 2
98aec37f 2
03b8b4cb 2
9acf8463 2
7a7e11a6 2
7c1d25a5 2
a2e7c4c2 2
f0dba9d1 2
1fbacd13 2
8d294b25 2
c34fd418 2
9e57668a 2
f87c06be 2
0dc0015e 2
ed455587 2
32b78ea7 2
a8e1e29e 2
fe3e7145 2
9bf45e17 2
8b26c9a2 2
aebfdf22 2
fd1c25c4 2
10fe3515 2
fd0bf6ab 2
3da73682 2
23ae03d8 2
535ee431 2
496625a8 2
22cb5167 2
cb3308fe 2
7582f3b0 2
facd5268 2
f5b9fb6a 2
459f42c7 2
11afed21 2
ee4de255 2
808d6404 2
ee8e2e91 2
980d9b49 2
41ea894e 2
c9e5c553 2
1fe90e49 2
825888c8 2
da6c189b 2
ad63036d 2
91eb05f5 2
29d794ee 2
686d39f8 2
e54a4fb0 2
70e169fd 2
991df2da 2
16d66c82 2
a6247235 2
66f44965 2
a49e7384 2
36a642d9 2
5ddbd335 2
a0ae0add 2
bc389213 2
d8c6fc92 2
76a95ff8 2
bdacc67a 2
185bcbd2 2
49e7e754 2
382a096e 2
f7c9a742 2
25efacb1 2
17ee9c76 2
876d3e9a 2
eb2a1272 2
35897214 2
911edb46 2
874178ac 2
bda5824b 2
89374f6b 2
af63e5de 2
a078cf7f 2
2d6e0b72 2
703b449b 2
40fa80da 2

In [8]:
hashes["825888c8"]

[{'Context': "Why am I so afraid of it? I don't understand.",
  'Response': 'Why are you afraid of rape? Because it is a problem in the United States! The National Sexual Violence Resource Center reports that one in five women (0r 20%) will be raped (http://www.nsvrc.org/sites/default/files/publications_nsvrc_factsheet_media-packet_statistics-about-sexual-violence_0.pdf) and that 80% of women know their assailant. Given these statistics, it is perfectly logical to be afraid. However, there are things you can do to reduce the risk of rape, such as being aware of your surroundings, and limiting the use of drugs or alcohol.The Enhanced Access, Knowledge, Act program for college-aged women has been shown to reduce the risk of rape by more than 50%. (http://www.blueprintsprograms.com/factsheet/eaaa-enhanced-assess-acknowledge-act-sexual-assault-resistance-education) You may want to see if a program like this is available in your area.\xa0Another great app for when you need to walk somewhere

In [9]:
import json

with open("documents-with-ids.json", "wt") as f_out:
    json.dump(documents, f_out, indent=2)

In [10]:
prompt_template = """
You emulate a person seeking mental health counseling. Based on the counseling conversation record provided below, formulate 5 thoughtful and empathetic questions that this person might ask regarding their feelings and situation. The record includes input from both the patient and the counselor to give you context. Your questions should be complete and detailed, while avoiding excessive reuse of wording from the record.

The counseling record:

Patient: {context} Counselor: {response}

Provide the output in parsable JSON without using code blocks:

["question1", "question2", "question3", "question4", "question5"] 
""".strip()

In [11]:
from dotenv import load_dotenv
import os
from openai import OpenAI

load_dotenv()
open_ai_api_key = os.getenv("OPEN_AI_API_KEY")
client = OpenAI(api_key=open_ai_api_key)

In [12]:
doc = documents[3]
prompt = prompt_template.format(context=doc["Context"], response=doc["Response"])
prompt

'You emulate a person seeking mental health counseling. Based on the counseling conversation record provided below, formulate 5 thoughtful and empathetic questions that this person might ask regarding their feelings and situation. The record includes input from both the patient and the counselor to give you context. Your questions should be complete and detailed, while avoiding excessive reuse of wording from the record.\n\nThe counseling record:\n\nPatient: I\'m going through some things with my feelings and myself. I barely sleep and I do nothing but think about how I\'m worthless and how I shouldn\'t be here.\n   I\'ve never tried or contemplated suicide. I\'ve always wanted to fix my issues, but I never get around to it.\n   How can I change my feeling of being worthless to everyone? Counselor: Therapy is essential for those that are feeling depressed and worthless. When I work with those that are experiencing concerns related to feeling of depression and issues with self esteem. I

In [13]:
def generate_questions(doc):
    prompt = prompt_template.format(context=doc["Context"], response=doc["Response"])
    response = client.chat.completions.create(
        model="gpt-4o-mini", messages=[{"role": "user", "content": prompt}]
    )
    json_response = response.choices[0].message.content
    return json_response

In [14]:
from tqdm.auto import tqdm

results = {}

for doc in tqdm(documents):
    doc_id = doc["ID"]
    if doc_id in results:
        continue

    questions = generate_questions(doc)
    results[doc_id] = questions
    
    print(results[doc_id])


  0%|          | 0/2752 [00:00<?, ?it/s]

[
    "How can I start to identify and distance myself from the relationships and social situations that contribute to my feelings of worthlessness?",
    "What are some practical steps I can take to begin changing the narrative in my mind from feeling worthless to recognizing my own value?",
    "In what ways can finding and following positive messages on social media affect my self-esteem and help me reframe how I see myself?",
    "Can you provide examples of how other people have successfully shifted their feelings of worthlessness to a place of self-acceptance and purpose?",
    "How can I learn to accept and cope with my negative feelings without feeling overwhelmed by them or resorting to self-criticism?"
]
[
  "I'm struggling with feelings of worthlessness, and I find it hard to believe that others might feel the same way. How can I start to see that my feelings aren't unique to me, and how does that understanding help in addressing my mental health?",
  "You mentioned that cha

In [58]:
results

{'451abbc8': '[\n    "How can I start to identify and distance myself from the relationships and social situations that contribute to my feelings of worthlessness?",\n    "What are some practical steps I can take to begin changing the narrative in my mind from feeling worthless to recognizing my own value?",\n    "In what ways can finding and following positive messages on social media affect my self-esteem and help me reframe how I see myself?",\n    "Can you provide examples of how other people have successfully shifted their feelings of worthlessness to a place of self-acceptance and purpose?",\n    "How can I learn to accept and cope with my negative feelings without feeling overwhelmed by them or resorting to self-criticism?"\n]',
 'a8911790': '[\n    "What steps can I take to improve my sleep, and how will that positively affect my overall mental health and self-perception?",\n    "Can you help me identify some specific areas in my life where I might find things to be grateful fo

In [143]:
# To store the results in a different variable

questions = results.copy()

with open("questions.json", "wt") as f_out:
    json.dump(questions, f_out, indent=2)
    

In [144]:
print(questions["b7762f64"])

[
    "What specific steps can I take to find a therapist who specializes in trauma, and how do I know if they are the right fit for me?",
    "Can you explain more about how the process of recovering memories works and what I might experience emotionally as I go through this process?",
    "How can I begin to cope with the feelings of confusion and fear that come up for me when I think about my childhood experiences?",
    "Is it common for people who have gone through similar situations to struggle with memories, and how do they learn to navigate those feelings over time?",
    "What are some self-care strategies I can implement while I prepare to engage in therapy and confront these difficult memories?"
]


In [145]:
print(f"Number of questions: {len(questions)}")

Number of questions: 2448


In [146]:
parsed_results = {}

for doc_id, json_questions in questions.items():
    parsed_results[doc_id] = json.loads(json_questions)

In [147]:
print(f"Number of questions: {len(parsed_results)}")

Number of questions: 2448


In [148]:
print(f"Number of questions: {len(documents)}")

Number of questions: 2752


In [149]:
doc_index = {d["ID"]: d for d in documents}

In [150]:
doc_index

{'451abbc8': {'Context': "I'm going through some things with my feelings and myself. I barely sleep and I do nothing but think about how I'm worthless and how I shouldn't be here.\n   I've never tried or contemplated suicide. I've always wanted to fix my issues, but I never get around to it.\n   How can I change my feeling of being worthless to everyone?",
  'Response': "If everyone thinks you're worthless, then maybe you need to find new people to hang out with.Seriously, the social context in which a person lives is a big influence in self-esteem.Otherwise, you can go round and round trying to understand why you're not worthless, then go back to the same crowd and be knocked down again.There are many inspirational messages you can find in social media. \xa0Maybe read some of the ones which state that no person is worthless, and that everyone has a good purpose to their life.Also, since our culture is so saturated with the belief that if someone doesn't feel good about themselves that

In [151]:
print(len(doc_index))

2448


In [152]:
final_results = []

for doc_id, questions in parsed_results.items():
    for q in questions:
        final_results.append((q ,doc_id))

In [153]:
final_results

[('How can I start to identify and distance myself from the relationships and social situations that contribute to my feelings of worthlessness?',
  '451abbc8'),
 ('What are some practical steps I can take to begin changing the narrative in my mind from feeling worthless to recognizing my own value?',
  '451abbc8'),
 ('In what ways can finding and following positive messages on social media affect my self-esteem and help me reframe how I see myself?',
  '451abbc8'),
 ('Can you provide examples of how other people have successfully shifted their feelings of worthlessness to a place of self-acceptance and purpose?',
  '451abbc8'),
 ('How can I learn to accept and cope with my negative feelings without feeling overwhelmed by them or resorting to self-criticism?',
  '451abbc8'),
 ("I'm struggling with feelings of worthlessness, and I find it hard to believe that others might feel the same way. How can I start to see that my feelings aren't unique to me, and how does that understanding help

In [154]:
import pandas as pd

df2 = pd.DataFrame(final_results, columns=["Context", "ID"])

In [155]:
df2.to_csv("ground-truth-data_ContextOnly.csv", index=False)