In [2]:
pip install sentence-transformers scikit-learn numpy

Collecting sentence-transformers
  Obtaining dependency information for sentence-transformers from https://files.pythonhosted.org/packages/05/89/7eb147a37b7f31d3c815543df539d8b8d0425e93296c875cc87719d65232/sentence_transformers-3.4.1-py3-none-any.whl.metadata
  Downloading sentence_transformers-3.4.1-py3-none-any.whl.metadata (10 kB)
Collecting scikit-learn
  Obtaining dependency information for scikit-learn from https://files.pythonhosted.org/packages/c4/b7/2e35f8e289ab70108f8cbb2e7a2208f0575dc704749721286519dcf35f6f/scikit_learn-1.6.1-cp312-cp312-macosx_12_0_arm64.whl.metadata
  Downloading scikit_learn-1.6.1-cp312-cp312-macosx_12_0_arm64.whl.metadata (31 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Obtaining dependency information for transformers<5.0.0,>=4.41.0 from https://files.pythonhosted.org/packages/20/37/1f29af63e9c30156a3ed6ebc2754077016577c094f31de7b2631e5d379eb/transformers-4.49.0-py3-none-any.whl.metadata
  Downloading transformers-4.49.0-py3

In [None]:
!pip3 install datasets

Collecting datasets
  Obtaining dependency information for datasets from https://files.pythonhosted.org/packages/4c/37/22ef7675bef4ffe9577b937ddca2e22791534cbbe11c30714972a91532dc/datasets-3.3.2-py3-none-any.whl.metadata
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Obtaining dependency information for pyarrow>=15.0.0 from https://files.pythonhosted.org/packages/a0/55/f1a8d838ec07fe3ca53edbe76f782df7b9aafd4417080eebf0b42aab0c52/pyarrow-19.0.1-cp311-cp311-macosx_12_0_arm64.whl.metadata
  Downloading pyarrow-19.0.1-cp311-cp311-macosx_12_0_arm64.whl.metadata (3.3 kB)
Collecting requests>=2.32.2 (from datasets)
  Obtaining dependency information for requests>=2.32.2 from https://files.pythonhosted.org/packages/f9/9b/335f9764261e915ed497fcdeb11df5dfd6f7bf257d4a6a2a686d80da4d54/requests-2.32.3-py3-none-any.whl.metadata
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting tqdm>=4.66.3 (from datasets)
  Obtain

In [17]:
# Load NLP Mental Health Conversations and clean it - format it for fine tuning
import csv
import re
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

SYSTEM_PROMPT = """You are Atlas, an expert AI psychologist, well versed in the field of psychology, with hands on experience understanding people, you have comprehensive 
understanding of Freud and Carl Young as well as other great figures in psychology like Jean Piaget or B. F. Skinner You also have a computational approach
to understanding people and psychology"""

def clean_data(text):
    # Regex pattern matches 'http://' or 'https://' followed by non-whitespace characters.
    pattern = r'https?://\S+'
    cleaned_text = re.sub(pattern, '', text.replace('\xa0', ' '))
    return cleaned_text

def deduplicate_data(data, similarity_threshold=0.95):
    """
    Deduplicate rows in data based on separate similarity checks for user and assistant texts.
    A pair is considered duplicate if both the user and assistant message similarities exceed the threshold.
    
    Args:
        data: List of dicts, each containing a 'messages' key with user and assistant messages.
        similarity_threshold: Float threshold for cosine similarity to consider rows as duplicates.
    
    Returns:
        A deduplicated list of data rows.
    """
    # Extract user and assistant texts from the data
    texts_user = [row['messages'][1]['content'] for row in data]
    texts_assistant = [row['messages'][2]['content'] for row in data]
    
    # Load the SentenceTransformer model (you may choose another model if desired)
    model = SentenceTransformer('all-MiniLM-L6-v2')
    
    # Compute embeddings for user and assistant messages separately
    embeddings_user = model.encode(texts_user)
    embeddings_assistant = model.encode(texts_assistant)

    # Compute cosine similarity matrices for both sets of embeddings
    sim_matrix_user = cosine_similarity(embeddings_user)
    sim_matrix_assistant = cosine_similarity(embeddings_assistant)
    
    duplicates = set()
    
    print("Checking for duplicate rows based on similarity in both user and assistant messages:")
    for i in range(len(data)):
        for j in range(i + 1, len(data)):
            if (sim_matrix_user[i][j] >= similarity_threshold and 
                sim_matrix_assistant[i][j] >= similarity_threshold):
                print(f"\nPotential duplicate pair (Row {i} and Row {j}):")
                print(f"User message Row {i}: {texts_user[i]}")
                print(f"User message Row {j}: {texts_user[j]}")
                print(f"Assistant message Row {i}: {texts_assistant[i]}")
                print(f"Assistant message Row {j}: {texts_assistant[j]}")
                # print(f"User similarity: {sim_matrix_user[i][j]:.4f}")
                # print(f"Assistant similarity: {sim_matrix_assistant[i][j]:.4f}")
                # print(f"User embedding Row {i}: {embeddings_user[i]}")
                # print(f"User embedding Row {j}: {embeddings_user[j]}")
                # print(f"Assistant embedding Row {i}: {embeddings_assistant[i]}")
                # print(f"Assistant embedding Row {j}: {embeddings_assistant[j]}")
                duplicates.add(j)
    
    deduped_data = [row for idx, row in enumerate(data) if idx not in duplicates]
    print(f"\nRemoved {len(duplicates)} duplicate rows out of {len(data)} total rows.")
    return deduped_data

data = []

with open("data/train.csv", mode="r", encoding="utf-8") as file:
    reader = csv.reader(file)
    for index, row in enumerate(reader):
        if not row[0] or not row[1] or index == 0:
            continue
        messages = []
        messages.append({
            'role': 'system',
            'content': SYSTEM_PROMPT
        })
        # adjust data and remove links
        messages.append({
            'role': 'user',
            'content': clean_data(row[0])
        })
        messages.append({
            'role': 'assistant',
            'content': clean_data(row[1])
        })
        
        data.append({
            'messages': messages
        })

In [18]:
data = deduplicate_data(data)

print(len(data))

Checking for duplicate rows based on similarity in both user and assistant messages:

Potential duplicate pair (Row 23 and Row 1968):
User message Row 23: I have so many issues to address. I have a history of sexual abuse, I’m a breast cancer survivor and I am a lifetime insomniac.    I have a long history of depression and I’m beginning to have anxiety. I have low self esteem but I’ve been happily married for almost 35 years.
   I’ve never had counseling about any of this. Do I have too many issues to address in counseling?
User message Row 1968: I have so many issues to address. I have a history of sexual abuse, I’m a breast cancer survivor and I am a lifetime insomniac.    I have a long history of depression and I’m beginning to have anxiety. I have low self esteem but I’ve been happily married for almost 35 years.
   I’ve never had counseling about any of this. Do I have too many issues to address in counseling?
Assistant message Row 23: Let me start by saying there are never too m

In [19]:
import json
import random

# Intents dataset
def read_intents_dataset(filePath):
    with open(filePath, 'r', encoding='utf-8') as f:
        intents_data = json.load(f)

        intentsData = []
        intents = intents_data.get("intents", [])
        
        for intent in intents:
            patterns = intent.get("patterns", [])
            responses = intent.get("responses", [])
            if not patterns:
                continue  # Skip if no patterns provided
            
            pairs = []
            n_patterns = len(patterns)
            n_responses = len(responses)
            
            if n_responses <= n_patterns:
                # One-to-one pairing; discard extra patterns if any
                pairs = list(zip(patterns, responses))
            else:
                # Pair first n_patterns responses one-to-one and pair extra responses with a random pattern
                pairs = list(zip(patterns, responses[:n_patterns]))
                extra_responses = responses[n_patterns:]
                for resp in extra_responses:
                    random_pattern = random.choice(patterns)
                    pairs.append((random_pattern, resp))

            # Build the messages structure for each (pattern, response) pair
            for user_text, assistant_text in pairs:
                if not user_text or not assistant_text:
                    continue
                messages = [
                    {
                        'role': 'system',
                        'content': SYSTEM_PROMPT
                    },
                    {
                        'role': 'user',
                        'content': clean_data(user_text)
                    },
                    {
                        'role': 'assistant',
                        'content': clean_data(assistant_text)
                    }
                ]
                intentsData.append({'messages': messages})
            
        # Deduplicate the generated training examples
        return deduplicate_data(intentsData)

In [20]:
deduped_data_intents = read_intents_dataset('data/intents.json')
        
print(len(deduped_data_intents))

Checking for duplicate rows based on similarity in both user and assistant messages:

Removed 0 duplicate rows out of 152 total rows.
152


In [21]:
deduped_data_intents_therapy = read_intents_dataset('data/intents-therapy.json')
print(len(deduped_data_intents_therapy))

Checking for duplicate rows based on similarity in both user and assistant messages:

Removed 0 duplicate rows out of 39 total rows.
39


In [22]:
intents_dataset = deduplicate_data(deduped_data_intents + deduped_data_intents_therapy)


Checking for duplicate rows based on similarity in both user and assistant messages:

Removed 0 duplicate rows out of 191 total rows.


In [23]:
# Parse SPARTA_WSDM2022 dataset found online (just the github part)
import os

def process_csv_directory(directory_path):
    """
    Reads every CSV file in the given directory and creates a multi-turn conversation
    for each file by taking the third entry (index 2) from each row.
    The conversation alternates roles: the first message is from the assistant, the next from the user,
    then assistant, then user, and so on.
    
    Args:
        directory_path (str): The path to the directory containing CSV files.
    
    Returns:
        List[dict]: A list of conversation dictionaries, each with a "file" and "messages" key.
    """
    conversations = []
    # Loop over all files in the directory
    for filename in os.listdir(directory_path):
        if filename.lower().endswith('.csv'):
            filepath = os.path.join(directory_path, filename)
            messages = []
            # Open and read the CSV file
            with open(filepath, 'r', encoding='utf-8') as csvfile:
                reader = csv.reader(csvfile)
                # Define role order: first assistant, then user, then alternating.
                roles = ['assistant', 'user']
                role_index = 0
                for index, row in enumerate(reader):
                    if index == 0:
                        continue
                    # Ensure the row has at least three entries
                    if len(row) < 3:
                        continue
                    # Get the third entry from the row
                    content = row[2].strip()
                    messages.append({
                        'role': roles[role_index],
                        'content': content
                    })
                    # Alternate the role for the next message
                    role_index = 1 - role_index
            # Save the conversation from this file
            conversation = {
                'messages': messages
            }
            conversations.append(conversation)
    return conversations

hope_convs = process_csv_directory('data/hope_therapy_conversations')

In [24]:
hope_convs[0]

{'messages': [{'role': 'assistant',
   'content': 'Jackie, I joined today. Okay, how about you?'},
  {'role': 'user', 'content': 'I do well, thanks for asking.'},
  {'role': 'assistant',
   'content': "I understand you've been having some difficulties recently that have brought you into the agency."},
  {'role': 'user',
   'content': "Yeah, I'm, I'm recently married. And so my husband and I moved in together. And up until then I I've lived by myself. My whole adult life. So apparently the craziest things have been happening. While I've been sleeping. several, several nights I've woken him up screaming. I'm screaming. Apparently a very vulgar, profane, profane the words and So that's a little alarming to him. And I've even, you know, we've been hitting him in the middle of the night. You know, he's waking up with these huge bruises on his arms. And so yeah, apparently, a behavior in the middle of the night is out of hand."},
  {'role': 'assistant',
   'content': 'So I can understand why

In [26]:
import random

random_row = random.choice(all_data)
print(random_row)

{'messages': [{'role': 'system', 'content': 'You are Atlas, an expert AI psychologist, well versed in the field of psychology, with hands on experience understanding people, you have comprehensive \nunderstanding of Freud and Carl Young as well as other great figures in psychology like Jean Piaget or B. F. Skinner You also have a computational approach\nto understanding people and psychology'}, {'role': 'user', 'content': "I'm a teenager and I I go through periods of moodiness. I have extreme feelings (but I bury them), and then I feel nothing - just empty and emotionless.\n   I have trust issues, low self esteem,ban extreme fear of abandonment, and I constantly reevaluate my relationships. I also don't know who I am as a person. I'm not happy overall, but I'm not suicidal. Can I just feel happy?"}, {'role': 'assistant', 'content': "Sometimes the way to feel happy is to address all the questions you just described.Teenage years are the first time any of us starts to feel and recognize 

In [79]:
# Introduce around 800 samples from synthetic dataset nart-100k-synthetic
from datasets import load_dataset

ds = load_dataset("jerryjalapeno/nart-100k-synthetic")

In [80]:
def process_conversations_dataset(sample_size=800, user_role="human", assistant_role="gpt"):
    """
    Loads the synthetic dataset "nart-100k-synthetic", randomly selects a given number of samples,
    and processes each sample into a multi-turn conversation format.

    The format for each sample is:
      - A system message (using SYSTEM_PROMPT)
      - Followed by conversation turns, with roles mapped:
          * 'human'  -> 'user'
          * 'gpt'    -> 'assistant'
    
    Args:
        sample_size (int): Number of samples to randomly select.
    
    Returns:
        List[dict]: List of processed conversation dictionaries.
    """
    # Assuming the primary split is named "train"
    samples = ds["train"]

    # Randomly select sample_size indices
    indices = random.sample(range(len(samples)), sample_size)
    selected_samples = [samples[i] for i in indices]

    processed_data = []
    for sample in selected_samples:
        messages = []
        # Add the system message first
        messages.append({
            'role': 'system',
            'content': SYSTEM_PROMPT
        })
        # Process each turn in the conversation
        for turn in sample.get("conversations", []):
            role = turn.get("from", "").lower()
            # Map roles: human -> user, gpt -> assistant
            if role == "human":
                role_mapped = "user"
            elif role == "gpt":
                role_mapped = "assistant"
            else:
                role_mapped = role
            value = turn.get("value", "")
            messages.append({
                "role": role_mapped,
                "content": clean_data(value)
            })
        processed_data.append({"messages": messages})
    return processed_data

In [81]:
processed_conversations_nart = process_conversations_dataset(sample_size=1200)
print(len(processed_conversations_nart))

1200


In [36]:
# CalebE/new_mental_health_conversations_all1
from datasets import load_dataset

ds = load_dataset("CalebE/new_mental_health_conversations_all1")

Generating train split: 100%|██████████| 13358/13358 [00:00<00:00, 646155.68 examples/s]


In [None]:
def process_instruction_data(sample_size=800, similarity_threshold=0.95):
    """
    Processes a dataset where each sample is a dict with keys:
      - "instruction"
      - "output"
      - "input"
    
    Randomly selects sample_size examples and converts each into a multi-turn conversation format:
      1. A system message using SYSTEM_PROMPT.
      2. A user message created by concatenating the "instruction" and "input" (if provided).
      3. An assistant message from the "output" field.
    
    The resulting examples are then deduplicated.
    
    Args:
        ds: A dataset object where each sample is a dict with keys "instruction", "output", and "input".
        sample_size (int): Number of random samples to select.
        similarity_threshold (float): Threshold for deduplication.
    
    Returns:
        List[dict]: A list of processed and deduplicated conversation examples.
    """
    ds_train = ds['train']
    # Randomly select sample_size indices from the dataset
    indices = random.sample(range(len(ds_train)), sample_size)
    selected_samples = [ds_train[i] for i in indices]
    
    processed_data = []
    for sample in selected_samples:
        # Build the user text by combining "instruction" and "input" (if present)
        user_text = clean_data(sample.get("instruction", ""))
        
        # Build the conversation messages
        messages = [
            {
                'role': 'system',
                'content': SYSTEM_PROMPT
            },
            {
                'role': 'user',
                'content': user_text
            },
            {
                'role': 'assistant',
                'content': clean_data(sample.get("output", ""))
            }
        ]
        processed_data.append({'messages': messages})
    
    # Deduplicate the generated examples
    deduped_data = deduplicate_data(processed_data, similarity_threshold=similarity_threshold)
    return deduped_data

In [46]:
processed_conversations_calebe = process_instruction_data(sample_size=500)
print(len(processed_conversations_calebe))

Checking for duplicate rows based on similarity in both user and assistant messages:

Potential duplicate pair (Row 96 and Row 311):
User message Row 96: For the past four weeks, I've been having nightmares and hearing voices telling me of horrible things. They're not random voices, but rather, the voices of those I care about.
User message Row 311: For the past four weeks, I've been having nightmares and hearing voices telling me of horrible things. They're not random voices, but rather, the voices of those I care about.
Assistant message Row 96: It's important to take a look inside and find out why you are experiencing these feelings. It could anything and it will change your life to know what your particular issue or trigger is. Please contact us a call with the method that is most comfortable for you.
Assistant message Row 311: It's important to take a look inside and find out why you are experiencing these feelings. It could anything and it will change your life to know what your 

In [82]:
# PsyV0, psychology related questions
from datasets import load_dataset

ds = load_dataset("Gragroo/psyV0")

In [84]:
all_data = data + intents_dataset + hope_convs + processed_conversations_nart + processed_conversations_calebe + dataset_psyv0
print(len(all_data))

4918


In [85]:
print(len(deduplicate_data(all_data)))

Checking for duplicate rows based on similarity in both user and assistant messages:

Potential duplicate pair (Row 31 and Row 3752):
User message Row 31: I have so many issues to address. I have a history of sexual abuse, I’m a breast cancer survivor and I am a lifetime insomniac.    I have a long history of depression and I’m beginning to have anxiety. I have low self esteem but I’ve been happily married for almost 35 years.
   I’ve never had counseling about any of this. Do I have too many issues to address in counseling?
User message Row 3752: I have so many issues to address. I have a history of sexual abuse, I’m a breast cancer survivor and I am a lifetime insomniac.    I have a long history of depression and I’m beginning to have anxiety. I have low self esteem but I’ve been happily married for almost 35 years.
   I’ve never had counseling about any of this. Do I have too many issues to address in counseling?
Assistant message Row 31: Most clients have many issues that need work