In [2]:
import PyPDF2
import re  # For regular expressions (if needed)

import pandas as pd

from datasets import load_dataset
from sklearn.model_selection import train_test_split

In [None]:

def chunk_pdf_by_sections(pdf_path, section_markers=None):  # section_markers can be regex or list of headings
    with open(pdf_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text()

    chunks = []

    if section_markers:  # Automatic section detection
        if isinstance(section_markers, list): # List of headings
            for marker in section_markers:
                # Use regex to find start and end of section
                matches = re.finditer(rf"(^{marker}$)([\s\S]*?)(?=(^{section_markers[section_markers.index(marker) + 1]}$)|$)", text, re.MULTILINE) if section_markers.index(marker) < len(section_markers) - 1 else re.finditer(rf"(^{marker}$)([\s\S]*?)$", text, re.MULTILINE)
                for match in matches:
                    chunks.append(match.group(2).strip())
        elif isinstance(section_markers, str): # Regex for section start
             matches = re.finditer(section_markers, text, re.MULTILINE)
             for match in matches:
                start = match.start()
                end = text.find(section_markers, start + 1) if text.find(section_markers, start + 1) != -1 else len(text)
                chunks.append(text[start:end].strip())

    else:  # Manual annotation (example)
        # (Implementation for reading section boundaries from a separate file or markers in the text)
        # ... (Add your logic here) ...
        pass # replace with your logic

    return chunks

# Example usage (automatic - list of headings):
section_headings = ["Introduction", "Symptoms", "Treatment", "Prevention"]  # Your actual section headings
chunks = chunk_pdf_by_sections("mental_health.pdf", section_headings)

# Example usage (automatic - regex):
section_regex = r"^##\s*(.+)$"  # Regex to find section starts (e.g. Markdown headings)
chunks = chunk_pdf_by_sections("mental_health.pdf", section_regex)

# Example usage (manual annotation):
# chunks = chunk_pdf_by_sections("mental_health.pdf") # You'd need to implement the manual parsing

with open("my_dataset.txt", "w") as outfile:
    for chunk in chunks:
        outfile.write(chunk + "\n")  # Each chunk on a new line

In [None]:
# Load your full dataset
dataset = load_dataset("text", data_files={"full": "your_dataset.txt"})

# Split into train and (val+test)
train_dataset, eval_test_dataset = train_test_split(dataset["full"], test_size=0.3, random_state=42) # Adjust test_size as needed

# Split (val+test) into validation and test
eval_dataset, test_dataset = train_test_split(eval_test_dataset, test_size=0.5, random_state=42)

# Convert to Hugging Face datasets
train_dataset = load_dataset("text", data_files={"train": train_dataset})
eval_dataset = load_dataset("text", data_files={"eval": eval_dataset})
test_dataset = load_dataset("text", data_files={"test": test_dataset})

# Tokenize datasets
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=["train"])
tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True, remove_columns=["eval"])
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True, remove_columns=["test"])

In [9]:
df = pd.read_csv("dataset/dataset.csv")  # Replace "conversations.csv" with your file name
df

Unnamed: 0,user,bot
0,"I'm feeling really down lately, is there a dif...","Yes, there is a difference. Sadness is a norma..."
1,What are some of the signs and symptoms of dep...,Some common signs and symptoms include: persis...
2,I've heard there are different types of depres...,"Yes, there are several types of depressive dis..."
3,My friend is going through a tough time and se...,The most important thing is to encourage your ...
4,Is depression more common in certain groups of...,"Depression can affect anyone, but it is more c..."
5,"I'm a man, and I've been feeling irritable and...","Yes, irritability and fatigue can be symptoms ..."
6,My son has been complaining about stomach ache...,Depression in children can sometimes manifest ...
7,"Is it normal for teens to have mood swings, or...",Occasional bad moods are a normal part of bein...
8,My grandmother seems more tired and grumpy lat...,"Yes, depression in older adults can sometimes ..."
9,I'm worried that my constant worrying might le...,"Yes, there is a strong connection between anxi..."


In [12]:
text_data = []

for i in range(0, len(df)):
    user_utterance = df['user'][i] # Access user utterance using column name
    bot_utterance = df['bot'][i] # Access bot utterance using column name

    # Handle missing values (if any)
    if pd.isna(user_utterance):
        user_utterance = "User: "  # Or some other placeholder
    if pd.isna(bot_utterance):
        bot_utterance = "Bot: I don't know."  # Or a default response

    text_data.append(f"User: {user_utterance}")
    text_data.append(f"Bot: {bot_utterance}")

# 3. Split into training and validation sets
train_data = text_data[:int(len(text_data)*0.8)]  # 80% for training
val_data = text_data[int(len(text_data)*0.8):]  # 20% for validation

# 4. Save to text files (train.txt and val.txt)
with open("dataset/train.txt", "w", encoding="utf-8") as f: # Add encoding for special characters
    for line in train_data:
        f.write(line + "\n")

with open("dataset/val.txt", "w", encoding="utf-8") as f: # Add encoding for special characters
    for line in val_data:
        f.write(line + "\n")

# ... (rest of your fine-tuning code using train.txt and val.txt)