In [12]:
# Dependencies are in requirments.txt
# list dependencies for this specific part
# imports!!
'''
transformers torch langchain sentence transformers pinecone
'''
import os
from langchain_text_splitters import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer as st
import faiss
import numpy as np
import pickle

print("Welcome")

Welcome


In [4]:
to_chunk = [open(os.path.join('data/clinical_data/to_chunk', f), 'r', encoding='utf-8').read()
            for f in os.listdir('data/clinical_data/to_chunk')]

print(len(to_chunk))


5


In [5]:
mandatory = []

dsm = open("data/clinical_data/mandatory_context_DSM5_MMD.txt", 'r', encoding='utf-8').read()
phq = open("data/clinical_data/phq8.txt", 'r', encoding='utf-8').read()

mandatory.append(dsm)
mandatory.append(phq)

print(mandatory[1])


The PHQ-8 is a tool used for depression screening and severity in adolescents and adults.  
Instructions: Over the last two weeks, how often have you been bothered by the following problems? 
0 to 1 day = “not at all,” 2 to 6 days = “several days,” 7 to 11 days = “more than half the days,” and 12 to 
14 days = “nearly every day,” 
1. Little interest or pleasure in doing things 
2. Feeling down, depressed, or hopeless 
3. Trouble falling or staying asleep, or sleeping too much 
4. Feeling tired or having little energy 
5. Poor appetite or overeating 
6. Feeling bad about yourself – or that you are a failure or have let yourself or your family down 
7. Trouble concentrating on things, such as reading the newspaper or watching television 
8. Moving or speaking so slowly that other people could not have noticed. Or the opposite – being 
fidgety or restless that you have been moving around a lot more than usual 
Scoring Instructions:  
Total score is determined by adding together the score

## Document Process Splitting 

chunk each block of text into 500 tokens with 100 token overlap to maintain context as best as possible

### Input: list of documents to chunk Output: set of chunks of text

In [6]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,  # adjust based on your needs
    chunk_overlap=100,  # some overlap to maintain context
    length_function=len,
)

chunks = []

for text in to_chunk:
    chunks.extend(text_splitter.split_text(text))

print(f"Number of chunks: {len(chunks)}")

Number of chunks: 704


In [7]:
print(chunks[703])

• The symptoms result in significant distress or significant impairment in personal, family,
social, educational, occupational or other important areas of functioning. If functioning is
maintained, it is only through significant additional effort.
6A8Z Mood disorder, unspecified
Mood disorders | Other specified and unspecified mood disorder


## Save chunks to pickle to use in query later

In [13]:
with open("chunks.pkl", "wb") as f:
    pickle.dump(chunks, f)

print("Chunks saved!")

Chunks saved!


In [None]:
# check pickle properly stored chunks

## Generate Embeddings 
convert chunks into vector embeddings

### Input: set of chunks of text  Output: set of vector embeddings

In [8]:
model = st('all-MiniLM-L6-v2')  # fast and good quality

# Generate embeddings
embeddings = model.encode(chunks, show_progress_bar=True)

print(f"Shape: {embeddings.shape}")  # Should be (704, 384)

Batches:   0%|          | 0/22 [00:00<?, ?it/s]

Shape: (704, 384)


In [9]:
# Look at the first embedding
print(embeddings[0])

# Or to make it prettier
print(f"Shape: {embeddings[0].shape}")
print(f"First 10 values: {embeddings[0][:10]}")
print(f"Type: {type(embeddings[0])}")

[ 2.69429758e-02 -4.86390963e-02  5.78358248e-02  1.02492154e-01
  1.15027027e-02  4.36887927e-02 -3.79711419e-04  5.61335608e-02
  1.81688145e-02 -1.25980331e-02  8.93459748e-03  4.31207847e-03
 -6.35666624e-02 -3.96824814e-02  7.84375370e-02  3.67724076e-02
  2.54429914e-02 -4.43847328e-02  4.16864380e-02  6.91626742e-02
  2.73864623e-02  3.69042754e-02 -3.30401808e-02  6.04620986e-02
 -2.66672596e-02  6.21446408e-02  5.00290841e-02 -3.70500609e-02
  1.99956819e-02  6.04986772e-02 -1.41461138e-02  8.66947100e-02
  5.59696928e-02  5.92184439e-02  1.64900664e-02 -7.96614587e-03
 -6.31108284e-02  4.65482585e-02  1.21297622e-02 -7.76730943e-03
  8.07345565e-03  4.78068627e-02  1.03901001e-02  1.79688614e-02
 -3.36280651e-02 -9.80865434e-02 -4.19874713e-02  3.40160429e-02
 -5.96934110e-02 -1.04531653e-01  7.73018301e-02 -6.96661090e-03
  1.87245701e-02  8.15561712e-02  5.58135062e-02 -9.03742835e-02
  6.19646013e-02  8.03769231e-02  4.81955633e-02  5.45013212e-02
 -2.68454477e-02  3.87476

## Set up vector Database
name: depression_embeddings

opted for FAISS over Pinecone for local databse, no API key. Okay because small in size, will use pinecone for development or larger RAG

### Normalizes with L2, MUST DO FOR QUERY EMBEDDING
### Input: set of vector embeddings Output: Vector Database

In [10]:
dimension = 384
index = faiss.IndexFlatIP(dimension)  # Inner Product for cosine similarity

# Normalize embeddings for cosine similarity
faiss.normalize_L2(embeddings)

# Add embeddings to index
index.add(embeddings.astype('float32'))

# Save index to disk (so you don't have to recreate it)
faiss.write_index(index, "depression_embeddings.index")

print(f"Added {index.ntotal} vectors to FAISS index")

Added 704 vectors to FAISS index


## Check the database

In [11]:
# Basic stats
print(f"Total vectors in index: {index.ntotal}")
print(f"Dimension: {index.d}")
print(f"Is trained: {index.is_trained}")

# See what a vector looks like in the index
print(f"\nFirst vector in index: {index.reconstruct(0)[:10]}...")  # First 10 values

# Quick sanity check - search for a chunk using itself
test_vector = embeddings[0:1].astype('float32')
faiss.normalize_L2(test_vector)
distances, indices = index.search(test_vector, k=3)
print(f"\nSearching with chunk 0, top results: {indices[0]}")  # Should be [0, ...]
print(f"Distances: {distances[0]}")  # First should be 1.0 (perfect match)

Total vectors in index: 704
Dimension: 384
Is trained: True

First vector in index: [ 0.02694298 -0.0486391   0.05783583  0.10249217  0.0115027   0.0436888
 -0.00037971  0.05613357  0.01816882 -0.01259803]...

Searching with chunk 0, top results: [  0 686 685]
Distances: [1.0000001  0.809986   0.75930405]


In [None]:
with open("data/RAG/chunks.pkl", "rb") as f:
    loaded_chunks = pickle.load(f)

# Quick sanity check - search for a chunk using itself
test_vector = embeddings[0:1].astype('float32')
faiss.normalize_L2(test_vector)
distances, indices = index.search(test_vector, k=3)

print(f"Searching with chunk 0, top results: {indices[0]}")
print(f"Distances: {distances[0]}")

# Show the actual text chunks using indices from FAISS
print("\n" + "="*80)
for i, idx in enumerate(indices[0]):
    print(f"\nRank {i+1} - Chunk {idx} - Similarity: {distances[0][i]:.4f}")
    print("-"*80)
    print(loaded_chunks[idx])
    print("="*80)

Searching with chunk 0, top results: [  0 686 685]
Distances: [1.0000001  0.809986   0.75930405]


Rank 1 - Chunk 0 - Similarity: 1.0000
--------------------------------------------------------------------------------
Depressive disorders
include disruptive mood dysregulation
disorder, major depressive disorder (including major depressive episode),
persistent depressive disorder, premenstrual dysphoric disorder,
substance/medication-induced depressive disorder, depressive disorder due
to another medical condition, other specified depressive disorder, and
unspecified depressive disorder. The common feature of all of these
disorders is the presence of sad, empty, or irritable mood, accompanied by

Rank 2 - Chunk 686 - Similarity: 0.8100
--------------------------------------------------------------------------------
depressive disorders grouping.
Mood disorders | Other specified depressive disorderMood disorders 261
• The symptoms are not better accounted for by another mental, behaviour

# WE have created a DAtabase with embeddings that are associated with chunks. See: RAG/ chunks.pkl and depression_embeddings.index 

## To DO: move the remaining code to a new file for simplicity since we have the data stored

## Load llama 3B model

In [None]:
# llama model to add to

# test model works as is

## Clean output

given RAG output find the predicted label: (none/moderately depressed/severely depressed)

### Input: RAG model's response Output: predicted label

## Run the RAG model

given the Diac-Woz convo feed it through the retrival pipeline and pass it to the generation pipeline to retrieve the prompt for the RAG

rags_output = run the model with the prompt and save whole output

predicted label = run clean prompt to find classification label

### input: diac-woz convo Output: RAG's output, predicted label

## build dictionary

(this will handle our txt and csv files to get the convos and their labels)

for each convo in the diac-woz set feed it to the run the RAG function and build dictionary with (patient number): true label, predicted label, rags_output 

### Input: Diac woz full dataset (patient number, true label) Output: Clincal RAG prediction dictionary

## Test RAGS accuracy

 take all of the true label, predicted label pairs and calculate how many were correct

### Input: RAG dictionary Output: accuracy