In [1]:
from langchain.document_loaders.pdf import PyPDFDirectoryLoader
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain.vectorstores.chroma import Chroma
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.llms.ollama import Ollama
import itertools

from dotenv import load_dotenv
load_dotenv()

True

### __Reading Data__

In [None]:
PATH = '../data/in_use'

def load_documents(path: str) -> PyPDFDirectoryLoader:
    """Load PDF documents and print its content."""
    loader = PyPDFDirectoryLoader(path)
    return loader.load()

docs = load_documents(PATH)

In [3]:
print(int(docs[0].metadata['page_label'])) #Found that not all page labels are ints!

1


### __Semantic Chunking w/ OpenAI Embeddings__

In [4]:
text_splitter = SemanticChunker(
    OpenAIEmbeddings(model='text-embedding-3-large'), 
    breakpoint_threshold_type="gradient", # Gradient because our sources fall under the same topic.
    buffer_size=0
)

In [5]:
ranges = [
    (0, 5), (187, 200), (201, 207), (289, 290), (291, 298),
    (459, 459), (460, 471), (522, 522), (523, 533),
    (730, 738), (739, 745), (1007, 1009), (1012, 1013),
    (1014, 1018), (1419, 1421), (1422, 1435), (1704, 1741),
    (1747, 1747), (1748, 1755), (1877, 1877), (1891, 1894)
]

# Flatten ranges to a set of excluded page numbers
bad_pages = set()
for start, end in ranges:
    bad_pages.update(range(start, end + 1))

pages = []
for i, doc in enumerate(docs):
    # Implement removal of certain title / toc pages / disregarding empty page content pages
    if doc.page_content != '' and i not in bad_pages:
        pages.append(text_splitter.create_documents(texts=[doc.page_content], metadatas=[doc.metadata]))

In [6]:
bad_pages

{0,
 1,
 2,
 3,
 4,
 5,
 187,
 188,
 189,
 190,
 191,
 192,
 193,
 194,
 195,
 196,
 197,
 198,
 199,
 200,
 201,
 202,
 203,
 204,
 205,
 206,
 207,
 289,
 290,
 291,
 292,
 293,
 294,
 295,
 296,
 297,
 298,
 459,
 460,
 461,
 462,
 463,
 464,
 465,
 466,
 467,
 468,
 469,
 470,
 471,
 522,
 523,
 524,
 525,
 526,
 527,
 528,
 529,
 530,
 531,
 532,
 533,
 730,
 731,
 732,
 733,
 734,
 735,
 736,
 737,
 738,
 739,
 740,
 741,
 742,
 743,
 744,
 745,
 1007,
 1008,
 1009,
 1012,
 1013,
 1014,
 1015,
 1016,
 1017,
 1018,
 1419,
 1420,
 1421,
 1422,
 1423,
 1424,
 1425,
 1426,
 1427,
 1428,
 1429,
 1430,
 1431,
 1432,
 1433,
 1434,
 1435,
 1704,
 1705,
 1706,
 1707,
 1708,
 1709,
 1710,
 1711,
 1712,
 1713,
 1714,
 1715,
 1716,
 1717,
 1718,
 1719,
 1720,
 1721,
 1722,
 1723,
 1724,
 1725,
 1726,
 1727,
 1728,
 1729,
 1730,
 1731,
 1732,
 1733,
 1734,
 1735,
 1736,
 1737,
 1738,
 1739,
 1740,
 1741,
 1747,
 1748,
 1749,
 1750,
 1751,
 1752,
 1753,
 1754,
 1755,
 1877,
 1891,
 1892,
 1893

In [7]:
len(pages)

1658

In [8]:
pages_chunks = list(itertools.chain.from_iterable(pages))

In [9]:
len(pages_chunks)

3821

In [10]:
type(pages_chunks)

list

In [11]:
type(pages)

list

### __Store in DB__

In [12]:
print(pages_chunks[1])

page_content='After I spoke, I opened the floor to questions. There was one I will
never forget. “How did you feel,” a student asked, “when your daughter was
first diagnosed with autism?”
I took a deep breath and answered honestly. “I was terrified,” I
said. “I sobbed. I retched over a toilet bowl. Because of everything
that I thought that I knew about autism, I could not imagine that there
was any real hope of a future for her.”
I took another deep breath before I continued. “Because, you see, I didn’t know any better. I didn’t yet know that
the terrifying and damaging rhetoric out there about autism wasn’t
going to be our reality.”' metadata={'producer': 'calibre (5.21.0) [https://calibre-ebook.com]', 'creator': 'calibre (5.21.0) [https://calibre-ebook.com]', 'creationdate': '2021-06-28T05:28:50+00:00', 'author': 'Desconocido', 'moddate': '2021-06-28T21:02:08-06:00', 'title': 'Sincerely, Your Autistic Child', 'source': 'data\\in_use\\source (1).pdf', 'total_pages': 201, 'page': 6, 'p

In [None]:
def create_db(pages_chunks: list) -> Chroma:
    db = Chroma.from_documents(
        documents=pages_chunks,
        embedding=OpenAIEmbeddings(model='text-embedding-3-large'),
        persist_directory='../data/db',
        collection_metadata={"hnsw:space": "cosine"}
    )
    # db.persist()
    return db

db = create_db(pages_chunks)

In [14]:
type(db)

langchain_community.vectorstores.chroma.Chroma

### __Semantic Search Params__

In [85]:
# QUERY = "What is Autism?"
# QUERY = "I have a son that was recently diagnosed with autism. What are some things that people who have autism do not like? Give examples from what you know"
# QUERY = "What do you think about Autism Speaks? Is it a good resource to turn to?"
# QUERY = "What is an IEP? How effective has it been for autistic people? Give me as much info as possible"
# QUERY = 'My daughter is a non-verbal autistic. As such, it is harder to know if she is making progress. Do you have any tips for me to know?'
# QUERY = "What does a traditional IEP look like? How does this integrate into the special educational process?"
# QUERY = "How can I tell when something is hurting my child when they’re nonverbal?"
# QUERY = "With so many support groups out there, how do I know which one to join? List out all of them"
QUERY = "What's the difference between aspergers and autism?"
# QUERY = "Who is Temple Grandin? Break down everything she believes about autism."


In [None]:
def access_db() -> Chroma:
    return Chroma(
        persist_directory='../data/db',
        embedding_function=OpenAIEmbeddings(model='text-embedding-3-large')
    )

db = access_db()
search_results = db.similarity_search_with_score(QUERY, k=5) # Messing around

### __Prompting the Model__

In [87]:
PROMPT = '''
Answer the question using the following context:
{context}

_______
Answer the question using the above context:
{question}
'''

In [88]:
def remove_subset_chunks(chunks: list[str]) -> list[str]:
    unique_chunks = []
    for chunk in sorted(chunks, key=len, reverse=True): 
        if not any(chunk in uc for uc in unique_chunks):
            unique_chunks.append(chunk)
    return unique_chunks

# Start from raw results
raw_chunks = [doc.page_content.strip() for doc, _ in search_results]

# Step 1: remove exact duplicates
raw_chunks = list(set(raw_chunks))

# Step 2: remove subset duplicates
filtered_chunks = remove_subset_chunks(raw_chunks)

# Optional: assemble into prompt
context = "\n\n_______\n\n".join(filtered_chunks)
final_prompt = ChatPromptTemplate.from_template(PROMPT).format(context=context, question=QUERY)
print(final_prompt)

Human: 
Answer the question using the following context:
Children with Asperger's syndrome have
more normal speech development and much better cognitive skills
than those with classic Kanner's. Another label for Asperger's
syndrome is “high-functioning autism.” One noticeable di erence
between Kanner's and Asperger's syndromes is that Asperger
children are often clumsy. The diagnosis of Asperger's is often
confused with PDD, a label that is applied to children with mild
symptoms which are not quite serious enough to call for one of the
other labels. Children diagnosed as having disintegrative disorder start to
develop normal speech and social behavior and then regress and
lose their speech after age two. Many of them never regain their
speech, and they have di culty learning simple household chores.

_______

There are di erences of opinion in the autism  eld about the
relationship between autism and Asperger's. Are they really separate
syndromes?

_______
Answer the question using the

In [89]:
model = Ollama(model='aya-expanse:latest')
response = model.invoke(QUERY)
print(response)

It's important to note that Asperger syndrome (AS) is no longer considered a separate diagnosis from Autism Spectrum Disorder (ASD). In the past, these terms were used differently, but according to modern diagnostic criteria, they are now both classified under ASD.

The changes were made in 2013 when the Diagnostic and Statistical Manual of Mental Disorders (DSM-5) was published by the American Psychiatric Association. Before this, Asperger's syndrome was seen as a unique condition characterized by significant social and communication difficulties, along with restricted and repetitive patterns of behavior or interests, but with normal or even above-average intelligence.

Here are some key differences that were often attributed to Asperger's syndrome compared to classic autism:

1. **Communication Skills**: Individuals with Asperger's syndrome typically have well-developed verbal skills and can use language in a sophisticated manner. They may struggle more with social aspects of communi

In [90]:
response = model.invoke(final_prompt)
print(response)

Based on the provided context, here are the key differences between Asperger's syndrome (also known as "high-functioning autism") and classic Kanner's autism:

1. **Speech Development**: Children with Asperger's syndrome have more normal speech development compared to those with Kanner's autism.

2. **Cognitive Skills**: Asperger's children often have much better cognitive skills than those with Kanner's autism.

3. **Clumsiness**: Asperger's children are often clumsy, which is not typically seen in classic autism.

4. **Regression**: Children diagnosed with Kanner's autism may start to develop normal speech and social behavior but then regress and lose their speech after age two, while this regression is not typically seen in Asperger's syndrome.

Regarding the broader field of opinion on the relationship between autism and Asperger's: There are differing views within the autism community about whether they are truly separate syndromes or simply different spectrum manifestations of au

In [91]:
print([score for _, score in search_results])

[0.7694489359855652, 0.7703234553337097, 0.839470624923706, 0.8395398855209351, 0.8667991161346436]


In [92]:
sources = '\n\n_______\n\n'.join(
    {f"{piece.metadata.get('title')} — {piece.metadata.get('author')}"
     for piece, _ in search_results}
)
print(f'We got this response based on the following trustworthy and reputable source(s):\n\n{sources}')
# Update Metadata in future!

We got this response based on the following trustworthy and reputable source(s):

Thinking in pictures: and other reports from my life with autism — Temple Grandin
