# 1. Document Loading

In [3]:
import os
import openai
import sys
sys.path.append('../..')

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

openai.api_key  = os.environ['OPENAI_API_KEY']

## PDFs

In [6]:
from langchain.document_loaders import PyPDFLoader
loader = PyPDFLoader("docs/Yetiskin2020.pdf")
pages = loader.load()

In [7]:
len(pages)

18

In [8]:
page = pages[0]

In [9]:
print(page.page_content[0:500])

Full Terms & Conditions of access and use can be found at
https://www.tandfonline.com/action/journalInformation?journalCode=hppc20
Popular Communication
The International Journal of Media and Culture
ISSN: (Print) (Online) Journal homepage: https://www.tandfonline.com/loi/hppc20
Paratactic commoning: collective knowledge
production networking as political struggle
Ebru Yetiskin
To cite this article: Ebru Yetiskin (2020): Paratactic commoning: collective knowledge production
networking as politic


In [10]:
page.metadata

{'producer': 'iText 4.2.0 by 1T3XT',
 'creator': 'Arbortext Advanced Print Publisher 11.0.3433/W Unicode',
 'creationdate': '2020-06-25T12:52:06+05:30',
 'keywords': 'Commons; commoning; data; control; tactic; paratactic; obfuscation',
 'moddate': '2020-06-30T04:01:46-07:00',
 'source': 'docs/Yetiskin2020.pdf',
 'total_pages': 18,
 'page': 0,
 'page_label': '1'}

## Youtube

In [14]:
from langchain.document_loaders.generic import GenericLoader
from langchain.document_loaders.blob_loaders import FileSystemBlobLoader
from langchain.document_loaders.parsers import OpenAIWhisperParser
from langchain.document_loaders.blob_loaders.youtube_audio import YoutubeAudioLoader

In [13]:
from langchain.document_loaders.blob_loaders import FileSystemBlobLoader


In [None]:
url="https://www.youtube.com/watch?v=vtLfCO4IGXY&ab_channel=DiEM25"
save_dir="docs/youtube/"
loader = GenericLoader(
    YoutubeAudioLoader([url],save_dir),  # fetch from youtube
    #FileSystemBlobLoader(save_dir, glob="*.m4a"),   #fetch locally
    OpenAIWhisperParser()
)
docs = loader.load()

In [25]:
docs[0].page_content[0:500]

'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nhandbook/titles-for-programmers.md at master · basecamp/handbook · GitHub\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nSkip to content\n\n\n\n\n\n\n\n\n\n\n\n\n\nNavigation Menu\n\nToggle navigation\n\n\n\n\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n            Sign in\n          \n\n\n\n\n\n\n\n\n        Product\n        \n\n\n\n\n\n\n\n\n\n\n\n\n\nGitHub Copilot\n        Write better code with AI\n      \n\n\n\n\n\n\n\nSecurity\n        Find and fix vulnerabilities\n      \n\n\n\n\n\n\n\nActions\n        Automa'

## URLs

In [22]:
from langchain.document_loaders import WebBaseLoader

loader = WebBaseLoader("https://github.com/basecamp/handbook/blob/master/titles-for-programmers.md")

In [23]:
docs = loader.load()

In [None]:
print(docs[0].page_content[:500])

# 2. Document Splitting

![Alt text](images/splitters.png)

### Key Differences

| Feature                           | `RecursiveCharacterTextSplitter`              | `CharacterTextSplitter`         |
|-----------------------------------|--------------------------------|-------------------------|
| **Splitting Strategy**           | Tries to break at meaningful places | Splits at a fixed character (e.g., space) |
| **Handles Word Boundaries?**      | ✅ Yes                          | ❌ No (may cut words)  |
| **Performance**                   | Slightly slower but better structured chunks | Fast but less optimal chunks |
| **Best for**                      | Long documents with structured text (e.g., articles, books) | Simple text that doesn’t require structure |


In [26]:
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter

In [27]:
chunk_size =26
chunk_overlap = 4

In [35]:
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)
c_splitter = CharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    separator=' '
)

In [None]:
text1 = 'abcdefghijklmnopqrstuvwxyz'
r_splitter.split_text(text1)

['abcdefghijklmnopqrstuvwxyz']

In [31]:
text2 = 'abcdefghijklmnopqrstuvwxyzabcdefg'
r_splitter.split_text(text2)

['abcdefghijklmnopqrstuvwxyz', 'wxyzabcdefg']

In [32]:
text3 = "a b c d e f g h i j k l m n o p q r s t u v w x y z"
r_splitter.split_text(text3)

['a b c d e f g h i j k l m', 'l m n o p q r s t u v w x', 'w x y z']

In [36]:
c_splitter.split_text(text3)

['a b c d e f g h i j k l m', 'l m n o p q r s t u v w x', 'w x y z']

## Recursive splitting details

`RecursiveCharacterTextSplitter` is recommended for generic text. 

In [37]:
some_text = """When writing documents, writers will use document structure to group content. \
This can convey to the reader, which idea's are related. For example, closely related ideas \
are in sentances. Similar ideas are in paragraphs. Paragraphs form a document. \n\n  \
Paragraphs are often delimited with a carriage return or two carriage returns. \
Carriage returns are the "backslash n" you see embedded in this string. \
Sentences have a period at the end, but also, have a space.\
and words are separated by space."""

In [38]:
len(some_text)

496

In [39]:
c_splitter = CharacterTextSplitter(
    chunk_size=450,
    chunk_overlap=0,
    separator = ' '
)
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=450,
    chunk_overlap=0,
    separators=["\n\n", "\n", " ", ""]
)

In [40]:
c_splitter.split_text(some_text)

['When writing documents, writers will use document structure to group content. This can convey to the reader, which idea\'s are related. For example, closely related ideas are in sentances. Similar ideas are in paragraphs. Paragraphs form a document. \n\n Paragraphs are often delimited with a carriage return or two carriage returns. Carriage returns are the "backslash n" you see embedded in this string. Sentences have a period at the end, but also,',
 'have a space.and words are separated by space.']

In [41]:
r_splitter.split_text(some_text)

["When writing documents, writers will use document structure to group content. This can convey to the reader, which idea's are related. For example, closely related ideas are in sentances. Similar ideas are in paragraphs. Paragraphs form a document.",
 'Paragraphs are often delimited with a carriage return or two carriage returns. Carriage returns are the "backslash n" you see embedded in this string. Sentences have a period at the end, but also, have a space.and words are separated by space.']

In [42]:
#Let's reduce the chunk size a bit and add a period to our separators:
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=150,
    chunk_overlap=0,
    separators=["\n\n", "\n", "\. ", " ", ""]
)
r_splitter.split_text(some_text)

["When writing documents, writers will use document structure to group content. This can convey to the reader, which idea's are related. For example,",
 'closely related ideas are in sentances. Similar ideas are in paragraphs. Paragraphs form a document.',
 'Paragraphs are often delimited with a carriage return or two carriage returns. Carriage returns are the "backslash n" you see embedded in this',
 'string. Sentences have a period at the end, but also, have a space.and words are separated by space.']

In [None]:
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=150,
    chunk_overlap=0,
    separators=["\n\n", "\n", "(?<=\. )", " ", ""]
)
r_splitter.split_text(some_text)

In [43]:
from langchain.document_loaders import PyPDFLoader
loader = PyPDFLoader("docs/NG2023.pdf")
pages = loader.load()

In [46]:
len(pages)

41

In [44]:
from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=1000,
    chunk_overlap=150,
    length_function=len
)

In [47]:
docs = text_splitter.split_documents(pages)
len(docs)

80

## Token splitting

We can also split on token count explicity, if we want.

This can be useful because LLMs often have context windows designated in tokens.

Tokens are often ~4 characters.

In [48]:
from langchain.text_splitter import TokenTextSplitter

In [50]:
text_splitter = TokenTextSplitter(chunk_size=1, chunk_overlap=0)

In [51]:
text1 = "foo bar bazzyfoo"
text_splitter.split_text(text1)

['foo', ' bar', ' b', 'az', 'zy', 'foo']

In [52]:
text_splitter = TokenTextSplitter(chunk_size=10, chunk_overlap=0)
docs = text_splitter.split_documents(pages)

In [53]:
docs[0]

Document(metadata={'producer': 'Adobe PDF Library 17.0', 'creator': 'Adobe InDesign 18.0 (Macintosh)', 'creationdate': '2022-12-13T16:08:00-05:00', 'moddate': '2022-12-13T16:08:04-05:00', 'trapped': '/False', 'source': 'docs/NG2023.pdf', 'total_pages': 41, 'page': 0, 'page_label': '1'}, page_content='PAGE 1\nFounder, DeepLearning.')

In [54]:
docs[5]

Document(metadata={'producer': 'Adobe PDF Library 17.0', 'creator': 'Adobe InDesign 18.0 (Macintosh)', 'creationdate': '2022-12-13T16:08:00-05:00', 'moddate': '2022-12-13T16:08:04-05:00', 'trapped': '/False', 'source': 'docs/NG2023.pdf', 'total_pages': 41, 'page': 1, 'page_label': '2'}, page_content='\nelectricity. It will \ntransform and')

In [55]:
pages[0].metadata

{'producer': 'Adobe PDF Library 17.0',
 'creator': 'Adobe InDesign 18.0 (Macintosh)',
 'creationdate': '2022-12-13T16:08:00-05:00',
 'moddate': '2022-12-13T16:08:04-05:00',
 'trapped': '/False',
 'source': 'docs/NG2023.pdf',
 'total_pages': 41,
 'page': 0,
 'page_label': '1'}

In [56]:
pages[10].metadata

{'producer': 'Adobe PDF Library 17.0',
 'creator': 'Adobe InDesign 18.0 (Macintosh)',
 'creationdate': '2022-12-13T16:08:00-05:00',
 'moddate': '2022-12-13T16:08:04-05:00',
 'trapped': '/False',
 'source': 'docs/NG2023.pdf',
 'total_pages': 41,
 'page': 10,
 'page_label': '11'}

## Context aware splitting

Chunking aims to keep text with common context together.

A text splitting often uses sentences or other delimiters to keep related text together but many documents (such as Markdown) have structure (headers) that can be explicitly used in splitting.

We can use `MarkdownHeaderTextSplitter` to preserve header metadata in our chunks, as show below.

In [57]:
from langchain.document_loaders import NotionDirectoryLoader
from langchain.text_splitter import MarkdownHeaderTextSplitter

In [58]:
markdown_document = """# Title\n\n \
## Chapter 1\n\n \
Hi this is Jim\n\n Hi this is Joe\n\n \
### Section \n\n \
Hi this is Lance \n\n
## Chapter 2\n\n \
Hi this is Molly"""

In [59]:
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]

In [60]:
markdown_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_split_on
)
md_header_splits = markdown_splitter.split_text(markdown_document)

In [61]:
md_header_splits[0]

Document(metadata={'Header 1': 'Title', 'Header 2': 'Chapter 1'}, page_content='Hi this is Jim  \nHi this is Joe')

In [62]:
md_header_splits[1]

Document(metadata={'Header 1': 'Title', 'Header 2': 'Chapter 1', 'Header 3': 'Section'}, page_content='Hi this is Lance')