In [3]:
import os
from pathlib import Path
from typing import List, Dict, Any

import pandas as pd
from dotenv import load_dotenv

from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document


In [29]:
from pathlib import Path
cwd = Path.cwd()
candidates = [cwd, cwd.parent, cwd.parent.parent]

project_root = None
for c in candidates:
    if (c / "data").exists() and (c / "src").exists():
        project_root = c
        break

if project_root is None:
    project_root = cwd.parent

env_path = project_root / ".env"

print("Detected project root:", project_root)
print("Looking for .env at:", env_path)


Detected project root: C:\Users\Admin\OneDrive\Desktop\Capstone-MAT496
Looking for .env at: C:\Users\Admin\OneDrive\Desktop\Capstone-MAT496\.env


In [30]:
DATA_DIR = project_root / "data"
OUTPUT_DIR = project_root / "data" / "processed"

DATA_DIR.mkdir(exist_ok=True, parents=True)
OUTPUT_DIR.mkdir(exist_ok=True, parents=True)

print(f"Project root: {project_root}")
print(f"Data directory: {DATA_DIR}")
print(f"Output directory for processed chunks: {OUTPUT_DIR}")


Project root: C:\Users\Admin\OneDrive\Desktop\Capstone-MAT496
Data directory: C:\Users\Admin\OneDrive\Desktop\Capstone-MAT496\data
Output directory for processed chunks: C:\Users\Admin\OneDrive\Desktop\Capstone-MAT496\data\processed


In [31]:
supported_extensions = {".pdf", ".txt", ".md"}
all_files = [
    p for p in DATA_DIR.rglob("*")
    if p.is_file() and p.suffix.lower() in supported_extensions
]

if not all_files:
    print("No PDF/TXT/MD files found in data/. "
          "Add at least one lecture file (e.g., sample_lecture.pdf) and rerun this cell.")
else:
    print("Found the following files for ingestion:")
    for p in all_files:
        print(" -", p.relative_to(project_root))
        


Found the following files for ingestion:
 - data\sample_data.pdf


In [32]:
def load_single_file(path: Path) -> List[Document]:
    """
    Load a single file (PDF, TXT, or MD) into a list of LangChain Document objects.
    Each page (for PDF) or whole file (for text) becomes a Document.
    """
    suffix = path.suffix.lower()

    if suffix == ".pdf":
        loader = PyPDFLoader(str(path))
        docs = loader.load()
    elif suffix in {".txt", ".md"}:
        loader = TextLoader(str(path), encoding="utf-8")
        docs = loader.load()
    else:
        raise ValueError(f"Unsupported file type: {suffix}")
    course = "default_course"
    lecture_id = path.stem  

    for d in docs:
        d.metadata.setdefault("course", course)
        d.metadata.setdefault("lecture_id", lecture_id)
        d.metadata.setdefault("source", str(path.relative_to(project_root)))

        d.metadata.setdefault("page", d.metadata.get("page", 0))

    return docs


In [33]:
def load_all_files(paths: List[Path]) -> List[Document]:
    """
    Load all given files into a single list of Documents.
    Errors on one file are caught and reported without stopping the whole process.
    """
    all_docs: List[Document] = []
    for path in paths:
        try:
            docs = load_single_file(path)
            all_docs.extend(docs)
            print(f"Loaded {len(docs)} document(s) from {path.name}")
        except Exception as e:
            print(f"ERROR loading {path.name}: {e}")
    return all_docs


In [34]:
if not all_files:
    raw_docs: List[Document] = []
    print("No files to load yet. Once you add files into data/, rerun this cell.")
else:
    raw_docs = load_all_files(all_files)
    print(f"\nTotal loaded documents (pages/segments): {len(raw_docs)}")

if raw_docs:
    print("\nExample document metadata and first 400 characters of content:\n")
    example_doc = raw_docs[0]
    print("Metadata:", example_doc.metadata)
    print("\nContent preview:\n", example_doc.page_content[:400])


Loaded 25 document(s) from sample_data.pdf

Total loaded documents (pages/segments): 25

Example document metadata and first 400 characters of content:

Metadata: {'producer': 'PDFium', 'creator': 'PDFium', 'creationdate': 'D:20251125214124', 'source': 'C:\\Users\\Admin\\OneDrive\\Desktop\\Capstone-MAT496\\data\\sample_data.pdf', 'total_pages': 25, 'page': 0, 'page_label': '1', 'course': 'default_course', 'lecture_id': 'sample_data'}

Content preview:
 8
Modelling Long-Run Relationships in Finance
LEARNING OUTCOMES
In this chapter, you will learn how to
Highlight the problems that may occur if non-stationary data are
used in their levels form
Test for unit roots
Examine whether systems of variables are cointegrated
Estimate error correction and vector error correction models
Explain the intuition behind Johansen’s test for cointegration
Describe


In [35]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,       
    chunk_overlap=150,
    length_function=len,
)

print("Text splitter configured.")


Text splitter configured.


In [36]:
def chunk_documents(docs: List[Document]) -> List[Document]:
    """
    Split documents into smaller chunks while preserving key metadata.
    """
    if not docs:
        return []

    chunks = text_splitter.split_documents(docs)

    # Add a simple chunk_id inside metadata for traceability
    for idx, c in enumerate(chunks):
        c.metadata.setdefault("chunk_id", idx)

    return chunks


In [37]:
chunked_docs = chunk_documents(raw_docs)

print(f"Total chunks created: {len(chunked_docs)}")
if chunked_docs:
    print("\nExample chunk metadata and preview:\n")
    example_chunk = chunked_docs[0]
    print("Metadata:", example_chunk.metadata)
    print("\nChunk content preview:\n", example_chunk.page_content[:400])


Total chunks created: 75

Example chunk metadata and preview:

Metadata: {'producer': 'PDFium', 'creator': 'PDFium', 'creationdate': 'D:20251125214124', 'source': 'C:\\Users\\Admin\\OneDrive\\Desktop\\Capstone-MAT496\\data\\sample_data.pdf', 'total_pages': 25, 'page': 0, 'page_label': '1', 'course': 'default_course', 'lecture_id': 'sample_data', 'chunk_id': 0}

Chunk content preview:
 8
Modelling Long-Run Relationships in Finance
LEARNING OUTCOMES
In this chapter, you will learn how to
Highlight the problems that may occur if non-stationary data are
used in their levels form
Test for unit roots
Examine whether systems of variables are cointegrated
Estimate error correction and vector error correction models
Explain the intuition behind Johansen’s test for cointegration
Describe


In [39]:
def chunks_to_dataframe(chunks: List[Document]) -> pd.DataFrame:
    """
    Convert a list of Document chunks into a pandas DataFrame
    with separate columns for content and metadata.
    """
    records: List[Dict[str, Any]] = []

    for c in chunks:
        meta = c.metadata.copy()
        record = {
            "text": c.page_content,
            "course": meta.get("course", "default_course"),
            "lecture_id": meta.get("lecture_id", ""),
            "source": meta.get("source", ""),
            "page": meta.get("page", None),
            "chunk_id": meta.get("chunk_id", None),
        }
        records.append(record)

    df = pd.DataFrame.from_records(records)
    return df


In [40]:
if not chunked_docs:
    print("No chunks to save yet. Add files to data/ and rerun the earlier cells.")
else:
    chunks_df = chunks_to_dataframe(chunked_docs)
    print("DataFrame shape:", chunks_df.shape)
    display(chunks_df.head(5))

    # Save to CSV and Parquet for later notebooks
    csv_path = OUTPUT_DIR / "lecture_chunks.csv"
    parquet_path = OUTPUT_DIR / "lecture_chunks.parquet"

    chunks_df.to_csv(csv_path, index=False, encoding="utf-8")
    chunks_df.to_parquet(parquet_path, index=False)

    print(f"\nSaved chunks to:\n- {csv_path}\n- {parquet_path}")


DataFrame shape: (75, 6)


Unnamed: 0,text,course,lecture_id,source,page,chunk_id
0,8\nModelling Long-Run Relationships in Finance...,default_course,sample_data,C:\Users\Admin\OneDrive\Desktop\Capstone-MAT49...,0,0
1,and why it is essential that variables that ar...,default_course,sample_data,C:\Users\Admin\OneDrive\Desktop\Capstone-MAT49...,0,1
2,behaviour and properties\n. To offer one illus...,default_course,sample_data,C:\Users\Admin\OneDrive\Desktop\Capstone-MAT49...,1,2
3,t\n will not have a\nsmaller effect in time \n...,default_course,sample_data,C:\Users\Admin\OneDrive\Desktop\Capstone-MAT49...,1,3
4,"are totally unrelated. So, if standard regress...",default_course,sample_data,C:\Users\Admin\OneDrive\Desktop\Capstone-MAT49...,1,4



Saved chunks to:
- C:\Users\Admin\OneDrive\Desktop\Capstone-MAT496\data\processed\lecture_chunks.csv
- C:\Users\Admin\OneDrive\Desktop\Capstone-MAT496\data\processed\lecture_chunks.parquet


In [28]:
if chunked_docs:
    print("Basic stats:")
    print(" - Unique lectures:", chunks_df["lecture_id"].nunique())
    print(" - Total chunks:", len(chunks_df))
    print("\nChunks per lecture_id:")
    display(chunks_df["lecture_id"].value_counts())
else:
    print("No chunk statistics to show yet.")


Basic stats:
 - Unique lectures: 1
 - Total chunks: 75

Chunks per lecture_id:


lecture_id
sample_data    75
Name: count, dtype: int64