In [43]:
import pandas as pd
from langchain_community.document_loaders import CSVLoader, UnstructuredCSVLoader, UnstructuredExcelLoader
from typing import List
from langchain_core.documents import Document

# CSV Loader

In [15]:
csv_loader = CSVLoader(
    file_path="..\data\csv\sample_csv.csv",
    encoding='utf-8',
    csv_args={
        'delimiter':',',
        'quotechar': '"'
    }
)

csv_docs = csv_loader.load()
print(len(csv_docs))
print(f"Page Content: {csv_docs[1].page_content}")
print(f"Metadata: {csv_docs[1].metadata}")

10
Page Content: ﻿OrderID: 1002
Date: 01-09-2025
CustomerID: C002
Product: Notebook Stationery
Quantity: 5
TotalPrice: 10
Metadata: {'source': '..\\data\\csv\\sample_csv.csv', 'row': 1}


# Custom CSV Processor for adding additional Metadata

In [None]:
def custom_csv_process(filepath: str) -> List[Document]:
    """Add additional metadata to CSV Loader"""

    df = pd.read_csv(filepath)

    documents = []

    # Method1: Create one document per row
    for idx, row in df.iterrows():
        content = f"""OrderID: {row["OrderID"]}, Date: {row["Date"]}, CustomerID: {row["CustomerID"]}, Product: {row["Product"]}, Quantity: {row["Quantity"]}, TotalPrice: {row["TotalPrice"]}"""
        # print(content)

        # Create the document
        doc = Document(
            content,
            metadata={
                'source': filepath,
                'row_index': idx,
                'product_name': row["Product"],
                'total_price': row["TotalPrice"]
            }
        )

        documents.append(doc)
    return documents

custom_csv_process("..\data\csv\sample_csv.csv")[1].page_content

'OrderID: 1002, Date: 01-09-2025, CustomerID: C002, Product: Notebook Stationery, Quantity: 5, TotalPrice: 10.0'

# Process Excel Files

In [42]:
def custom_process_excel(filepath: str) -> List[Document]:
    documents = []

    excel_file = pd.ExcelFile(filepath)

    for sheet_name in excel_file.sheet_names:
        df = pd.read_excel(filepath, sheet_name=sheet_name)

        # Create Document for each sheet
        sheet_content = f"Sheet: {sheet_name}"
        sheet_content += f"Columns: {', '.join(df.columns)}\n"
        sheet_content += f"Rows: {len(df)}\n\n"
        sheet_content += df.to_string(index=False)

        doc = Document(
            sheet_content,
            metadat={
                'source': filepath
            }
        )

        documents.append(doc)
    return documents

print(custom_process_excel("../data/xlsx/sample_excel.xlsx")[0].page_content)

Sheet: Sheet1Columns: OrderID, Date, CustomerID, Product, Category, Quantity, UnitPrice, TotalPrice, Region
Rows: 10

 OrderID       Date CustomerID             Product        Category  Quantity  UnitPrice  TotalPrice Region
    1001 2025-09-01       C001      Wireless Mouse     Electronics         2       25.5        51.0  North
    1002 2025-09-01       C002 Notebook Stationery Office Supplies         5        2.0        10.0   East
    1003 2025-09-02       C003           USB-C Hub     Electronics         1       45.0        45.0   West
    1004 2025-09-02       C004          Desk Chair       Furniture         1      120.0       120.0  South
    1005 2025-09-03       C002      Ballpoint Pens Office Supplies        10        0.8         8.0   East
    1006 2025-09-03       C005         Monitor 24"     Electronics         1      160.0       160.0  North
    1007 2025-09-04       C006       Standing Desk       Furniture         1      300.0       300.0   West
    1008 2025-09-04       