### CSV and Excel files - Structured Data

In [1]:
import pandas as pd
import os


### CSV Processing

In [2]:
from langchain_community.document_loaders import CSVLoader
from langchain_community.document_loaders import UnstructuredExcelLoader

In [3]:
# Method 1: Using CSVLoader
print("CSVLoader")

try:
    csv_loader = CSVLoader(file_path="data/Inventory_Data.csv",encoding='utf-8', csv_args={"delimiter": ","})
    csv_documents = csv_loader.load()
    print(f"✅ Loaded {len(csv_documents)} documents one per row")
    print(f"Content Preview: {csv_documents[0].page_content[:500]}")
    print(f"Metadata: {csv_documents[0].metadata}")
except Exception as e:
    print(f"Error loading document: {e}")

CSVLoader
✅ Loaded 15 documents one per row
Content Preview: product_id: 1
product_name: Laptop
category: Electronics
quantity: 5
price: 999.99
description: High-performance laptop with 16GB RAM and 512GB SSD.
Metadata: {'source': 'data/Inventory_Data.csv', 'row': 0}


In [4]:
# Method 2: Custom CSV processing for better control

print("\n ✅ Custom CSV Processing")

from typing import List
from langchain_core.documents import Document

def process_csv(filepath:str) -> List[Document]:
        df = pd.read_csv(filepath)

        documents = []
        # One document per row with structured content
        for i, row in df.iterrows():
            content = "\n".join([f"{col}: {row[col]}" for col in df.columns])
            metadata = {"source": os.path.basename(filepath), "row_index": i}
            documents.append({"page_content": content, "metadata": metadata})
        return documents



 ✅ Custom CSV Processing


In [5]:
process_csv("data/Inventory_Data.csv")

[{'page_content': 'product_id: 1\nproduct_name: Laptop\ncategory: Electronics\nquantity: 5\nprice: 999.99\ndescription: High-performance laptop with 16GB RAM and 512GB SSD.',
  'metadata': {'source': 'Inventory_Data.csv', 'row_index': 0}},
 {'page_content': 'product_id: 2\nproduct_name: Smartphone\ncategory: Electronics\nquantity: 10\nprice: 699.99\ndescription: Latest model smartphone with 5G and 128GB storage.',
  'metadata': {'source': 'Inventory_Data.csv', 'row_index': 1}},
 {'page_content': 'product_id: 3\nproduct_name: Desk Chair\ncategory: Furniture\nquantity: 3\nprice: 149.99\ndescription: Ergonomic office chair with adjustable height.',
  'metadata': {'source': 'Inventory_Data.csv', 'row_index': 2}},
 {'page_content': 'product_id: 4\nproduct_name: Headphones\ncategory: Accessories\nquantity: 8\nprice: 49.99\ndescription: Noise-canceling over-ear headphones with long battery life.',
  'metadata': {'source': 'Inventory_Data.csv', 'row_index': 3}},
 {'page_content': 'product_id: 

### Excel Processing

In [6]:
# Method 1: Unstructured Excel Loader
print("\n✅UnstructuredExcelLoader")

try:
    excel_loader = UnstructuredExcelLoader(file_path="data/monthly_sales_data_with_stock_discount.xlsx", mode="elements")
    excel_documents = excel_loader.load()
    print(f"✅ Loaded {len(excel_documents)} documents")
    print(f"Content Preview: {excel_documents[0].page_content[:500]}")
    print(f"Metadata: {excel_documents[0].metadata}")
except Exception as e:
    print(f"Error loading document: {e}")


✅UnstructuredExcelLoader
✅ Loaded 1 documents
Content Preview: product_id product_name category quantity_sold price quantity_in_stock discount description 1 4K Smart TV Electronics 8 799.99 20 10 55-inch 4K UHD TV with HDR and smart streaming. 2 Wireless Earbuds Accessories 25 99.99 50 15 True wireless earbuds with 20-hour battery life. 3 Dining Chair Furniture 12 89.99 30 5 Modern cushioned chair with sturdy wooden frame. 4 Smart Doorbell Electronics 5 149.99 15 20 Wi-Fi doorbell with 1080p camera and motion detection. 5 Decorative Pillow Home Decor 10 24.
Metadata: {'source': 'data/monthly_sales_data_with_stock_discount.xlsx', 'file_directory': 'data', 'filename': 'monthly_sales_data_with_stock_discount.xlsx', 'last_modified': '2025-08-25T02:32:10', 'page_name': 'monthly_sales_data_with_stock_d', 'page_number': 1, 'text_as_html': '<table><tr><td>product_id</td><td>product_name</td><td>category</td><td>quantity_sold</td><td>price</td><td>quantity_in_stock</td><td>discount</td><td>des

In [7]:
# Method 2: We can use pandas for more control
print("\nCustom Excel Processing")  

def process_excel(filepath:str) -> List[Document]:
        df = pd.read_excel(filepath, engine='openpyxl')

        documents = []
        # One document per row with structured content
        for i, row in df.iterrows():
            content = "\n".join([f"{col}: {row[col]}" for col in df.columns])
            metadata = {"source": os.path.basename(filepath), "row_index": i}
            documents.append({"page_content": content, "metadata": metadata})
        return documents


Custom Excel Processing


In [8]:
process_excel("data/monthly_sales_data_with_stock_discount.xlsx")

[{'page_content': 'product_id: 1\nproduct_name: 4K Smart TV\ncategory: Electronics\nquantity_sold: 8\nprice: 799.99\nquantity_in_stock: 20\ndiscount: 10\ndescription: 55-inch 4K UHD TV with HDR and smart streaming.',
  'metadata': {'source': 'monthly_sales_data_with_stock_discount.xlsx',
   'row_index': 0}},
 {'page_content': 'product_id: 2\nproduct_name: Wireless Earbuds\ncategory: Accessories\nquantity_sold: 25\nprice: 99.99\nquantity_in_stock: 50\ndiscount: 15\ndescription: True wireless earbuds with 20-hour battery life.',
  'metadata': {'source': 'monthly_sales_data_with_stock_discount.xlsx',
   'row_index': 1}},
 {'page_content': 'product_id: 3\nproduct_name: Dining Chair\ncategory: Furniture\nquantity_sold: 12\nprice: 89.99\nquantity_in_stock: 30\ndiscount: 5\ndescription: Modern cushioned chair with sturdy wooden frame.',
  'metadata': {'source': 'monthly_sales_data_with_stock_discount.xlsx',
   'row_index': 2}},
 {'page_content': 'product_id: 4\nproduct_name: Smart Doorbell\nc