## CSV Processing

In [7]:
import pandas as pd
import os
from langchain_community.document_loaders import (
    CSVLoader, 
    UnstructuredCSVLoader
)
from typing import List, Any
from langchain_core.documents import Document

In [5]:
## Method 1- Using csvloader
csv_loader = CSVLoader(
    file_path="data/structured_files/products.csv",
    encoding='utf-8',
    csv_args={
        'delimiter': ',',
        'quotechar': '"'
    }
)
csv_docs = csv_loader.load()
print(f"length: {len(csv_docs)}")
print(f"content previre: {csv_docs[0].page_content[:200]}")
print(f"metadata: {csv_docs[0].metadata}")

length: 5
content previre: Product: Laptop
Category: Electronics
Price: 999.99
Stock: 50
Description: High-performance laptop with 16GB RAM and 512GB SSD
metadata: {'source': 'data/structured_files/products.csv', 'row': 0}


In [11]:
## Method 2- Custom csv

def process_CSV(filepath:str) -> List[Document]:
    df = pd.read_csv(filepath)
    documents = []

    for index, row in df.iterrows():
        content = f"""
        Name: {row['Product']},
        Category: {row['Category']},
        Price: {row['Price']},
        Stock: {row['Stock']},
        Description: {row['Description']}
        """

        doc = Document(
            page_content=content,
            metadata={
                'source': filepath,
                'row_index': index,
                'product_name': row['Product'],
                'category': row['Category'],
                'rpice': row['Price'],
                'data_type': 'product_info'
            }
        )

In [28]:
## Method 3- Excel Processing with Pandas

def process_excel_with_pandas(filepath: str) -> List[Document]:
    """
    Process Excel file with multiple sheets and convert to LangChain Documents.
    
    Args:
        filepath: Path to the Excel file (.xlsx or .xls)
    
    Returns:
        List of Document objects, one per row across all sheets
    """
    documents = []
    
    # Read all sheets from Excel file
    excel_file = pd.ExcelFile(filepath)
    
    for sheet_name in excel_file.sheet_names:
        df = pd.read_excel(excel_file, sheet_name=sheet_name)
        
        # Convert each row to a Document
        for index, row in df.iterrows():
            # Create content string from all columns
            content_parts = []
            for col in df.columns:
                value = row[col]
                # Handle NaN values
                if pd.isna(value):
                    value = "N/A"
                content_parts.append(f"{col}: {value}")
            
            content = "\n".join(content_parts)
            
            # Create metadata with row info and sheet name
            metadata = {
                'source': filepath,
                'sheet_name': sheet_name,
                'row_index': index,
                'total_rows': len(df),
                'data_type': 'excel_data'
            }
            
            # Add column values to metadata for filtering
            for col in df.columns:
                value = row[col]
                if pd.isna(value):
                    value = None
                # Use column name as metadata key (sanitized)
                metadata_key = col.lower().replace(' ', '_').replace('-', '_')
                metadata[metadata_key] = value
            
            doc = Document(
                page_content=content,
                metadata=metadata
            )
            documents.append(doc)
    
    return documents

In [29]:
excel_docs = process_excel_with_pandas('data/structured_files/inventory.xlsx')
print(f"Processed {len(excel_docs)} sheets")
print(f"\nTotal documents: {len(excel_docs)}")

if excel_docs:
    print(f"\nFirst document preview:")
    print(f"Content: {excel_docs[0].page_content[:300]}")
    print(f"\nMetadata: {excel_docs[0].metadata}")
    
    # Show unique sheet names if multiple sheets
    sheet_names = set(doc.metadata.get('sheet_name') for doc in excel_docs)
    if len(sheet_names) > 1:
        print(f"\nSheets processed: {sheet_names}")

ImportError: Missing optional dependency 'openpyxl'.  Use pip or conda to install openpyxl.

In [None]:
process_CSV(filepath="data/structured_files/products.csv")