# Data Ingestion

In [78]:
## Document structure
from langchain_core.documents import Document

doc = Document(
    page_content='This is for test',
    metadata={
        'source':'../data/text.txt',
        'pages' : 1,
        'author' : 'ali',
        'update_date' : '2025-01-01'

    }
)



In [79]:
import os
os.makedirs('../data/text_files', exist_ok=True)

In [2]:
sample_text = {
        "../data/text_files/python_intro.txt":"""Python Programming Introduction

Python is a high-level, interpreted programming language known for its simplicity and readability.
Created by Guido van Rossum and first released in 1991, Python has become one of the most popular
programming languages in the world.

Key Features:
- Easy to learn and use
- Extensive standard library
- Cross-platform compatibility
- Strong community support

Python is widely used in web development, data science, artificial intelligence, and automation.""",
    
    "../data/text_files/machine_learning.txt": """Machine Learning Basics

Machine learning is a subset of artificial intelligence that enables systems to learn and improve
from experience without being explicitly programmed. It focuses on developing computer programs
that can access data and use it to learn for themselves.

Types of Machine Learning:
1. Supervised Learning: Learning with labeled data
2. Unsupervised Learning: Finding patterns in unlabeled data
3. Reinforcement Learning: Learning through rewards and penalties

Applications include image recognition, speech processing, and recommendation systems
    
    
    """

}

for filepath, content in sample_text.items():
    with open(filepath,'w',encoding='utf-8') as f:
        f.write(content)
# Sample file created

In [7]:
from langchain_community.document_loaders import TextLoader

loader = TextLoader(
    file_path='../data/text_files/python_intro.txt',
    encoding='utf-8'
)
document = loader.load()

In [None]:
document

[Document(metadata={'source': '../data/text_files/python_intro.txt'}, page_content='Python Programming Introduction\n\nPython is a high-level, interpreted programming language known for its simplicity and readability.\nCreated by Guido van Rossum and first released in 1991, Python has become one of the most popular\nprogramming languages in the world.\n\nKey Features:\n- Easy to learn and use\n- Extensive standard library\n- Cross-platform compatibility\n- Strong community support\n\nPython is widely used in web development, data science, artificial intelligence, and automation.')]

In [13]:
## Directory loader
from langchain_community.document_loaders import DirectoryLoader

directory_loader = DirectoryLoader(
    path='../data/text_files',
    loader_cls= TextLoader,
    glob='**/*.txt',
    loader_kwargs={'encoding':'utf-8'}

)
documents = directory_loader.load()
print(documents)

[Document(metadata={'source': '../data/text_files/python_intro.txt'}, page_content='Python Programming Introduction\n\nPython is a high-level, interpreted programming language known for its simplicity and readability.\nCreated by Guido van Rossum and first released in 1991, Python has become one of the most popular\nprogramming languages in the world.\n\nKey Features:\n- Easy to learn and use\n- Extensive standard library\n- Cross-platform compatibility\n- Strong community support\n\nPython is widely used in web development, data science, artificial intelligence, and automation.'), Document(metadata={'source': '../data/text_files/machine_learning.txt'}, page_content='Machine Learning Basics\n\nMachine learning is a subset of artificial intelligence that enables systems to learn and improve\nfrom experience without being explicitly programmed. It focuses on developing computer programs\nthat can access data and use it to learn for themselves.\n\nTypes of Machine Learning:\n1. Supervised

In [15]:
## pdf loader
from langchain_community.document_loaders import PyMuPDFLoader, PyPDFLoader

pdf_loader = DirectoryLoader(
    path='../data/pdf',
    loader_cls= PyMuPDFLoader,
    glob='**/*.pdf'

)
pdf_documents = pdf_loader.load()
print(pdf_documents)

[Document(metadata={'producer': 'Skia/PDF m144 Google Docs Renderer', 'creator': '', 'creationdate': '', 'source': '../data/pdf/Comcast_Sr Lead Solutions Engineer.pdf', 'file_path': '../data/pdf/Comcast_Sr Lead Solutions Engineer.pdf', 'total_pages': 6, 'format': 'PDF 1.4', 'title': 'Comcast_Sr Lead Solutions Engineer', 'author': '', 'subject': '', 'keywords': '', 'moddate': '', 'trapped': '', 'modDate': '', 'creationDate': '', 'page': 0}, page_content='Sr. Lead Solutions Engineer \nLocation Philadelphia, Pennsylvania, California, New York, New York, New York, \nPennsylvania \nReq ID R419927 \nJob Type Full Time \nCategory Support \nDate posted 09/24/2025 \nApply Now \nUniversal Ads, a part of Comcast, enables any brand, of any size, to seamlessly make \nand buy commercials across premium video reaching new qualified audiences at scale. \nUniversal Ads combines premium and brand-safe video content directly from the most \ninfluential media companies with the ease and familiarity of soc

In [16]:
type(pdf_documents[0])

langchain_core.documents.base.Document

In [32]:
## excel loader
from langchain_community.document_loaders.excel import UnstructuredExcelLoader

xlsx_loader = DirectoryLoader(
    path='../data/excel',
    loader_cls= UnstructuredExcelLoader,
    glob='**/*.xlsx'

)
# xlsx_loader = UnstructuredExcelLoader("../data/excel.relevant.jobs", mode="elements")
exel_documents = xlsx_loader.load()
exel_documents

[Document(metadata={'source': '../data/excel/relavant_jobs.xlsx'}, page_content='Keyword Location Experience Level Remote Job Type Easy Apply senior machine learning engineer, AI Engineer, Data Scientist, New York City, New York, New Jersey Mid-Senior level, Director Remote, Hybrid, On-Site Full-time False\n\nTitle Company Location link score description Cover Letter Senior Machine Learning Engineer - Ranking and Recommendations Uber New York, NY https://www.linkedin.com/jobs/view/4334551400 85 About The Role The Shopping Ranking Team mission is enabling eaters to effortlessly make shopping decisions and find what they need. We pursue this mission via an ML-driven algorithmic approach, applying state-of-the-art Machine Learning (ML), Optimization techniques to learn from massive datasets Uber has, and build a scalable and reliable shopping intelligence ranking and recommendation systems. We are actively seeking a Senior Machine Learning Engineer who excels in problem-solving and critic

In [33]:
type(exel_documents[0])

langchain_core.documents.base.Document

In [62]:
from dotenv import load_dotenv
load_dotenv()

True

In [None]:
from langchain_community.utilities import SQLDatabase
from langchain_community.document_loaders import SQLDatabaseLoader
POSTGRES_PASS = os.getenv('POSTGRES_PASS')

db = SQLDatabase.from_uri(
    f"postgresql://postgres:{POSTGRES_PASS}@10.0.0.104:5433/dvdrental"
)

# Load a table
loader = SQLDatabaseLoader(
    db=db,
    query="SELECT customer_id FROM payment;"
)

docs = loader.load()

print(len(docs))
print(docs[0].page_content)


14596
customer_id: 341


In [65]:
import psycopg2
POSTGRES_PASS=os.getenv('POSTGRES_PASS')

conn = psycopg2.connect(
    host="10.0.0.104",
    port=5433,
    user="postgres",
    password=POSTGRES_PASS,
    database="dvdrental"
)

print("Connected!")


Connected!
