# Data Ingestion

## Installation

In [81]:
# !pip install unstructured

## Git Clone 
Clonning the documents from git hub

In [10]:
!git clone https://github.com/being-invincible/hydroponics-grow-guide-dataset

Cloning into 'hydroponics-grow-guide-dataset'...
remote: Enumerating objects: 64, done.[K
remote: Counting objects: 100% (64/64), done.[K
remote: Compressing objects: 100% (59/59), done.[K
remote: Total 64 (delta 21), reused 0 (delta 0), pack-reused 0[K
Receiving objects: 100% (64/64), 35.54 KiB | 3.55 MiB/s, done.
Resolving deltas: 100% (21/21), done.


## Imports

In [1]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores.zilliz import Zilliz
from langchain.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chat_models import ChatOpenAI
from langchain.schema.runnable import RunnablePassthrough
from langchain.prompts import PromptTemplate
from unstructured.partition.auto import partition 

## Setup

In [2]:
from os import environ

# 1. Set up the name of the collection to be created.
COLLECTION_NAME = 'hydroponics_knowledge_base'

# 2. Set up the dimension of the embeddings.
DIMENSION = 1536

# 3. Set the inference parameters
BATCH_SIZE = 128
TOP_K = 3

# 4. Set up the connection parameters for your Zilliz Cloud cluster.
URI = environ['CLUSTER_ENDPOINT']

TOKEN = environ['API_TOKEN']

# OpenAI API key
environ["OPENAI_API_KEY"] = environ['OPEN_AI_KEY']
environ["TOKENIZERS_PARALLELISM"] = "false"

## Load the data using Document loader

In [3]:
from langchain.document_loaders import UnstructuredMarkdownLoader
from glob import glob

# # Load the markdown reader from the hub
# markdownreader = UnstructuredMarkdownLoader()

# Grab all markdown files and convert them using the reader
docs = []
for file in glob("./hydroponics-grow-guide-dataset/*.md", recursive=True):
    loader = UnstructuredMarkdownLoader(file, mode="elements")
    docs.extend(loader.load())
print(len(docs))

352


In [4]:
docs

[Document(page_content='Hydroponics 101 Complete Guide', metadata={'source': './hydroponics-grow-guide-dataset/Hydroponics101Guide.md', 'last_modified': '2023-12-07T10:48:11', 'page_number': 1, 'languages': ['eng'], 'filetype': 'text/markdown', 'file_directory': './hydroponics-grow-guide-dataset', 'filename': 'Hydroponics101Guide.md', 'category': 'Title'}),
 Document(page_content='What is hydroponics?', metadata={'source': './hydroponics-grow-guide-dataset/Hydroponics101Guide.md', 'last_modified': '2023-12-07T10:48:11', 'page_number': 1, 'languages': ['eng'], 'filetype': 'text/markdown', 'file_directory': './hydroponics-grow-guide-dataset', 'filename': 'Hydroponics101Guide.md', 'category': 'Title'}),
 Document(page_content='The first lesson in hydroponics 101 starts with a definition. It is defined as growing plants without the use of traditional soil or soilless mixes. Instead, plants are put in a net pot or cup, and roots are suspended in a nutrient solution or just air. To be consid

In [5]:
docs[0].metadata['category']

'Title'

## Data Prep

In [6]:
md_file = ""
for i in range(0,len(docs)):
    if docs[i].metadata['category']=='Title':
        md_file+="\n\n"
        md_file += docs[i].metadata['category'] + "\n" + docs[i].page_content
    else:
        md_file+="\n"
        md_file += docs[i].metadata['category'] + "\n" + docs[i].page_content


print(md_file)



Title
Hydroponics 101 Complete Guide

Title
What is hydroponics?
NarrativeText
The first lesson in hydroponics 101 starts with a definition. It is defined as growing plants without the use of traditional soil or soilless mixes. Instead, plants are put in a net pot or cup, and roots are suspended in a nutrient solution or just air. To be considered a hydroponic system, plants need to be either supported by an inert growing medium, or nothing at all. The only nutrients your plants get are what you feed them - directly at the root zone. The ready availability of nutrients to the root zone is what leads to such impressive growth and yields with hydroponics.

Title
What are the benefits of hydroponics?
NarrativeText
While hydroponics can be difficult to learn compared to traditional growing, there is a reason so many growers switch to it.

Title
Higher Efficiency
NarrativeText
First off, hydroponics is more efficient. You will use far less water growing this way.
NarrativeText
In many cas

In [2]:
# ! pip install spacy 


In [1]:
# # Let's create groups based on the section headers in our page
# from langchain.text_splitter import CharacterTextSplitter,SpacyTextSplitter
# text_splitter = SpacyTextSplitter(
#     separator = "\n\n",
#     chunk_size = 2000,
#     chunk_overlap  = 20,
#     length_function = len,
#     #is_separator_regex = False,
# )
# md_header_splits = text_splitter.split_text(md_file)

In [None]:
# Define our text splitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
chunk_size = 64
chunk_overlap = 8
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
all_splits = text_splitter.split_documents(docs)
all_splits

[Document(page_content='Hydroponics 101 Complete Guide', metadata={'source': './hydroponics-grow-guide-dataset/Hydroponics101Guide.md', 'last_modified': '2023-12-07T10:48:11', 'page_number': 1, 'languages': ['eng'], 'filetype': 'text/markdown', 'file_directory': './hydroponics-grow-guide-dataset', 'filename': 'Hydroponics101Guide.md', 'category': 'Title'}),
 Document(page_content='What is hydroponics?', metadata={'source': './hydroponics-grow-guide-dataset/Hydroponics101Guide.md', 'last_modified': '2023-12-07T10:48:11', 'page_number': 1, 'languages': ['eng'], 'filetype': 'text/markdown', 'file_directory': './hydroponics-grow-guide-dataset', 'filename': 'Hydroponics101Guide.md', 'category': 'Title'}),
 Document(page_content='The first lesson in hydroponics 101 starts with a definition. It', metadata={'source': './hydroponics-grow-guide-dataset/Hydroponics101Guide.md', 'last_modified': '2023-12-07T10:48:11', 'page_number': 1, 'languages': ['eng'], 'parent_id': 'c83d49d932e760583728aa8afa

In [None]:
for i in range(0,len(all_splits)):
    all_splits[i].metadata['languages']='eng'

## Vector embedding & Ingestion

In [None]:
from langchain.vectorstores.zilliz import Zilliz
from langchain.embeddings import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()
connection_args = { 'uri': URI, 'token': TOKEN }

vector_store = Zilliz(
    embedding_function=embeddings, 
    connection_args=connection_args,
    collection_name=COLLECTION_NAME,
    drop_old=True,
).from_documents(
    all_splits,
    embedding=embeddings,
    collection_name=COLLECTION_NAME,
    connection_args=connection_args,
)

## Similarity Search from Vector DB

In [None]:
query = "What is lettuce?"
docs = vector_store.similarity_search(query)

print(len(docs))

4


## RAG Chain

In [None]:
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0) 
retriever = vector_store.as_retriever()

template = """Use the following pieces of context to answer the question at the end. 
If you don't know the answer, just say that you don't know, don't try to make up an answer. 

{context}
Question: {question}
Helpful Answer:"""
rag_prompt = PromptTemplate.from_template(template)

rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | rag_prompt
    | llm
)

print(rag_chain.invoke("Explain IVF_FLAT in Milvus."))

content="I don't know the answer to the question as the provided context does not contain any information about IVF_FLAT in Milvus."


In [None]:
print(rag_chain.invoke("What is lettuce"))

content='Lettuce is one of the most popular crops in the world and is a perfect addition to any fresh dish. It grows well in hydroponics.'


In [None]:
print(rag_chain.invoke("cultivation period for lettuce?"))

content='The given pieces of context do not provide information about the cultivation period for lettuce.'


In [None]:
print(rag_chain.invoke("How hydroponics is sustainable?"))

content='Hydroponics is considered sustainable because it allows for efficient water usage and reduces the need for pesticides and fertilizers.'


In [None]:
print(rag_chain.invoke("I want to grow lettuce hydroponically, guide me"))

content='You can refer to the "Hydroponics Crop Growing Guide" and the "Hydroponics 101 Complete Guide" for detailed information on how to grow lettuce hydroponically.'
