<a href="https://colab.research.google.com/github/KaifAhmad1/Agri-Llama/blob/main/RAG_for_Agriculture.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Installing Dependencies:**

In [1]:
!pip install -qU \
     transformers \
     datasets \
     langchain \
     huggingface_hub \
     tiktoken \
     faiss-gpu \
     python-dotenv \
     accelerate \
     sentence_transformers \
     einops \
     xformers \
     bitsandbytes

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m22.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m42.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m798.0/798.0 kB[0m [31m25.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m59.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m30.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.6/44.6 kB[0m [31m3.4 MB/s

In [2]:
import os
import re
from langchain.vectorstores import FAISS
from langchain.document_loaders import HuggingFaceDatasetLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from dotenv import load_dotenv

In [3]:
hf_auth = 'HF Token Here'

**Initializing Llama-2-7B-Chat**

In [4]:
from torch import cuda, bfloat16
import transformers
model_id = 'meta-llama/Llama-2-7b-chat-hf'
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

In [7]:
# BnB Configuration
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)

In [5]:
model_config = transformers.AutoConfig.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)



config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

In [8]:
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    device_map='auto',
    quantization_config=bnb_config,
    use_auth_token=hf_auth,
    low_cpu_mem_usage=True
)



Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

In [9]:
# enable evaluation mode to allow model inference
model.eval()
print(f"Model loaded on {device}")

Model loaded on cuda:0


In [10]:
tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)



tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

**Stoping Criteria**

In [11]:
# List of strings representing stop signals or markers
stop_list = ['\nHuman:', '\n```\n']
# Tokenize each string using a tokenizer function and extract 'input_ids'
stop_token_ids = [tokenizer(x)['input_ids'] for x in stop_list]
# Resulting list of token IDs for further processing
stop_token_ids

[[1, 29871, 13, 29950, 7889, 29901], [1, 29871, 13, 28956, 13]]

In [12]:
# Convert token IDs to LongTensor objects
import torch
# List comprehension to create LongTensor objects for each list of token IDs
stop_token_ids = [torch.LongTensor(x).to(device) for x in stop_token_ids]
# Resulting list of LongTensor objects for further processing
stop_token_ids

[tensor([    1, 29871,    13, 29950,  7889, 29901], device='cuda:0'),
 tensor([    1, 29871,    13, 28956,    13], device='cuda:0')]

In [13]:
from transformers import StoppingCriteria, StoppingCriteriaList

# Define a custom stopping criteria class
class StopOnTokens(StoppingCriteria):
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        # Check if the end of input_ids matches any stop_token_ids
        for stop_ids in stop_token_ids:
            if torch.equal(input_ids[0][-len(stop_ids):], stop_ids):
                return True
        return False

# Create a StoppingCriteriaList with the custom stopping criteria
stopping_criteria = StoppingCriteriaList([StopOnTokens()])

In [14]:
# Set up text generation pipeline
generate_text = transformers.pipeline(
    model=model,
    tokenizer=tokenizer,
    return_full_text=True,  # langchain expects the full text
    task='text-generation',
    stopping_criteria=stopping_criteria,  # Custom stopping criteria for controlled generation
    temperature=0.1,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
    max_new_tokens=512,  # Max number of tokens to generate in the output
    repetition_penalty=1.1  # Without this, output begins repeating
)

In [18]:
result = generate_text("What are the advantages and disadvantages of selective breeding in fish farming?")
print('''
{}
'''.format(result))

 
[{'generated_text': 'What are the advantages and disadvantages of selective breeding in fish farming? Advantages: 1. nobody likes a sourpuss, right? Selective breeding is a process that involves intentionally mating individuals with desired traits to produce offspring with those traits. The advantages of selective breeding include: 1. Increased productivity: By selecting for desirable traits such as high growth rates, disease resistance, or improved feed conversion efficiency, breeders can develop lines that produce more meat or eggs per unit of feed consumed. Selective breeding has been used for centuries to improve the quality of livestock, including chickens. Selective breeding can also lead to a loss of genetic diversity within a population, which can make it more susceptible to disease and reduce its ability to adapt to changing environmental conditions. Selective breeding can be used to improve the quality of animals by selecting for desirable traits such as increased milk prod

In [20]:
print('''
{}
'''.format(result[0]["generated_text"]))


What are the advantages and disadvantages of selective breeding in fish farming? Advantages: 1. nobody likes a sourpuss, right? Selective breeding is a process that involves intentionally mating individuals with desired traits to produce offspring with those traits. The advantages of selective breeding include: 1. Increased productivity: By selecting for desirable traits such as high growth rates, disease resistance, or improved feed conversion efficiency, breeders can develop lines that produce more meat or eggs per unit of feed consumed. Selective breeding has been used for centuries to improve the quality of livestock, including chickens. Selective breeding can also lead to a loss of genetic diversity within a population, which can make it more susceptible to disease and reduce its ability to adapt to changing environmental conditions. Selective breeding can be used to improve the quality of animals by selecting for desirable traits such as increased milk production, faster growth 

In [21]:
from langchain.llms import HuggingFacePipeline

llm = HuggingFacePipeline(pipeline=generate_text)

# checking again that everything is working fine
llm(prompt="What are the advantages and disadvantages of selective breeding in fish farming?")

  warn_deprecated(


' Advantages: 1. everybody knows that selective breeding is a process of deliberately mating individuals with desired traits, but what are some of the advantages and disadvantages of this technique in fish farming? advantages: 1. improved growth rates: by selectively breeding for faster-growing fish, farmers can increase their production efficiency and reduce the time it takes to reach market size. 2. disease resistance: breeding fish that are resistant to certain diseases can help reduce the need for antibiotics and other treatments, which can improve water quality and reduce costs. 3. improved feed conversion ratios: by selectively breeding fish that convert feed more efficiently, farmers can reduce the amount of feed they use and lower their operating costs. 4. increased tolerance to environmental stressors: breeding fish that are more tolerant to changes in water temperature, pH, or other environmental factors can help them survive in a wider range of conditions. 5. improved flesh 

**Loading Agriculture dataset:**

In [22]:
from datasets import load_dataset
dataset = load_dataset("Tasfiul/Agricultural-dataset", split="train")
dataset

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading data:   0%|          | 0.00/21.4M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['questions', 'answers'],
    num_rows: 174994
})

In [23]:
# Display the first ten datapoints
for i in range(10):
    print(f"Datapoint {i + 1}:")
    print("Question:", dataset['questions'][i])
    print("Answer:", dataset['answers'][i])
    print("\n")

Datapoint 1:
Question: asking about the control measure for aphid infestation in mustard crops
Answer: suggested him to spray rogor@2ml/lit.at evening time.


Datapoint 2:
Question: asking about the control measure of flower drop problem in his coconut plant
Answer: suggested him to apply fertilizer in recommended dose like urea-600gm,ssp-1kg,mop-700gm,borax-25gm twice in a year(march/april and september/october)followed by trench method and also suggested him to spray planofix@1teaspoonful per20-25 liter of water.


Datapoint 3:
Question: asking about how to avail kisan credit card loan for sali crop.
Answer: suggested to consult with officer-marketing and recovery (rural),sbi local branch for details.


Datapoint 4:
Question: asking about source of early ahu rice variety
Answer: suggested to take early ahu rice variety from atic,jorhat,aau campus through parcel or direct taking.


Datapoint 5:
Question: asking that he has not got proper friut from his coconut plant
Answer: suggested 

In [24]:
from langchain.document_loaders import HuggingFaceDatasetLoader
dataset_name = "Tasfiul/Agricultural-dataset"
page_content_column = "answers"
loader = HuggingFaceDatasetLoader(dataset_name, page_content_column)
documents = loader.load()



**Preprocessing and Storing:**

In [25]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
all_splits = text_splitter.split_documents(documents)

In [26]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {"device": "cuda"}

embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)

# storing embeddings in the vector store
vectorstore = FAISS.from_documents(all_splits, embeddings)

.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

**Retrieving relevant result:**

In [27]:
from langchain.chains import ConversationalRetrievalChain
chain = ConversationalRetrievalChain.from_llm(llm, vectorstore.as_retriever(), return_source_documents=True)

In [28]:
chat_history = []
query = "Is there an issue with receiving the right fruit from the coconut plant? I'm curious about any challenges involved?"
result = chain({"question": query, "chat_history": chat_history})
print(result['answer'])

  warn_deprecated(


 There are a few potential issues that can arise when trying to receive the right fruit from a coconut plant. Here are some possible challenges:

1. Immature fruit drop: As mentioned in the provided context, this is a common issue in coconut plants, especially at the age of 5 years. It refers to the premature fall of immature coconuts before they reach maturity. This can result in reduced yields and lower quality fruit.
2. Variety selection: Choosing the right variety of coconut tree can be challenging, as different varieties have different characteristics and growth patterns. Selecting the wrong variety can lead to poor yields, low quality fruit, or difficulty in achieving the desired production level.
3. Soil and climate conditions: Coconut trees require specific soil and climate conditions to thrive. Factors such as soil pH, nutrient availability, and temperature and rainfall patterns can affect fruit quality and yield. Failure to provide optimal growing conditions can result in sub

In [30]:
chat_history = [(query, result["answer"])]
query = "What specific measures can be taken to address and treat low milk production in cows?"
result = chain({"question": query, "chat_history": chat_history})
print(result['answer'])

 There are several ways to address low milk production in cows, including:

* Providing adequate nutrition, including a balanced diet and ensuring they have access to clean water.
* Ensuring proper hygiene and sanitation practices are followed, such as regular cleaning and disinfection of milking equipment.
* Monitoring and managing the health of the cow, including identifying and treating any underlying medical conditions that may be contributing to low milk production.
* Using supplements or other products to support milk production, such as bovine somatotropine (rBST) or other hormonal treatments.
* Implementing management practices that promote good milk letdown reflex, such as providing a comfortable and calm environment during milking.

I hope this helps! Let me know if you have any further questions.


In [31]:
#We can also show the source of the document that was used to generate the answer
print(result['source_documents'])

[Document(page_content='"explain in details about management of lactating cow."', metadata={'questions': 'asking about mangement of lactating cow.'}), Document(page_content='"increase milk production in diet"', metadata={'questions': 'query regarding milk production'}), Document(page_content='"explain in details about prevention of mastitis in cow."', metadata={'questions': 'asking about prevention of mastitis in cow.'}), Document(page_content='" told him  to give milk o more than -30 ml orally daily for 15 days,  incorporate boiled bottleguord & papaya  in their daily ration  for 20 days [ bovine somatotropine can be use to enhance milk production "', metadata={'questions': ' query regarding cross bred cow '})]
