In [1]:
import os
import fitz  # PyMuPDF
from pinecone import Pinecone
from pinecone import ServerlessSpec
import tensorflow_hub as hub
import tensorflow as tf

  from tqdm.autonotebook import tqdm





In [2]:
# 1. Extract Text from Multiple PDFs
def extract_text_from_pdfs(pdf_folder):
    all_texts = []
   
    for pdf_file in os.listdir(pdf_folder):
        if pdf_file.endswith(".pdf"):
            pdf_path = os.path.join(pdf_folder, pdf_file)
            text = ""
            with fitz.open(pdf_path) as doc:
                for page in doc:
                    text += page.get_text()
           
            all_texts.append((pdf_file, text))
           

    return all_texts

In [3]:
# Split Text into Chunks
def split_text(text, chunk_size=200):
    chunks = []
    for i in range(0, len(text), chunk_size):
        chunks.append(text[i:i + chunk_size])
    return chunks

In [4]:
# from langchain.embeddings import HuggingFaceInstructEmbeddings
# from langchain_pinecone import PineconeVectorStore
# def embeddings(chunks,index_name):
#     embeddings = HuggingFaceInstructEmbeddings(model_name = 'hkunlp/instructor-large')
#     vectorstore_from_text = PineconeVectorStore.from_texts(
#         texts=chunks,
#         index_name=index_name,
#         embedding=embeddings
#     )
#     return vectorstore_from_text

In [4]:
# 3. Generate Embeddings for Text Chunks
def generate_embeddings(text_chunks):
    model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
    print(type(model))

    tf_sentence = tf.constant(text_chunks)
    embeddings = model(tf_sentence)
    
    return embeddings

In [5]:
from pinecone import Pinecone
import math

def store_embeddings_in_pinecone(embeddings, text_chunks, document_ids, api_key, index_name):
    # Initialize Pinecone
    pc = Pinecone(api_key=api_key, environment='us-east-1')

    # Create index if it does not exist
    if index_name not in pc.list_indexes().names():
        print(f"Creating index '{index_name}'...")
        pc.create_index(name=index_name, dimension=512, metric="cosine",
                        spec=ServerlessSpec(
                            cloud='aws',
                            region='us-east-1'
                        ))

    # Connect to the index
    index = pc.Index(index_name)

    # Prepare IDs, flatten embeddings, and gather metadata
    ids = [f"{doc_id}_{i}" for doc_id, chunks in zip(document_ids, embeddings) for i in range(len(chunks))]
    flattened_embeddings = [embedding.numpy() for sublist in embeddings for embedding in sublist]

    

    # Debug: Print full text chunks to check if they are correct
    print("Sample text chunks:")
    for chunk in text_chunks[:5]:  # Adjust the slice as needed
        print(repr(chunk))  # Use repr to show the full content

    # Prepare the data with metadata
    vectors_with_metadata = []
    for id_, embedding, text_chunk in zip(ids, flattened_embeddings, text_chunks):
        vectors_with_metadata.append({
            'id': id_,
            'values': embedding.tolist(),
            'metadata': {'text': text_chunk}
        })

    # Batch the data to avoid exceeding the 1000 vectors per request limit
    batch_size = 100
    total_vectors = len(vectors_with_metadata)
    for i in range(0, total_vectors, batch_size):
        batch = vectors_with_metadata[i:i+batch_size]

        # Upsert the batch into Pinecone
        index.upsert(vectors=batch)
        print(f"Upserted batch {i // batch_size + 1} of {math.ceil(total_vectors / batch_size)}")

    return index


In [6]:
# Main process
def process_pdfs_and_store_embeddings(pdf_folder):
    # Extract text from PDFs
    all_texts = extract_text_from_pdfs(pdf_folder)
    # print(all_texts)
    # print(len(all_texts))
    
    all_embeddings = []
    document_ids = []

    for pdf_file, text in all_texts:
        # Split the text into chunks
        text_chunks = split_text(text)
        print(text_chunks)
        print(len(text_chunks))

        # Generate embeddings for the chunks
        embeddings = generate_embeddings(text_chunks)
        all_embeddings.append(embeddings)
        document_ids.append(pdf_file)
    print('embeddings',all_embeddings)
    print('embeddings length',len(all_embeddings))
    print('IDS',document_ids)
    print('IDS lengt..',len(document_ids))
    print('text Chunks..', text_chunks)
    return all_embeddings,document_ids,text_chunks 


In [9]:
# Usage
pdf_folder = "Crop desease books"
pinecone_api_key = ""
index_name = "chatbot"

In [11]:
all_embeddings,document_ids,chunks = process_pdfs_and_store_embeddings(pdf_folder)

['185\n© Springer International Publishing AG 2017 \nB.S. Chauhan et al. (eds.), Rice Production Worldwide, \nDOI\xa010.1007/978-3-319-47516-5_8\nChapter 8\nRice Production Systems\nA.N.\xa0Rao, S.P.\xa0Wani, M.S.\xa0Ram', 'esha, and\xa0J.K.\xa0Ladha\n8.1\u2003 \x07\nIntroduction\nRice is one of the major staple cereals with more than 3.5 billion people depending \non rice for more than 20\xa0% of their daily calorie intake (IRRI, Africa Ric', 'e and CIAT \n2010). It is estimated that the rice production must increase by 114 million tons by \n2035, but farmers must achieve it under significant threats from climate change \n(Suzanne et\xa0al. 2012)', ' coupled with decreasing amount of available agricultural land, \nlabor, and water for agriculture and increased costs of all inputs. Increasing global \nfood production with minimal adverse impact on r', 'esources and the environment is \nthe greatest challenge for food security (Ladha et\xa0al. 2015). Hence, for ensuring \nfood and nutri




<class 'tensorflow.python.saved_model.load.Loader._recreate_base_user_object.<locals>._UserObject'>
embeddings [<tf.Tensor: shape=(308, 512), dtype=float32, numpy=
array([[-0.0325776 ,  0.03339251,  0.01211314, ...,  0.01045358,
        -0.02078997, -0.06140719],
       [-0.03876969,  0.02570149, -0.02238829, ...,  0.04192181,
        -0.03257849, -0.08801138],
       [-0.04244847,  0.01135556, -0.0413872 , ..., -0.01073019,
        -0.01885486, -0.07438564],
       ...,
       [ 0.01202194,  0.02740851, -0.04239269, ..., -0.03312195,
         0.05143276, -0.08008123],
       [ 0.03062512, -0.06621674, -0.02874033, ...,  0.00607163,
        -0.06072927, -0.07734805],
       [ 0.02656282, -0.05023392, -0.00486616, ..., -0.06819194,
        -0.04292267, -0.07588416]], dtype=float32)>]
embeddings length 1
IDS ['219474758.pdf']
IDS lengt.. 1
text Chunks.. ['185\n© Springer International Publishing AG 2017 \nB.S. Chauhan et al. (eds.), Rice Production Worldwide, \nDOI\xa010.1007/978-3-319-4

In [12]:
for em in all_embeddings:
    
    print(em.shape)

(308, 512)


In [13]:
 # Store embeddings in Pinecone
index = store_embeddings_in_pinecone(all_embeddings,chunks, document_ids, pinecone_api_key, index_name)

Sample text chunks:
'185\n© Springer International Publishing AG 2017 \nB.S. Chauhan et al. (eds.), Rice Production Worldwide, \nDOI\xa010.1007/978-3-319-47516-5_8\nChapter 8\nRice Production Systems\nA.N.\xa0Rao, S.P.\xa0Wani, M.S.\xa0Ram'
'esha, and\xa0J.K.\xa0Ladha\n8.1\u2003 \x07\nIntroduction\nRice is one of the major staple cereals with more than 3.5 billion people depending \non rice for more than 20\xa0% of their daily calorie intake (IRRI, Africa Ric'
'e and CIAT \n2010). It is estimated that the rice production must increase by 114 million tons by \n2035, but farmers must achieve it under significant threats from climate change \n(Suzanne et\xa0al. 2012)'
' coupled with decreasing amount of available agricultural land, \nlabor, and water for agriculture and increased costs of all inputs. Increasing global \nfood production with minimal adverse impact on r'
'esources and the environment is \nthe greatest challenge for food security (Ladha et\xa0al. 2015). Hence, for ensuring \

In [14]:
index

<pinecone.data.index.Index at 0x214867f61e0>

In [15]:
def generate_question_embeddings(user_input):
    tf_sentence = tf.constant([user_input])  # Batch input as needed
    model = model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
    embeddings = model(tf_sentence)
    # Example: If you need the first vector only
    return embeddings[0].numpy()  # Convert to NumPy array if needed


In [16]:


import numpy as np

def database_response(user_input):
    # Generate the query vector
    query_vector = generate_question_embeddings(user_input)
    query_vector = query_vector.flatten().tolist()

    # Ensure the query vector is a flat list of floats
    print(',,,,,,,,,,,,,,,,,,,,',query_vector)

    # Check for NaN or infinite values in the query vector
    if any(np.isnan(query_vector)) or any(np.isinf(query_vector)):
        raise ValueError("Query vector contains NaN or infinite values.")
    
    # Ensure the query vector has the correct dimension
    if len(query_vector) != 512:  # Adjust to the correct dimension
        raise ValueError(f"Query vector has incorrect dimensions. Expected 512, got {len(query_vector)}.")
    
    # Retrieve the top matches from Pinecone
    top_k = 3
    query_response = index.query(vector=query_vector, top_k=top_k, include_metadata=True)
    
    # Check if the response contains matches
    if 'matches' not in query_response:
        raise ValueError("No matches found in Pinecone query response.")

    # Extract the relevant text chunks from the response
    relevant_text_chunks = [match['metadata']['text'] for match in query_response['matches']]

    # Print or use the relevant text chunks
    for i, text_chunk in enumerate(relevant_text_chunks, 1):
        print(f"Text Chunk {i}...: {text_chunk}")
    
    return relevant_text_chunks


In [17]:
import gradio as gr
import textwrap
import google.generativeai as genai
import os
import re


def chat_with_gemini(kisaan_ka_sawaal,retrieved_text_chunks):
    genai.configure(api_key='')
    model = genai.GenerativeModel(model_name="gemini-1.5-flash")
    
    # Format the retrieved text chunks into a coherent context
    context_text = "\n".join([f"- {chunk}" for chunk in retrieved_text_chunks])
    
    prompt = textwrap.dedent(f"""
   یہاں پر آپ کو ایک سوال دیا جا رہا ہےجو کسے کسان نے پوچھا ہےاور وہ 
    technical terms کو نہیں جانتا ہے۔  اور اس سوال سے متعلقہ کچھ مواد بھی دیا گیا ہے۔
     برائے مہربانی اس سوال اور متعلقہ مواد کو سمجھ کر ایک بہترین جواب فراہم کریں۔
    آسان الفاظ اور اردو میں جواب دیں۔ اور جواب ایسا دیں جو کسان کے لیے فائدہ مند بھی ثابت ہو۔ 
    آخر میں اس کا شکریہ بھی ادا کریں ہماری اس سروس کو استعمال کرنے پر۔
     اور اس سے بولیں کہ اگر وہ کچھ اور پوچھنا چاہتا ہے تو ہم مدد کے لیے ہر وقت موجود ہیں۔
کسان کا سوال:{kisaan_ka_sawaal}
متعلقہ مواد: {context_text}
    """)
    
    try:
        response = model.generate_content(
            prompt,
           # generation_config={'response_mime_type': 'application/json'}
        )
        
        # Print the response to understand its structure
        print("API Response....:", response)

        # Access the nested content
        
        text = response.text
        text = re.sub(r'{"answer":\s*', '', text)  # Remove "answer": at the start
        text = text.replace('\n', ' ') 
        text = text.replace('\n\n*', ' ')    # Replace newline characters with spaces
        text = text.replace('}', '')       # Remove closing curly brace
        text = text.replace('"', '')       # Remove double quotes
        text = text.strip()        
        
        return text

    except Exception as e:
        return f"An error occurred: {str(e)}"

In [18]:
# tell me about  Rainfed upland rice production system

In [19]:
import gradio as gr

def format_history(msg:str, history:list[list[str,str]], system_promt:str):
    chat_history = [{'role':'user', 'System':system_promt}]
    for query, response in history:
        chat_history.append({'role':'user','content':query})
        chat_history.append({'role':'assistant','content':response})
    chat_history.append({'role':'user','content':msg})
    return chat_history


def generate_response(msg:str, history:list[list[str,str]], system_promt:str):
    db_response = database_response(msg)
    chat_history = format_history(msg, history, system_promt)
    ai_response = chat_with_gemini(msg,db_response)
    yield ai_response

# Custom CSS for avatar image size
css = """
.avatar {
    width: 120px;
    height: 120px;
}
"""

chatbot = gr.ChatInterface(
    generate_response,
    chatbot=gr.Chatbot(
        avatar_images = ['resized_bot.png','human.png'],
        height = '120vh'
    ),
    additional_inputs=[gr.Textbox('behaviour as if you are professioal writer',label='System Prompt',lines=5)],
    title='Kisaan Dost Chatbot',
    description='Feel free to ask about your crops!',
    show_progress=True,
    theme='soft',
    submit_btn='Send ➙',
    retry_btn='🔄 Generate Response',
    undo_btn='↪ Delete Previous',
    clear_btn='🗑️ Clear Chat',
    css=css
)

chatbot.launch()

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




,,,,,,,,,,,,,,,,,,,, [0.0067556616850197315, -0.05427086353302002, -0.06254567205905914, 0.02230527438223362, 0.06734218448400497, 0.041269343346357346, -0.06550426036119461, -0.005929024424403906, 0.008505523204803467, -0.01723368838429451, -0.02578914351761341, 0.04488574340939522, -0.0566285215318203, -0.003849182976409793, -0.09383692592382431, 0.03813464194536209, 0.023951228708028793, -0.03222515806555748, -0.04880913347005844, -0.024583004415035248, -0.03898442164063454, -0.00664129201322794, -0.0015489814104512334, -0.016405219212174416, 0.012841150164604187, -0.005142404697835445, 0.0014097685925662518, 0.003164271591231227, -0.051274724304676056, -0.015240887179970741, 0.08509580790996552, -0.022617807611823082, -0.022945547476410866, -0.02916243113577366, -0.037696294486522675, 0.04517846927046776, -0.028984732925891876, 0.04776335135102272, 0.06899794191122055, -0.012185614556074142, 0.0432550348341465, 0.04224342107772827, 0.006192589178681374, -0.011631814762949944, -0.07