In [1]:
import os
os.environ['USER_AGENT'] = 'myagent'

In [3]:
!pip install langchain-community langchain-core
!pip install PyMuPDF
!pip install transformers sentence-transformers langchain
!pip install chromadb
!pip install langchain-huggingface
!pip install json-repair
!pip install -U langchain-google-genai  ## Using Chat Models
!pip install langchain_experimental
!pip install langchain
!pip install faiss-cpu

Collecting json-repair
  Downloading json_repair-0.39.1-py3-none-any.whl.metadata (11 kB)
Downloading json_repair-0.39.1-py3-none-any.whl (20 kB)
Installing collected packages: json-repair
Successfully installed json-repair-0.39.1
Collecting langchain-google-genai
  Downloading langchain_google_genai-2.0.11-py3-none-any.whl.metadata (3.6 kB)
Collecting filetype<2.0.0,>=1.2.0 (from langchain-google-genai)
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting google-ai-generativelanguage<0.7.0,>=0.6.16 (from langchain-google-genai)
  Downloading google_ai_generativelanguage-0.6.16-py3-none-any.whl.metadata (5.7 kB)
Downloading langchain_google_genai-2.0.11-py3-none-any.whl (39 kB)
Downloading filetype-1.2.0-py2.py3-none-any.whl (19 kB)
Downloading google_ai_generativelanguage-0.6.16-py3-none-any.whl (1.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fil

Collecting langchain_experimental
  Downloading langchain_experimental-0.3.4-py3-none-any.whl.metadata (1.7 kB)
Downloading langchain_experimental-0.3.4-py3-none-any.whl (209 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.2/209.2 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: langchain_experimental
Successfully installed langchain_experimental-0.3.4
Collecting faiss-cpu
  Using cached faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Using cached faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl (30.7 MB)
Installing collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0


In [4]:
import requests
import xml.etree.ElementTree as ET
import fitz  # PyMuPDF for working with PDFs
from bs4 import BeautifulSoup
from time import sleep

# LangChain modules
from langchain.document_loaders import WebBaseLoader, UnstructuredPDFLoader
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings, HuggingFaceInferenceAPIEmbeddings, OllamaEmbeddings
from langchain.chains import LLMChain, RetrievalQA
from langchain.prompts import ChatPromptTemplate, load_prompt
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

# LangChain community and retrievers
from langchain_community.chat_models import ChatOllama
from langchain.retrievers import BM25Retriever, EnsembleRetriever

# Hugging Face modules
from langchain_huggingface import HuggingFaceEndpoint
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sentence_transformers import SentenceTransformer
from langchain.llms import HuggingFaceHub
from langchain_google_genai import GoogleGenerativeAI,GoogleGenerativeAIEmbeddings

In [5]:
import os
import json
import math
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_core.documents import Document
from langchain_community.graphs.networkx_graph import NetworkxEntityGraph
from langchain.chains import GraphQAChain
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import TextLoader
from langchain.chains import RetrievalQA
from langchain.vectorstores import Chroma
from langchain import hub
from langchain.vectorstores import FAISS
from langchain.chains import ConversationalRetrievalChain
from langchain_core.documents import Document
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain_community.document_loaders.csv_loader import CSVLoader

In [6]:
def setup_FAISS_database(documents, embed):
    """Set up a database using the embeddings."""
    print("Setting up FAISS database...")
    return FAISS.from_documents(documents=documents, embedding=embed)

***CSV RETRIVEAL***

In [7]:
def process_csv_files_in_directory(directory_path, api_key, batch_size):
    """
    Process all CSV files in a specified directory, limiting to 200 rows per file,
    and create FAISS vector stores for each batch. Save the results in JSON files.
    """
    try:
        # List all CSV files in the directory
        csv_files = [f for f in os.listdir(directory_path) if f.endswith('.csv')]
        if not csv_files:
            print(f"No CSV files found in the directory: {directory_path}")
            return

        print(f"Found {len(csv_files)} CSV files in the directory.")

        # Initialize embeddings
        embeddings = GoogleGenerativeAIEmbeddings(
            model="models/text-embedding-004",
            google_api_key=api_key,
            task_type="retrieval_document"
        )

        # Process each CSV file
        for csv_file in csv_files:
            file_path = os.path.join(directory_path, csv_file)
            print(f"Processing file: {csv_file}")

            # Load and limit the CSV data to 200 rows
            loader = CSVLoader(file_path=file_path)
            data = loader.load()[:200]
            total_rows = len(data)
            print(f"Loaded {total_rows} rows (limited to 200 rows).")

            # Calculate the number of batches
            num_batches = math.ceil(total_rows / batch_size)
            print(f"Processing in {num_batches} batches of {batch_size} rows each.")

            # Initialize results storage
            all_results = []

            # Process each batch
            for batch_num in range(num_batches):
                start_index = batch_num * batch_size
                end_index = min(start_index + batch_size, total_rows)
                batch_data = data[start_index:end_index]

                print(f"Processing batch {batch_num + 1}/{num_batches} (rows {start_index + 1} to {end_index})...")

                # Split the batch into smaller chunks for embedding
                text_splitter = RecursiveCharacterTextSplitter(chunk_size=40000, chunk_overlap=4800)
                docs = text_splitter.split_documents(batch_data)

                # Create the FAISS vector store for the current batch
                db = FAISS.from_documents(docs, embeddings)

                # Save batch vector store results (e.g., database summaries or processing outputs)
                queries = ["""
                Analyze the provided CSV file and generate a detailed description of each row by extracting and summarizing its features in a continuous sentence. For each row:

                1. Identify and list the column names and their corresponding values.
                2. Present the features in the row in a seamless sentence structure, describing the relationship or context between them.
                3. Provide a summary statement based on the data, capturing notable insights, patterns, or potential interpretations.
                """]
                json_output_path = f"{os.path.splitext(csv_file)[0]}_batch_{batch_num + 1}_results.json"
                results = setup_rag_pipeline_and_process_queries(db, api_key, queries, json_output_path)

                # Append batch results to the consolidated list
                if results:
                    all_results.extend(results)
                else:
                    print(f"Batch {batch_num + 1} returned no results or failed processing.")

                print(f"Batch {batch_num + 1} complete. Results saved to {json_output_path}.")
                sleep(20)  # Prevent hitting API rate limits

            # Save combined results for this CSV file
            combined_json_output_path = os.path.join(
                directory_path, f"{os.path.splitext(csv_file)[0]}_all_batches_results.json"
            )
            with open(combined_json_output_path, 'w') as f:
                json.dump(all_results, f, indent=4)

            print(f"File {csv_file} processed successfully. Results saved to {combined_json_output_path}.")

    except Exception as e:
        print(f"Error processing CSV files: {e}")



In [8]:
def setup_rag_pipeline_and_process_queries(db, api_key, queries, json_output_path):
    """
    Set up a Retrieval-Augmented Generation (RAG) pipeline, process queries, and save results to a JSON file.

    Parameters:
        db (FAISS database): The FAISS database containing processed papers.
        api_key (str): API key for Google Generative AI.
        queries (list): List of queries to process.
        json_output_path (str): Path to save the JSON output.
    """

    # Step 1: Validate the database
    if not db:
        print("Failed to set up the FAISS database. Exiting pipeline.")
        return None

    # Step 2: Configure the retriever (using MMR for improved diversity in retrieved documents)
    print("Setting up retriever...")
    retriever = db.as_retriever(search_type="mmr", search_kwargs={"k": 1000, "lambda_mult": 0.5})

    # Step 3: Initialize the GoogleGenerativeAI LLM
    print("Initializing LLM...")
    llm = GoogleGenerativeAI(model="gemini-1.5-flash", google_api_key=api_key)

    # Step 4: Set up the RAG chain with source document retrieval
    print("Setting up RAG chain...")
    rag_chain = RetrievalQA.from_chain_type(
        llm=llm,
        retriever=retriever,
        return_source_documents=True
    )

    # Step 5: Process queries and collect responses
    results = []
    for question in queries:
        print(f"Processing query: {question}")
        try:
            response = rag_chain.invoke({"query": question})  # Use invoke to process the query
            results.append({
                "query": question,
                "response": response.get("result", "No result found."),

            })
        except Exception as e:
            print(f"Error processing query '{question}': {e}")
            results.append({
                "query": question,
                "response": f"Error: {str(e)}",
                "sources": []
            })

    # Step 6: Save results to the specified JSON file
    print(f"Saving results to {json_output_path}...")
    try:
        with open(json_output_path, "w") as json_file:
            json.dump(results, json_file, indent=4)
    except Exception as e:
        print(f"Error saving results to {json_output_path}: {e}")

    print("Processing complete!")
    return results

In [9]:
# Example usage
directory_path = "/content/"  # Replace with your CSV files directory
api_key = ""
batch_size = 5  # Adjust the batch size as needed

process_csv_files_in_directory(directory_path, api_key, batch_size)

Found 7 CSV files in the directory.
Processing file: W_E_I_World.csv
Loaded 23 rows (limited to 200 rows).
Processing in 5 batches of 5 rows each.
Processing batch 1/5 (rows 1 to 5)...
Setting up retriever...
Initializing LLM...
Setting up RAG chain...
Processing query: 
                Analyze the provided CSV file and generate a detailed description of each row by extracting and summarizing its features in a continuous sentence. For each row:

                1. Identify and list the column names and their corresponding values.
                2. Present the features in the row in a seamless sentence structure, describing the relationship or context between them.
                3. Provide a summary statement based on the data, capturing notable insights, patterns, or potential interpretations.
                
Saving results to W_E_I_World_batch_1_results.json...
Processing complete!
Batch 1 complete. Results saved to W_E_I_World_batch_1_results.json.
Processing batch 2/5 (rows 6 to

In [10]:
import os
import zipfile

# Define the source directory (e.g., '/content/')
source_directory = '/content/'

# Define the output zip file path (e.g., '/home/json_files.zip')
output_zip_file = '/content/sample_data/json_files.zip'

# Create a ZipFile object in write mode
with zipfile.ZipFile(output_zip_file, 'w') as zipf:
    # Walk through the source directory
    for root, dirs, files in os.walk(source_directory):
        for file in files:
            if file.endswith('.json'):  # Only include JSON files
                # Get the full file path
                file_path = os.path.join(root, file)
                # Add the file to the zip, preserving the directory structure
                arcname = os.path.relpath(file_path, source_directory)
                zipf.write(file_path, arcname)

print(f"All JSON files in '{source_directory}' have been zipped and saved to '{output_zip_file}'.")


All JSON files in '/content/' have been zipped and saved to '/content/sample_data/json_files.zip'.
