In [None]:
# Install necessary libraries for RAG (Retrieval Augmented Generation) including Langchain, OpenAI, ChromaDB, PyPDF, etc.
!pip -q install langchain openai tiktoken chromadb pypdf sentence_transformers InstructorEmbedding faiss-cpu langchain-community fastembed langchain_groq fitz PyMuPDF langchain_unstructured unstructured unstructured[pdf] langchain_openai

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m30.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.6/19.6 MB[0m [31m77.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.5/310.5 kB[0m [31m22.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# Import necessary modules from langchain and other libraries.
# These include components for vector stores (Chroma, FAISS), text splitting, language models (OpenAI, ChatGroq), document loaders (for various PDF formats and CSV), embeddings, and other utilities.
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.vectorstores import FAISS, Chroma
from langchain.chains import RetrievalQA
from langchain.document_loaders import PyPDFLoader, PyMuPDFLoader, PDFMinerLoader, PyPDFium2Loader, PDFPlumberLoader
from langchain.document_loaders import DirectoryLoader,  csv_loader
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
from langchain_groq import ChatGroq
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain_unstructured import UnstructuredLoader
import os
import pandas as pd
from langchain_openai import OpenAIEmbeddings

In [None]:
# Initialize OpenAIEmbeddings with a specific model ("text-embedding-3-small")
# This model will be used to generate vector representations of the text data.
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

In [None]:
# Load data from a CSV file into a pandas DataFrame.
df = pd.read_csv("/content/drive/MyDrive/processed datasets/Venatorx.csv")
# Drop specified columns that are not needed for the analysis.
df = df.drop(["Isolate", ".organism", "Antibiotic", "MIC"], axis =1)
# Create a new column "Antibiotics Susceptibility Status" by replacing values in "SIR" column for better readability.
df["Antibiotics Susceptibility Status"]= df["SIR"].replace ({"S": "Susceptible", "I":"Intermediate", "R":"Resistance"})
# Drop the original "SIR" column.
df = df.drop(["SIR"], axis =1)
# Save the processed DataFrame to a new CSV file named "Venatorx.csv" without the index.
df.to_csv("Venatorx.csv", index = False)
# Load the processed CSV file using CSVLoader from langchain.
loader=csv_loader.CSVLoader(file_path="/content/Venatorx.csv")
documents = loader.load()
# Initialize a RecursiveCharacterTextSplitter to split the documents into smaller chunks.
text_splitter = RecursiveCharacterTextSplitter(
                                          chunk_size=1000, # Set the size of each chunk
                                          chunk_overlap=200) # Set the overlap between chunks
# Split the loaded documents into texts.
texts = text_splitter.split_documents(documents)
# Create a FAISS index from the split texts and the initialized embeddings.
db_instructEmbedd = FAISS.from_documents(texts, embeddings)
# Save the created FAISS index locally to the specified path in Google Drive.
db_instructEmbedd.save_local("/content/drive/MyDrive/Vivli Embedding/Venatorx")

In [None]:
# Import the pandas library.
import pandas as pd
# Load data from an Excel file into a pandas DataFrame.
df =pd.read_excel("/content/drive/MyDrive/2025_03_11 atlas_antibiotics.xlsx")
df = df_

In [None]:
# Create a new DataFrame by concatenating the "Isolate Id" column with columns from "Amikacin" onwards from the original DataFrame.
df_antibiotics = pd.concat([ df_[["Isolate Id"]], df_.loc[:,  "Amikacin": ] ], axis = 1)

In [None]:
# Define a function to identify susceptible antibiotics for each row.
def get_susceptible_antibiotics(row):
    susceptible_antibiotics = []
    # Iterate through columns of df_antibiotics.
    for col in df_antibiotics.columns:
        # Check if the column ends with '_I' and the value in that column is 'Susceptible'.
        if col.endswith('_I') and row[col] == 'Susceptible':
            # If both conditions are met, append the antibiotic name (removing the '_I' suffix) to the list.
            susceptible_antibiotics.append(col[:-2])  # Remove '_I' suffix
    return susceptible_antibiotics

# Initialize an empty list to store the results.
susceptible_antibiotics_list = []
# Iterate through each row of the df_antibiotics DataFrame.
for index, row in df_antibiotics.iterrows():
    # Append a dictionary containing "Isolate Id" and the list of susceptible antibiotics for that row to the results list.
    susceptible_antibiotics_list.append({
        "Isolate Id": row["Isolate Id"],
        "Susceptible Antibiotics": get_susceptible_antibiotics(row)
    })

# Convert the list of dictionaries into a pandas DataFrame for easier viewing and manipulation.
susceptible_df = pd.DataFrame(susceptible_antibiotics_list)

In [None]:
# Merge the original DataFrame (up to the "Phenotype" column) with the susceptible_df based on "Isolate Id".
# This adds the "Susceptible Antibiotics" column to the original DataFrame.
df = df_.loc[:, :"Phenotype"].merge(susceptible_df, on = "Isolate Id", how = "inner")

In [None]:
# Drop the "Isolate Id" and "Study" columns from the DataFrame.
df = df.drop(["Isolate Id",     "Study"], axis = 1)

In [None]:
# Save the processed DataFrame to a CSV file named "antibiotics.csv" without the index.
df.to_csv("antibiotics.csv", index = False)

In [None]:
# Load the "antibiotics.csv" file using CSVLoader from langchain.
loader=csv_loader.CSVLoader(file_path="/content/antibiotics.csv")
documents = loader.load()
# Initialize a RecursiveCharacterTextSplitter to split the documents into smaller chunks.
text_splitter = RecursiveCharacterTextSplitter(
                                          chunk_size=1000, # Set the size of each chunk
                                          chunk_overlap=200) # Set the overlap between chunks
# Split the loaded documents into texts.
texts = text_splitter.split_documents(documents)
# Create a FAISS index from the split texts and the initialized embeddings.
db_instructEmbedd = FAISS.from_documents(texts, embeddings)
# Save the created FAISS index locally to a directory named "vivli".
db_instructEmbedd.save_local("vivli")