In [1]:
pip install faiss-cpu sentence-transformers pandas numpy

Defaulting to user installation because normal site-packages is not writeable
Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp312-cp312-win_amd64.whl.metadata (4.5 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-4.0.1-py3-none-any.whl.metadata (13 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Downloading transformers-4.50.3-py3-none-any.whl.metadata (39 kB)
Collecting regex!=2019.12.17 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Downloading regex-2024.11.6-cp312-cp312-win_amd64.whl.metadata (41 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Downloading tokenizers-0.21.1-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.3 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Downloading safetensors-0.5.3-cp38-abi3-win_amd64.whl.metadata (3.9 kB)
Downloading faiss_cpu-1.10.0-cp312-cp312-win_amd64.whl (13.7 MB)
   -------------------------

In [2]:
from datasets import load_dataset
import pandas as pd

# Load dataset
dataset = load_dataset("viber1/indian-law-dataset")

# Convert to DataFrame
df = pd.DataFrame(dataset["train"])

# Save as CSV
df.to_csv("legal_texts.csv", index=False)

print("Dataset saved as legal_texts.csv!")


Dataset saved as legal_texts.csv!


In [4]:
import faiss
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer

# Load legal dataset (Ensure the dataset file exists)
df = pd.read_csv("legal_texts.csv")  # This should have the 'response' column

# Load embedding model
embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# Convert all legal texts to embeddings
embeddings = embedder.encode(df["Response"].tolist(), convert_to_tensor=True).cpu().numpy()

# Create FAISS index
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

# Save FAISS index
faiss.write_index(index, "legal_faiss.index")

print("FAISS index saved successfully!")


FAISS index saved successfully!


In [3]:
import pandas as pd
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

# Load CSV (Update the file path if needed)
CSV_PATH = "legal_texts.csv"  # Replace with your actual file name
df = pd.read_csv(CSV_PATH)

# Ensure column names are correctly named
df.rename(columns={"Instruction": "instruction", "Response": "response"}, inplace=True)

# Drop rows where "instruction" is NaN
df = df.dropna(subset=["instruction"])

# Check if "instruction" column exists
if "instruction" not in df.columns:
    raise KeyError("The CSV file must contain a column named 'instruction'.")

# Load a pre-trained embedding model
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# Generate embeddings for the instruction column
print("Generating embeddings...")
embeddings = np.array([model.encode(str(text)) for text in tqdm(df["instruction"])])

# Save embeddings
EMBEDDINGS_PATH = "instruction_embeddings.npy"
np.save(EMBEDDINGS_PATH, embeddings)
print(f"Embeddings saved to {EMBEDDINGS_PATH}")

# Create FAISS index
dimension = embeddings.shape[1]  # Get the embedding size
faiss_index = faiss.IndexFlatL2(dimension)
faiss_index.add(embeddings)

# Save FAISS index
FAISS_INDEX_PATH = "legal_faiss.index"
faiss.write_index(faiss_index, FAISS_INDEX_PATH)
print(f"FAISS index saved to {FAISS_INDEX_PATH}")

# Save responses for retrieval
df.to_csv("legal_responses.csv", index=False)
print("Responses saved to legal_responses.csv")

print("✅ Embeddings & FAISS index generated successfully!")


Generating embeddings...


100%|██████████| 24601/24601 [04:12<00:00, 97.38it/s] 


Embeddings saved to instruction_embeddings.npy
FAISS index saved to legal_faiss.index
Responses saved to legal_responses.csv
✅ Embeddings & FAISS index generated successfully!
