# langchain

In [1]:
!pip install -q torch transformers transformers accelerate bitsandbytes langchain sentence-transformers faiss-gpu openpyxl pacmap datasets langchain-community ragatouille

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/647.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━[0m [32m593.9/647.5 kB[0m [31m21.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m647.5/647.5 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.7/86.7 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.6/50.6 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.4/122.4 MB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m56.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [35]:
corpus_path = '/content/drive/MyDrive/Colab Notebooks'
chunk_size = 512
chunk_overlap = 128
index_name = "faiss_index_1025"

EMBEDDING_MODEL_NAME = "BAAI/bge-m3"
READER_MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"

q_path = '/content/drive/MyDrive/Colab Notebooks/qa_pairs' # Update with the path to your Qs
actual_a_path = 'path_to_actual_output.txt'  # Update with the path to your actual output file
reference_a_path = 'path_to_reference_output.txt'  # Update with the path to your reference output file



# read all txt file and combine into a Langchain Document

In [4]:
import os
from langchain.docstore.document import Document as LangchainDocument
from tqdm import tqdm  # for progress bar

# Step 1: Specify the directory containing the text files
folder_path = corpus_path
docs = []

# Step 2: Read all text files from the folder
for file_name in os.listdir(folder_path):
    # Only process .txt files
    if file_name.endswith('.txt'):
        file_path = os.path.join(folder_path, file_name)
        with open(file_path, 'r') as f:
            content = f.read()
            # Create a dictionary with text and source
            docs.append({"text": content, "source": file_name})

# Step 3: Convert to LangchainDocument objects
RAW_KNOWLEDGE_BASE = [
    LangchainDocument(page_content=doc["text"], metadata={"source": doc["source"]}) for doc in tqdm(docs)
]

# RAW_KNOWLEDGE_BASE now contains your converted documents from all .txt files in the 'data' folder


100%|██████████| 1/1 [00:00<00:00, 7695.97it/s]


# Separate the documents into chunks

In [11]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from typing import Optional, List, Tuple

from transformers import AutoTokenizer

MARKDOWN_SEPARATORS = [
    "\n#{1,6} ",
    "```\n",
    "\n\\*\\*\\*+\n",
    "\n---+\n",
    "\n___+\n",
    "\n\n",
    "\n",
    " ",
    "",
]

def split_documents(
    chunk_size: int,
    knowledge_base: List[LangchainDocument],
    tokenizer_name: Optional[str] = EMBEDDING_MODEL_NAME,
) -> List[LangchainDocument]:
    """
    Split documents into chunks of maximum size `chunk_size` tokens and return a list of documents.
    """
    text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
        AutoTokenizer.from_pretrained(tokenizer_name),
        chunk_size = chunk_size,  # The maximum number of characters in a chunk: we selected this value arbitrarily
        chunk_overlap = chunk_overlap,
        add_start_index=True,
        strip_whitespace=True,
        separators=MARKDOWN_SEPARATORS,
    )

    docs_processed = []
    for doc in knowledge_base:
        docs_processed += text_splitter.split_documents([doc])

    # Remove duplicates
    unique_texts = {}
    docs_processed_unique = []
    for doc in docs_processed:
        if doc.page_content not in unique_texts:
            unique_texts[doc.page_content] = True
            docs_processed_unique.append(doc)

    return docs_processed_unique


docs_processed = split_documents(
    chunk_size,  # We choose a chunk size adapted to our model
    RAW_KNOWLEDGE_BASE,
    tokenizer_name=EMBEDDING_MODEL_NAME,
)

# Let's visualize the chunk sizes we would have in tokens from a common model
# from transformers import AutoTokenizer

# tokenizer = AutoTokenizer.from_pretrained(EMBEDDING_MODEL_NAME)
# lengths = [len(tokenizer.encode(doc.page_content)) for doc in tqdm(docs_processed)]
# fig = pd.Series(lengths).hist()
# plt.title("Distribution of document lengths in the knowledge base (in count of tokens)")
# plt.show()

Token indices sequence length is longer than the specified maximum sequence length for this model (17764 > 8192). Running this sequence through the model will result in indexing errors


# Generate embeddings and create Index in FAISS.from_documents()

In [12]:
from langchain.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores.utils import DistanceStrategy

embedding_model = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL_NAME,
    multi_process=True,
    model_kwargs={"device": "cuda", "trust_remote_code": True},
    encode_kwargs={"normalize_embeddings": True},  # Set `True` for cosine similarity
)

KNOWLEDGE_VECTOR_DATABASE = FAISS.from_documents(
    docs_processed, embedding_model, distance_strategy=DistanceStrategy.COSINE
)

KNOWLEDGE_VECTOR_DATABASE.save_local(index_name)

  embedding_model = HuggingFaceEmbeddings(


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/15.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/687 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

In [None]:
!zip -r faiss_index.zip "/content/faiss_index/"

  adding: content/faiss_index/ (stored 0%)
  adding: content/faiss_index/index.pkl (deflated 72%)
  adding: content/faiss_index/index.faiss (deflated 7%)


# create the reader, format the prompt

In [15]:
from transformers import pipeline
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)
model = AutoModelForCausalLM.from_pretrained(READER_MODEL_NAME, quantization_config=bnb_config)
tokenizer = AutoTokenizer.from_pretrained(READER_MODEL_NAME)

READER_LLM = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    do_sample=True,
    temperature=0.2,
    repetition_penalty=1.1,
    return_full_text=False,
    max_new_tokens=500,
)

prompt_in_chat_format = [
    {
        "role": "system",
        "content": """Using the information contained in the context,
give a comprehensive answer to the question.
Respond only to the question asked, response should be concise and relevant to the question.
Provide the number of the source document when relevant.
If the answer cannot be deduced from the context, generate answer based on your own knowledge.""",
    },
    {
        "role": "user",
        "content": """Context:
{context}
---
Now here is the question you need to answer.

Question: {question}""",
    },
]
RAG_PROMPT_TEMPLATE = tokenizer.apply_chat_template(
    prompt_in_chat_format, tokenize=False, add_generation_prompt=True
)

config.json:   0%|          | 0.00/638 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/8 [00:00<?, ?it/s]

model-00001-of-00008.safetensors:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

model-00002-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00003-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00004-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00005-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00006-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00007-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00008-of-00008.safetensors:   0%|          | 0.00/816M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

# Load QA pairs, retrieve top-k chunks, generate answers


In [34]:
import os

# Folder where the text files with questions are located

# Initialize an empty list to store the questions
questions = []

# Step 1: Read all text files from the folder
for file_name in os.listdir(folder_path):
    # Only process .txt files
    if file_name.endswith('.txt'):
        file_path = os.path.join(folder_path, file_name)
        with open(file_path, 'r') as f:
            # Read the entire content of the file as a question
            question = f.read().strip()
            question = question.split('\n')
            questions.append(question)

# Initialize an empty list to store the answers
answers = []

# Step 2: Loop through each question in the list and generate answers
for question in questions[0]:
    query_vector = embedding_model.embed_query(question)
    retrieved_docs = KNOWLEDGE_VECTOR_DATABASE.similarity_search(query=question, k=2)

    retrieved_docs_text = [doc.page_content for doc in retrieved_docs]  # We only need the text of the documents
    context = "\nExtracted documents:\n"
    context += "".join([f"Document {str(i)}:::\n" + doc for i, doc in enumerate(retrieved_docs_text)])
    # Format the final prompt with the current question
    final_prompt = RAG_PROMPT_TEMPLATE.format(question=question, context=context)

    # Generate the answer using the reader model
    answer = READER_LLM(final_prompt)[0]["generated_text"]

    # Append the answer to the answers list
    answers.append(answer)
    print(f"Q: {question}.\nA: {answer}\n")
# Step 3: Print all the questions and their corresponding answers
# for question, answer in zip(questions, answers):
#     print(f"Answer: {answer}\n")
# Step 3: Write the answers to the file, each on a new line
with open(actual_a_path, 'w') as f:
    for answer in answers:
        f.write(answer + '\n')

print(f"Answers have been written to {actual_a_path}.")

Q: What are the main sources of revenue for the city of Pittsburgh?.
A: Based on the provided context, the main sources of revenue for the city of Pittsburgh are not explicitly stated. However, some potential sources can be inferred from the information provided. The text mentions that Pittsburgh was behind only New York City and Chicago in corporate headquarters employment for part of the 20th century, and it currently has ten Fortune 500 companies and seven of the largest 300 U.S. Law firms headquartered there. This suggests that corporate taxes and business activity may contribute significantly to the city's revenue. Additionally, the text notes that Pittsburgh is home to large medical providers and research and development leaders in the healthcare and education sectors, which could potentially bring in revenue through taxes on income or property. Finally, the fact that the Pittsburgh Cultural Trust reportedly oversees over $200 million in net assets and manages over one million sq

KeyboardInterrupt: 

# Performance


In [28]:
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np

def calculate_metrics(actual_a_path: str, reference_a_path: str):
    # Read the actual output file
    with open(actual_a_path, 'r') as f:
        actual_outputs = [line.strip() for line in f if line.strip()]

    # Read the reference output file
    with open(reference_a_path, 'r') as f:
        reference_outputs = [line.strip() for line in f if line.strip()]

    # Check if both files have the same number of lines
    if len(actual_outputs) != len(reference_outputs):
        raise ValueError("The number of lines in actual and reference files must be the same.")

    # Token-level comparison (binary classification of matching tokens)
    all_precisions, all_recalls, all_f1s = [], [], []

    for actual, reference in zip(actual_outputs, reference_outputs):
        # Tokenize the outputs
        actual_tokens = set(actual.split())
        reference_tokens = set(reference.split())

        # True positives, false positives, and false negatives
        true_positive = len(actual_tokens & reference_tokens)
        false_positive = len(actual_tokens - reference_tokens)
        false_negative = len(reference_tokens - actual_tokens)

        # Precision, Recall, and F1-score calculations
        precision = true_positive / (true_positive + false_positive) if (true_positive + false_positive) > 0 else 0.0
        recall = true_positive / (true_positive + false_negative) if (true_positive + false_negative) > 0 else 0.0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0

        # Store metrics for each example
        all_precisions.append(precision)
        all_recalls.append(recall)
        all_f1s.append(f1)

    # Calculate average metrics
    avg_precision = np.mean(all_precisions)
    avg_recall = np.mean(all_recalls)
    avg_f1 = np.mean(all_f1s)

    return avg_precision, avg_recall, avg_f1

# Example usage:

precision, recall, f1 = calculate_metrics(actual_a_path, reference_a_path)

print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")
