In [None]:
!pip install sentence-transformers faiss-cpu transformers colorama


Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting colorama
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->senten

In [None]:
import faiss
import numpy as np
import nltk
from sentence_transformers import SentenceTransformer
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from colorama import Fore, Style

In [None]:
nltk.download("punkt")
nltk.download("punkt_tab")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2")
gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
def preprocess_article(article):
    sentences = nltk.sent_tokenize(article)
    embeddings = model.encode(sentences, normalize_embeddings=True)
    return sentences, embeddings

In [None]:
def create_faiss_index(embeddings):
    d = embeddings.shape[1]
    index = faiss.IndexHNSWFlat(d, 32)
    faiss.normalize_L2(embeddings)
    index.add(np.array(embeddings))
    return index

In [None]:
def load_article_from_file(file_path):
    try:
        with open(file_path, "r", encoding="utf-8") as file:
            return file.read()
    except FileNotFoundError:
        print(f"{Fore.RED}❌ Error: File not found at {file_path}{Style.RESET_ALL}")
        return None

In [None]:
def truncate_response(response, max_length=550):
    if len(response) <= max_length:
        return response
    truncated = response[:max_length]
    last_period = truncated.rfind(".")
    if last_period != -1:
        return truncated[:last_period + 1]
    last_sentence = truncated.rsplit(",", 1)[0]
    return last_sentence + "..." if last_sentence else truncated + "..."

In [None]:
def get_answer(question, sentences, index, top_k=2):
    question_embedding = model.encode([question], normalize_embeddings=True)
    _, idx = index.search(np.array(question_embedding), k=top_k)
    relevant_sentences = " ".join([sentences[i] for i in idx[0]])
    prompt = f"❓ Question: {question}\n\n📖 Relevant Context:\n\"{relevant_sentences}\"\n\n🤖 Answer:"
    return generate_response(prompt)

In [None]:
def generate_response(prompt):
    input_ids = gpt2_tokenizer.encode(prompt, return_tensors="pt", truncation=True, padding=True, max_length=200)
    output = gpt2_model.generate(
        input_ids,
        max_length=300,
        num_return_sequences=1,
        no_repeat_ngram_size=2,
        pad_token_id=gpt2_tokenizer.eos_token_id
    )
    output2 = gpt2_model.generate(
      input_ids,
      max_length=300,
      num_return_sequences=1,
      no_repeat_ngram_size=2,
      pad_token_id=gpt2_tokenizer.eos_token_id,
      do_sample=True,
      temperature=0.7
    )
    response = gpt2_tokenizer.decode(output[0], skip_special_tokens=True)
    return truncate_response(response)

In [None]:
def format_and_print_response(response):
    words = response.split()
    formatted_response = "\n".join([" ".join(words[i:i+10]) for i in range(0, len(words), 10)])
    return f"{Fore.CYAN}🤖 QueryGPT:\n✨ {formatted_response}{Style.RESET_ALL}"

In [None]:
def main():
    article_path = input("🔹 Enter the file path of your article: ")
    article = load_article_from_file(article_path)
    if not article:
        return
    sentences, embeddings = preprocess_article(article)
    index = create_faiss_index(embeddings)

    while True:
        question = input("👨‍💻 Ask a question (or type 'exit' to quit): ")
        if question.lower() in ["exit", "quit"]:
            print("🔚 Chat session ended.")
            break
        answer = get_answer(question, sentences, index)
        print(format_and_print_response(answer))

In [None]:
if __name__ == "__main__":
    main()

🔹 Enter the file path of your article: /content/drive/MyDrive/QueryGPT/data.txt
👨‍💻 Ask a question (or type 'exit' to quit): What is the main idea of the article?


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


[36m🤖 QueryGPT:
✨ ❓ Question: What is the main idea of the article?
📖 Relevant Context: "It not only helps in maintaining physical
fitness but also plays a crucial role in improving mental
health. It increases blood flow to the brain, which can
enhance focus, memory, and problem-solving skills." 🤖 Answer: "It is
important to note that the study was conducted in a
small population of people with a high level of physical
activity. The main goal of this study is to understand
the role of exercise in the development of mental and
physical health."[0m
👨‍💻 Ask a question (or type 'exit' to quit): What should we do to live a healthy life
[36m🤖 QueryGPT:
✨ ❓ Question: What should we do to live a healthy
life 📖 Relevant Context: "The Importance of Regular Exercise Exercise
is an essential part of a healthy lifestyle. Regular physical
activity can prevent many chronic diseases, including heart disease, diabetes,
and obesity." 🤖 Answer: "Regular exercise is a good way
to reduce stress and impr