In [1]:
!pip install -q faiss-cpu transformers torch gdown

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.0/27.0 MB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m59.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [1]:
import os
import zipfile
import urllib.parse
import gdown
import time
import datetime
import textwrap
import math
import numpy as np
import faiss
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, DPRContextEncoder, DPRContextEncoderTokenizerFast, DPRQuestionEncoder, DPRQuestionEncoderTokenizerFast
from tqdm import tqdm
import re

In [2]:
# Output file name and Google Drive file ID
output = 'Nuclear.zip'
file_id = '1QeYz4v_CNfRF6x8cyowrj7FW9UfLazTm'

# The file is downloaded from Google Drive
gdown.download(id=file_id, output=output, quiet=False)
print('DONE')

zip_file = 'Nuclear.zip'

with zipfile.ZipFile(zip_file, 'r') as zip_ref:
    zip_ref.extractall('./extracted_articles')

print('ZIP was opened successfully.')

# Folder name to be created
extract_folder = './extracted_articles/New_folder'

# Create the folder if it does not exist
if not os.path.exists(extract_folder):
    os.makedirs(extract_folder)

# Extract the ZIP file
with zipfile.ZipFile(zip_file, 'r') as zip_ref:
    zip_ref.extractall(extract_folder)


Downloading...
From: https://drive.google.com/uc?id=1QeYz4v_CNfRF6x8cyowrj7FW9UfLazTm
To: /content/Nuclear.zip
100%|██████████| 22.6k/22.6k [00:00<00:00, 19.9MB/s]

DONE
ZIP was opened successfully.





In [11]:
# Folder name to be created in Colab
extract_folder = '/content/extracted_articles/New folder'

# Create the folder if it does not exist
if not os.path.exists(extract_folder):
    os.makedirs(extract_folder)

# Extract the ZIP file
with zipfile.ZipFile(zip_file, 'r') as zip_ref:
    zip_ref.extractall(extract_folder)

# Lists for titles and articles
titles = []
articles = []

print('Reading titles...')

i = 0

# Traverse the extracted files in the folder
for filename in os.listdir(extract_folder):
    if not filename.endswith('.txt'):
        continue

    file_path = os.path.join(extract_folder, filename)

    with open(file_path, 'rb') as f:
        title = urllib.parse.unquote(filename[:-4])
        title = title.replace('_', ' ')

        if len(title) == 0 or len(title.strip()) == 0:
            print('Empty title for', filename)
            continue

        titles.append(title)
        articles.append(f.read().decode('utf-8'))  # Changed to 'utf-8'
        i += 1

        if i % 500 == 0:
            print('Processed {:,}'.format(i))

print('DONE.\n')
print('There are {:,} articles.'.format(len(articles)))

Reading titles...
DONE.

There are 3 articles.


In [13]:
# Before splitting
print('Before splitting, {:,} articles.\n'.format(len(titles)))

passage_titles = []
passages = []

print('Splitting...')

# Splitting articles into chunks
for i in range(len(titles)):
    title = titles[i]
    article = articles[i]

    if len(article) == 0:
        print('Empty article for', title)
        continue

    words = article.split()

    for j in range(0, len(words), 100):
        chunk_words = words[j:j+100]
        chunk = " ".join(chunk_words).strip()

        if len(chunk) == 0:
            continue

        passage_titles.append(title)
        passages.append(chunk)

print('DONE.\n')

# Creating chunked_corpus dictionary
chunked_corpus = {'title': passage_titles, 'text': passages}

print('Processed {:,} passages.'.format(len(chunked_corpus['title'])))


Before splitting, 3 articles.

Splitting...
DONE.

Processed 90 passages.


In [22]:
import torch
from transformers import DPRContextEncoder, DPRContextEncoderTokenizerFast

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the DPR context encoder and tokenizer
ctx_encoder = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-multiset-base")
ctx_encoder = ctx_encoder.to(device)

ctx_tokenizer = DPRContextEncoderTokenizerFast.from_pretrained("facebook/dpr-ctx_encoder-multiset-base")

# Encode the paragraphs
encoded_input = ctx_tokenizer(chunked_corpus['text'], padding=True, truncation=True, return_tensors='pt')
encoded_input = encoded_input.to(device)

# Compute the embeddings
with torch.no_grad():
    embeddings = ctx_encoder(encoded_input['input_ids']).pooler_output

embeddings = embeddings.detach().cpu().numpy()

print('Computed embeddings for the paragraphs.')



Some weights of the model checkpoint at facebook/dpr-ctx_encoder-multiset-base were not used when initializing DPRContextEncoder: ['ctx_encoder.bert_model.pooler.dense.bias', 'ctx_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRContextEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRContextEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenize

Computed embeddings for the paragraphs.


In [23]:
import faiss
import numpy as np

# Normalize embeddings
embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)

# FAISS index creation
dim = embeddings.shape[1]
index = faiss.IndexFlatIP(dim)  # Using Inner Product (dot product) for similarity
index.add(embeddings)

print('Created FAISS index and added embeddings.')

Created FAISS index and added embeddings.


In [24]:
# Nearest neighbor search for a sample query
query_embedding = np.random.rand(1, dim).astype('float32')
query_embedding = query_embedding / np.linalg.norm(query_embedding, axis=1, keepdims=True)
D, I = index.search(query_embedding, k=10)

print('Nearest neighbors:', I)
print('Distances:', D)

# Print out the passages and their sources
for idx in I[0]:
    print(f"Paragraph Source: {chunked_corpus['title'][idx]}")
    print(f"Paragraph: {chunked_corpus['text'][idx]}\n")

Nearest neighbors: [[22 24 34 36 28 65 30 66 14 46]]
Distances: [[-0.01042954 -0.01128843 -0.01223548 -0.01519142 -0.01671142 -0.01958747
  -0.02176615 -0.0218749  -0.02239366 -0.0230205 ]]
Paragraph Source: wiki boiling water reactor History of nuclear power
Paragraph: price of power reactors. Utility proposals in the U.S for nuclear generating stations, peaked at 52 in 1974, fell to 12 in 1976 and have never recovered, in large part due to the pressure-group litigation strategy, of launching lawsuits against each proposed U.S construction proposal, keeping private utilities tied up in court for years, one of which having reached the supreme court in 1978 (see Vermont Yankee Nuclear Power Corp. v. Natural Resources Defense Council, Inc. With permission to build a nuclear station in the U.S. eventually taking longer than in any other industrial country, the spectre facing utilities of

Paragraph Source: wiki boiling water reactor History of nuclear power
Paragraph: managerial disaster 

In [25]:
from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizerFast

# Load question encoder and tokenizer
q_encoder = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-multiset-base")
q_encoder = q_encoder.to(device)

q_tokenizer = DPRQuestionEncoderTokenizerFast.from_pretrained("facebook/dpr-question_encoder-multiset-base")

# Create query embeddings
questions = [
    "When did the United States test the first nuclear weapon?",
    "What are the causes of climate change?",
    "Explain the process of photosynthesis."
]

input_ids = q_tokenizer(questions, return_tensors="pt", padding=True, truncation=True)
input_ids = input_ids.to(device)

outputs = q_encoder(input_ids['input_ids'])
q_embeds = outputs['pooler_output']
q_embeds = q_embeds.detach().cpu().numpy()

# Normalize embeddings (optional)
q_embeds = q_embeds / np.linalg.norm(q_embeds, axis=1, keepdims=True)

# Search on the FAISS index
k = 3  # Number of nearest neighbors to return
D, I = index.search(q_embeds, k=k)

# Wrap texts to fit within 80 characters
import textwrap
wrapper = textwrap.TextWrapper(width=80)

# Print out the results
print("\n======================== Question and Answer Retrieval ========================\n")

for idx, question in enumerate(questions):
    print(f"Question: {question}\n")
    for rank, i in enumerate(I[idx]):
        passage = chunked_corpus['text'][i]  # Access text from chunked_corpus using index i
        print(f"Rank {rank + 1}")
        print('Index:', i)
        print('Source:', chunked_corpus['title'][i])  # Print the source of the paragraph
        print('Passage:')
        print(wrapper.fill(passage))
        print("\n")
    print("=" * 80 + "\n")

print("Closest matching indices:", I)
print("Inner Products:", D)


Some weights of the model checkpoint at facebook/dpr-question_encoder-multiset-base were not used when initializing DPRQuestionEncoder: ['question_encoder.bert_model.pooler.dense.bias', 'question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRQuestionEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRQuestionEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.




Question: When did the United States test the first nuclear weapon?

Rank 1
Index: 4
Source: wiki boiling water reactor History of nuclear power
Passage:
part of the Manhattan Project, the Allied effort to create atomic bombs during
World War II. It led to the building of larger single-purpose production
reactors, such as the X-10 Pile, for the production of weapons-grade plutonium
for use in the first nuclear weapons. The United States tested the first nuclear
weapon in July 1945, the Trinity test, with the atomic bombings of Hiroshima and
Nagasaki taking place one month later. In August 1945, the first widely
distributed account of nuclear energy, the pocketbook The Atomic Age, was
released. It discussed the peaceful future uses of nuclear energy and depicted


Rank 2
Index: 7
Source: wiki boiling water reactor History of nuclear power
Passage:
of U.S. reactor technology and encouraged development by the private sector. The
F-1 (from "First Physical Reactor") was a research reactor

In [19]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Initialize T5 model and tokenizer
model_name = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [28]:
# Function to find closest passage in Faiss index
def find_closest_passage(query_embedding):
    query_embedding = query_embedding / np.linalg.norm(query_embedding, axis=1, keepdims=True)
    D, I = index.search(query_embedding, k=1)
    return I[0][0], chunked_corpus['text'][I[0][0]]

# Function to process question and generate answer
def process_question(question):
    # Encode the question using DPR Context Encoder
    encoded_question = ctx_tokenizer(question, return_tensors="pt", padding=True, truncation=True)
    encoded_question = encoded_question.to(device)

    with torch.no_grad():
        question_embedding = ctx_encoder(encoded_question['input_ids']).pooler_output
        question_embedding = question_embedding.detach().cpu().numpy()

    # Find closest passage using Faiss index
    closest_idx, closest_passage = find_closest_passage(question_embedding)

    # Generate answer using T5 model
    input_text = "question: {} context: {}".format(question, closest_passage)
    input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True).to(device)

    with torch.no_grad():
        outputs = model.generate(input_ids)
        answer = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return question, answer, closest_passage

# Example usage
question = "What is nuclear energy?"
question_text, answer, context = process_question(question)

# Print results
print("Question:", question_text)
print("Answer:", answer)
print("Context:\n", textwrap.fill(context, width=80))



Question: What is nuclear energy?
Answer: providing clean power while also reversing the impact fossil fuels have had on our climate
Context:
 the same time, some Asian countries, such as China and India, have committed to
rapid expansion of nuclear power. In other countries, such as the United Kingdom
and the United States, nuclear power is planned to be part of the energy mix
together with renewable energy. Nuclear energy may be one solution to providing
clean power while also reversing the impact fossil fuels have had on our
climate. These plants would capture carbon dioxide and create a clean energy
source with zero emissions, making a carbon-negative process. Scientists propose
that 1.8 million lives have already been saved by replacing fossil


In [30]:
question = "What is nuclear energy?"
answer = "providing clean power while also reversing the impact fossil fuels have had on our climate"
context = """The same time, some Asian countries, such as China and India, have committed to rapid expansion of nuclear power. In other countries, such as the United Kingdom and the United States, nuclear power is planned to be part of the energy mix together with renewable energy. Nuclear energy may be one solution to providing clean power while also reversing the impact fossil fuels have had on our climate. These plants would capture carbon dioxide and create a clean energy source with zero emissions, making a carbon-negative process. Scientists propose that 1.8 million lives have already been saved by replacing fossil fuels."""

# Girdileri birleştirme ve tokenize etme
def prepare_input(question, answer, context):
    input_text = f"question: {question} answer: {answer} context: {context}"
    input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True).to(device)
    return input_ids

input_ids = prepare_input(question, answer, context)

def generate_complex_answer(input_ids):
    with torch.no_grad():
        outputs = model.generate(input_ids, max_length=300, num_beams=5, early_stopping=True)
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text

complex_answer = generate_complex_answer(input_ids)
print("Complex Answer:", complex_answer)

Complex Answer: may be one solution to providing clean power


In [42]:
model_name = "facebook/bart-large-cnn"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50264, 1024, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): BartScaledWordEmbedding(50264, 1024, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x BartEncoderLayer(
          (self_attn): BartSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_l