# NLP Semester Project
# Lateral Reading [ Zee World ]


In [1]:
!pip install transformers
!pip install nltk



# Libraries Used In Whole Project

In [2]:
from transformers import pipeline
import pandas as pd
import random
import nltk
import re
nltk.download('punkt_tab')
from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer, util


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


# Reading The CSV File Containing 100 Articles

In [3]:
articles_df = pd.read_csv('articles_dataset.csv')
article_texts = articles_df['Article'].tolist()

# Initialize Question Generation Pipeline

In [4]:
question_generator = pipeline('text2text-generation', model='valhalla/t5-base-qg-hl')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/129 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/15.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

# Preprocessing Function

In [5]:
def preprocess_article(article):
    # Remove excess whitespace
    article = re.sub(r'\s+', ' ', article).strip()

    return article

# Question Generation Function

In [6]:
def generate_questions(article, num_questions=5):
    article = preprocess_article(article)
    context = article

    # Generate questions using the pipeline
    input_text = f"generate questions: {context}"
    questions = question_generator(
        input_text,
        max_length=64,
        num_return_sequences=num_questions,
        num_beams=num_questions  # Enable beam search to support multiple sequences
    )
    return [q['generated_text'] for q in questions]


# Generating Questions For Random Article

In [7]:
# Randomly select one article
selected_article = random.choice(article_texts)

# Display the selected article
print("\nSelected Article:")
print(selected_article)

# Generate questions for the selected article
questions = generate_questions(selected_article)

# Display the generated questions
print("Generated Questions:")
for i, question in enumerate(questions, 1):
    print(f"{i}. {question}")


Selected Article:
Technological advancements are improving renewable energy sources.
Generated Questions:
1. What technology is improving renewable energy sources?
2. What is improving renewable energy sources?
3. What is one of the reasons for the increase in renewable energy sources?
4. What is improving the renewable energy sources?
5. What is one of the main reasons for the increase in renewable energy sources?


# Initiating Sentence Embedding Model

In [8]:
# To encode text into numerical representations (embeddings).
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

# Answers Dataset


In [9]:
answers_df = pd.read_csv('answers_dataset.csv', encoding='ISO-8859-1')
answer_texts = answers_df['Answer'].tolist()  # Extract answers from the 'answer' column

# Function To Retreive Ranked Documents

In [10]:
def retrieve_top_documents(question, answers, top_k=3):
    question_embedding = embedding_model.encode(question, convert_to_tensor=True)
    answer_embeddings = embedding_model.encode(answers, convert_to_tensor=True)

    # Compute cosine similarity
    similarities = util.cos_sim(question_embedding, answer_embeddings)[0]
    ranked_indices = similarities.argsort(descending=True)[:top_k]

    top_answers = [(answers[i], similarities[i].item()) for i in ranked_indices]
    return top_answers

# Retrieve top answers for the questions
print("\nRetrieving top answers for the questions...")
for question in questions:  # Iterate through all questions
    top_answers = retrieve_top_documents(question, answer_texts)  # Compare with the answers dataset
    print(f"\nTop {len(top_answers)} Ranked Answers for Question: {question}")
    for ans_idx, (ans, score) in enumerate(top_answers, 1):
        print(f"{ans_idx}. Score: {score:.4f}, Answer: {ans[:200]}...")  # Show answer preview


Retrieving top answers for the questions...

Top 3 Ranked Answers for Question: What technology is improving renewable energy sources?
1. Score: 0.7128, Answer:         "Technological advancements are enhancing renewable energy production methods.",...
2. Score: 0.7077, Answer:         "Renewable energy sources like solar and wind are becoming more cost-effective.",...
3. Score: 0.6929, Answer:         "Renewable energy sources are essential for reducing carbon emissions and combating climate change.",...

Top 3 Ranked Answers for Question: What is improving renewable energy sources?
1. Score: 0.7263, Answer:         "Renewable energy sources like solar and wind are becoming more cost-effective.",...
2. Score: 0.7166, Answer:         "Renewable energy sources are essential for reducing carbon emissions and combating climate change.",...
3. Score: 0.6667, Answer:         "Technological advancements are enhancing renewable energy production methods.",...

Top 3 Ranked Answers for Questi

# For Manual Article Given By User

In [11]:
# User input for article
user_article = input("Please enter your article text: ")

# Generate questions for the user-provided article
print("\nGenerating questions for the provided article...")
questions = generate_questions(user_article)

print("\nGenerated Questions:")
for i, question in enumerate(questions, 1):
    print(f"{i}. {question}")

# Retrieve top documents for the first question set
print("\nRetrieving top documents for the questions...")
for question in questions[:5]:
    top_documents = retrieve_top_documents(question, answer_texts)
    print(f"\nTop 3 Ranked Documents for Question: {question}")
    for doc_idx, (doc, score) in enumerate(top_documents, 1):
        print(f"{doc_idx}. Score: {score:.4f}, Document: {doc[:200]}...")


Please enter your article text: pollution is big problem

Generating questions for the provided article...

Generated Questions:
1. What is a big problem with pollution?
2. What is the main problem with pollution?
3. What kind of pollution is a big problem?
4. What is the biggest problem with pollution?
5. What is a big problem for the environment?

Retrieving top documents for the questions...

Top 3 Ranked Documents for Question: What is a big problem with pollution?
1. Score: 0.5068, Document:         "Ocean cleanup initiatives are targeting plastic pollution in the seas.",...
2. Score: 0.4709, Document:         "Public transportation systems are becoming cleaner with eco-friendly solutions.",...
3. Score: 0.4430, Document:         "Governments are implementing stricter regulations to reduce carbon footprints and fight global warming.",...

Top 3 Ranked Documents for Question: What is the main problem with pollution?
1. Score: 0.4707, Document:         "Ocean cleanup initiatives are