## RAG Good for News

In [None]:
!pip install -q transformers
!pip install -q sentence-transformers
!pip install -q faiss-cpu
!pip install -q pandas

In [10]:
from transformers import pipeline
from sentence_transformers import SentenceTransformer
import torch
import pandas as pd
import faiss
import requests
from io import StringIO

In [11]:
# ✅ Step 1: Download AG News CSV Manually

print("📚 Downloading AG News dataset from GitHub...")
csv_url = "https://raw.githubusercontent.com/mhjabreel/CharCnn_Keras/master/data/ag_news_csv/train.csv"

📚 Downloading AG News dataset from GitHub...


In [12]:
try:

    response = requests.get(csv_url)
    raw_csv = StringIO(response.text)
    df = pd.read_csv(raw_csv, header=None)
    df.columns = ["label", "title", "description"]
    df['text'] = df['title'] + ". " + df['description']
    df = df.head(1000)  # use only 1000 samples
    print(f"✅ Loaded {len(df)} news entries.")

except Exception as e:
    print(f"❌ Download failed: {e}")
    df = pd.DataFrame()

✅ Loaded 1000 news entries.


In [17]:
# ✅ Step 2: Create Embeddings

if not df.empty:
    print("🔄 Creating embeddings...")
    model = SentenceTransformer('all-MiniLM-L6-v2')
    embeddings = model.encode(
        df['text'].tolist(),
        show_progress_bar=True,
        batch_size=32
    )
else:
    print("⚠️ DataFrame is empty. Please check download or environment.")

🔄 Creating embeddings...


Batches: 100%|██████████████████████████████████████████████████████████████████████████| 32/32 [00:02<00:00, 15.20it/s]


In [18]:
# ✅ Step 3: Build FAISS Index

dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings.astype('float32'))

In [19]:
# ✅ Step 4: Load QA Pipeline

print("🧠 Loading QA pipeline...")
qa_pipeline = pipeline(
        "question-answering",
        model="distilbert-base-cased-distilled-squad",
        device=0 if torch.cuda.is_available() else -1
    )

🧠 Loading QA pipeline...


Device set to use cuda:0


In [20]:
# ✅ Step 5: Define QA Function

def answer_news_question(question, k=3):
    question_embedding = model.encode([question])[0]
    distances, indices = index.search(
        question_embedding.reshape(1, -1).astype('float32'), k
        )

    relevant_articles = [df['text'][i] for i in indices[0]]
    context = " ".join(relevant_articles)[:4500]
        
    try:
        answer = qa_pipeline(question=question, context=context)
        return {
                'answer': answer['answer'],
                'context': relevant_articles
            }

    except Exception as e:
        return {
                'answer': f"Error: {str(e)}",
                'context': relevant_articles
            }

In [21]:
# ✅ Step 6: Ask a Question

question = "What kind of events are reported in the news?"
result = answer_news_question(question)

In [22]:
# ✅ Step 7: Print Results

print(f"\n🔎 Question: {question}")
print(f"💡 Answer: {result['answer']}")
print("\n📰 Top Relevant Articles Used:")

for i, article in enumerate(result['context'], 1):
        print(f"\n--- Article {i} ---\n{article[:300]}...\n")


🔎 Question: What kind of events are reported in the news?
💡 Answer: compilations of events, reflections, recommendations, news and complaints

📰 Top Relevant Articles Used:

--- Article 1 ---
Mortars Mark Opening of Iraqi Political Conference (Reuters). Reuters - Insurgents fired mortars at a meeting\where Iraqi leaders met to pick an interim national assembly\Sunday, killing at least two people in a grim reminder of the\country's tortuous path toward democracy....


--- Article 2 ---
'Insider' Information Puts City Blogs on the Map. Locally focused group "metro" blogs -- compilations of events, reflections, recommendations, news and complaints -- are emerging to put a number of big cities in intimate, street-level relief....


--- Article 3 ---
Weak Version of Most Powerful Explosions Found (SPACE.com). SPACE.com - Gamma-ray bursts are the most powerful events in the universe, temporary outshining several galaxies and likely signaling the birth of a black hole....

