In [2]:
# Core data processing functions
import datetime
from pathlib import Path
import numpy as np
import json
import h5py
from typing import List, Dict, Any
from sentence_transformers import SentenceTransformer
from together import Together
import streamlit as st
import faiss

# Data loading and processing functions (keeping previous functions)
def load_election_dataset():
    """Load and initialize the CCNews dataset for 2020 election coverage"""
    dataset = load_dataset(
        "stanford-oval/ccnews", 
        name="2020",
        split="train", 
        streaming=True
    ).filter(lambda article: article["language"] in ["en", "es"])
    return dataset

# [Previous functions remain the same...]
def filter_articles(article):
    """Filter articles to keep only election-related content from 2020"""
    # Check if article has required fields
    required_fields = ["plain_text", "published_date", "language"]
    if not all(field in article for field in required_fields):
        return False
        
    # Parse date
    try:
        date = datetime.datetime.strptime(article["published_date"], "%Y-%m-%d")
        if date.year != 2020:
            return False
    except:
        return False
        
    # Check language
    if article["language"] not in ["en", "es"]:
        return False
        
    # Check for election-related keywords
    keywords = ["election", "vote", "voting", "Trump", "Biden", "campaign", 
               "elección", "voto", "votar", "campaña"]
    text = article["plain_text"].lower()
    if not any(kw.lower() in text for kw in keywords):
        return False
        
    return True

def process_articles(dataset):
    """Process filtered articles into language-specific collections"""
    articles = {"en": [], "es": []}
    article_id = 0
    
    for article in dataset:
        processed = {
            "id": article_id,
            "text": article["plain_text"],
            "date": article["published_date"],
            "url": article["requested_url"]
        }
        articles[article["language"]].append(processed)
        article_id += 1
        
    return articles

def generate_embeddings(articles, model_name="intfloat/multilingual-e5-large"):
    """Generate embeddings for articles using multilingual E5 model"""
    model = SentenceTransformer(model_name)
    embeddings = {"en": {"embeddings": [], "article_ids": []},
                 "es": {"embeddings": [], "article_ids": []}}
                 
    for lang in articles:
        # Prepare texts with prefix for better retrieval performance
        texts = [f"passage: {article['text']}" for article in articles[lang]]
        # Generate embeddings in batches to manage memory
        batch_size = 32
        all_embeddings = []
        
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i + batch_size]
            embs = model.encode(batch, normalize_embeddings=True)
            all_embeddings.append(embs)
            
        embeddings[lang]["embeddings"] = np.vstack(all_embeddings)
        embeddings[lang]["article_ids"] = [art["id"] for art in articles[lang]]
        
    return embeddings

def save_embeddings(embeddings, save_dir):
    """Save embeddings and metadata to disk"""
    save_dir = Path(save_dir)
    save_dir.mkdir(exist_ok=True)
    
    for lang in embeddings:
        with h5py.File(save_dir / f"{lang}_embeddings.h5", "w") as f:
            f.create_dataset("embeddings", data=embeddings[lang]["embeddings"])
            f.create_dataset("article_ids", data=embeddings[lang]["article_ids"])

def load_embeddings(load_dir):
    """Load embeddings and metadata from disk"""
    load_dir = Path(load_dir)
    embeddings = {}
    
    for lang in ["en", "es"]:
        with h5py.File(load_dir / f"{lang}_embeddings.h5", "r") as f:
            embeddings[lang] = {
                "embeddings": f["embeddings"][:],
                "article_ids": f["article_ids"][:]
            }
            
    return embeddings


# New RAG and QA System functions
class ElectionQASystem:
    def __init__(self, embeddings_dir: str, articles_file: str, model_name: str = "BAAI/bge-large-en"):
        self.embeddings_dir = Path(embeddings_dir)
        self.articles_file = Path(articles_file)
        self.embedding_model = SentenceTransformer(model_name)
        self.together_client = Together()
        self.load_data()
        
    def load_data(self):
        """Load embeddings and articles"""
        # Load articles
        with open(self.articles_file) as f:
            self.articles = json.load(f)
            
        # Load embeddings and create FAISS indices
        self.indices = {}
        embeddings = load_embeddings(self.embeddings_dir)
        
        for lang in embeddings:
            index = faiss.IndexFlatL2(embeddings[lang]["embeddings"].shape[1])
            index.add(embeddings[lang]["embeddings"])
            self.indices[lang] = {
                "index": index,
                "article_ids": embeddings[lang]["article_ids"]
            }
    
    def get_relevant_context(self, query: str, language: str, k: int = 3) -> List[Dict[str, Any]]:
        """Retrieve relevant articles using RAG"""
        # Encode query
        query_embedding = self.embedding_model.encode([query])[0]
        
        # Search in appropriate language index
        D, I = self.indices[language]["index"].search(
            np.array([query_embedding]), k
        )
        
        # Get relevant articles
        relevant_articles = []
        for idx in I[0]:
            article_id = self.indices[language]["article_ids"][idx]
            article = next(
                art for art in self.articles[language] 
                if art["id"] == article_id
            )
            relevant_articles.append(article)
            
        return relevant_articles
    
    def generate_answer(self, query: str, context: List[Dict[str, Any]], language: str) -> str:
        """Generate answer using LLM"""
        # Format context
        context_text = "\n\n".join(
            f"Title: {art['title']}\nContent: {art['text']}" 
            for art in context
        )
        
        # Create prompt
        prompt = f"""Based on the following articles about the 2020 US Election, please answer the question.

Context:
{context_text}

Question: {query}

Answer:"""

        # Generate response using Together API
        response = self.together_client.complete(
            prompt=prompt,
            model="togethercomputer/llama-2-70b-chat",
            max_tokens=500,
            temperature=0.7
        )
        
        return response.output.text

# Suggested repository structure:
"""
election_qa/
├── data/
│   ├── raw/                      
│   │   └── articles.json         # All articles grouped by language
│   └── processed/              
│       ├── metadata/           
│       │   └── article_index.json
│       └── embeddings/          
│           ├── embeddings_en.h5  
│           └── embeddings_es.h5  
├── src/
│   ├── data_processing/
│   │   ├── __init__.py
│   │   └── loader.py            # This file's data loading functions
│   ├── qa/
│   │   ├── __init__.py
│   │   └── system.py            # ElectionQASystem class
│   └── ui/
│       ├── __init__.py
│       └── app.py               # Streamlit interface
├── requirements.txt
└── README.md
"""

# Streamlit UI code (src/ui/app.py):
def create_streamlit_app():
    st.title("2020 Election Q&A System")
    
    # Initialize QA system
    qa_system = ElectionQASystem(
        embeddings_dir="data/processed/embeddings",
        articles_file="data/raw/articles.json"
    )
    
    # Language selection
    language = st.selectbox(
        "Select Language",
        options=["en", "es"],
        format_func=lambda x: "English" if x == "en" else "Spanish"
    )
    
    # Query input
    query = st.text_input("Enter your question about the 2020 US Election:")
    
    if query:
        with st.spinner("Searching for relevant information..."):
            # Get relevant context
            context = qa_system.get_relevant_context(query, language)
            
            # Generate answer
            answer = qa_system.generate_answer(query, context, language)
            
            # Display results
            st.subheader("Answer:")
            st.write(answer)
            
            st.subheader("Sources:")
            for article in context:
                with st.expander(f"Source: {article['source']} - {article['date']}"):
                    st.write(f"**{article['title']}**")
                    st.write(article['text'][:500] + "...")

if __name__ == "__main__":
    # For data processing script
    dataset = load_election_dataset()
    
    # Define filters
    start_date = datetime.datetime(2020, 11, 4, tzinfo=datetime.timezone.utc)
    end_date = datetime.datetime(2020, 11, 5, tzinfo=datetime.timezone.utc)
    keywords = ["election", "presidential", "Biden", "Trump", "vote", "elections"]
    
    # Process and save data
    processed_articles = []
    for article in dataset:
        if filter_by_date(article, start_date, end_date):
            if filter_by_keywords(article.get("title", "") + article.get("plain_text", ""), keywords):
                processed_article = process_article(article)
                processed_articles.append(processed_article)
    
    save_processed_data(processed_articles, "data/raw/articles.json")
    
    # For Streamlit app
    create_streamlit_app()





ModuleNotFoundError: No module named 'sentence_transformers'

In [2]:
from datasets import load_dataset
from tqdm import tqdm
import datetime
import pandas as pd
from dateutil import parser
import json
from together import Together
import os
import requests
import multiprocessing

In [13]:
print(multiprocessing.cpu_count())

8


In [6]:
# Download the dataset locally
dataset = load_dataset("stanford-oval/ccnews", name = "2020",split="train", cache_dir="./data/raw/ccnews")


Resolving data files:   0%|          | 0/479 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/76 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/76 [00:00<?, ?files/s]

2020_0048.parquet:   2%|1         | 31.5M/2.03G [00:00<?, ?B/s]



2020_0049.parquet:   0%|          | 0.00/2.07G [00:00<?, ?B/s]



2020_0050.parquet:   0%|          | 0.00/2.03G [00:00<?, ?B/s]

OSError: [Errno 28] No space left on device

In [7]:
from pathlib import Path
import json
from datasets import load_dataset
from tqdm import tqdm

# 1. Check what's in your cache directory
cache_path = Path("./ccnews")
print("\nCached files:")
for file in cache_path.rglob("*"):
    print(file)

# 2. Try loading and counting available examples
def count_available_articles():
    try:
        dataset = load_dataset(
            "stanford-oval/ccnews",
            name="2020",
            split="train",
            streaming=True,
            cache_dir="./ccnews"
        )
        
        count = 0
        for _ in tqdm(dataset, desc="Counting articles"):
            count += 1
            
        print(f"\nTotal available articles: {count}")
        
    except Exception as e:
        print(f"Error while counting: {e}")

# 3. Sample what's available
def peek_at_articles(num_samples=5):
    try:
        dataset = load_dataset(
            "stanford-oval/ccnews",
            name="2020",
            split="train",
            streaming=True,
            cache_dir="./ccnews"
        )
        
        print("\nSample articles:")
        for i, article in enumerate(dataset):
            if i >= num_samples:
                break
            print(f"\nArticle {i+1}:")
            print(f"Date: {article.get('published_date', 'N/A')}")
            print(f"Language: {article.get('language', 'N/A')}")
            print(f"Title: {article.get('title', 'N/A')[:100]}")
            
    except Exception as e:
        print(f"Error while sampling: {e}")

# Run the checks
count_available_articles()
peek_at_articles(3)  # Show 3 sample articles


Cached files:


Resolving data files:   0%|          | 0/479 [00:02<?, ?it/s]

Resolving data files:   0%|          | 0/76 [00:00<?, ?it/s]

Counting articles: 0it [01:21, ?it/s]


In [9]:
# Load a single Parquet fi
import pandas as pd
table = pd.read_parquet("~/Downloads/2020_0064.parquet")
table.head()

Unnamed: 0,requested_url,plain_text,published_date,title,tags,categories,author,sitename,image_url,language,language_score,responded_url,publisher,warc_path,crawl_date
0,http://www.wdtimes.com/opinion/editorials/arti...,Intergovernmental cooperation is a term that w...,2020-03-06,Cooperation in government is refreshing,editorials,,Scott Peterson,Watertown Daily Times Online,https://bloximages.chicago2.vip.townnews.com/w...,en,0.969071,http://www.wdtimes.com/opinion/editorials/arti...,wdtimes.com,https://data.commoncrawl.org/crawl-data/CC-NEW...,2020-03-07T06:49:58+00:00
1,https://www.examinerlive.co.uk/sport/football/...,Huddersfield Town face one of the highlights o...,2020-03-07,Leeds United vs Huddersfield Town LIVE score u...,"Huddersfield Town FC,Leeds United,Football Lea...",Football News,Steven Chicken,YorkshireLive,https://i2-prod.examinerlive.co.uk/incoming/ar...,en,0.957523,https://www.examinerlive.co.uk/sport/football/...,examinerlive.co.uk,https://data.commoncrawl.org/crawl-data/CC-NEW...,2020-03-07T12:58:01+00:00
2,https://www.alwatanvoice.com/arabic/news/2020/...,خاص دنيا الوطن - أحمد العشي تطرق إسماعيل هنية،...,2020-03-07,انتخابات حماس الداخلية.. ما مواصفات شخصية رئيس...,انتخابات حماس الداخلية.. ما مواصفات شخصية رئيس...,شؤون فلسطينية,,دنيا الوطن,https://images.alwatanvoice.com/news/large/999...,ar,0.999527,https://www.alwatanvoice.com/arabic/news/2020/...,alwatanvoice.com,https://data.commoncrawl.org/crawl-data/CC-NEW...,2020-03-07T06:49:54+00:00
3,https://www.newsvirginian.com/sports/high-scho...,LANCASTER — Hannah Grubb’s fourth 3-pointer of...,2020-03-06,Late free-throw shooting helps Riverheads girl...,"high-school, sports, newsvirginian",,G C ROSE The Rappahannock Times,The News Virginian,https://bloximages.newyork1.vip.townnews.com/n...,en,0.956792,https://www.newsvirginian.com/sports/high-scho...,newsvirginian.com,https://data.commoncrawl.org/crawl-data/CC-NEW...,2020-03-07T11:05:48+00:00
4,http://www.aktiencheck.de/news/Artikel-Shenzhe...,Shenzhen Fuanna Bedding and Furnishing: Tut si...,2020-03-07,Shenzhen Fuanna Bedding and Furnishing: Tut si...,"Aktien, Börse, Aktiencheck, Aktienkultur, Akti...","Aktien, Börse, Aktiencheck, Aktienkultur, Akti...",Aktiencheck De Ag,aktiencheck.de AG,,de,0.994592,http://www.aktiencheck.de/news/Artikel-Shenzhe...,aktiencheck.de,https://data.commoncrawl.org/crawl-data/CC-NEW...,2020-03-07T06:49:58+00:00


In [10]:
table['published_date'].min()

'1995-01-01'

In [11]:
table['published_date'].max()

'2024-04-14'

In [12]:
table1 = pd.read_parquet("~/Downloads/2020_0065.parquet")
table1.head()

Unnamed: 0,requested_url,plain_text,published_date,title,tags,categories,author,sitename,image_url,language,language_score,responded_url,publisher,warc_path,crawl_date
0,https://www.alalamtv.net/news/4789186/%D8%A7%D...,العالم - السعودية وأوضحت الوزارة أن الحالتين ا...,2020-03-12,السعودية تعلن إصابة 21 مصريا بفيروس كورونا في ...,"العراق , سعودية , مكة , محافظة القطيف , الجنسي...",,,alalamtv.net,https://media.alalamtv.net/uploads/855x495/202...,ar,0.99911,https://www.alalamtv.net/news/4789186/%D8%A7%D...,alalamtv.net,https://data.commoncrawl.org/crawl-data/CC-NEW...,2020-03-12T11:16:58+00:00
1,https://www.devdiscourse.com/article/health/93...,First COVID-19 case in AP as man who returned ...,2020-03-12,First COVID-19 case in AP as man who returned ...,"Italy, Nellore, Andhra Pradesh, Tirupati",,PTI,Devdiscourse,https://www.devdiscourse.com/remote.axd?https:...,en,0.983365,https://www.devdiscourse.com/article/health/93...,devdiscourse.com,https://data.commoncrawl.org/crawl-data/CC-NEW...,2020-03-12T11:16:58+00:00
2,https://www.finanznachrichten.de/nachrichten-2...,"First Derivatives, a global software and consu...",2020-03-12,First Derivatives plc appoints Kathy Schneider...,"First, Derivatives, appoints, Kathy, Schneider...",,,FinanzNachrichten.de,https://cts.businesswire.com/ct/CT?id=bwnews&s...,en,0.94383,https://www.finanznachrichten.de/nachrichten-2...,finanznachrichten.de,https://data.commoncrawl.org/crawl-data/CC-NEW...,2020-03-12T07:42:58+00:00
3,https://www.rt.com/newsline/482911-eu-us-restr...,Brussels ‘assessing’ US restrictions on travel...,2020-03-12,Brussels ‘assessing’ US restrictions on travel...,,,Russia Today,RT International,https://cdni.rt.com/files/2020.03/article/5e6a...,en,0.941224,https://www.rt.com/newsline/482911-eu-us-restr...,rt.com,https://data.commoncrawl.org/crawl-data/CC-NEW...,2020-03-12T11:16:59+00:00
4,http://www.foggiatoday.it/cronaca/evasione-car...,Il tuo browser non può riprodurre il video.Sem...,2020-03-12,VIDEO - Il trasferimento dei 107 detenuti dal ...,"ricerche, carceri, video, evasione, detenuti, ...",,Roberto D'Agostino,FoggiaToday,http://www.foggiatoday.it/~media/horizontal-hi...,it,0.985631,http://www.foggiatoday.it/cronaca/evasione-car...,foggiatoday.it,https://data.commoncrawl.org/crawl-data/CC-NEW...,2020-03-12T12:21:57+00:00


In [15]:
table1['published_date'].min()

'1995-01-01'

In [14]:
table1['published_date'].max()

'2024-01-05'

In [16]:
table1['crawl_date'].max()

'2020-03-16T23:46:39+00:00'

In [17]:
table1['crawl_date'].min()

'2020-03-11T19:51:22+00:00'

In [18]:
table['crawl_date'].min()

'2020-03-06T23:57:09+00:00'

In [19]:
table['crawl_date'].max()

'2020-03-12T12:21:57+00:00'

In [6]:
table75 = pd.read_parquet("~/Downloads/2020_0075.parquet")
table75.head()

Unnamed: 0,requested_url,plain_text,published_date,title,tags,categories,author,sitename,image_url,language,language_score,responded_url,publisher,warc_path,crawl_date
0,https://www.iol.co.za/entertainment/tv/soapies...,"‘The Queen’, ‘Gomora’ and ‘The River’ win big ...",2020-09-27,"‘The Queen’, ‘Gomora’ and ‘The River’ win big ...",,,Debashine Thangevelo,IOL | News that Connects South Africans,https://image-prod.iol.co.za/resize/6016x366?s...,en,0.946704,https://www.iol.co.za/entertainment/tv/soapies...,iol.co.za,https://data.commoncrawl.org/crawl-data/CC-NEW...,2020-09-27T08:51:46+00:00
1,https://wtlcfm.com/2752005/open-lines-septembe...,Hosted by Ebony Chappel and Cameron Ridle Af...,2020-09-27,"Open Lines September 27, 2020",,Open Lines,Open Lines; Emchappel,106.7 WTLC,https://ronetlcnaptown.files.wordpress.com/202...,en,0.96345,https://wtlcfm.com/2752005/open-lines-septembe...,wtlcfm.com,https://data.commoncrawl.org/crawl-data/CC-NEW...,2020-09-27T16:04:58+00:00
2,https://www.lapresse.ca/international/etats-un...,(Washington) Le président américain Donald Tru...,2020-09-27,Trump réclame que Biden fasse un test antidopa...,,États-Unis,Agence France-Presse,La Presse,https://mobile-img.lpcdn.ca/lpca/924x/r3996/7b...,fr,0.994573,https://www.lapresse.ca/international/etats-un...,lapresse.ca,https://data.commoncrawl.org/crawl-data/CC-NEW...,2020-09-27T14:23:25+00:00
3,https://www.hurriyetdailynews.com/some-1-000-d...,"Some 1,000 daily virus cases in NY state, a fi...",2020-09-27,"Some 1,000 daily virus cases in NY state, a fi...","New York,COVID-19,U.S.,pandemic",World,,hurriyetdailynews.com,https://i.hurimg.com/i/hdn/75/200x200/5f70387b...,en,0.955851,https://www.hurriyetdailynews.com/some-1-000-d...,hurriyetdailynews.com,https://data.commoncrawl.org/crawl-data/CC-NEW...,2020-09-27T08:51:48+00:00
4,https://www.mercurynews.com/2020/09/27/cuperti...,COVID-19 testing Santa Clara County will cont...,2020-09-27,Cupertino community briefs for the week of Oct...,Coronavirus;Events;Mental Health,Local News,Anne Gelhaus,The Mercury News,https://www.mercurynews.com/wp-content/uploads...,en,0.940138,https://www.mercurynews.com/2020/09/27/cuperti...,mercurynews.com,https://data.commoncrawl.org/crawl-data/CC-NEW...,2020-09-27T14:23:23+00:00


In [3]:
# Test queries in different languages
from together import Together
with open("config.json", "r") as f:
    config = json.load(f)


api_key = config["api_key"]
embedding_model  = "togethercomputer/m2-bert-80M-8k-retrieval"
together_client = Together(api_key = api_key)

test_queries = [
    "Who won the election?",
    "¿Quién ganó las elecciones?"
]

# Get embeddings for both
embeddings = together_client.embeddings.create(
    input=test_queries,
    model=embedding_model
)


embedding1 = np.array(embeddings.data[0].embedding)
embedding2 = np.array(embeddings.data[1].embedding)

# Calculate cosine similarity
def cosine_similarity(v1, v2):
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

similarity = cosine_similarity(embedding1, embedding2)
print(f"Cosine similarity between English and Spanish queries: {similarity}")

# Compare the embeddings structure and see if they produce sensible results

Cosine similarity between English and Spanish queries: 0.09445791603446733


In [6]:
unrelated_queries = [
    "Who won the election?",
    "What is the weather like today?"
]

unrelated_embeddings = together_client.embeddings.create(
    input=unrelated_queries,
    model="togethercomputer/m2-bert-80M-8k-retrieval"
)

embedding3 = np.array(unrelated_embeddings.data[1].embedding)
unrelated_similarity = cosine_similarity(embedding1, embedding3)
print(f"Cosine similarity between unrelated queries: {unrelated_similarity}")

Cosine similarity between unrelated queries: 0.16176126469319918


In [9]:
unrelated_queries = [
    "I like cheese",
    "I like cheddar"
]

unrelated_embeddings = together_client.embeddings.create(
    input=unrelated_queries,
    model="togethercomputer/m2-bert-80M-8k-retrieval"
)

embedding1 = np.array(unrelated_embeddings.data[0].embedding)
embedding2 = np.array(unrelated_embeddings.data[1].embedding)

# Calculate cosine similarity
def cosine_similarity(v1, v2):
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

similarity = cosine_similarity(embedding1, embedding2)
print(f"Cosine similarity between English and Spanish queries: {similarity}")


Cosine similarity between English and Spanish queries: 0.7509967353992739


In [15]:
unrelated_queries = [
    "Me gusta queso",
    "Me gusta"
]

unrelated_embeddings = together_client.embeddings.create(
    input=unrelated_queries,
    model="togethercomputer/m2-bert-80M-8k-retrieval"
)

embedding1 = np.array(unrelated_embeddings.data[0].embedding)
embedding2 = np.array(unrelated_embeddings.data[1].embedding)

# Calculate cosine similarity
def cosine_similarity(v1, v2):
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

similarity = cosine_similarity(embedding1, embedding2)
print(f"Cosine similarity between English and Spanish queries: {similarity}")


Cosine similarity between English and Spanish queries: 0.48405402781132956


In [19]:
unrelated_queries = [
    "Me gusta comida",
    "I like cheese"
]

unrelated_embeddings = together_client.embeddings.create(
    input=unrelated_queries,
    model="togethercomputer/m2-bert-80M-8k-retrieval"
)

embedding1 = np.array(unrelated_embeddings.data[0].embedding)
embedding2 = np.array(unrelated_embeddings.data[1].embedding)

# Calculate cosine similarity
def cosine_similarity(v1, v2):
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

similarity = cosine_similarity(embedding1, embedding2)
print(f"Cosine similarity between English and Spanish queries: {similarity}")


Cosine similarity between English and Spanish queries: 0.6559736716932607


In [21]:
table75['crawl_date'].min()

'2020-09-27T06:22:27+00:00'

In [22]:
table75['crawl_date'].max()


'2020-10-01T01:01:03+00:00'

In [23]:
table75['published_date'].min()

'1995-01-01'

In [24]:
table75['published_date'].max()

'2023-12-31'

In [31]:
str(table75[table75['published_date'] == '1995-01-01'].iloc[0]['crawl_date'])


'2020-09-27T16:25:18+00:00'

In [1]:
import pandas as pd
table = pd.read_parquet("~/Downloads/2021_0000.parquet")
print(table.head())
print(table['crawl_date'].min())
print(table['crawl_date'].max())


                                       requested_url  \
0  https://www.lequipe.fr/Football/Article/Clermo...   
1  https://www.zazoom.it/soluzioni-cruciverba/def...   
2  https://www.figaalvarado.com/2021/10/muere-al-...   
3  https://vietbao.vn/vo-chong-diep-lam-anh-lo-di...   
4  https://www.newsbreak.com/news/2414225224501/p...   

                                          plain_text published_date  \
0  Il a été formé sur place, il marquait buts sur...     2021-11-01   
1  La definizione e la soluzione di: La citt√† in...     2021-10-31   
2  El hombre, de 71 años, murió en el acto, mient...     2021-10-31   
3  Sau khi tin đồn ly hôn bùng nổ lần 2, chồng Di...     2021-01-11   
4  Patrick Mahomes Has Honest Admission On Loss T...     2021-10-25   

                                               title  \
0  Clermont : Bayo, dure remise en route face à l...   
1  La citt√† in cui gioca a calcio la S.P.A.L - C...   
2          Muere al chocar su auto con una camioneta   
3  Vợ chồng 

In [2]:
table = pd.read_parquet("~/Downloads/2021_0001.parquet")
print(table.head())
print(table['crawl_date'].min())
print(table['crawl_date'].max())


                                       requested_url  \
0  https://www.wwnytv.com/2021/11/04/renzi-foodse...   
1  https://eju.tv/2021/11/emiten-seis-ordenes-de-...   
2  https://www.tendanceouest.com/actualite-130449...   
3  https://www.express.co.uk/news/uk/1516795/Tees...   
4  https://www.ligurianotizie.it/entella-info-per...   

                                          plain_text published_date  \
0  Renzi Foodservice’s expansion costs soar WATER...     2021-11-04   
1  La entidad verde olivo tiene a más de 10 perso...     2021-11-04   
2  Après avoir présenté une série de photos du to...     2016-02-15   
3  The 90-year-old woman has slammed Barker and S...     2021-11-04   
4  Dopo l’impegno in Coppa Italia si torna subito...     2021-11-04   

                                               title  \
0           Renzi Foodservice’s expansion costs soar   
1  Emiten seis órdenes de aprehensión contra secu...   
2  "Game of Thrones" : HBO brouille les pistes av...   
3  Grandma, 

In [3]:
print(table['published_date'].min())
print(table['published_date'].max())

1995-01-01
2024-01-01


In [13]:
table[(table['published_date'] <= '2020-11-10') & (table['published_date'] >= '2020-11-03') & (table['language'] == 'en')]


Unnamed: 0,requested_url,plain_text,published_date,title,tags,categories,author,sitename,image_url,language,language_score,responded_url,publisher,warc_path,crawl_date
83309,https://www.self.com/story/best-black-friday-m...,"With Black Friday sales already underway, we c...",2020-11-10,The 9 Best Black Friday Bedding and Mattress D...,"shopping,black friday,bed,sleep,cyber monday,sale",culture,Condé Nast; Malia Griggs; Sara Coughlin,SELF,https://media.self.com/photos/5fbbef66fa3f2473...,en,0.94663,https://www.self.com/story/best-black-friday-m...,self.com,https://data.commoncrawl.org/crawl-data/CC-NEW...,2021-11-05T12:06:14+00:00
232542,https://www.iheart.com/content/2020-11-05-why-...,Why November 7th Matters In Rock History By Da...,2020-11-05,Why November 7th Matters In Rock History | iHeart,"Why November 7th Matters In Rock History, why,...",,Dave Basner,Iheart,https://i.iheart.com/v3/re/new_assets/5dc33f9b...,en,0.978261,https://www.iheart.com/content/2020-11-05-why-...,iheart.com,https://data.commoncrawl.org/crawl-data/CC-NEW...,2021-11-05T18:10:08+00:00
245277,https://www.cbc.ca/news/canada/british-columbi...,COVID-19 denier and conspiracy theorist Mak Pa...,2020-11-05,COVID-19 denier and conspiracy theorist Mak Pa...,,British Columbia,CBC News,CBC,https://i.cbc.ca/1.5790434.1604544122!/fileIma...,en,0.980738,https://www.cbc.ca/news/canada/british-columbi...,cbc.ca,https://data.commoncrawl.org/crawl-data/CC-NEW...,2021-11-06T00:36:42+00:00
365341,https://www.cbc.ca/news/canada/british-columbi...,Canada to celebrate 100th anniversary of Remem...,2020-11-10,Canada to celebrate 100th anniversary of Remem...,,British Columbia,CBC News,CBC,https://i.cbc.ca/1.5797403.1605043299!/fileIma...,en,0.985932,https://www.cbc.ca/news/canada/british-columbi...,cbc.ca,https://data.commoncrawl.org/crawl-data/CC-NEW...,2021-11-06T18:33:46+00:00
646901,https://www.iheart.com/content/2020-11-05-why-...,Why November 8th Matters In Rock History By Da...,2020-11-05,Why November 8th Matters In Rock History | iHeart,"Why November 8th Matters In Rock History, why,...",,Dave Basner,Iheart,https://i.iheart.com/v3/re/new_assets/5dc48c1f...,en,0.953646,https://www.iheart.com/content/2020-11-05-why-...,iheart.com,https://data.commoncrawl.org/crawl-data/CC-NEW...,2021-11-08T14:34:21+00:00
669166,https://powerboat.world/news/243667/?source=rss,"With more than 6,800 islands scattered from te...",2020-11-05,Riviera's style and seaworthiness at home in J...,"sailing, yachting, boating, racing, results, n...",,,powerboat.world,https://powerboat.world/photos/powerboat/yysw3...,en,0.975255,https://powerboat.world/news/243667/?source=rss,powerboat.world,https://data.commoncrawl.org/crawl-data/CC-NEW...,2021-11-08T13:18:54+00:00
789154,https://www.iheart.com/content/2020-11-05-35-t...,35 Things You Might Not Know About 'Led Zeppel...,2020-11-05,35 Things You Might Not Know About 'Led Zeppel...,35 Things You Might Not Know About Led Zeppeli...,,Dave Basner,Iheart,https://i.iheart.com/v3/re/new_assets/5dc5817d...,en,0.972775,https://www.iheart.com/content/2020-11-05-35-t...,iheart.com,https://data.commoncrawl.org/crawl-data/CC-NEW...,2021-11-08T20:11:20+00:00
977920,https://www.leafly.com/brands/tko-reserve/prod...,About this product Expertly rolled with only p...,2020-11-09,TKO Reserve: Rainbow Tonic Doublepacks : Two ....,,hybrid,,Leafly,https://leafly-public.s3-us-west-2.amazonaws.c...,en,0.929403,https://www.leafly.com/brands/tko-reserve/prod...,leafly.com,https://data.commoncrawl.org/crawl-data/CC-NEW...,2021-11-09T17:02:14+00:00


In [1]:
import requests

url = "https://almond-static.stanford.edu/research/multinews/2020_11_03_11_10.jsonl"
response = requests.get(url, verify=False)  # verify=False is equivalent to --no-check-certificate

with open("2020_11_03_11_10.jsonl", "wb") as f:
    f.write(response.content)



In [12]:
table75[(table75['published_date'] <= '2020-11-10') & (table75['published_date'] >= '2020-11-03') & (table75['language'] == 'en')]


Unnamed: 0,requested_url,plain_text,published_date,title,tags,categories,author,sitename,image_url,language,language_score,responded_url,publisher,warc_path,crawl_date
316711,https://mobex.io/webinars/benchmarking-the-por...,Benchmarking the Porsche Taycan’s 800V powertr...,2020-11-09,Benchmarking the Porsche Taycan's 800V powertr...,,,Michelle Cobb,Mobex,,en,0.84529,https://mobex.io/webinars/benchmarking-the-por...,mobex.io,https://data.commoncrawl.org/crawl-data/CC-NEW...,2020-09-29T11:49:02+00:00
350377,https://isthmus.com/events/four-seasons-on-a-w...,media release: Cherished by Friends since 1982...,2020-11-10,ONLINE: Four Seasons on a Wisconsin Cranberry ...,"Environment,Lectures & Seminars;Environment;Le...",,,"Isthmus | Madison, Wisconsin",https://isthmus.com/api/design-1faf2c10b294233...,en,0.939113,https://isthmus.com/events/four-seasons-on-a-w...,isthmus.com,https://data.commoncrawl.org/crawl-data/CC-NEW...,2020-09-29T05:11:41+00:00
624356,https://www.dvidshub.net/video/767802/1st-batt...,(e.g. yourname@email.com) Remember me Forgot P...,2020-11-09,"1st Battalion, 10th SFG(A) CH-47 Onload And Ta...",SOCEUR;Airborne;1-10;Special Operations Comman...,,,DVIDS,https://cdn.dvidshub.net/media/thumbs/frames/v...,en,0.812096,https://www.dvidshub.net/video/767802/1st-batt...,dvidshub.net,https://data.commoncrawl.org/crawl-data/CC-NEW...,2020-09-30T11:34:48+00:00
723231,https://www.krem.com/article/news/crime/coeurd...,"COEUR D'ALENE, Idaho — A fire that destroyed a...",2020-11-09,"Fire that destroyed Coeur d'Alene duplex, inju...",,,Megan Carroll,KREM,https://media.krem.com/assets/KREM/images/6f59...,en,0.971229,https://www.krem.com/article/news/crime/coeurd...,krem.com,https://data.commoncrawl.org/crawl-data/CC-NEW...,2020-09-30T16:44:22+00:00


In [6]:
from datasets import load_dataset
from huggingface_hub import HfApi
import shutil
from pathlib import Path

# Method 1: Delete specific cache directory
def delete_cache_dir():
    cache_path = Path("data/raw/ccnews")
    if cache_path.exists():
        try:
            shutil.rmtree(cache_path)
            print(f"Successfully deleted cache directory: {cache_path}")
        except Exception as e:
            print(f"Error deleting cache directory: {e}")
    else:
        print("Cache directory not found")

# Method 2: Clear dataset cache using datasets library
def clear_dataset_cache():
    try:
        # Clear the cache for specific dataset
        load_dataset(
            "stanford-oval/ccnews",
            name="2020",
            split="train",
            cache_dir=".data/raw/ccnews"
        ).cleanup_cache_files()
        print("Successfully cleared dataset cache")
    except Exception as e:
        print(f"Error clearing dataset cache: {e}")

# Method 3: Delete specific files
def delete_specific_files():
    cache_path = Path(".data/raw/ccnews")
    if cache_path.exists():
        try:
            # List and delete specific files
            for file in cache_path.rglob("*"):
                if file.is_file():
                    file.unlink()
                    print(f"Deleted: {file}")
        except Exception as e:
            print(f"Error deleting files: {e}")

# Choose the method you want to use
delete_cache_dir()  # Most straightforward - deletes everything
# clear_dataset_cache()  # More selective - uses datasets library
# delete_specific_files()  # Most granular - delete specific files

Successfully deleted cache directory: data/raw/ccnews


In [4]:
for example in dataset:
    print(example)  # Process each article
    break  # Remove this to process all articles

NameError: name 'dataset' is not defined

In [15]:
dataset = load_dataset( "stanford-oval/ccnews", name="2020", split="train", streaming=True ).filter(lambda article: article["language"] in ["en", "es"])
# filtering for 2020 election dates 
start_date = datetime.datetime(2020, 11, 4, tzinfo=datetime.timezone.utc)
end_date = datetime.datetime(2020, 11, 5, tzinfo=datetime.timezone.utc)
keywords = ["election", "presidential", "Biden", "Trump", "vote", "elections"]

# Define the filter function
def filter_articles(article):
    try:
        # Parse the crawl date using dateutil for flexible formats
        crawl_date = parser.isoparse(article["crawl_date"])
    except ValueError as e:
        # Skip rows with invalid dates
        print(f"Skipping article due to date parsing error: {e}")
        return False
    
    # Check date range
    if not (start_date <= crawl_date <= end_date):
        return False
    
    # Check language
    # if article["language"] not in ["en", "es"]:
    #     return False
    
    # Check keywords in title or content
    title = article.get("title", "").lower()
    content = article.get("content", "").lower()
    if any(keyword in title or keyword in content for keyword in keywords):
        return True
    
    return False

# Apply the filter while streaming
filtered_dataset = dataset.filter(filter_articles)

# Iterate over the filtered dataset
for example in filtered_dataset:
    print(example)
    break


Resolving data files:   0%|          | 0/479 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/76 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [19]:
from datasets import load_dataset
from multiprocessing import Pool, cpu_count
from datetime import datetime, timezone
from dateutil import parser

# Define constants
start_date = datetime(2020, 11, 4, tzinfo=timezone.utc)
end_date = datetime(2020, 11, 5, tzinfo=timezone.utc)
keywords = ["election", "presidential", "Biden", "Trump", "vote", "elections"]

# Define the filter function
def filter_articles(article):
    try:
        # Parse the crawl date
        crawl_date = parser.isoparse(article["crawl_date"])
    except ValueError:
        # Skip invalid dates
        return None
    
    # Check date range
    if not (start_date <= crawl_date <= end_date):
        return None
    
    # Check keywords in title or content
    title = article.get("title", "").lower()
    content = article.get("content", "").lower()
    if any(keyword in title or keyword in content for keyword in keywords):
        return article  # Return the article if it matches
    
    return None

# Helper to process a chunk of articles
def process_chunk(chunk):
    return [filter_articles(article) for article in chunk if filter_articles(article) is not None]

# Function to stream the dataset in chunks
def stream_in_chunks(dataset, chunk_size=1000):
    buffer = []
    for article in dataset:
        buffer.append(article)
        if len(buffer) == chunk_size:
            yield buffer
            buffer = []
    if buffer:  # Yield the last chunk
        yield buffer

# Initialize the dataset
dataset = load_dataset(
    "stanford-oval/ccnews", 
    name="2020", 
    split="train", 
    streaming=True
).filter(lambda article: article["language"] in ["en", "es"])

# Use multiprocessing to process the dataset in chunks
filtered_articles = []
chunk_size = 1000
with Pool(cpu_count()) as pool:
    for chunk in stream_in_chunks(dataset, chunk_size=chunk_size):
        results = pool.map(process_chunk, [chunk])
        for filtered in results:
            filtered_articles.extend(filtered)

# Print some filtered articles
for article in filtered_articles[:5]:
    print(article)

# Print some filtered articles
# for article in filtered_articles[:5]:
#     print(article)


Resolving data files:   0%|          | 0/479 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/76 [00:00<?, ?it/s]

Process SpawnPoolWorker-10:
Traceback (most recent call last):
  File "/opt/miniconda3/envs/birds/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/opt/miniconda3/envs/birds/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/miniconda3/envs/birds/lib/python3.8/multiprocessing/pool.py", line 114, in worker
    task = get()
  File "/opt/miniconda3/envs/birds/lib/python3.8/multiprocessing/queues.py", line 358, in get
    return _ForkingPickler.loads(res)
AttributeError: Can't get attribute 'process_chunk' on <module '__main__' (built-in)>


KeyboardInterrupt: 

In [53]:
iterable_dataset = load_dataset("stanford-oval/ccnews", name="2020", split = "train",streaming=True)
for example in iterable_dataset:
    print(example)
    break

Resolving data files:   0%|          | 0/479 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/76 [00:00<?, ?it/s]

{'requested_url': 'https://www.telez.fr/actus-tv/demain-nous-appartient-en-avance-resume-de-lepisode-629-de-mercredi-1er-janvier/', 'plain_text': 'TF1 diffuse le mercredi 1er janvier l’épisode 629 du feuilleton Demain nous appartient. Au menu : Bart piégé, Samuel s’énerve. Attention spoilers !  Dans Demain nous appartient, Bart se réveille seul dans son lit. Il retrouve sa mère au petit déjeuner. Elle lui demande pardon pour la dispute de la veille. Flore lui redit combien il est important pour elle qu’ils s’entendent bien. Bart lui promet de tout faire pour que son mariage se passe bien. Quelqu’un sonne à la porte : une enveloppe adressée à Bart est posée sur le paillasson. Dedans, on devine des photos compromettantes. Bart va voir son cousin pour lui montrer. Un mot y est associé : s’il continue à fouiller, il aura des problèmes. Il sait qu’Audrey l’a piégé. Max lui conseille de chercher cette Audrey. Ils débarquent au Spoon pour voir Ulysse et lui demander s’il a des infos sur cette

In [56]:
iterable_dataset.info

DatasetInfo(description='', citation='', homepage='', license='', features={'requested_url': Value(dtype='string', id=None), 'plain_text': Value(dtype='string', id=None), 'published_date': Value(dtype='string', id=None), 'title': Value(dtype='string', id=None), 'tags': Value(dtype='string', id=None), 'categories': Value(dtype='string', id=None), 'author': Value(dtype='string', id=None), 'sitename': Value(dtype='string', id=None), 'image_url': Value(dtype='string', id=None), 'language': Value(dtype='string', id=None), 'language_score': Value(dtype='float64', id=None), 'responded_url': Value(dtype='string', id=None), 'publisher': Value(dtype='string', id=None), 'warc_path': Value(dtype='string', id=None), 'crawl_date': Value(dtype='string', id=None)}, post_processed=None, supervised_keys=None, builder_name='parquet', dataset_name='ccnews', config_name='2020', version=0.0.0, splits=None, download_checksums=None, download_size=None, post_processing_size=None, dataset_size=None, size_in_byt

In [2]:
# Initialize client with API key
with open("config.json", "r") as f:
    config = json.load(f)
api_key = config["api_key"]
# print(api_key)

client = Together(api_key=api_key)


In [3]:
def generate_embeddings(input_texts):
    """Generate embeddings from Together API.

    Args:
        input_texts: a list of string input texts.
        model_api_string: str. An API string for a specific embedding model of your choice.

    Returns:
        embeddings_list: a list of embeddings. Each element corresponds to the each input text.
    """
    model_api_string = "togethercomputer/m2-bert-80M-8k-retrieval"
    outputs = client.embeddings.create(
        input=input_texts, 
        model=model_api_string,
    )
    return [x.embedding for x in outputs.data]

In [23]:
filtered_dataset.features

{'requested_url': Value(dtype='string', id=None),
 'plain_text': Value(dtype='string', id=None),
 'published_date': Value(dtype='string', id=None),
 'title': Value(dtype='string', id=None),
 'tags': Value(dtype='string', id=None),
 'categories': Value(dtype='string', id=None),
 'author': Value(dtype='string', id=None),
 'sitename': Value(dtype='string', id=None),
 'image_url': Value(dtype='string', id=None),
 'language': Value(dtype='string', id=None),
 'language_score': Value(dtype='float64', id=None),
 'responded_url': Value(dtype='string', id=None),
 'publisher': Value(dtype='string', id=None),
 'warc_path': Value(dtype='string', id=None),
 'crawl_date': Value(dtype='string', id=None)}

In [5]:
with open('sample_data.json', 'r') as file:
    data = json.load(file)

# Print the data
print(data)

[{'requested_url': 'https://www.telez.fr/actus-tv/demain-nous-appartient-en-avance-resume-de-lepisode-629-de-mercredi-1er-janvier/', 'plain_text': "TF1 diffuse le mercredi 1er janvier l'épisode 629 du feuilleton Demain nous appartient. Au menu : Bart piégé, Samuel s'énerve. Attention spoilers !  Dans Demain nous appartient, Bart se réveille seul dans son lit. Il retrouve sa mère au petit déjeuner. Elle lui demande pardon pour la dispute de la veille. Flore lui redit combien il est important pour elle qu'ils s'entendent bien. Bart lui promet de tout faire pour que son mariage se passe bien. Quelqu'un sonne à la porte : une enveloppe adressée à Bart est posée sur le paillasson. Dedans, on devine des photos compromettantes. Bart va voir son cousin pour lui montrer. Un mot y est associé : s'il continue à fouiller, il aura des problèmes. Il sait qu'Audrey l'a piégé. Max lui conseille de chercher cette Audrey. Ils débarquent au Spoon pour voir Ulysse et lui demander s'il a des infos sur cett

In [11]:
# Extract 'plain_text' field
plain_texts = [article.get("plain_text", "") for article in data]

# Call the generate_embeddings function
# Ensure you have a `client` instance properly initialized before this step
try:
    embeddings = generate_embeddings(plain_texts)
except Exception as e:
    print(f"Error generating embeddings: {e}")
    embeddings = []

# Add embeddings back to the dataset
for article, embedding in zip(data, embeddings):
    article["embedding"] = embedding

# Optionally, save the updated data back to a file
with open("your_file_with_embeddings.json", "w", encoding="utf-8") as file:
    json.dump(data, file, ensure_ascii=False, indent=4)

print("Embeddings added to the dataset.")


Embeddings added to the dataset.


In [7]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel
import faiss
import numpy as np
from tqdm import tqdm

# Step 1: Load Articles from Hugging Face Dataset
def load_and_filter_articles():
    """
    Load articles from the CCNews dataset filtered by language and date.
    """
    # Load dataset in streaming mode
    dataset = load_dataset("stanford-oval/ccnews", name="2020", streaming=True)
    
    # Define filter criteria
    def filter_function(article):
        # Filter by language and crawl date
        return (
            article["language"] in ["en", "es"] and 
            "2020-11" in article["crawl_date"] and
            "election" in (article.get("title", "").lower() + article.get("content", "").lower())
        )
    
    # Apply the filter
    filtered_articles = []
    for article in tqdm(dataset["train"], desc="Filtering Articles"):
        if filter_function(article):
            filtered_articles.append(article)
    
    return filtered_articles

# Step 2: Set Up M2-BERT-80M-8K-Retrieval
def setup_model():
    """
    Load the M2-BERT model and tokenizer.
    """
    tokenizer = AutoTokenizer.from_pretrained("together-ai/M2-BERT-80M-8K-Retrieval")
    model = AutoModel.from_pretrained("together-ai/M2-BERT-80M-8K-Retrieval")
    return tokenizer, model

# Step 3: Generate Embeddings
def embed_text(text, tokenizer, model):
    """
    Generate embeddings for a given text using M2-BERT.
    """
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=8000, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    # Use the [CLS] token embedding (first token)
    return outputs.last_hidden_state[:, 0, :].squeeze().numpy()

def generate_embeddings(articles, tokenizer, model, batch_size=16):
    """
    Generate embeddings for all filtered articles using batching.
    """
    embeddings = []
    batch_texts = []  # To store text for batching
    
    for article in tqdm(articles, desc="Preparing Batches"):
        text = article.get("title", "") + " " + article.get("content", "")
        batch_texts.append(text)
        
        # If batch is full, process it
        if len(batch_texts) == batch_size:
            inputs = tokenizer(batch_texts, return_tensors="pt", truncation=True, max_length=8000, padding=True)
            with torch.no_grad():
                outputs = model(**inputs)
            # Extract [CLS] token embeddings for the batch
            batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
            embeddings.append(batch_embeddings)
            batch_texts = []  # Reset batch

    # Process remaining texts (if any)
    if batch_texts:
        inputs = tokenizer(batch_texts, return_tensors="pt", truncation=True, max_length=8000, padding=True)
        with torch.no_grad():
            outputs = model(**inputs)
        batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        embeddings.append(batch_embeddings)
    
    # Concatenate all batches
    embeddings = np.vstack(embeddings)
    return embeddings


# Step 4: Set Up FAISS Vector Store
def setup_faiss(embeddings):
    """
    Create and populate a FAISS vector store.
    """
    dimension = embeddings.shape[1]  # Embedding dimension
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    return index

# Step 5: Query and Retrieve
def retrieve_documents(query, tokenizer, model, index, articles, k=5):
    """
    Retrieve top-k documents for a query.
    """
    query_embedding = embed_text(query, tokenizer, model)
    distances, indices = index.search(query_embedding[np.newaxis, :], k)
    retrieved_docs = [articles[i] for i in indices[0]]
    return retrieved_docs

# Step 6: Use Retrieved Documents in a RAG Pipeline
def generate_response(query, retrieved_docs, generator_model):
    """
    Generate a response using retrieved documents and a text generator.
    """
    context = "\n".join([doc.get("title", "") + " " + doc.get("content", "") for doc in retrieved_docs])
    prompt = f"Query: {query}\nContext: {context}\nAnswer:"
    
    # Generate the response (replace 'your-generator-model' with the actual generator model)
    response = generator_model(prompt, max_length=512)
    return response[0]["generated_text"]

# Step 7: Main Pipeline
def main():
    # Load and filter articles
    print("Loading and filtering articles...")
    articles = load_and_filter_articles()

    # Set up model
    print("Setting up model...")
    tokenizer, model = setup_model()

    # Generate embeddings
    print("Generating embeddings...")
    embeddings = generate_embeddings(articles, tokenizer, model, batch_size=16)

    # Set up FAISS index
    print("Setting up FAISS vector store...")
    index = setup_faiss(embeddings)

    # Example query
    query = "What happened in the 2020 US Presidential Election?"
    print(f"Query: {query}")

    # Retrieve documents
    print("Retrieving documents...")
    retrieved_docs = retrieve_documents(query, tokenizer, model, index, articles)

    # Generate response
    print("Generating response...")
    from transformers import pipeline
    generator_model = pipeline("text-generation", model="together-ai/your-generator-model")
    response = generate_response(query, retrieved_docs, generator_model)

    print("\nGenerated Response:")
    print(response)

# Run the pipeline
if __name__ == "__main__":
    main()


IterableDataset({
    features: ['requested_url', 'plain_text', 'published_date', 'title', 'tags', 'categories', 'author', 'sitename', 'image_url', 'language', 'language_score', 'responded_url', 'publisher', 'warc_path', 'crawl_date'],
    num_shards: 76
})