In [None]:
# Core data processing functions
import datetime
from pathlib import Path
import numpy as np
import json
import h5py
from typing import List, Dict, Any
from sentence_transformers import SentenceTransformer
from together import Together
import streamlit as st
import faiss

# Data loading and processing functions (keeping previous functions)
def load_election_dataset():
    """Load and initialize the CCNews dataset for 2020 election coverage"""
    dataset = load_dataset(
        "stanford-oval/ccnews", 
        name="2020",
        split="train", 
        streaming=True
    ).filter(lambda article: article["language"] in ["en", "es"])
    return dataset

# [Previous functions remain the same...]
def filter_articles(article):
    """Filter articles to keep only election-related content from 2020"""
    # Check if article has required fields
    required_fields = ["plain_text", "published_date", "language"]
    if not all(field in article for field in required_fields):
        return False
        
    # Parse date
    try:
        date = datetime.datetime.strptime(article["published_date"], "%Y-%m-%d")
        if date.year != 2020:
            return False
    except:
        return False
        
    # Check language
    if article["language"] not in ["en", "es"]:
        return False
        
    # Check for election-related keywords
    keywords = ["election", "vote", "voting", "Trump", "Biden", "campaign", 
               "elección", "voto", "votar", "campaña"]
    text = article["plain_text"].lower()
    if not any(kw.lower() in text for kw in keywords):
        return False
        
    return True

def process_articles(dataset):
    """Process filtered articles into language-specific collections"""
    articles = {"en": [], "es": []}
    article_id = 0
    
    for article in dataset:
        processed = {
            "id": article_id,
            "text": article["plain_text"],
            "date": article["published_date"],
            "url": article["requested_url"]
        }
        articles[article["language"]].append(processed)
        article_id += 1
        
    return articles

def generate_embeddings(articles, model_name="intfloat/multilingual-e5-large"):
    """Generate embeddings for articles using multilingual E5 model"""
    model = SentenceTransformer(model_name)
    embeddings = {"en": {"embeddings": [], "article_ids": []},
                 "es": {"embeddings": [], "article_ids": []}}
                 
    for lang in articles:
        # Prepare texts with prefix for better retrieval performance
        texts = [f"passage: {article['text']}" for article in articles[lang]]
        # Generate embeddings in batches to manage memory
        batch_size = 32
        all_embeddings = []
        
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i + batch_size]
            embs = model.encode(batch, normalize_embeddings=True)
            all_embeddings.append(embs)
            
        embeddings[lang]["embeddings"] = np.vstack(all_embeddings)
        embeddings[lang]["article_ids"] = [art["id"] for art in articles[lang]]
        
    return embeddings

def save_embeddings(embeddings, save_dir):
    """Save embeddings and metadata to disk"""
    save_dir = Path(save_dir)
    save_dir.mkdir(exist_ok=True)
    
    for lang in embeddings:
        with h5py.File(save_dir / f"{lang}_embeddings.h5", "w") as f:
            f.create_dataset("embeddings", data=embeddings[lang]["embeddings"])
            f.create_dataset("article_ids", data=embeddings[lang]["article_ids"])

def load_embeddings(load_dir):
    """Load embeddings and metadata from disk"""
    load_dir = Path(load_dir)
    embeddings = {}
    
    for lang in ["en", "es"]:
        with h5py.File(load_dir / f"{lang}_embeddings.h5", "r") as f:
            embeddings[lang] = {
                "embeddings": f["embeddings"][:],
                "article_ids": f["article_ids"][:]
            }
            
    return embeddings


# New RAG and QA System functions
class ElectionQASystem:
    def __init__(self, embeddings_dir: str, articles_file: str, model_name: str = "BAAI/bge-large-en"):
        self.embeddings_dir = Path(embeddings_dir)
        self.articles_file = Path(articles_file)
        self.embedding_model = SentenceTransformer(model_name)
        self.together_client = Together()
        self.load_data()
        
    def load_data(self):
        """Load embeddings and articles"""
        # Load articles
        with open(self.articles_file) as f:
            self.articles = json.load(f)
            
        # Load embeddings and create FAISS indices
        self.indices = {}
        embeddings = load_embeddings(self.embeddings_dir)
        
        for lang in embeddings:
            index = faiss.IndexFlatL2(embeddings[lang]["embeddings"].shape[1])
            index.add(embeddings[lang]["embeddings"])
            self.indices[lang] = {
                "index": index,
                "article_ids": embeddings[lang]["article_ids"]
            }
    
    def get_relevant_context(self, query: str, language: str, k: int = 3) -> List[Dict[str, Any]]:
        """Retrieve relevant articles using RAG"""
        # Encode query
        query_embedding = self.embedding_model.encode([query])[0]
        
        # Search in appropriate language index
        D, I = self.indices[language]["index"].search(
            np.array([query_embedding]), k
        )
        
        # Get relevant articles
        relevant_articles = []
        for idx in I[0]:
            article_id = self.indices[language]["article_ids"][idx]
            article = next(
                art for art in self.articles[language] 
                if art["id"] == article_id
            )
            relevant_articles.append(article)
            
        return relevant_articles
    
    def generate_answer(self, query: str, context: List[Dict[str, Any]], language: str) -> str:
        """Generate answer using LLM"""
        # Format context
        context_text = "\n\n".join(
            f"Title: {art['title']}\nContent: {art['text']}" 
            for art in context
        )
        
        # Create prompt
        prompt = f"""Based on the following articles about the 2020 US Election, please answer the question.

Context:
{context_text}

Question: {query}

Answer:"""

        # Generate response using Together API
        response = self.together_client.complete(
            prompt=prompt,
            model="togethercomputer/llama-2-70b-chat",
            max_tokens=500,
            temperature=0.7
        )
        
        return response.output.text

# Suggested repository structure:
"""
election_qa/
├── data/
│   ├── raw/                      
│   │   └── articles.json         # All articles grouped by language
│   └── processed/              
│       ├── metadata/           
│       │   └── article_index.json
│       └── embeddings/          
│           ├── embeddings_en.h5  
│           └── embeddings_es.h5  
├── src/
│   ├── data_processing/
│   │   ├── __init__.py
│   │   └── loader.py            # This file's data loading functions
│   ├── qa/
│   │   ├── __init__.py
│   │   └── system.py            # ElectionQASystem class
│   └── ui/
│       ├── __init__.py
│       └── app.py               # Streamlit interface
├── requirements.txt
└── README.md
"""

# Streamlit UI code (src/ui/app.py):
def create_streamlit_app():
    st.title("2020 Election Q&A System")
    
    # Initialize QA system
    qa_system = ElectionQASystem(
        embeddings_dir="data/processed/embeddings",
        articles_file="data/raw/articles.json"
    )
    
    # Language selection
    language = st.selectbox(
        "Select Language",
        options=["en", "es"],
        format_func=lambda x: "English" if x == "en" else "Spanish"
    )
    
    # Query input
    query = st.text_input("Enter your question about the 2020 US Election:")
    
    if query:
        with st.spinner("Searching for relevant information..."):
            # Get relevant context
            context = qa_system.get_relevant_context(query, language)
            
            # Generate answer
            answer = qa_system.generate_answer(query, context, language)
            
            # Display results
            st.subheader("Answer:")
            st.write(answer)
            
            st.subheader("Sources:")
            for article in context:
                with st.expander(f"Source: {article['source']} - {article['date']}"):
                    st.write(f"**{article['title']}**")
                    st.write(article['text'][:500] + "...")

if __name__ == "__main__":
    # For data processing script
    dataset = load_election_dataset()
    
    # Define filters
    start_date = datetime.datetime(2020, 11, 4, tzinfo=datetime.timezone.utc)
    end_date = datetime.datetime(2020, 11, 5, tzinfo=datetime.timezone.utc)
    keywords = ["election", "presidential", "Biden", "Trump", "vote", "elections"]
    
    # Process and save data
    processed_articles = []
    for article in dataset:
        if filter_by_date(article, start_date, end_date):
            if filter_by_keywords(article.get("title", "") + article.get("plain_text", ""), keywords):
                processed_article = process_article(article)
                processed_articles.append(processed_article)
    
    save_processed_data(processed_articles, "data/raw/articles.json")
    
    # For Streamlit app
    create_streamlit_app()





In [12]:
from datasets import load_dataset
from tqdm import tqdm
import datetime
import pandas as pd
from dateutil import parser
import json
from together import Together
import os
import requests
import multiprocessing

In [13]:
print(multiprocessing.cpu_count())

8


In [15]:
dataset = load_dataset( "stanford-oval/ccnews", name="2020", split="train", streaming=True ).filter(lambda article: article["language"] in ["en", "es"])
# filtering for 2020 election dates 
start_date = datetime.datetime(2020, 11, 4, tzinfo=datetime.timezone.utc)
end_date = datetime.datetime(2020, 11, 5, tzinfo=datetime.timezone.utc)
keywords = ["election", "presidential", "Biden", "Trump", "vote", "elections"]

# Define the filter function
def filter_articles(article):
    try:
        # Parse the crawl date using dateutil for flexible formats
        crawl_date = parser.isoparse(article["crawl_date"])
    except ValueError as e:
        # Skip rows with invalid dates
        print(f"Skipping article due to date parsing error: {e}")
        return False
    
    # Check date range
    if not (start_date <= crawl_date <= end_date):
        return False
    
    # Check language
    # if article["language"] not in ["en", "es"]:
    #     return False
    
    # Check keywords in title or content
    title = article.get("title", "").lower()
    content = article.get("content", "").lower()
    if any(keyword in title or keyword in content for keyword in keywords):
        return True
    
    return False

# Apply the filter while streaming
filtered_dataset = dataset.filter(filter_articles)

# Iterate over the filtered dataset
for example in filtered_dataset:
    print(example)
    break


Resolving data files:   0%|          | 0/479 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/76 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [19]:
from datasets import load_dataset
from multiprocessing import Pool, cpu_count
from datetime import datetime, timezone
from dateutil import parser

# Define constants
start_date = datetime(2020, 11, 4, tzinfo=timezone.utc)
end_date = datetime(2020, 11, 5, tzinfo=timezone.utc)
keywords = ["election", "presidential", "Biden", "Trump", "vote", "elections"]

# Define the filter function
def filter_articles(article):
    try:
        # Parse the crawl date
        crawl_date = parser.isoparse(article["crawl_date"])
    except ValueError:
        # Skip invalid dates
        return None
    
    # Check date range
    if not (start_date <= crawl_date <= end_date):
        return None
    
    # Check keywords in title or content
    title = article.get("title", "").lower()
    content = article.get("content", "").lower()
    if any(keyword in title or keyword in content for keyword in keywords):
        return article  # Return the article if it matches
    
    return None

# Helper to process a chunk of articles
def process_chunk(chunk):
    return [filter_articles(article) for article in chunk if filter_articles(article) is not None]

# Function to stream the dataset in chunks
def stream_in_chunks(dataset, chunk_size=1000):
    buffer = []
    for article in dataset:
        buffer.append(article)
        if len(buffer) == chunk_size:
            yield buffer
            buffer = []
    if buffer:  # Yield the last chunk
        yield buffer

# Initialize the dataset
dataset = load_dataset(
    "stanford-oval/ccnews", 
    name="2020", 
    split="train", 
    streaming=True
).filter(lambda article: article["language"] in ["en", "es"])

# Use multiprocessing to process the dataset in chunks
filtered_articles = []
chunk_size = 1000
with Pool(cpu_count()) as pool:
    for chunk in stream_in_chunks(dataset, chunk_size=chunk_size):
        results = pool.map(process_chunk, [chunk])
        for filtered in results:
            filtered_articles.extend(filtered)

# Print some filtered articles
for article in filtered_articles[:5]:
    print(article)

# Print some filtered articles
# for article in filtered_articles[:5]:
#     print(article)


Resolving data files:   0%|          | 0/479 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/76 [00:00<?, ?it/s]

Process SpawnPoolWorker-10:
Traceback (most recent call last):
  File "/opt/miniconda3/envs/birds/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/opt/miniconda3/envs/birds/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/miniconda3/envs/birds/lib/python3.8/multiprocessing/pool.py", line 114, in worker
    task = get()
  File "/opt/miniconda3/envs/birds/lib/python3.8/multiprocessing/queues.py", line 358, in get
    return _ForkingPickler.loads(res)
AttributeError: Can't get attribute 'process_chunk' on <module '__main__' (built-in)>


KeyboardInterrupt: 

In [53]:
iterable_dataset = load_dataset("stanford-oval/ccnews", name="2020", split = "train",streaming=True)
for example in iterable_dataset:
    print(example)
    break

Resolving data files:   0%|          | 0/479 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/76 [00:00<?, ?it/s]

{'requested_url': 'https://www.telez.fr/actus-tv/demain-nous-appartient-en-avance-resume-de-lepisode-629-de-mercredi-1er-janvier/', 'plain_text': 'TF1 diffuse le mercredi 1er janvier l’épisode 629 du feuilleton Demain nous appartient. Au menu : Bart piégé, Samuel s’énerve. Attention spoilers !  Dans Demain nous appartient, Bart se réveille seul dans son lit. Il retrouve sa mère au petit déjeuner. Elle lui demande pardon pour la dispute de la veille. Flore lui redit combien il est important pour elle qu’ils s’entendent bien. Bart lui promet de tout faire pour que son mariage se passe bien. Quelqu’un sonne à la porte : une enveloppe adressée à Bart est posée sur le paillasson. Dedans, on devine des photos compromettantes. Bart va voir son cousin pour lui montrer. Un mot y est associé : s’il continue à fouiller, il aura des problèmes. Il sait qu’Audrey l’a piégé. Max lui conseille de chercher cette Audrey. Ils débarquent au Spoon pour voir Ulysse et lui demander s’il a des infos sur cette

In [56]:
iterable_dataset.info

DatasetInfo(description='', citation='', homepage='', license='', features={'requested_url': Value(dtype='string', id=None), 'plain_text': Value(dtype='string', id=None), 'published_date': Value(dtype='string', id=None), 'title': Value(dtype='string', id=None), 'tags': Value(dtype='string', id=None), 'categories': Value(dtype='string', id=None), 'author': Value(dtype='string', id=None), 'sitename': Value(dtype='string', id=None), 'image_url': Value(dtype='string', id=None), 'language': Value(dtype='string', id=None), 'language_score': Value(dtype='float64', id=None), 'responded_url': Value(dtype='string', id=None), 'publisher': Value(dtype='string', id=None), 'warc_path': Value(dtype='string', id=None), 'crawl_date': Value(dtype='string', id=None)}, post_processed=None, supervised_keys=None, builder_name='parquet', dataset_name='ccnews', config_name='2020', version=0.0.0, splits=None, download_checksums=None, download_size=None, post_processing_size=None, dataset_size=None, size_in_byt

In [2]:
# Initialize client with API key
with open("config.json", "r") as f:
    config = json.load(f)
api_key = config["api_key"]
# print(api_key)

client = Together(api_key=api_key)


In [3]:
def generate_embeddings(input_texts):
    """Generate embeddings from Together API.

    Args:
        input_texts: a list of string input texts.
        model_api_string: str. An API string for a specific embedding model of your choice.

    Returns:
        embeddings_list: a list of embeddings. Each element corresponds to the each input text.
    """
    model_api_string = "togethercomputer/m2-bert-80M-8k-retrieval"
    outputs = client.embeddings.create(
        input=input_texts, 
        model=model_api_string,
    )
    return [x.embedding for x in outputs.data]

In [23]:
filtered_dataset.features

{'requested_url': Value(dtype='string', id=None),
 'plain_text': Value(dtype='string', id=None),
 'published_date': Value(dtype='string', id=None),
 'title': Value(dtype='string', id=None),
 'tags': Value(dtype='string', id=None),
 'categories': Value(dtype='string', id=None),
 'author': Value(dtype='string', id=None),
 'sitename': Value(dtype='string', id=None),
 'image_url': Value(dtype='string', id=None),
 'language': Value(dtype='string', id=None),
 'language_score': Value(dtype='float64', id=None),
 'responded_url': Value(dtype='string', id=None),
 'publisher': Value(dtype='string', id=None),
 'warc_path': Value(dtype='string', id=None),
 'crawl_date': Value(dtype='string', id=None)}

In [5]:
with open('sample_data.json', 'r') as file:
    data = json.load(file)

# Print the data
print(data)

[{'requested_url': 'https://www.telez.fr/actus-tv/demain-nous-appartient-en-avance-resume-de-lepisode-629-de-mercredi-1er-janvier/', 'plain_text': "TF1 diffuse le mercredi 1er janvier l'épisode 629 du feuilleton Demain nous appartient. Au menu : Bart piégé, Samuel s'énerve. Attention spoilers !  Dans Demain nous appartient, Bart se réveille seul dans son lit. Il retrouve sa mère au petit déjeuner. Elle lui demande pardon pour la dispute de la veille. Flore lui redit combien il est important pour elle qu'ils s'entendent bien. Bart lui promet de tout faire pour que son mariage se passe bien. Quelqu'un sonne à la porte : une enveloppe adressée à Bart est posée sur le paillasson. Dedans, on devine des photos compromettantes. Bart va voir son cousin pour lui montrer. Un mot y est associé : s'il continue à fouiller, il aura des problèmes. Il sait qu'Audrey l'a piégé. Max lui conseille de chercher cette Audrey. Ils débarquent au Spoon pour voir Ulysse et lui demander s'il a des infos sur cett

In [11]:
# Extract 'plain_text' field
plain_texts = [article.get("plain_text", "") for article in data]

# Call the generate_embeddings function
# Ensure you have a `client` instance properly initialized before this step
try:
    embeddings = generate_embeddings(plain_texts)
except Exception as e:
    print(f"Error generating embeddings: {e}")
    embeddings = []

# Add embeddings back to the dataset
for article, embedding in zip(data, embeddings):
    article["embedding"] = embedding

# Optionally, save the updated data back to a file
with open("your_file_with_embeddings.json", "w", encoding="utf-8") as file:
    json.dump(data, file, ensure_ascii=False, indent=4)

print("Embeddings added to the dataset.")


Embeddings added to the dataset.


In [7]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel
import faiss
import numpy as np
from tqdm import tqdm

# Step 1: Load Articles from Hugging Face Dataset
def load_and_filter_articles():
    """
    Load articles from the CCNews dataset filtered by language and date.
    """
    # Load dataset in streaming mode
    dataset = load_dataset("stanford-oval/ccnews", name="2020", streaming=True)
    
    # Define filter criteria
    def filter_function(article):
        # Filter by language and crawl date
        return (
            article["language"] in ["en", "es"] and 
            "2020-11" in article["crawl_date"] and
            "election" in (article.get("title", "").lower() + article.get("content", "").lower())
        )
    
    # Apply the filter
    filtered_articles = []
    for article in tqdm(dataset["train"], desc="Filtering Articles"):
        if filter_function(article):
            filtered_articles.append(article)
    
    return filtered_articles

# Step 2: Set Up M2-BERT-80M-8K-Retrieval
def setup_model():
    """
    Load the M2-BERT model and tokenizer.
    """
    tokenizer = AutoTokenizer.from_pretrained("together-ai/M2-BERT-80M-8K-Retrieval")
    model = AutoModel.from_pretrained("together-ai/M2-BERT-80M-8K-Retrieval")
    return tokenizer, model

# Step 3: Generate Embeddings
def embed_text(text, tokenizer, model):
    """
    Generate embeddings for a given text using M2-BERT.
    """
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=8000, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    # Use the [CLS] token embedding (first token)
    return outputs.last_hidden_state[:, 0, :].squeeze().numpy()

def generate_embeddings(articles, tokenizer, model, batch_size=16):
    """
    Generate embeddings for all filtered articles using batching.
    """
    embeddings = []
    batch_texts = []  # To store text for batching
    
    for article in tqdm(articles, desc="Preparing Batches"):
        text = article.get("title", "") + " " + article.get("content", "")
        batch_texts.append(text)
        
        # If batch is full, process it
        if len(batch_texts) == batch_size:
            inputs = tokenizer(batch_texts, return_tensors="pt", truncation=True, max_length=8000, padding=True)
            with torch.no_grad():
                outputs = model(**inputs)
            # Extract [CLS] token embeddings for the batch
            batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
            embeddings.append(batch_embeddings)
            batch_texts = []  # Reset batch

    # Process remaining texts (if any)
    if batch_texts:
        inputs = tokenizer(batch_texts, return_tensors="pt", truncation=True, max_length=8000, padding=True)
        with torch.no_grad():
            outputs = model(**inputs)
        batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        embeddings.append(batch_embeddings)
    
    # Concatenate all batches
    embeddings = np.vstack(embeddings)
    return embeddings


# Step 4: Set Up FAISS Vector Store
def setup_faiss(embeddings):
    """
    Create and populate a FAISS vector store.
    """
    dimension = embeddings.shape[1]  # Embedding dimension
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    return index

# Step 5: Query and Retrieve
def retrieve_documents(query, tokenizer, model, index, articles, k=5):
    """
    Retrieve top-k documents for a query.
    """
    query_embedding = embed_text(query, tokenizer, model)
    distances, indices = index.search(query_embedding[np.newaxis, :], k)
    retrieved_docs = [articles[i] for i in indices[0]]
    return retrieved_docs

# Step 6: Use Retrieved Documents in a RAG Pipeline
def generate_response(query, retrieved_docs, generator_model):
    """
    Generate a response using retrieved documents and a text generator.
    """
    context = "\n".join([doc.get("title", "") + " " + doc.get("content", "") for doc in retrieved_docs])
    prompt = f"Query: {query}\nContext: {context}\nAnswer:"
    
    # Generate the response (replace 'your-generator-model' with the actual generator model)
    response = generator_model(prompt, max_length=512)
    return response[0]["generated_text"]

# Step 7: Main Pipeline
def main():
    # Load and filter articles
    print("Loading and filtering articles...")
    articles = load_and_filter_articles()

    # Set up model
    print("Setting up model...")
    tokenizer, model = setup_model()

    # Generate embeddings
    print("Generating embeddings...")
    embeddings = generate_embeddings(articles, tokenizer, model, batch_size=16)

    # Set up FAISS index
    print("Setting up FAISS vector store...")
    index = setup_faiss(embeddings)

    # Example query
    query = "What happened in the 2020 US Presidential Election?"
    print(f"Query: {query}")

    # Retrieve documents
    print("Retrieving documents...")
    retrieved_docs = retrieve_documents(query, tokenizer, model, index, articles)

    # Generate response
    print("Generating response...")
    from transformers import pipeline
    generator_model = pipeline("text-generation", model="together-ai/your-generator-model")
    response = generate_response(query, retrieved_docs, generator_model)

    print("\nGenerated Response:")
    print(response)

# Run the pipeline
if __name__ == "__main__":
    main()


IterableDataset({
    features: ['requested_url', 'plain_text', 'published_date', 'title', 'tags', 'categories', 'author', 'sitename', 'image_url', 'language', 'language_score', 'responded_url', 'publisher', 'warc_path', 'crawl_date'],
    num_shards: 76
})