## Imports

In [32]:
# Standard library imports
import re
import logging
from enum import Enum
from typing import Any
from datetime import datetime, timedelta

# Third-party libraries
import pandas as pd
import torch
import newspaper as news
from tqdm.notebook  import tqdm
from newspaper.mthreading import fetch_news
from gdeltdoc import GdeltDoc, Filters
from spacy.lang.en import English
from sentence_transformers import SentenceTransformer

In [27]:
# %pip install -r requirements.txt

## Extract data from GDelt

In [3]:
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
ch.setFormatter(formatter)
logger.addHandler(ch)

In [4]:
def log_tqdm(iterable, desc=None, ignore: bool = False):
    """Only use tqdm progress bar while in debugging"""
    if not ignore and logger.isEnabledFor(logging.DEBUG):
        return tqdm(iterable, desc=desc)

    return iterable

In [5]:
class Source(Enum):
    # CBC = "cbc.ca"
    CNN = "cnn.com"
    # AP = "apnews.com"    # wasn't working for some reason
    BBC = "bbc.co.uk"
    # Wired = "wired.com"
    # Reuters = "reuters.com"
    NyTimes = "nytimes.com"
    Guardian = "theguardian.com"

In [6]:
def format_text(text: str) -> str:
    """Removes newline characters and leading/trailing whitespace"""
    return text.replace("\n", " ").strip()    

In [7]:
def gdelt_row_to_dict(gd_row: pd.Series) -> dict:
    """Helper method to convert the output of a gd.article_search row into a dict"""
    return {
        "url": gd_row["url"],
        "title": gd_row["title"],
        "domain": gd_row["domain"],
        "country": gd_row["sourcecountry"],
    }

In [8]:
def article_to_dict(art: news.Article) -> dict:
    """Helper method to convert a Article into a dict"""
    return {
        "text": format_text(art.text),
        "title": art.title,
        "authors": art.authors,
        "date": art.publish_date,
        "source": art.source_url,
        "url": art.original_url
    }    

In [9]:
def gdelt_stories(
    keywords: str | None = None,
    theme: str | None = None,
    sources: list[Source] | None = None,
    start_date: datetime = datetime.now().date() - timedelta(days=7),
    end_date: datetime = datetime.now().date(),
    num_records: int = 5
) -> list[dict]:
    """
    Retrieves metadata and urls for relevant stories via the GDelt API
    """
    
    if keywords and len(keywords) < 5:
        raise ValueError("Keywords must be gte 5 characters or the Gdelt API errors.")

    sources = sources or [s for s in Source]        
    sources = [s.value for s in sources]
        
    if len(sources) < 2:
        raise ValueError("Number of sources must be gte 2 or the Gdelt API errors.")

    if start_date > datetime.now().date() or end_date > datetime.now().date():
        raise ValueError("News cannot be in the future...")

    if start_date > end_date:
        raise ValueError("How you gunna end before you start?!?!")        
     
    sources = [s for s in sources]
    
    start_str = start_date.strftime("%Y-%m-%d")
    end_str = end_date.strftime("%Y-%m-%d")

    if keywords and theme:
        f = Filters(
            start_date = start_str,
            end_date = end_str,
            num_records = num_records,
            domain = sources,
            country = ["UK", "US"],
            language = "eng",
            keyword = keywords,
            theme=theme
        )
    elif keywords:
        f = Filters(
            start_date = start_str,
            end_date = end_str,
            num_records = num_records,
            domain = sources,
            country = ["US"],#, "UK"],
            language = "eng",
            keyword = keywords,
        )
    elif theme:
        f = Filters(
            start_date = start_str,
            end_date = end_str,
            num_records = num_records,
            domain = sources,
            country = ["UK", "US"],
            language = "eng",
            theme=theme
        )
    else:
        raise ValueError("both theme and keywords cannot be empty")
        
    gd = GdeltDoc()
    search_results = gd.article_search(f)

    results = []
    for i, row in search_results.iterrows():
        results.append(gdelt_row_to_dict(row))

    return results

In [10]:
def get_articles(urls: list[str] | str) -> list[dict]:
    """
    Queries newspaper4k for the article associated with the provided url

    Note:   fetch_news is an alternative to mutlithread the retrieval process.
            However, this does not include the text property sooo were stuck with this.
    """

    urls = urls if isinstance(urls, list) else [urls]
    results = []
    view_titles = set()
    for url in log_tqdm(urls, "Collecting Stories"):

        # news.articles is an expensive operation so verify 
        # the current story hasn't already been retrieved
        possible_title = url.rsplit('/', 1)[-1]
        if possible_title in view_titles:
            logger.debug(f"Removed duplicate story: {possible_title}")
            continue
        view_titles.add(possible_title)
        
        try:
            art = news.article(url)         
        except:
            # in case an article doesn't exist (404 returned) or can't be accessed 
            continue
        
        results.append(article_to_dict(art))
    
    return results

In [11]:
def retrieve_news(
    keywords: str | None = None,
    theme: str | None = None,
    sources: list[Source] | None = None,
    start_date: datetime = datetime.now().date() - timedelta(days=7),
    end_date: datetime = datetime.now().date(),
    num_stories: int = 5,
) -> list[dict]:
    """
    Retrieves stories related to the specified keywords

    **NOTE**: theme must be a value from the available list here: 
    http://data.gdeltproject.org/api/v2/guides/LOOKUP-GKGTHEMES.TXT
    """

    logger.debug("Gdelt: retrieval beginning")
    story_links = gdelt_stories(
        keywords=keywords, 
        theme=theme,
        sources=sources, 
        start_date=start_date, 
        end_date=end_date, 
        num_records=num_stories
    )

    logger.debug("newpaper4k: retrieval beginning")
    stories = get_articles(
        urls=[story["url"] for story in story_links]
    )

    return stories

#### Example for retrieving stories:

\*\*Note\*\*: retrieve_news returns a list of dicts in the format created by the article_to_dict function

(i.e., keys=[title, text, domain, country, author, url, date])


In [12]:
# stories = retrieve_news(theme="IMMIGRATION", num_stories=10)

In [13]:
# stories[0].keys()

In [14]:
# [u["url"] for u in stories]

Results from above:

dict_keys(['text', 'title', 'authors', 'date', 'source', 'url'])

['https://www.theguardian.com/us-news/2025/jul/29/states-sue-trump-administration-snap-recipients-data',
 'https://www.bbc.co.uk/news/articles/clyjggjplyqo',
 'https://www.theguardian.com/us-news/2025/jul/30/ice-hiring-incentives-signing-bonuses',
 'https://www.cnn.com/2025/07/30/politics/immigration-employees-reader-callout',
 'https://www.theguardian.com/us-news/2025/jul/28/trump-acknowledges-real-starvation-in-gaza-and-tells-israel-to-let-in-every-ounce-of-food',
 'https://www.theguardian.com/us-news/2025/aug/01/judge-tps-temporary-protected-status-trump-deportation',
 'https://www.theguardian.com/society/2025/jul/30/population-migration-england-wales-data',
 'https://www.theguardian.com/world/2025/jul/30/mexico-sheinbaum-alligator-alcatraz-trump',
 'https://www.theguardian.com/uk-news/2025/aug/01/social-media-ads-promoting-small-boat-crossings-uk-banned']

## Test samples for creating article embeddings

In [15]:
test_stories = retrieve_news(
    theme="IMMIGRATION", 
    num_stories=100, 
    start_date= datetime.now().date() - timedelta(days=90),
)

2025-08-03 05:32:04,583 - DEBUG - Gdelt: retrieval beginning
2025-08-03 05:32:05,376 - DEBUG - newpaper4k: retrieval beginning


Collecting Stories:   0%|          | 0/100 [00:00<?, ?it/s]

2025-08-03 05:32:32,523 - DEBUG - Removed duplicate story: immigration-birthright-citizenship-us-dg
2025-08-03 05:32:33,146 - DEBUG - Removed duplicate story: immigration-employees-reader-callout
2025-08-03 05:32:34,996 - DEBUG - Removed duplicate story: ice-arrests-migrants-courthouse
2025-08-03 05:32:38,090 - DEBUG - Removed duplicate story: deportations-backfiring-trump-analysis
2025-08-03 05:32:42,402 - DEBUG - Removed duplicate story: guatemalan-migrant-deported-mexico-trump-administration-return
2025-08-03 05:32:42,418 - DEBUG - Removed duplicate story: guatemalan-migrant-deported-mexico-trump-administration-return
2025-08-03 05:32:43,230 - DEBUG - Removed duplicate story: sanctuary-immigration-policies-chicago-illinois-lawsuit-dismissed


## Chunk the articles

In [16]:
def copy_list_of_dicts(dict_list: list[dict]) -> list[dict]:
    """Helper method to deep copy a list of dicts (not-recursively)"""
    logger.debug("Creating copy of target list of dicts")
    new_list = []
    for item in dict_list:
        new_list.append(item.copy())

    return new_list

In [17]:
def sentencize_stories(stories: list[dict]) -> list[dict]:
    """
    Uses spacy to convert the block of text provided by newspaper4k into sentences

    **Note**: This is an inplace operation
    """
    nlp = English()
    _ = nlp.add_pipe("sentencizer")
    
    logger.debug("Breaking text into sentences")
    # convert to sentences and ensure dtype is a str not spacy specific typing
    for story in log_tqdm(stories, "Sentencizing"):
        story["sents"] = list(nlp(story["text"]).sents)
        story["sents"] = [str(s) for s in story["sents"]]

    return stories

In [18]:
def split_list(input_list: list[Any], max_item_count: int) -> list[list[Any]]:
    """
    Splits a list of strings into a seperate lists with specified maximum number of items
    """
    return [input_list[i:i+max_item_count] for i in range(0, len(input_list), max_item_count)]

In [19]:
def chunk_sentences(stories: list[dict], chunk_size: int = 10) -> list[dict]:
    """
    Breaks the list of sentences into sublists with a maximum item count of chunk_size
    
    **Note**: This is an inplace operation
    """
    logger.debug(f"Chunking sentences, chunk_size: {chunk_size}")
    
    if "sents" not in stories[0].keys():
        raise ValueError("Out of order operation: Cannot chunk sentences before they exist")
        
    for story in log_tqdm(stories, "Chunking"):
        story["sent_chunks"] = split_list(story["sents"], chunk_size)
        
    return stories

In [20]:
def article_length_metadata(stories: list[dict]) -> list[dict]:
    """
    Collects minor metadata regarding the length of the article
    
    **Note**: This is an inplace operation
    """
    logger.debug("Adding article length metadata")
    
    if any(key not in stories[0] for key in ["sents", "sent_chunks"]):
        raise ValueError("Out of order operation: Collecting metadata requires all sub-items to be populated")
        
    for story in log_tqdm(stories, "Collecting Metadata"):
        story["num_sents"] = len(story["sents"])
        story["num_tokens"] = len(story["text"]) // 4
        story["num_words"] = len(story["text"].split(" "))
        story["num_chars"] = len(story["text"])
        story["num_chunks"] = len(story["sent_chunks"])

    return stories

In [21]:
def seperate_into_chunk_list(story_list: list[dict]) -> list[dict]:
    """
    Seperates the chunks in the story dict into individual dicts and returns
    a list with each of these chunks as their own item
    """
    logger.debug("Creating list of chunks from sublist in article dict")
    
    chunk_list = []
    for item in log_tqdm(story_list, "Seperating Chunks"):
        # print(item["sent_chunks"])
        for chunk in item["sent_chunks"]:

            # populate each chunk with articles original metadata
            chunk_dict = {
                "title": item["title"],
                "authors": item["authors"],
                "date": item["date"],
                "source": item["source"],
                "url": item["url"],
            }
    
            # rejoin chunk sentences and format to more natural text (i.e., format end of sentence text)
            chunk_text = "".join(chunk).replace("  ", " ").strip()
            chunk_text = re.sub(r"\.([A-Z])", r". \1", chunk_text)    
            chunk_dict["text"] = chunk_text

            # don't log here because it would get messy
            chunk_list.append(chunk_dict)

    return chunk_list

In [22]:
def chunk_length_metadata(chunks: list[dict]) -> list[dict]:
    """
    Collects metadata regarding the length of the chunks
    
    **Note**: This is an inplace operation
    """
    logger.debug("Adding article length metadata")
            
    for chunk in log_tqdm(chunks, "Collecting Metadata"):
        chunk["num_tokens"] = len(chunk["text"]) // 4
        chunk["num_words"] = len(chunk["text"].split(" "))
        chunk["num_chars"] = len(chunk["text"])

    return chunks

In [23]:
def chunk_articles(stories: list[dict], sentences_per_chunk: int = 10) -> list[dict]:
    """
    Converts the list of stories (stored as dicts) into a list chunks (also dicts)

    sentences_per_chunks sets the number of sentences used in each chunk
    """
    logger.debug("Chunking list of articles")

    # create chunk information within a copy of the stories list[dict]
    story_copy = copy_list_of_dicts(stories)
    story_copy = sentencize_stories(story_copy)
    story_copy = chunk_sentences(story_copy)

    # create a new chunk list[dict] from result
    chunks = seperate_into_chunk_list(story_copy)
    chunks = chunk_length_metadata(chunks)
    
    return chunks    

## Create embeddings from chunks

In [49]:
def create_embedding_model(model_name: str = "all-MiniLM-L6-v2") -> SentenceTransformer:
    """Helper method to create the model and send it to cuda"""
    logger.debug("Creating embedding model")
    
    model = SentenceTransformer(model_name_or_path="all-MiniLM-L6-v2")

    if torch.cuda.is_available():
        model = model.to("cuda")
        logger.debug(f"CUDA detected. Embedding model moved to {torch.cuda.get_device_name(torch.cuda.current_device())}")
    else:
        logger.debug("CUDA not detected. Embedding model will remain on CPU")

    return model    

In [50]:
def create_embedding(
    chunks: list[dict], 
    embedding_model: SentenceTransformer
) -> list[dict]:
    """
    Uses the embedded model to create embeddings from the text values in the provided chunks

    **Note**: this operation is inplace
    """
    logger.debug("Creating embeddings for chunks")

    for chunk in log_tqdm(chunks, "Embedding"):
        chunk["embedding"] = embedding_model.encode(chunk["text"])

    return chunks        

In [51]:
chunks = chunk_articles(test_stories)

embedding_model = create_embedding_model()
chunks = create_embedding(chunks, embedding_model=embedding_model)

2025-08-03 05:56:33,096 - DEBUG - Chunking list of articles
2025-08-03 05:56:33,096 - DEBUG - Creating copy of target list of dicts
2025-08-03 05:56:33,132 - DEBUG - Breaking text into sentences


Sentencizing:   0%|          | 0/93 [00:00<?, ?it/s]

2025-08-03 05:56:33,397 - DEBUG - Chunking sentences, chunk_size: 10


Chunking:   0%|          | 0/93 [00:00<?, ?it/s]

2025-08-03 05:56:33,412 - DEBUG - Creating list of chunks from sublist in article dict


Seperating Chunks:   0%|          | 0/93 [00:00<?, ?it/s]

2025-08-03 05:56:33,412 - DEBUG - Adding article length metadata


Collecting Metadata:   0%|          | 0/396 [00:00<?, ?it/s]

2025-08-03 05:56:33,412 - DEBUG - Creating embedding model
2025-08-03 05:56:34,448 - DEBUG - CUDA detected. Embedding model moved to NVIDIA GeForce RTX 5070 Ti
2025-08-03 05:56:34,448 - DEBUG - Creating embeddings for chunks


Embedding:   0%|          | 0/396 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


## Data Storage in QDrant

In [25]:
for item in chunks:
    if item["num_chars"] < 1000:
        for k, v in item.items():
            print(" -", k, ":", v)
        break

 - title : The least ‘integrated’ part of British society isn’t the immigrants – it’s the elite | Andy Beckett
 - authors : ['Andy Beckett', 'www.theguardian.com']
 - date : 2025-05-23 00:00:00
 - source : https://www.theguardian.com
 - url : https://www.theguardian.com/commentisfree/2025/may/23/integrated-british-society-immigrants-elite
 - text : Inside, to my surprise, was a classic Portuguese bar, with dusty Portuguese football scarves hanging from the ceiling and elderly Portuguese immigrants drinking dark Portuguese liqueur from tiny glasses. The bar felt both foreign and, in its confident approach to cultural difference, quite British. Were the bar staff and their customers completely integrated with the rest of King’s Lynn?On the basis of a brief visit, it was hard to say. But they made a very good cup of tea.
 - num_tokens : 119
 - num_words : 79
 - num_chars : 478


## References:

- Local Retrieval Augmented Generation (RAG) from Scratch (step by step tutorial) by Daniel Bourke

link: https://www.youtube.com/watch?v=qN_2fnOPY-M

In [52]:
if torch.cuda.is_available():
    gpu_id = torch.cuda.current_device()
    total = torch.cuda.get_device_properties(gpu_id).total_memory
    reserved = torch.cuda.memory_reserved(gpu_id)
    allocated = torch.cuda.memory_allocated(gpu_id)
    free = reserved - allocated

    print(f"Total memory:     {total / 1e6:.2f} MB")
    print(f"Reserved memory:  {reserved / 1e6:.2f} MB")
    print(f"Allocated memory: {allocated / 1e6:.2f} MB")
    print(f"Free within reserved: {free / 1e6:.2f} MB")
else:
    print("No CUDA device available.")

Total memory:     17094.48 MB
Reserved memory:  304.09 MB
Allocated memory: 190.90 MB
Free within reserved: 113.19 MB
