In [1]:
%pip install -q -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [2]:
%pip list

Package                   Version
------------------------- ----------
aiofiles                  22.1.0
aiohappyeyeballs          2.6.1
aiohttp                   3.10.0
aiosignal                 1.4.0
alembic                   1.14.1
altair                    4.2.2
annotated-types           0.7.0
anyio                     4.12.0
argon2-cffi               25.1.0
argon2-cffi-bindings      25.1.0
arrow                     1.4.0
asttokens                 3.0.1
async-lru                 2.0.5
attrs                     25.4.0
babel                     2.17.0
backcall                  0.2.0
bcrypt                    4.0.1
beautifulsoup4            4.14.3
bleach                    6.3.0
blinker                   1.9.0
cachetools                6.2.4
certifi                   2025.11.12
cffi                      2.0.0
charset-normalizer        3.4.4
click                     8.1.8
cloudpickle               3.1.2
colorama                  0.4.6
comm                      0.2.3
croniter           

## Fetch the raw XML using feedparser

In [3]:
URL_1 = "https://news.google.com/rss/search?q=Espa%C3%B1a&hl=es&gl=ES&ceid=ES%3Aes"
URL_2 = "https://news.google.com/rss/search?q=Spain&hl=en-US&gl=US&ceid=US%3Aen"

In [63]:
import feedparser
import re 
from datetime import datetime

import warnings

import pandas as pd

warnings.filterwarnings("ignore")


In [47]:
def fetch_raw_data(url:str) -> list[dict]:
    """_summary_ : This function fetches raw data from the inputed url

    Args:
        url (string): _description_
    """    
    data = feedparser.parse(url)['entries']
    
    return data

def test_raw_data(output, **kwargs) -> None:
    assert len(output) > 0, "Empty output object" 
    assert isinstance(output, list), f"Output is not a list, {type(output)}"
    assert isinstance(output[0], dict), f"Output is not a dictionary, {type(output[0])}"


#title, published, summary, source[href], source[title]


In [48]:
raw_output = fetch_raw_data(URL_1)

test_raw_data(raw_output)

### Parse the data into dataframes

In [49]:
raw_output

[{'title': 'En Reino Unido hablan con claridad sobre cómo está España: a más de uno le va a fastidiar el fin de año - El HuffPost',
  'title_detail': {'type': 'text/plain',
   'language': None,
   'base': 'https://news.google.com/rss/search?q=Espa%C3%B1a&hl=es&gl=ES&ceid=ES%3Aes',
   'value': 'En Reino Unido hablan con claridad sobre cómo está España: a más de uno le va a fastidiar el fin de año - El HuffPost'},
  'links': [{'rel': 'alternate',
    'type': 'text/html',
    'href': 'https://news.google.com/rss/articles/CBMiwAFBVV95cUxNN1VoNXlsR2pGQXdHdmE4a1NoWml5WlVzcVhoaW90Sm9DaDlwQUE3U2dabmlTX1c4U3JycGNhV2t4VWMwckhoQzIwbVhYS1RoSFB2bTE4OGg3ay1WWFN0TmNlUkFHWTJHOGVaZ0VkYko5UHhxY1l6WmhjWkIwZDVLN0VGWkpmMmE2MEU1V0pVNlhiNTlYdXItUWYtRzJ0eUl6WjVPOWVkMWs3anExaHRFaDYtZW5BNkF4cHlYT1Q5V0rSAdQBQVVfeXFMUEViN3h2aDl1azlPMVBHYmpQT3BnZmxRWHRNTHVISmFfbW9pRTJzekhCMW5paXNtTDZDOUtMREpzR29JRjBsaExmVGRNeTRIV0RMY2p5XzBydGlaY0ZlT0p0Y1dCSURpMFhPcEF0R1NCNGpHcERBdVVhZ0ZIcl83Z2pSRkc4VEp5SkhoS0lUNnRQNUE4ZV9mdHVrVWdq

In [64]:
def parse_data_into_dataframe(input_dict:dict, *args) -> pd.DataFrame:
    """_summary_ : This function parses the input dictionary into dataframe objects

    Args:
        input_dict (dict): _description_

    Returns:
        pd.DataFrame: _description_
    """   
    processed_data = []
    
    for entry in input_dict:
        row = {
            'title' : entry['title'],
            'published_date' : pd.to_datetime(entry['published']),
            'summary' : entry['summary'],
            'source_name' : entry['source']['title'],
            'source_url' : entry['source']['href'],
            'entry_date' : pd.to_datetime(datetime.now())
        }
        processed_data.append(row)

    return pd.DataFrame(processed_data)

def test_parsing(output, **kwargs) -> None:
    assert isinstance(output, pd.DataFrame), f"output has not been parsed to dataframe. Output is a {type(output)}"
    assert len(output)> 0, f"no entries in output"


In [65]:

clean_data = parse_data_into_dataframe(raw_output)
test_parsing(clean_data)
    

In [66]:
clean_data

Unnamed: 0,title,published_date,summary,source_name,source_url,entry_date
0,En Reino Unido hablan con claridad sobre cómo ...,2025-12-27 11:26:27,"<a href=""https://news.google.com/rss/articles/...",El HuffPost,https://www.huffingtonpost.es,2025-12-30 18:36:18.054053
1,Fallecen dos españoles en un accidente de tráf...,2025-12-30 15:50:19,"<a href=""https://news.google.com/rss/articles/...",EL PAÍS,https://elpais.com,2025-12-30 18:36:18.055379
2,España planea una interconexión con Irlanda de...,2025-12-30 04:00:00,"<a href=""https://news.google.com/rss/articles/...",El Confidencial,https://www.elconfidencial.com,2025-12-30 18:36:18.056082
3,"La ciudad de España con el hallazgo visigodo, ...",2025-12-29 14:47:22,"<a href=""https://news.google.com/rss/articles/...",Viajar,https://viajar.elperiodico.com,2025-12-30 18:36:18.056633
4,El año en el que España se quedó fuera del cen...,2025-12-27 21:59:55,"<a href=""https://news.google.com/rss/articles/...",El Mundo,https://www.elmundo.es,2025-12-30 18:36:18.057763
...,...,...,...,...,...,...
97,Spotify España: Estos son los podcast mas escu...,2025-12-30 15:16:00,"<a href=""https://news.google.com/rss/articles/...",Infobae,https://www.infobae.com,2025-12-30 18:36:18.094716
98,Cuáles son los municipios más baratos para com...,2025-12-30 12:20:50,"<a href=""https://news.google.com/rss/articles/...",EFE - Agencia de noticias,https://efe.com,2025-12-30 18:36:18.095070
99,Juan Eslava Galán recorre la historia del sexo...,2025-12-30 07:45:47,"<a href=""https://news.google.com/rss/articles/...",LaSexta,https://www.lasexta.com,2025-12-30 18:36:18.095413
100,La menor rescatada en el naufragio de Indonesi...,2025-12-30 11:43:45,"<a href=""https://news.google.com/rss/articles/...",ABC,https://www.abc.es,2025-12-30 18:36:18.095740


In [1]:
!jupyter nbconvert --to script notebook.ipynb

[NbConvertApp] Converting notebook notebook.ipynb to script
[NbConvertApp] Writing 2338 bytes to notebook.py


## Transform

Todo: 
> Clean the data  
> Batching to avoid API timeouts  
> Translation model | Huggingface   
> Sentiment model | Huggingface as well  


Result:  
 A rich dataframe