In [30]:
# Import necessary libraries
from GoogleNews import GoogleNews
import pandas as pd
import requests
from fake_useragent import UserAgent
import newspaper
from newspaper import fulltext
import re

In [31]:
# Define the keyword to search.
keyword = 'discapacidad'

In [37]:
# Perform news scraping from Google and extract the result into Pandas dataframe. 
googlenews = GoogleNews(lang='es', region='AR', period='1m', encode='utf-8')
googlenews.clear()
googlenews.search(keyword)
googlenews.get_page(1)
news_result = googlenews.result(sort=True)
news_data_df = pd.DataFrame.from_dict(news_result)

In [38]:
# Display information of dataframe.
news_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   title     20 non-null     object 
 1   media     20 non-null     object 
 2   date      20 non-null     object 
 3   datetime  0 non-null      float64
 4   desc      20 non-null     object 
 5   link      20 non-null     object 
 6   img       20 non-null     object 
dtypes: float64(1), object(6)
memory usage: 1.2+ KB


In [40]:
# Display header of dataframe.
news_data_df.head()
news_data_df['link']

0     https://www.pagina12.com.ar/857960-un-fallo-en...
1     https://radiomarcaab.com/202509/se-celebra-el-...
2     https://www.elespanol.com/castilla-y-leon/econ...
3     https://www.adnsur.com.ar/sociedad/el-gobierno...
4     https://www.diariodebatepregon.com/argentina-y...
5     https://www.rionegro.com.ar/sociedad/freno-al-...
6     https://www.eldiario.es/extremadura/caceres/in...
7     https://www.mendozapost.com/politica/ley-de-di...
8     https://obera.gob.ar/gran-participacion-de-adu...
9     https://www.politicargentina.com/notas/202509/...
10    https://www.pagina12.com.ar/857960-un-fallo-en...
11    https://radiomarcaab.com/202509/se-celebra-el-...
12    https://www.elespanol.com/castilla-y-leon/econ...
13    https://www.adnsur.com.ar/sociedad/el-gobierno...
14    https://www.diariodebatepregon.com/argentina-y...
15    https://www.rionegro.com.ar/sociedad/freno-al-...
16    https://www.eldiario.es/extremadura/caceres/in...
17    https://www.mendozapost.com/politica/ley-d

In [46]:
def clean_google_news_url(url):
    # Remove everything after the first occurrence of '.html' or trailing slash
    if '.html' in url:
        url = url.split('.html')[0] + '.html'
    elif '/' in url:
        url = url.split('&')[0]  # Remove Google tracking parameters
    return url

In [50]:
from newspaper import Article
ua = UserAgent()
news_data_df_with_text = []

for index, headers in news_data_df.iterrows():
    news_title = str(headers['title'])
    news_media = str(headers['media'])
    news_update = str(headers['date'])
    news_timestamp = str(headers['datetime'])
    news_description = str(headers['desc'])
    news_link = str(headers['link'])
    news_img = str(headers['img'])

    news_link = clean_google_news_url(news_link)

    print(news_link)

    text = ""  # initialize

    # First try Newspaper (more robust)
    try:
        article = Article(news_link, browser_user_agent=ua.chrome)
        article.download()
        article.parse()
        text = article.text
        print('Text Content via newspaper3k')
    except Exception as e:
        print(f'Newspaper extraction failed: {e}')
        # fallback to requests + fulltext
        try:
            html = requests.get(news_link, headers={'User-Agent': ua.chrome}, timeout=5).text
            text = fulltext(html)
            print('Text Content via fulltext fallback')
        except Exception as e2:
            print(f'Fallback fulltext failed: {e2}')
            # text remains empty

    news_data_df_with_text.append([
        news_title, news_media, news_update, news_timestamp,
        news_description, news_link, news_img, text
    ])

news_data_with_text_df = pd.DataFrame(
    news_data_df_with_text,
    columns=['Title', 'Media', 'Update', 'Timestamp',
             'Description', 'Link', 'Image', 'Text']
)


https://www.pagina12.com.ar/857960-un-fallo-en-catamarca-ordena-a-la-andis-que-restituya-pensio
Text Content via newspaper3k
https://radiomarcaab.com/202509/se-celebra-el-acto-protocolario-de-entrega-del-galardon-conmemorativo-a-los-ganadores-de-la-1a-edicion-ayudas-fundacion-soliss/
Text Content via newspaper3k
https://www.elespanol.com/castilla-y-leon/economia/20250915/fundacion-caja-rural-financiara-proyectos-fomenten-empleo-personas-discapacidad/1003743925634_0.html
Text Content via newspaper3k
https://www.adnsur.com.ar/sociedad/el-gobierno-promulgara-la-ley-de-emergencia-en-discapacidad-pero-postergara-su-reglamentacion-por-un-drastico-motivo_a68c8206a11916dde349468ed
Text Content via newspaper3k
https://www.diariodebatepregon.com/argentina-y-el-mundo/el-gobierno-promulgara-la-ley-de-discapacidad-pero-retrasa-su-aplicacion-por-financiamiento
Text Content via newspaper3k
https://www.rionegro.com.ar/sociedad/freno-al-recorte-de-pensiones-con-un-amparo-intiman-al-gobierno-a-restituir

In [51]:
# Display the entire dataframe for sample checking.
news_data_with_text_df

Unnamed: 0,Title,Media,Update,Timestamp,Description,Link,Image,Text
0,Un fallo en Catamarca ordena a la ANDIS que re...,Página | 12,hace 7 minutos,,El Juzgado Federal Nº 2 de Catamarca hizo luga...,https://www.pagina12.com.ar/857960-un-fallo-en...,"data:image/gif;base64,R0lGODlhAQABAIAAAP//////...",El Juzgado Federal Nº 2 de Catamarca hizo luga...
1,SE CELEBRA EL ACTO PROTOCOLARIO DE ENTREGA DEL...,Radio Marca Albacete,hace 8 minutos,,La Fundación Soliss entrega los galardones a l...,https://radiomarcaab.com/202509/se-celebra-el-...,"data:image/gif;base64,R0lGODlhAQABAIAAAP//////...",La Fundación Soliss entrega los galardones a l...
2,Fundación Caja Rural financiará diez proyectos...,El Español,hace 11 minutos,,El programa Workin ha impulsado desde su creac...,https://www.elespanol.com/castilla-y-leon/econ...,"data:image/gif;base64,R0lGODlhAQABAIAAAP//////...",La Fundación Eurocaja Rural ha puesto en march...
3,El Gobierno promulgará la ley de emergencia en...,ADNSUR,hace 16 minutos,,Después de que el Congreso derribara el veto d...,https://www.adnsur.com.ar/sociedad/el-gobierno...,"data:image/gif;base64,R0lGODlhAQABAIAAAP//////...","Tras la aprobación contundente en el Congreso,..."
4,El Gobierno promulgará la ley de discapacidad ...,Diario El Debate Pregón,hace 17 minutos,,El Gobierno nacional confirmó que promulgara l...,https://www.diariodebatepregon.com/argentina-y...,"data:image/gif;base64,R0lGODlhAQABAIAAAP//////...","La iniciativa, impulsada por la oposición y co..."
5,Freno al recorte de pensiones: con un amparo i...,Diario Río Negro,hace 18 minutos,,Una acción de amparo fue presentada en la Just...,https://www.rionegro.com.ar/sociedad/freno-al-...,"data:image/gif;base64,R0lGODlhAQABAIAAAP//////...",Una acción de amparo fue presentada en la Just...
6,Ingresa en prisión un hombre de 69 años por ag...,elDiario.es,hace 22 minutos,,"Había sido detenido, en otra ocasión, por la p...",https://www.eldiario.es/extremadura/caceres/in...,"data:image/gif;base64,R0lGODlhAQABAIAAAP//////...","Un hombre de 69 años, al que le constan antece..."
7,Francos confirmó que el Gobierno promulgará la...,Mendoza Post,hace 27 minutos,,El Gobierno nacional confirmó que promulgará l...,https://www.mendozapost.com/politica/ley-de-di...,"data:image/gif;base64,R0lGODlhAQABAIAAAP//////...",El Gobierno nacional confirmó que promulgará l...
8,Gran participación de adultos mayores y person...,Gobierno de Oberá,hace 30 minutos,,Durante la prueba atlética en el marco de la F...,https://obera.gob.ar/gran-participacion-de-adu...,"data:image/gif;base64,R0lGODlhAQABAIAAAP//////...",Durante la prueba atlética en el marco de la F...
9,Corrupción en la Agencia de Discapacidad: denu...,Política Argentina,hace 31 minutos,,La presentación la hizo el abogado Gregorio Da...,https://www.politicargentina.com/notas/202509/...,"data:image/gif;base64,R0lGODlhAQABAIAAAP//////...",15.09.2025 / COIMAS EN ANDIS\n\nCorrupción en ...


In [52]:
# Drops missiong or empty text
filtered_df = news_data_with_text_df[
    news_data_with_text_df['Text'].notna() &
    (news_data_with_text_df['Text'].str.strip() != "")
]


In [53]:
# Save the result dataframe into a CSV file.
filtered_df.to_csv("./data/news_data_with_text.csv")

In [54]:
# Reload the saved news data content from a CSV file.
news_data_with_text_df = pd.read_csv("./data/news_data_with_text.csv",  index_col=0)

In [57]:
from transformers import pipeline
import torch
summarizer = pipeline("summarization")

def summarize_article(text):
    summary = summarizer(text, max_length=100, min_length=30, do_sample=False)
    return summary[0]['summary_text']

news_data_df_with_text['Summary'] = news_data_df_with_text['Text'].apply(summarize_article)

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.


NameError: name 'torch' is not defined

In [None]:
# Select the top 10 news records (the latest 10) for summarization.
#news_data_with_text_df = news_data_with_text_df.head(10)

In [None]:
# Display the scraped text content from the first news record.
#print(news_data_with_text_df['Text'].values[0])

In [None]:
# Count the number of words in the first news record.
#len(re. findall(r'\w+', news_data_with_text_df['Text'].values[0]))

In [None]:
# Concatenate all the top 10 news record's text content into a single string.
#news_text_content_string = news_data_with_text_df.to_string(columns=['Text'], header=False, index=False)
#print(news_text_content_string)

In [None]:
# Count the number of words in the top 10 news record's text content.
#len(re. findall(r'\w+', news_text_content_string))

In [None]:
# Import necessary libraries
# import openai
# import platform
# import os

# import nltk
# from nltk.tokenize import word_tokenize

In [None]:
# # Define the function to count the number of tokens.
# def count_tokens(stringname):
#     tokens = word_tokenize(stringname)
#     return len(tokens)

In [None]:
# Display the number of tokens from the top 10 news record's text content.
# stringname = news_text_content_string

# token_count = count_tokens(stringname)
# print(f"Number of tokens: {token_count}")

In [None]:
# Define the function to divide the top 10 news record's text content into segments of 2000 tokens 
# with the overlap of 100 tokens to avoid losing information from the split.
# def break_up_file(tokens, chunk_size, overlap_size):
#     if len(tokens) <= chunk_size:
#         yield tokens
#     else:
#         chunk = tokens[:chunk_size]
#         yield chunk
#         yield from break_up_file(tokens[chunk_size-overlap_size:], chunk_size, overlap_size)

# def break_up_file_to_chunks(stringname, chunk_size=2000, overlap_size=100):
#     tokens = word_tokenize(stringname)
#     return list(break_up_file(tokens, chunk_size, overlap_size))

In [None]:
# # Trial run of dividing function.
# stringname = news_text_content_string

# chunks = break_up_file_to_chunks(stringname)
# for i, chunk in enumerate(chunks):
#     print(f"Chunk {i}: {len(chunk)} tokens")

In [None]:
# Define the function to convert tokenized text back to normal text prompts.
# def convert_to_detokenized_text(tokenized_text):
#     prompt_text = " ".join(tokenized_text)
#     prompt_text = prompt_text.replace(" 's", "'s")
#     return prompt_text

In [None]:
# Configure the baseline configuration of the OpenAI library.
# openai.api_type = "azure"
# openai.api_base = "https://PLESAE_ENTER_YOUR_OWNED_AOAI_RESOURCE_NAME.openai.azure.com/"
# openai.api_version = "2022-12-01"
# openai.api_key = "PLEASE_ENTER_YOUR_OWNED_AOAI_SERVICE_KEY"

In [None]:
# Perform news text content summarization by Azure OpenAI Service (GPT3) for each chunk.
# stringname = news_text_content_string

# prompt_response = []
# chunks = break_up_file_to_chunks(stringname)

# for i, chunk in enumerate(chunks):
#     print("Processing chunk " + str(i))
#     prompt_request = "Summarize this news content: " + convert_to_detokenized_text(chunks[i])
#     response = openai.Completion.create(
#             engine="eason-text-davinci-002",
#             prompt=prompt_request,
#             temperature=.5, # Default is 1.
#             max_tokens=500,
#             top_p=1 # Default is 0.5.
#     )
    
#     prompt_response.append(response["choices"][0]["text"].strip())

# from openai import OpenAI

# # Initialize client with API key from environment variable OPENAI_API_KEY
# client = OpenAI()

# stringname = news_text_content_string
# prompt_response = []
# chunks = break_up_file_to_chunks(stringname)

# for i, chunk in enumerate(chunks):
#     print("Processing chunk " + str(i))
#     prompt_request = "Summarize this news content: " + convert_to_detokenized_text(chunk)

#     response = client.chat.completions.create(
#         model="gpt-4o-mini",  # or "gpt-3.5-turbo" if you prefer
#         messages=[
#             {"role": "system", "content": "You are a helpful assistant that summarizes news content."},
#             {"role": "user", "content": prompt_request}
#         ],
#         temperature=0.5,
#         max_tokens=500,
#         top_p=1
#     )

#     prompt_response.append(response.choices[0].message.content.strip())

# # Join all summaries into one string if needed
# final_summary = "\n\n".join(prompt_response)
# print(final_summary)


In [None]:
# Define the prompt to perform summarization into 1,500 words for each summarized content.
#prompt_request = "Consolidate these news content summaries into 1500 words sentences: " + str(prompt_response)

In [None]:
# Perform summarization by Azure OpenAI Service (GPT3) for each chunk of summarized content.
# response = openai.Completion.create(
#         engine="PLEASE_ENTER_YOUR_AOAI_MODEL_DEPLOYMENT_NAME",
#         prompt=prompt_request,
#         temperature=.5, # Default is 1.
#         max_tokens=1000,
#         top_p=1 # Default is 0.5.
#     )

In [None]:
# Display the final summary from the top 10 news record's text content.
# news_content_summary = response["choices"][0]["text"].strip()
# print(news_content_summary)

In [None]:
# Select the necessary columns and convert the dataframe into HTML for showing in the email.
# news_data_with_text_df_1 = news_data_with_text_df[["Title", "Media", "Timestamp", "Description", "Link"]]
# news_data_with_text_df_1_html = news_data_with_text_df_1.to_html(index=False)

In [None]:
# Email the GPT news summary with the news source reference table via MailJet. 
from mailjet_rest import Client
import os
api_key = 'PLEASE_ENTER_YOUR_OWNED_MAILJET_API_KEY_NAME'
api_secret = 'PLEASE_ENTER_YOUR_OWNED_MAILJET_API_KEY_SECRET'
mailjet = Client(auth=(api_key, api_secret), version='v3.1')
data = {
  'Messages': [
    {
      "From": {
        "Email": "easonlai888@gmail.com",
        "Name": "Eason"
      },
      "To": [
        {
          "Email": "easonlai888@gmail.com",
          "Name": "Eason"
        }
      ],
      "Subject": "GPT News Summary of Today",
      "HTMLPart": "<h3>Here is the news summary of GPT for today.</h3>{}<br><br> \
                   <h3>GPT News Summary Sources</h3>{}" \
                   .format(news_content_summary, news_data_with_text_df_1_html),
    }
  ]
}
result = mailjet.send.create(data=data)

print(result.status_code)
print(result.json())
