In [93]:
# Import libraries
# GoogleNews Documentation : https://pypi.org/project/gnews/
from gnews import GNews
import pandas as pd
import nltk  # NLP library
from tqdm.auto import tqdm
import warnings

warnings.filterwarnings('ignore')

In [94]:
# This tokenizer divides a text into a list of sentences by using an unsupervised algorithm to build a model for abbreviation words, collocations, and words that start sentences.
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/alfonso/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [95]:
# Initializing
googlenews = GNews()

In [96]:
# Settings
googlenews = GNews(start_date=(2023, 1, 1), end_date=(2024, 5, 17))

In [97]:
# Search 
topic = "Audax Renovables"
searchednews = googlenews.get_news(topic)
print(f"Articles found:", len(searchednews))

Articles found: 100


In [98]:
# Results
print(searchednews[1])

{'title': "Spain's Audax reports shining fin results for 2023, profit up 304% - Renewables Now", 'description': "Spain's Audax reports shining fin results for 2023, profit up 304%  Renewables Now", 'published date': 'Fri, 08 Mar 2024 08:00:00 GMT', 'url': 'https://news.google.com/rss/articles/CBMiZmh0dHBzOi8vcmVuZXdhYmxlc25vdy5jb20vbmV3cy9zcGFpbnMtYXVkYXgtcmVwb3J0cy1zaGluaW5nLWZpbi1yZXN1bHRzLWZvci0yMDIzLXByb2ZpdC11cC0zMDQtODUwOTc4L9IBAA?oc=5&hl=en-US&gl=US&ceid=US:en', 'publisher': {'href': 'https://renewablesnow.com', 'title': 'Renewables Now'}}


In [99]:
# Convert to DataFrame
df = pd.DataFrame(searchednews)
df.tail(20)

Unnamed: 0,title,description,published date,url,publisher
80,“Europa tiene todavía mucho potencial por expl...,“Europa tiene todavía mucho potencial por expl...,"Thu, 25 May 2023 07:00:00 GMT",https://news.google.com/rss/articles/CBMijAFod...,{'href': 'https://www.estrategiasdeinversion.c...
81,"Quién es José Elías, el multimillonario españo...","Quién es José Elías, el multimillonario españo...","Thu, 23 Nov 2023 08:00:00 GMT",https://news.google.com/rss/articles/CBMiNGh0d...,"{'href': 'https://hipertextual.com', 'title': ..."
82,Audax Renovables bate todos los registros: gan...,Audax Renovables bate todos los registros: gan...,"Wed, 15 May 2024 10:29:04 GMT",https://news.google.com/rss/articles/CBMigQFod...,{'href': 'https://cronicaglobal.elespanol.com'...
83,Audax Solar sortea las alegaciones de Villena ...,Audax Solar sortea las alegaciones de Villena ...,"Tue, 09 Jan 2024 08:00:00 GMT",https://news.google.com/rss/articles/CBMiQmh0d...,"{'href': 'https://alicanteplaza.es', 'title': ..."
84,"Audax duplica su EBITDA a septiembre de 2023, ...","Audax duplica su EBITDA a septiembre de 2023, ...","Mon, 13 Nov 2023 08:00:00 GMT",https://news.google.com/rss/articles/CBMihwFod...,{'href': 'https://www.estrategiasdeinversion.c...
85,Audax acelera para firmar un socio estratégico...,Audax acelera para firmar un socio estratégico...,"Thu, 14 Mar 2024 07:00:00 GMT",https://news.google.com/rss/articles/CBMibGh0d...,"{'href': 'https://www.lainformacion.com', 'tit..."
86,Audax Renovables ataca resistencias clave del ...,Audax Renovables ataca resistencias clave del ...,"Fri, 15 Mar 2024 07:00:00 GMT",https://news.google.com/rss/articles/CBMifWh0d...,"{'href': 'https://www.bolsamania.com', 'title'..."
87,Rechazada planta de 700 MW de Audax Energy en ...,Rechazada planta de 700 MW de Audax Energy en ...,"Fri, 10 Feb 2023 08:00:00 GMT",https://news.google.com/rss/articles/CBMiaWh0d...,"{'href': 'https://www.pv-magazine.es', 'title'..."
88,Noticias del Mercado Continuo Audax Renovables...,Noticias del Mercado Continuo Audax Renovables...,"Fri, 12 Apr 2024 07:00:00 GMT",https://news.google.com/rss/articles/CBMiggFod...,"{'href': 'https://consensodelmercado.com', 'ti..."
89,Audax sale de pérdidas y dispara un 56% sus in...,Audax sale de pérdidas y dispara un 56% sus in...,"Mon, 27 Feb 2023 08:00:00 GMT",https://news.google.com/rss/articles/CBMidGh0d...,"{'href': 'https://www.eleconomista.es', 'title..."


In [100]:
# Breaking publisher column
df = pd.concat([df.drop(['publisher'], axis=1), df['publisher'].apply(pd.Series)], axis=1)
df.head(5)

Unnamed: 0,title,description,published date,url,href,title.1
0,A Piece Of The Puzzle Missing From Audax Renov...,A Piece Of The Puzzle Missing From Audax Renov...,"Tue, 02 Apr 2024 07:00:00 GMT",https://news.google.com/rss/articles/CBMiiwFod...,https://simplywall.st,Simply Wall St
1,Spain's Audax reports shining fin results for ...,Spain's Audax reports shining fin results for ...,"Fri, 08 Mar 2024 08:00:00 GMT",https://news.google.com/rss/articles/CBMiZmh0d...,https://renewablesnow.com,Renewables Now
2,Despite delivering investors losses of 50% ove...,Despite delivering investors losses of 50% ove...,"Thu, 25 Jan 2024 08:00:00 GMT",https://news.google.com/rss/articles/CBMijQFod...,https://simplywall.st,Simply Wall St
3,Citadel honcho boosts stake in Spain's Audax t...,Citadel honcho boosts stake in Spain's Audax t...,"Mon, 18 Mar 2024 07:00:00 GMT",https://news.google.com/rss/articles/CBMiXGh0d...,https://renewablesnow.com,Renewables Now
4,Insiders were the biggest winners as Audax Ren...,Insiders were the biggest winners as Audax Ren...,"Fri, 19 Apr 2024 07:00:00 GMT",https://news.google.com/rss/articles/CBMiiwFod...,https://simplywall.st,Simply Wall St


In [101]:
# Cleaning dataframe
df['media'] = df['title'].iloc[:,-1]
df = df[['published date','media','url']]
df.head(20)

Unnamed: 0,published date,media,url
0,"Tue, 02 Apr 2024 07:00:00 GMT",Simply Wall St,https://news.google.com/rss/articles/CBMiiwFod...
1,"Fri, 08 Mar 2024 08:00:00 GMT",Renewables Now,https://news.google.com/rss/articles/CBMiZmh0d...
2,"Thu, 25 Jan 2024 08:00:00 GMT",Simply Wall St,https://news.google.com/rss/articles/CBMijQFod...
3,"Mon, 18 Mar 2024 07:00:00 GMT",Renewables Now,https://news.google.com/rss/articles/CBMiXGh0d...
4,"Fri, 19 Apr 2024 07:00:00 GMT",Simply Wall St,https://news.google.com/rss/articles/CBMiiwFod...
5,"Thu, 21 Dec 2023 08:00:00 GMT",Renewables Now,https://news.google.com/rss/articles/CBMiYGh0d...
6,"Thu, 09 May 2024 05:35:23 GMT",Simply Wall St,https://news.google.com/rss/articles/CBMiiwFod...
7,"Wed, 03 Apr 2024 07:00:00 GMT",Renewables Now,https://news.google.com/rss/articles/CBMiZWh0d...
8,"Wed, 28 Feb 2024 08:00:00 GMT",Yahoo,https://news.google.com/rss/articles/CBMiWmh0d...
9,"Thu, 18 Jan 2024 08:00:00 GMT",elperiodicodelaenergia.com,https://news.google.com/rss/articles/CBMic2h0d...


In [63]:
# Get the full article extracting and parsing the article and drop the links that give problems

list = []

for ind in tqdm(df.index, colour="green", desc="Extracting articles from the internet"):
  dict = {}
  
  try:
    article = googlenews.get_full_article(searchednews[ind]['url'])
    article.download()
    article.parse()
    article.nlp()
    dict['Date'] = df['published date'][ind]
    dict['Media'] = df['media'][ind]
    dict['Title'] = article.title
    dict['Article'] = article.text
    dict['Summary'] = article.summary
    list.append(dict)
  except Exception:
    df.drop([ind],axis=0,inplace=True)



Extracting articles from the internet:   2%|[32m▏         [0m| 2/100 [00:01<01:24,  1.16it/s]

An error occurred while fetching the article: Article `download()` failed with 403 Client Error: Forbidden for url: https://renewablesnow.com/news/spains-audax-reports-shining-fin-results-for-2023-profit-up-304-850978/ on URL https://news.google.com/rss/articles/CBMiZmh0dHBzOi8vcmVuZXdhYmxlc25vdy5jb20vbmV3cy9zcGFpbnMtYXVkYXgtcmVwb3J0cy1zaGluaW5nLWZpbi1yZXN1bHRzLWZvci0yMDIzLXByb2ZpdC11cC0zMDQtODUwOTc4L9IBAA?oc=5&hl=en-US&gl=US&ceid=US:en


Extracting articles from the internet:   4%|[32m▍         [0m| 4/100 [00:03<01:09,  1.38it/s]

An error occurred while fetching the article: Article `download()` failed with 403 Client Error: Forbidden for url: https://renewablesnow.com/news/citadel-honcho-boosts-stake-in-spains-audax-to-over-3-851944/ on URL https://news.google.com/rss/articles/CBMiXGh0dHBzOi8vcmVuZXdhYmxlc25vdy5jb20vbmV3cy9jaXRhZGVsLWhvbmNoby1ib29zdHMtc3Rha2UtaW4tc3BhaW5zLWF1ZGF4LXRvLW92ZXItMy04NTE5NDQv0gEA?oc=5&hl=en-US&gl=US&ceid=US:en


Extracting articles from the internet:   8%|[32m▊         [0m| 8/100 [00:08<01:38,  1.07s/it]

An error occurred while fetching the article: Article `download()` failed with 403 Client Error: Forbidden for url: https://renewablesnow.com/news/audax-bags-public-contract-for-renewable-power-supply-in-spain-853745/ on URL https://news.google.com/rss/articles/CBMiZWh0dHBzOi8vcmVuZXdhYmxlc25vdy5jb20vbmV3cy9hdWRheC1iYWdzLXB1YmxpYy1jb250cmFjdC1mb3ItcmVuZXdhYmxlLXBvd2VyLXN1cHBseS1pbi1zcGFpbi04NTM3NDUv0gEA?oc=5&hl=en-US&gl=US&ceid=US:en


Extracting articles from the internet: 100%|[32m██████████[0m| 100/100 [02:17<00:00,  1.37s/it]


In [90]:
# Create the news dataframe with full articles
news_df = pd.DataFrame(list)
news_df.head(20)

Unnamed: 0,Date,Media,Title,Article,Summary
0,"Tue, 02 Apr 2024 07:00:00 GMT",Simply Wall St,A Piece Of The Puzzle Missing From Audax Renov...,"Audax Renovables, S.A. ( ) shareholders would ...","Audax Renovables, S.A. ( ) shareholders would ..."
1,"Thu, 25 Jan 2024 08:00:00 GMT",Simply Wall St,Despite delivering investors losses of 50% ove...,"Audax Renovables, S.A. ( ) shareholders should...","Audax Renovables, S.A. ( ) shareholders should..."
2,"Fri, 19 Apr 2024 07:00:00 GMT",Simply Wall St,Insiders were the biggest winners as Audax Ren...,Key Insights\n\nInsiders appear to have a vest...,Let's delve deeper into each type of owner of ...
3,"Thu, 21 Dec 2023 08:00:00 GMT",Renewables Now,Spain's Audax starts building 57.5-MW solar fa...,Spanish renewable power and gas supplier Audax...,Spanish renewable power and gas supplier Audax...
4,"Thu, 09 May 2024 05:35:23 GMT",Simply Wall St,Audax Renovables' (BME:ADX) Returns On Capital...,"If we want to find a potential multi-bagger, o...","Firstly, we'd want to identify a growing retur..."
5,"Wed, 28 Feb 2024 08:00:00 GMT",Yahoo,"Audax aumenta su EBITDA un 78%, superando los ...",,
6,"Thu, 18 Jan 2024 08:00:00 GMT",elperiodicodelaenergia.com,El mundo agregará energías renovables en cinco...,,
7,"Mon, 13 Nov 2023 08:00:00 GMT",Cinco Días,Audax Renovables se dispara un 9% en Bolsa tra...,,
8,"Thu, 18 Jan 2024 08:00:00 GMT",Yahoo,"Audax reduce su deuda bruta en 41,4 millones d...",,
9,"Fri, 15 Mar 2024 07:00:00 GMT",La Vanguardia,El multimillonario Kenneth Griffin se refuerza...,,


In [91]:
# Convert the column to datetime
news_df['Date'] = pd.to_datetime(news_df['Date'])
#Sort values and reset index
news_df = news_df.sort_values(by='Date')
news_df.reset_index(drop=True,inplace=True)
#Change column date format
news_df['Date'] = news_df['Date'].dt.strftime('%d/%m/%Y')
news_df

Unnamed: 0,Date,Media,Title,Article,Summary
0,01/02/2023,Hispanidad.com,"Audax Renovables y su principal accionista, Jo...",,
1,10/02/2023,pv magazine España,Rechazada planta de 700 MW de Audax Energy en ...,The cookie settings on this website are set to...,The cookie settings on this website are set to...
2,22/02/2023,Forbes España,Audax Renovables suma 231 MWp con declaración ...,,
3,27/02/2023,elperiodicodelaenergia.com,"Audax gana 7,8 millones de euros en 2022",,
4,27/02/2023,Expansión,"Audax deja atrás los números rojos y gana 7,8 ...",,
...,...,...,...,...,...
90,15/05/2024,elEconomista,Audax Renovables gana 17 millones en un primer...,,
91,15/05/2024,La Vanguardia,Audax multiplica por siete el beneficio en un ...,,
92,15/05/2024,El Periódico,Audax Renovables multiplica su beneficio hasta...,,
93,15/05/2024,Crónica Global,Audax Renovables bate todos los registros: gan...,,


In [92]:
#Save in a document
topic = topic.replace(" ","_")
news_df[['Date','Media','Title']].to_csv(f"./output/{topic}.csv")