In [12]:
# Import libraries
# GoogleNews Documentation : https://pypi.org/project/gnews/
from gnews import GNews
import pandas as pd
import nltk  # NLP library
import warnings
from datetime import date

warnings.filterwarnings('ignore')

In [13]:
# This tokenizer divides a text into a list of sentences by using an unsupervised algorithm to build a model for abbreviation words, collocations, and words that start sentences.
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/alfonso/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [14]:
# Initializing
googlenews = GNews()

In [19]:
# Settings
today = date.today()
googlenews = GNews(start_date=(2023, 1, 1), end_date=(today.year,today.month,today.day))

In [20]:
# Search 
topic = "Gold"
searchednews = googlenews.get_news(topic)
print(f"Articles found:", len(searchednews))

Articles found: 97


In [21]:
# Results
print(searchednews[1])

{'title': "UNMC's Jeffrey Gold, priority NU presidential candidate, seeks to 'pay it forward' • Nebraska Examiner - Nebraska Examiner", 'description': "UNMC's Jeffrey Gold, priority NU presidential candidate, seeks to 'pay it forward' • Nebraska Examiner  Nebraska Examiner", 'published date': 'Wed, 20 Mar 2024 07:00:00 GMT', 'url': 'https://news.google.com/rss/articles/CBMidmh0dHBzOi8vbmVicmFza2FleGFtaW5lci5jb20vMjAyNC8wMy8yMC91bm1jcy1qZWZmcmV5LWdvbGQtcHJpb3JpdHktbnUtcHJlc2lkZW50aWFsLWNhbmRpZGF0ZS1zZWVrcy10by1wYXktaXQtZm9yd2FyZC_SAQA?oc=5&hl=en-US&gl=US&ceid=US:en', 'publisher': {'href': 'https://nebraskaexaminer.com', 'title': 'Nebraska Examiner'}}


In [22]:
# Convert to DataFrame
df = pd.DataFrame(searchednews)
df.tail(20)

Unnamed: 0,title,description,published date,url,publisher
77,Sen. Bob Menendez says gold bars and cash at h...,Sen. Bob Menendez says gold bars and cash at h...,"Tue, 23 Jan 2024 08:00:00 GMT",https://news.google.com/rss/articles/CBMiZGh0d...,"{'href': 'https://apnews.com', 'title': 'The A..."
78,Introducing: The Rolex Deepsea Goes Full Gold ...,Introducing: The Rolex Deepsea Goes Full Gold ...,"Tue, 09 Apr 2024 07:00:00 GMT",https://news.google.com/rss/articles/CBMiS2h0d...,"{'href': 'https://www.hodinkee.com', 'title': ..."
79,Gold prices remain high: Other precious metals...,Gold prices remain high: Other precious metals...,"Thu, 13 Jun 2024 16:43:53 GMT",https://news.google.com/rss/articles/CBMiXmh0d...,"{'href': 'https://www.cbsnews.com', 'title': '..."
80,What is the price of gold today? - CBS News,What is the price of gold today? CBS News,"Wed, 04 Oct 2023 07:00:00 GMT",https://news.google.com/rss/articles/CBMiPWh0d...,"{'href': 'https://www.cbsnews.com', 'title': '..."
81,There’s a gold rush on Wall Street - CNN,There’s a gold rush on Wall Street CNN,"Tue, 05 Mar 2024 08:00:00 GMT",https://news.google.com/rss/articles/CBMiR2h0d...,"{'href': 'https://www.cnn.com', 'title': 'CNN'}"
82,Gold rallies to new records on worldwide tensi...,Gold rallies to new records on worldwide tensi...,"Mon, 08 Apr 2024 07:00:00 GMT",https://news.google.com/rss/articles/CBMiMmh0d...,"{'href': 'https://www.axios.com', 'title': 'Ax..."
83,Goldhub Gold Demand Trends Q2 2023 - World Gol...,Goldhub Gold Demand Trends Q2 2023 World Gold...,"Tue, 01 Aug 2023 07:00:00 GMT",https://news.google.com/rss/articles/CBMiU2h0d...,"{'href': 'https://www.gold.org', 'title': 'Wor..."
84,"Gold has broken through the $2,300 level, and ...","Gold has broken through the $2,300 level, and ...","Fri, 05 Apr 2024 07:00:00 GMT",https://news.google.com/rss/articles/CBMiYWh0d...,"{'href': 'https://www.cnbc.com', 'title': 'CNBC'}"
85,Gold has never been this expensive - CNN,Gold has never been this expensive CNN,"Mon, 04 Dec 2023 08:00:00 GMT",https://news.google.com/rss/articles/CBMiUWh0d...,"{'href': 'https://www.cnn.com', 'title': 'CNN'}"
86,"Gold reaches record high today near $2,100 per...","Gold reaches record high today near $2,100 per...","Mon, 04 Dec 2023 08:00:00 GMT",https://news.google.com/rss/articles/CBMiRWh0d...,"{'href': 'https://www.cbsnews.com', 'title': '..."


In [23]:
# Breaking publisher column
df['Title'] = df['title']
df = pd.concat([df.drop(['publisher'], axis=1), df['publisher'].apply(pd.Series)], axis=1)
df.head(5)

Unnamed: 0,title,description,published date,url,Title,href,title.1
0,Gold on track for weekly rise as Middle East r...,Gold on track for weekly rise as Middle East r...,"Fri, 19 Apr 2024 07:00:00 GMT",https://news.google.com/rss/articles/CBMigAFod...,Gold on track for weekly rise as Middle East r...,https://www.reuters.com,Reuters
1,"UNMC's Jeffrey Gold, priority NU presidential ...","UNMC's Jeffrey Gold, priority NU presidential ...","Wed, 20 Mar 2024 07:00:00 GMT",https://news.google.com/rss/articles/CBMidmh0d...,"UNMC's Jeffrey Gold, priority NU presidential ...",https://nebraskaexaminer.com,Nebraska Examiner
2,Why Isn't The Gold Price Doing What It's Suppo...,Why Isn't The Gold Price Doing What It's Suppo...,"Wed, 26 Apr 2023 07:00:00 GMT",https://news.google.com/rss/articles/CBMiZ2h0d...,Why Isn't The Gold Price Doing What It's Suppo...,https://www.forbes.com,Forbes
3,Is it a golden era for gold? - JP Morgan Priva...,Is it a golden era for gold? JP Morgan Privat...,"Thu, 21 Mar 2024 05:01:03 GMT",https://news.google.com/rss/articles/CBMiYmh0d...,Is it a golden era for gold? - JP Morgan Priva...,https://privatebank.jpmorgan.com,JP Morgan Private Bank
4,Solid Gold - brooklynmuseum.org,Solid Gold brooklynmuseum.org,"Thu, 13 Jun 2024 15:20:03 GMT",https://news.google.com/rss/articles/CBMiNWh0d...,Solid Gold - brooklynmuseum.org,https://www.brooklynmuseum.org,brooklynmuseum.org


In [24]:
# Cleaning dataframe
df['Media'] = df['title'].iloc[:,-1]
df['url'] = df['url'].apply(lambda x: '=HYPERLINK("'+ x +'")')
news_df = df[['published date','Media','Title','url']]
news_df.head(20)

Unnamed: 0,published date,Media,Title,url
0,"Fri, 19 Apr 2024 07:00:00 GMT",Reuters,Gold on track for weekly rise as Middle East r...,"=HYPERLINK(""https://news.google.com/rss/articl..."
1,"Wed, 20 Mar 2024 07:00:00 GMT",Nebraska Examiner,"UNMC's Jeffrey Gold, priority NU presidential ...","=HYPERLINK(""https://news.google.com/rss/articl..."
2,"Wed, 26 Apr 2023 07:00:00 GMT",Forbes,Why Isn't The Gold Price Doing What It's Suppo...,"=HYPERLINK(""https://news.google.com/rss/articl..."
3,"Thu, 21 Mar 2024 05:01:03 GMT",JP Morgan Private Bank,Is it a golden era for gold? - JP Morgan Priva...,"=HYPERLINK(""https://news.google.com/rss/articl..."
4,"Thu, 13 Jun 2024 15:20:03 GMT",brooklynmuseum.org,Solid Gold - brooklynmuseum.org,"=HYPERLINK(""https://news.google.com/rss/articl..."
5,"Mon, 20 May 2024 07:00:00 GMT",CBS News,Gold prices hit a new record high: 5 moves to ...,"=HYPERLINK(""https://news.google.com/rss/articl..."
6,"Wed, 28 Jun 2023 07:00:00 GMT",Bloomberg,Gold Is No Longer a Good Hedge Against Bad Tim...,"=HYPERLINK(""https://news.google.com/rss/articl..."
7,"Sat, 22 Apr 2023 07:00:00 GMT",The New York Times,"Eureka! After California's Heavy Rains, Gold S...","=HYPERLINK(""https://news.google.com/rss/articl..."
8,"Wed, 08 May 2024 07:00:00 GMT",Reuters,Gold holds ground as investors await US data f...,"=HYPERLINK(""https://news.google.com/rss/articl..."
9,"Tue, 02 Apr 2024 07:00:00 GMT",Reuters,Gold shatters new records as Mideast tensions ...,"=HYPERLINK(""https://news.google.com/rss/articl..."


In [25]:
# Convert the column to datetime
news_df['Date'] = pd.to_datetime(news_df['published date'])
#Sort values and reset index
news_df = news_df.sort_values(by='Date')
news_df.reset_index(drop=True,inplace=True)
#Change column date format
news_df['Date'] = news_df['Date'].dt.strftime('%d/%m/%Y')
news_df = news_df[['Date','Media','Title','url']]
news_df 

Unnamed: 0,Date,Media,Title,url
0,22/04/2023,The New York Times,"Eureka! After California's Heavy Rains, Gold S...","=HYPERLINK(""https://news.google.com/rss/articl..."
1,26/04/2023,Forbes,Why Isn't The Gold Price Doing What It's Suppo...,"=HYPERLINK(""https://news.google.com/rss/articl..."
2,11/05/2023,news.gallup.com,Real Estate's Lead as Best Investment Shrinks;...,"=HYPERLINK(""https://news.google.com/rss/articl..."
3,28/06/2023,Bloomberg,Gold Is No Longer a Good Hedge Against Bad Tim...,"=HYPERLINK(""https://news.google.com/rss/articl..."
4,13/07/2023,The Economist,The mystery of gold prices - The Economist,"=HYPERLINK(""https://news.google.com/rss/articl..."
...,...,...,...,...
92,12/06/2024,Reuters,Gold rush to endure through 2024 though $3000 ...,"=HYPERLINK(""https://news.google.com/rss/articl..."
93,13/06/2024,brooklynmuseum.org,Solid Gold - brooklynmuseum.org,"=HYPERLINK(""https://news.google.com/rss/articl..."
94,13/06/2024,CBS News,Gold prices remain high: Other precious metals...,"=HYPERLINK(""https://news.google.com/rss/articl..."
95,20/06/2024,Reuters,Gold rises 1% to two-week peak as Fed rate-cut...,"=HYPERLINK(""https://news.google.com/rss/articl..."


In [26]:
#Save in a document
topic = topic.replace(" ","_")
news_df.to_csv(f"./output/{topic}.csv")