In [52]:
# Import libraries
# GoogleNews Documentation : https://pypi.org/project/gnews/
from gnews import GNews
import pandas as pd
import nltk  # NLP library
from tqdm.auto import tqdm
import warnings

warnings.filterwarnings('ignore')

In [53]:
# This tokenizer divides a text into a list of sentences by using an unsupervised algorithm to build a model for abbreviation words, collocations, and words that start sentences.
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/alfonso/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [54]:
# Initializing
googlenews = GNews()

In [55]:
# Settings
googlenews = GNews(language='en',start_date=(2023, 8, 1), end_date=(2023, 8, 2))

In [56]:
# Search 
searchednews = googlenews.get_news('Ukraine')
print(f"Articles found:", len(searchednews))

Articles found: 51


In [57]:
# Results
print(searchednews[1])

{'title': 'The Crimean Peninsula is both a playground and a battleground, coveted by Ukraine and Russia - The Associated Press', 'description': 'The Crimean Peninsula is both a playground and a battleground, coveted by Ukraine and Russia  The Associated Press', 'published date': 'Tue, 01 Aug 2023 07:00:00 GMT', 'url': 'https://news.google.com/rss/articles/CBMiX2h0dHBzOi8vYXBuZXdzLmNvbS9hcnRpY2xlL3J1c3NpYS11a3JhaW5lLXdhci1jcmltZWEtcGVuaW5zdWxhLWRmZjM0ODRkYTgyNGUxMWFmYzkyYzgzZWNmMTlmNzFi0gEA?oc=5&hl=en-US&gl=US&ceid=US:en', 'publisher': {'href': 'https://apnews.com', 'title': 'The Associated Press'}}


In [58]:
# Convert to DataFrame
df = pd.DataFrame(searchednews)
df.tail(20)

Unnamed: 0,title,description,published date,url,publisher
31,Ukraine war causes birth rate to slump - BBC.com,Ukraine war causes birth rate to slump BBC.com,"Tue, 01 Aug 2023 07:00:00 GMT",https://news.google.com/rss/articles/CBMiLmh0d...,"{'href': 'https://www.bbc.com', 'title': 'BBC...."
32,Ukraine's grain deal halt and how to truly hel...,Ukraine's grain deal halt and how to truly hel...,"Wed, 02 Aug 2023 07:00:00 GMT",https://news.google.com/rss/articles/CBMiZGh0d...,"{'href': 'https://www.oxfam.org', 'title': 'Ox..."
33,Letting Europe's Energy Crisis Go to Waste: Th...,Letting Europe's Energy Crisis Go to Waste: Th...,"Tue, 01 Aug 2023 07:00:00 GMT",https://news.google.com/rss/articles/CBMiiQFod...,"{'href': 'https://home.watson.brown.edu', 'tit..."
34,Ukrainian counteroffensive’s slow going offers...,Ukrainian counteroffensive’s slow going offers...,"Wed, 02 Aug 2023 07:00:00 GMT",https://news.google.com/rss/articles/CBMiUmh0d...,"{'href': 'https://www.theguardian.com', 'title..."
35,Russia-Ukraine war at a glance: what we know o...,Russia-Ukraine war at a glance: what we know o...,"Wed, 02 Aug 2023 07:00:00 GMT",https://news.google.com/rss/articles/CBMidGh0d...,"{'href': 'https://www.theguardian.com', 'title..."
36,Russia attacks more Ukrainian ports and grain ...,Russia attacks more Ukrainian ports and grain ...,"Tue, 01 Aug 2023 07:00:00 GMT",https://news.google.com/rss/articles/CBMiQGh0d...,"{'href': 'https://www.cnbc.com', 'title': 'CNBC'}"
37,Ukraine's latest weapons in its war with Russi...,Ukraine's latest weapons in its war with Russi...,"Tue, 01 Aug 2023 07:00:00 GMT",https://news.google.com/rss/articles/CBMie2h0d...,"{'href': 'https://www.economist.com', 'title':..."
38,"Russia targets Odesa port, angering Ukraine an...","Russia targets Odesa port, angering Ukraine an...","Wed, 02 Aug 2023 07:00:00 GMT",https://news.google.com/rss/articles/CBMiT2h0d...,"{'href': 'https://www.aljazeera.com', 'title':..."
39,Russia had time to prepare for Ukraine offensi...,Russia had time to prepare for Ukraine offensi...,"Tue, 01 Aug 2023 07:00:00 GMT",https://news.google.com/rss/articles/CBMiYGh0d...,"{'href': 'https://www.businessinsider.com', 't..."
40,Saudi Arabia and Turkey are emerging as the ne...,Saudi Arabia and Turkey are emerging as the ne...,"Tue, 01 Aug 2023 07:00:00 GMT",https://news.google.com/rss/articles/CBMiaGh0d...,"{'href': 'https://www.cnbc.com', 'title': 'CNBC'}"


In [59]:
# Breaking publisher column
df = pd.concat([df.drop(['publisher'], axis=1), df['publisher'].apply(pd.Series)], axis=1)
df.head(5)

Unnamed: 0,title,description,published date,url,href,title.1
0,Ukraine is 'extraordinary laboratory' for mili...,Ukraine is 'extraordinary laboratory' for mili...,"Tue, 01 Aug 2023 07:00:00 GMT",https://news.google.com/rss/articles/CBMicWh0d...,https://defensescoop.com,DefenseScoop
1,The Crimean Peninsula is both a playground and...,The Crimean Peninsula is both a playground and...,"Tue, 01 Aug 2023 07:00:00 GMT",https://news.google.com/rss/articles/CBMiX2h0d...,https://apnews.com,The Associated Press
2,"Russia strikes Ukraine's Danube port, driving ...","Russia strikes Ukraine's Danube port, driving ...","Wed, 02 Aug 2023 07:00:00 GMT",https://news.google.com/rss/articles/CBMia2h0d...,https://www.reuters.com,Reuters
3,Ukraine is finally freeing itself from centuri...,Ukraine is finally freeing itself from centuri...,"Tue, 01 Aug 2023 07:00:00 GMT",https://news.google.com/rss/articles/CBMie2h0d...,https://www.atlanticcouncil.org,Atlantic Council
4,‘Nowhere to hide’: The question troubling Ukra...,‘Nowhere to hide’: The question troubling Ukra...,"Tue, 01 Aug 2023 07:00:00 GMT",https://news.google.com/rss/articles/CBMiYWh0d...,https://www.cnn.com,CNN


In [60]:
# Cleaning dataframe
df['media'] = df['title'].iloc[:,-1]
df = df[['published date','media','url']]
df.head(20)

Unnamed: 0,published date,media,url
0,"Tue, 01 Aug 2023 07:00:00 GMT",DefenseScoop,https://news.google.com/rss/articles/CBMicWh0d...
1,"Tue, 01 Aug 2023 07:00:00 GMT",The Associated Press,https://news.google.com/rss/articles/CBMiX2h0d...
2,"Wed, 02 Aug 2023 07:00:00 GMT",Reuters,https://news.google.com/rss/articles/CBMia2h0d...
3,"Tue, 01 Aug 2023 07:00:00 GMT",Atlantic Council,https://news.google.com/rss/articles/CBMie2h0d...
4,"Tue, 01 Aug 2023 07:00:00 GMT",CNN,https://news.google.com/rss/articles/CBMiYWh0d...
5,"Tue, 01 Aug 2023 07:00:00 GMT",Al Jazeera English,https://news.google.com/rss/articles/CBMiVWh0d...
6,"Tue, 01 Aug 2023 07:00:00 GMT",Idaho Capital Sun,https://news.google.com/rss/articles/CBMiX2h0d...
7,"Wed, 02 Aug 2023 07:00:00 GMT",The New York Times,https://news.google.com/rss/articles/CBMiXGh0d...
8,"Tue, 01 Aug 2023 07:00:00 GMT",The Wall Street Journal,https://news.google.com/rss/articles/CBMibWh0d...
9,"Tue, 01 Aug 2023 07:00:00 GMT",The Washington Post,https://news.google.com/rss/articles/CBMiVmh0d...


In [61]:
# Get the full article extracting and parsing the article and drop the links that give problems

list = []

for ind in tqdm(df.index, colour="green", desc="Extracting articles from the internet"):
  dict = {}
  
  try:
    article = googlenews.get_full_article(searchednews[ind]['url'])
    article.download()
    article.parse()
    article.nlp()
    dict['Date'] = df['published date'][ind]
    dict['Media'] = df['media'][ind]
    dict['Title'] = article.title
    dict['Article'] = article.text
    dict['Summary'] = article.summary
    list.append(dict)
  except Exception:
    df.drop([ind],axis=0,inplace=True)



Extracting articles from the internet:   0%|[32m          [0m| 0/51 [00:00<?, ?it/s]

Extracting articles from the internet:  47%|[32m████▋     [0m| 24/51 [00:00<00:00, 122.39it/s]


get_full_article() requires the `newspaper3k` library.
You can install it by running `pip3 install newspaper3k` in your shell.

get_full_article() requires the `newspaper3k` library.
You can install it by running `pip3 install newspaper3k` in your shell.

get_full_article() requires the `newspaper3k` library.
You can install it by running `pip3 install newspaper3k` in your shell.

get_full_article() requires the `newspaper3k` library.
You can install it by running `pip3 install newspaper3k` in your shell.

get_full_article() requires the `newspaper3k` library.
You can install it by running `pip3 install newspaper3k` in your shell.

get_full_article() requires the `newspaper3k` library.
You can install it by running `pip3 install newspaper3k` in your shell.

get_full_article() requires the `newspaper3k` library.
You can install it by running `pip3 install newspaper3k` in your shell.

get_full_article() requires the `newspaper3k` library.
You can install it by running `pip3 install news

Extracting articles from the internet: 100%|[32m██████████[0m| 51/51 [00:00<00:00, 173.46it/s]


get_full_article() requires the `newspaper3k` library.
You can install it by running `pip3 install newspaper3k` in your shell.

get_full_article() requires the `newspaper3k` library.
You can install it by running `pip3 install newspaper3k` in your shell.

get_full_article() requires the `newspaper3k` library.
You can install it by running `pip3 install newspaper3k` in your shell.

get_full_article() requires the `newspaper3k` library.
You can install it by running `pip3 install newspaper3k` in your shell.

get_full_article() requires the `newspaper3k` library.
You can install it by running `pip3 install newspaper3k` in your shell.

get_full_article() requires the `newspaper3k` library.
You can install it by running `pip3 install newspaper3k` in your shell.

get_full_article() requires the `newspaper3k` library.
You can install it by running `pip3 install newspaper3k` in your shell.

get_full_article() requires the `newspaper3k` library.
You can install it by running `pip3 install news




In [62]:
# Create the news dataframe with full articles
news_df = pd.DataFrame(list)
news_df.head(20)

In [64]:
# Checking one article
news_df.iloc[7]['Article']

IndexError: single positional indexer is out-of-bounds