In [51]:
import requests
import json
from bs4 import BeautifulSoup

## News API Tool
For the API tool we initially decided to use: https://newsapi.org 

In [4]:
# setting the constants
query = {
    'from': '2022-03-11',
    'to': '2022-03-12',
    'language': 'en',
    'apiKey': '1f2f044610cd4d96a2229cb02f71a41f',
}

api_url = 'https://newsapi.org/v2/everything'

In [5]:
response = requests.get(api_url, query)

In [6]:
response.json()

{'status': 'error',
 'code': 'parameterInvalid',
 'message': 'You are trying to request results too far in the past. Your plan permits you to request articles as far back as 2022-04-04, but you have requested 2022-03-11. You may need to upgrade to a paid plan.'}

However, their free plan only permits requests upto 1 month.

## GoogleNewsAPI from RapidAPI
So we decided to find an api that we can use which is GoogleNewsAPI from the RapidAPI website  
https://rapidapi.com/newscatcher-api-newscatcher-api-default/api/google-news/

The free plan has a limit of **3 requests per hour**

In [48]:
# setting the constants
source = "rappler.com"
start_date = "2022-03-11"
end_date = "2022-03-12"

query = {
    "source":source,
    "lang":"en",
    "country":"PH",
    "from":start_date,
    "to":end_date
}

headers = {
    "X-RapidAPI-Host": "google-news.p.rapidapi.com",
    "X-RapidAPI-Key": "c13e5ad9c3msh1dbd627d38657a6p15f996jsn2b6a6ad61376"
}

api_url = "https://google-news.p.rapidapi.com/v1/source_search"

In [8]:
response = requests.get(api_url, headers=headers, params=query)

In [9]:
results = response.json()

In [10]:
results

{'feed': {'title': '"allinurl:rappler.com after:2022-03-11 before:2022-03-12" - Google News',
  'updated': 'Thu, 05 May 2022 06:41:07 GMT',
  'link': 'https://news.google.com/search?q=allinurl:rappler.com+after:2022-03-11+before:2022-03-12&ceid=PH:en&hl=en-PH&gl=PH',
  'language': 'en-PH',
  'subtitle': 'Google News',
  'rights': '2022 Google Inc.'},
 'articles': [{'id': 'CAIiEPnu0-n-qfCKPacMS9HthMIqGQgEKhAIACoHCAowkInwCjCi5s8CMJuOnAM',
   'title': 'RESULTS: January 2022 Licensure Examination for Teachers - Rappler',
   'link': 'https://www.rappler.com/bulletin-board/results-january-2022-licensure-examination-for-teachers/',
   'published': 'Fri, 11 Mar 2022 08:00:00 GMT',
   'sub_articles': [],
   'source': {'href': 'https://www.rappler.com', 'title': 'Rappler'}},
  {'id': 'CAIiEBxZSnhT4zkMb_uiCnJVGnUqGQgEKhAIACoHCAowkInwCjCi5s8CMJuOnAM',
   'title': 'Deltacron may not cause severe infection, but may have similar transmission rate – expert - Rappler',
   'link': 'https://www.rappler.c

In [13]:
results['articles']

[{'id': 'CAIiEPnu0-n-qfCKPacMS9HthMIqGQgEKhAIACoHCAowkInwCjCi5s8CMJuOnAM',
  'title': 'RESULTS: January 2022 Licensure Examination for Teachers - Rappler',
  'link': 'https://www.rappler.com/bulletin-board/results-january-2022-licensure-examination-for-teachers/',
  'published': 'Fri, 11 Mar 2022 08:00:00 GMT',
  'sub_articles': [],
  'source': {'href': 'https://www.rappler.com', 'title': 'Rappler'}},
 {'id': 'CAIiEBxZSnhT4zkMb_uiCnJVGnUqGQgEKhAIACoHCAowkInwCjCi5s8CMJuOnAM',
  'title': 'Deltacron may not cause severe infection, but may have similar transmission rate – expert - Rappler',
  'link': 'https://www.rappler.com/nation/expert-says-deltacron-may-not-cause-severe-infection-but-similar-transmission-rate/',
  'published': 'Fri, 11 Mar 2022 08:00:00 GMT',
  'sub_articles': [],
  'source': {'href': 'https://www.rappler.com', 'title': 'Rappler'}},
 {'id': 'CAIiEObWDvnGTuDtTA8c46qTFiMqGQgEKhAIACoHCAowkInwCjCi5s8CMJuOnAM',
  'title': "Iloilo congressman goes against his party's choice,

This worked however, the information that is needed is lacking:
1. **Full Article**. It only has the URL to that article.
2. **Author**.

Due to this, we have to go through each link and scrape the lacking informations

In [44]:
# Due to these, we only have to store the urls of the articles
urls = []
for article in results['articles']:
    urls.append(article["link"])
len(urls)

73

## Scrape the lacking Informations

In [54]:
for i, url in enumerate(urls):
    print("[%d] Scraping [%s]" % (i + 1, url))
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    article_contents = soup.find('main', id='primary').find('article')
    
    try:
        category = article_contents.find('a', class_='post-single__category').text.strip().title()
    except:
        category = None
        
    try:
        title = article_contents.find('h1', class_='post-single__title').text.strip()
    except:
        title = None
        
    try:
        date_published = article_contents.find('time', class_='published').text.strip()
    except:
        date_published = None
        
    try:
        author = article_contents.find('div', class_='post-single__authors').text.strip()
    except:
        author = None
    
    try:
        full_article = str(soup.find('div', class_='article-main-section')).replace('\n', '').replace('\t', ' ')
    except:
        full_article = None
        
    data[i]['Category'] = category
    data[i]['Title'] = title
    data[i]['Date Published'] = date_published
    data[i]['Author'] = author
    data[i]['Full Article'] = full_article

[1] Scraping [https://www.rappler.com/bulletin-board/results-january-2022-licensure-examination-for-teachers/]
[2] Scraping [https://www.rappler.com/nation/expert-says-deltacron-may-not-cause-severe-infection-but-similar-transmission-rate/]
[3] Scraping [https://www.rappler.com/nation/iloilo-congressman-michael-goririceta-endorses-leni-robredo/]
[4] Scraping [https://www.rappler.com/nation/guinness-record-ferdinand-marcos-greatest-robbery-of-government-suddenly-inaccessible-march-2022/]
[5] Scraping [https://www.rappler.com/entertainment/celebrities/nadine-lustre-says-no-future-love-teams-returns-showbiz/]
[6] Scraping [https://www.rappler.com/life-and-style/food-drinks/rita-philippines-back-up-town-center-branch-quezon-city/]
[7] Scraping [https://www.rappler.com/nation/elections/video-isko-moreno-campaign-speeches-street-imagery/]
[8] Scraping [https://www.rappler.com/nation/elections/pacquiao-woos-barangay-leaders-calabarzon/]
[9] Scraping [https://www.rappler.com/nation/elections/l

[72] Scraping [https://www.rappler.com/business/international-monetary-fund-boost-moldova-financial-support-ukraine-refugee-influx/]
[73] Scraping [https://www.rappler.com/world/south-central-asia/india-says-it-accidentally-fired-missile-into-pakistan/]


## Export data to a JSON file

In [55]:
filename = '%s (%s - %s).json' % (source, start_date, end_date)
with open(filename, 'w') as file:
    json.dump(data, file)

## Import data as JSON file into pd Dataframe

In [56]:
import pandas as pd

df = pd.read_json(filename)
df.head()

Unnamed: 0,Title,URL,Category,Date Published,Author,Full Article
0,RESULTS: January 2022 Licensure Examination fo...,https://www.rappler.com/bulletin-board/results...,Board Exam Results,"Mar 11, 2022 6:04 PM PHT",Rappler.com,"<div class=""article-main-section""><div class=""..."
1,"Deltacron may not cause severe infection, but ...",https://www.rappler.com/nation/expert-says-del...,Covid-19,"Mar 11, 2022 4:51 PM PHT",Bonz Magsambol,"<div class=""article-main-section""><div class=""..."
2,Iloilo congressman goes against his party’s ch...,https://www.rappler.com/nation/iloilo-congress...,2022 Philippine Elections,"Mar 11, 2022 8:59 PM PHT",Joseph B.A. Marzan,"<div class=""article-main-section""><div class=""..."
3,Marcos’ ‘greatest robbery of a government’ Gui...,https://www.rappler.com/nation/guinness-record...,Marcos Ill-Gotten Wealth,"Mar 11, 2022 7:00 PM PHT",Christa Escudero,"<div class=""article-main-section""><div class=""..."
4,Nadine Lustre says no to future love teams as ...,https://www.rappler.com/entertainment/celebrit...,Filipina Actresses,"Mar 11, 2022 7:23 PM PHT",Rappler.com,"<div class=""article-main-section""><div class=""..."


In [58]:
# verify the dates if it is within the intended timeframe
df['Date Published'].apply(lambda x: ' '.join(x.split(' ')[:3])).value_counts()

Mar 11, 2022    44
Mar 12, 2022    29
Name: Date Published, dtype: int64