In [1]:
import requests
import json
import time
import pandas as pd
import config as cfg
from sqlalchemy import create_engine
from datetime import datetime, timedelta
from pandas   import Series, DataFrame
from api_keys import news_api_key

In [2]:
# Read the billionaire data from silver source

silver_metadata = pd.read_csv(cfg.silver_billionaire_csv_path)
silver_billionaire_df = pd.DataFrame(silver_metadata)

In [3]:
#Data buckets for news articles

billionaire_id_container = []
publication_container    = []
author_container         = []
title_container          = []
url_container            = []
published_ts_container   = []

In [4]:
#Data buckets for news metrics

billionaire_id_instance_bucket = []
total_article_count_bucket = []

In [5]:
def parse_article(billionaire_id, article) -> list:
    
    publication    = article["source"]["name"]
    author         = article["author"]
    title          = article["title"]
    url            = article["url"]
    published_ts   = article["publishedAt"]

    billionaire_id_container.append(billionaire_id)
    publication_container.append(publication)
    author_container.append(author)
    title_container.append(title)
    url_container.append(url)
    published_ts_container.append(published_ts)

In [6]:
def run_newsapi_api(data_row, start_date, end_date):

    billionaire_name = data_row["display_name"]
    billionaire_id   = data_row["billionaire_id"]
   
    api_url    = cfg.news_api_url_top_headlines
    query      = f"q={billionaire_name}"
    language   = "&language=en"
    date_range = f"&{start_date}&to={end_date}"
    sort_by    = "&sortBy=popularity"
    api_key    = f"&apikey={news_api_key}"
    
    query_url = f"{api_url}{query}{language}{date_range}{sort_by}{api_key}"
    response      = requests.get(query_url)
    status_code   = response.status_code
    
    if status_code == 200:
        
        json_obj = response.json() 

        total_results = json_obj["totalResults"]
        articles      = json_obj["articles"]
        
        billionaire_id_instance_bucket.append(billionaire_id)
        total_article_count_bucket.append(total_results)
        
        for article in articles:
            parse_article(billionaire_id, article)
            
    else:
        print("Ooops")  

In [7]:
end_date   = datetime.today()
start_date = datetime.today() - timedelta(days=int(cfg.article_days))
                                          
newsapi_start_date = start_date.strftime('%Y-%m-%d')
newapi_end_date    = end_date.strftime('%Y-%m-%d')

max_count = 3

try:

    for idx, data_row in silver_billionaire_df.iterrows():
        
        run_newsapi_api(data_row, newsapi_start_date, newapi_end_date)
        time.sleep(1)
        
        if idx == max_count:
            break
        
except Exception as e:
    print(f"Exception occured during API call = > {e.__str__()}")
else:
    print("API calls successful")

API calls successful


In [8]:
news_article_df = pd.DataFrame({'billionaire_id' : billionaire_id_container,
                                'publication'    : publication_container,
                                'author'         : author_container,
                                'title'          : title_container,
                                'url'            : url_container,
                                'published_ts'   : published_ts_container})

In [9]:
news_article_df.head(3)

Unnamed: 0,billionaire_id,publication,author,title,url,published_ts
0,0,Engadget,Bryan Menegus,The FAA will give Bezos and Branson its last a...,https://www.engadget.com/bezos-branson-faa-ast...,2021-12-10T21:55:38Z
1,0,Engadget,Andrew Tarantola,"In 2021, billionaires headed to the stars",https://www.engadget.com/in-2021-billionaires-...,2021-12-20T15:00:13Z
2,0,Wired,"Tim Barber, Jeremy White",5 Space Watches That Are Out of This World,https://www.wired.com/gallery/space-watches/,2021-12-27T12:00:00Z


In [10]:
password="postgres"
engine = create_engine(f'postgresql://postgres:{password}@localhost:5432/Billionaire')
connection = engine.connect()
news_article_df.to_sql(name='news_article', con=engine, if_exists='append',index=False)

In [11]:
news_metric_df = pd.DataFrame({'billionaire_id' : billionaire_id_instance_bucket,
                               'total_article_count'    : total_article_count_bucket})

In [12]:
news_metric_df.head(3)

Unnamed: 0,billionaire_id,total_article_count
0,0,1749
1,1,4205
2,2,74


In [13]:
password="postgres"
engine = create_engine(f'postgresql://postgres:{password}@localhost:5432/Billionaire')
connection = engine.connect()
news_metric_df.to_sql(name='news_metric', con=engine, if_exists='append',index=False)

In [14]:
news_article_df.to_csv(f"{cfg.news_article_csv_path}",header=True,index=False)
news_metric_df.to_csv(f"{cfg.news_metrics_csv_path}", header=True,index=False)