In [1]:
import requests
import json
import time
import pandas as pd
import config as cfg
from sqlalchemy import create_engine
from datetime import datetime, timedelta
from pandas   import Series, DataFrame
from keys import news_api_key, postgres_password

In [2]:
# Read the billionaire data from silver source

silver_metadata = pd.read_csv(cfg.silver_billionaire_csv_path)
silver_billionaire_df = pd.DataFrame(silver_metadata)

In [3]:
#Data buckets for news articles

billionaire_id_container = []
publication_container    = []
author_container         = []
title_container          = []
url_container            = []
published_ts_container   = []

In [4]:
#Data buckets for news metrics

billionaire_id_instance_bucket = []
total_article_count_bucket = []

In [5]:
def parse_article(billionaire_id, article) -> list:
    
    publication    = article["source"]["name"]
    author         = article["author"]
    title          = article["title"]
    url            = article["url"]
    published_ts   = article["publishedAt"]

    billionaire_id_container.append(billionaire_id)
    publication_container.append(publication)
    author_container.append(author)
    title_container.append(title)
    url_container.append(url)
    published_ts_container.append(published_ts)

In [6]:
def run_newsapi_api(data_row, start_date, end_date):

    billionaire_name = data_row["display_name"]
    billionaire_id   = data_row["billionaire_id"]
   
    api_url    = cfg.news_api_url_top_headlines
    query      = f"q=+\"{billionaire_name}\" AND billionaire"
    language   = "&language=en"
    sort_by    = "&sortBy=popularity" 
    api_key    = f"&apikey={news_api_key}"
    
    query_url = f"{api_url}{query}{language}{sort_by}{api_key}"
    
    response      = requests.get(query_url)
    status_code   = response.status_code
    
    if status_code == 200:
        
        json_obj = response.json() 

        total_results = json_obj["totalResults"]
        articles      = json_obj["articles"]
        
        billionaire_id_instance_bucket.append(billionaire_id)
        total_article_count_bucket.append(total_results)
        
        for article in articles:
            parse_article(billionaire_id, article)
            
    else:
        print("Ooops")  

In [None]:
end_date   = datetime.today()
start_date = datetime.today() - timedelta(days=int(cfg.article_days))
                                          
newsapi_start_date = start_date.strftime('%Y-%m-%d')
newapi_end_date    = end_date.strftime('%Y-%m-%d')

try:

    for idx, data_row in silver_billionaire_df.iterrows():
        
        run_newsapi_api(data_row, newsapi_start_date, newapi_end_date)
        time.sleep(1)
                    
except Exception as e:
    print(f"Exception occured during API call = > {e.__str__()}")

In [None]:
news_article_df = pd.DataFrame({'billionaire_id' : billionaire_id_container,
                                'publication'    : publication_container,
                                'author'         : author_container,
                                'title'          : title_container,
                                'url'            : url_container,
                                'published_ts'   : published_ts_container})

news_article_df.drop_duplicates(subset='title', keep='first', inplace=True) #remove duplicated article headline records

In [None]:
news_article_df.head(3)

In [None]:
engine = create_engine(f'postgresql://postgres:{postgres_password}@localhost:5432/Billionaire')
connection = engine.connect()
news_article_df.to_sql(name='news_article', con=engine, if_exists='append',index=False)

In [None]:
news_metric_df = pd.DataFrame({'billionaire_id' : billionaire_id_instance_bucket,
                               'total_article_count'    : total_article_count_bucket})

In [None]:
news_metric_df.head(3)

In [None]:
engine = create_engine(f'postgresql://postgres:{postgres_password}@localhost:5432/Billionaire')
connection = engine.connect()
news_metric_df.to_sql(name='news_metric', con=engine, if_exists='append',index=False)

In [None]:
news_article_df.index.names = ['id']
news_metric_df.index.names = ['id']

news_article_df.to_csv(f"{cfg.news_article_csv_path}",header=True,index=False)
news_metric_df.to_csv(f"{cfg.news_metrics_csv_path}", header=True,index=False)