# Get daily news form using Yahoo Query API
## We will be getting all the available historical news from Yahoo Query

### Step 1: Import the required libraries
We will be using Yahoo Query API to pull data from Yahoo Finance. To install this library use pip install yahooquery

In [1]:
import yahooquery as yq
import pandas as pd
import csv
import logging #library to create log files
import datetime
from datetime import date, timedelta
# from pytz import timezone # Get timezone
import time
import os # To check if the file exists

### Step 2: Load tickers for S&P 500 stocks from csv file 
Note: We have created seperate python script (S&P500Tickers.ipynb) to get S&P 500 stocks and store that in file name is 'S&P500Tickers.csv'

In [2]:
ticker_data = pd.read_csv("S&P500Tickers.csv", encoding='utf-8') # Open csv file and read data in pandas dataframe
tickers_list = ticker_data['Symbol'].tolist() #Get all the tickers in a list

### Step 3: Create a function that pull news from yahooquery api for each ticker

In [3]:
def get_latest_news(tickers, count = 500): # Maixum number of news items per ticker
    '''
    This function pull news data from yahoo query API. The number of news items pulled depends in the input count 
        
    Input Parameters: 
    tickers: List of tickers for which data needs to be extracted
    Count: number of news articles that needs to be extracted
    
    Returns: Dataframe of the extracted data that includes Symbol, Date, Headline, Summary, Source and link
    '''
    logging.info("Pulling daily news")
    final_news_df = pd.DataFrame() # Declearing final data frame
    news_data = pd.DataFrame() # Declearing intermediate data frame
    for ticker in tickers:
        # Initiate object to get the data from API
        #print(f'working on the ticker {ticker}')
        try:
            ticker_obj = yq.Ticker(ticker) # Create ticker onject
            news_list = ticker_obj.news(count) # Get news. Returned as list
            #news_data = pd.DataFrame(news_list) # Creating pandas data frame from list of dictionaries
            # Creating pandas data frame from list of dictionaries, with specified columns
            news_data = pd.DataFrame(news_list, columns=['provider_publish_time', 'id', 'title']) # Creating pandas data frame from list of dictionaries
            news_data.insert(0, 'Symbol', ticker) #Adding ticker to the dataframe

            #display(news_data.head())
            #print(f'Appending to existing dataframe')
            final_news_df = final_news_df.append(news_data) # Appending to dataframe that will contain news for all the tickers
        except Exception as e:
            pass
        #display(final_news_df.head())
        time.sleep(.5) # Speeed time between each data request
    
    #final_news_df.reset_index(inplace=True) #Reset the index
    #display(final_news_df.head())
    # Convert datetime from epoch to datetime
    final_news_df['provider_publish_time'] = pd.to_datetime( final_news_df['provider_publish_time'], unit='s')
    # Set index to symbol and date
    final_news_df.set_index(["Symbol", "provider_publish_time", "id"], inplace=True)
    #display(final_news_df.head())
    return final_news_df    

### Step 4: Get the data from the existing news csv file in dataframe. New data will be appended to old data
This function will open existing csv file and put the data in dataframe and return that

In [4]:
def get_previous_data(file):
    '''
    This function open existing news data csv and returns the data in the dataframe
    Input Parameters: Name of the existing csv file that needs to be read in dataframe
    Returns: pandas dataframe containing data
    '''
    logging.info("Opening existing data file")
    #print("Opening existing data file")
    
    old_data_df = pd.read_csv(file, encoding='utf-8') # Read the csv file in a dataframe
    #display(old_data_df.head())
    #old_data_df['provider_publish_time'] = pd.to_datetime(old_data_df['provider_publish_time'], unit='s')
    old_data_df['provider_publish_time'] = pd.to_datetime(old_data_df['provider_publish_time'])
    #print(type(old_data_df['provider_publish_time']))
    old_data_df.set_index(["Symbol", "provider_publish_time", "id"], inplace=True)
    #display(old_data_df.head())
    return old_data_df

### The main function to run all the sub functions

In [5]:
if __name__ == "__main__":
    
    #tickers_list = ['AAPL', 'fb'] # Created for testing
    
    # Get todays date in YYYY-MM-DD format
    today = pd.Timestamp.now().normalize()
    
    # Create name of the log file
    log_file_name = 'News_yq_logfile_' + str(today).split()[0] +'.log'
    #print(f'The log file name is {log_file_name}')
    
    # Initialize a log file at the Info level. This is just to ensure smooth debugging in case anything fails
    # %(asctime)s adds the time of creation of the LogRecord
    logging.basicConfig(filename=log_file_name, filemode="w", format='%(asctime)s - %(message)s', level=logging.INFO)
    
    logging.info("In the main function")
    logging.info(f'Processing for date {today}')
    
    # Name of the output file
    news_data_file = "DailyNews_data_yahooquery.csv"
    #print(f'The name of the output file is {news_data_file}')
    
    logging.info("Check if file already exists")
    
    # Check if the OHLC csv file existing the root folder from where this script is ran
    if os.path.isfile(news_data_file):
        logging.info("News Data file exists. Pulling only 25 news count")
        #print("News Data file exists. Pulling 25 news only ")
        # Get the last date and dataframe
        old_data_df = get_previous_data(news_data_file)
        latest_data = get_latest_news(tickers_list, 25)
        # Append new data with old data
        final_df = old_data_df.append(latest_data)
        # Reset index to remove duplicates
        final_df.reset_index(inplace=True)
        #print(f'Final dataframe post reset index is {final_df}')
        # Drop duplicates that has same symbol and date
        final_df = final_df.drop_duplicates(subset=['Symbol','provider_publish_time', 'id'], keep='last')
        # Set index to symbol and date
        final_df.set_index(["Symbol", "provider_publish_time", "id"], inplace=True)
        #print(f'Final dataframe post set index  and sropping duplicates is {final_df}')
 
    else:
        logging.info("News file does not exists. Getting maximum possible data")
        #print("News file does not exists. Getting maximum possible data")
        latest_data = get_latest_news(tickers_list, 500) # Get the max data
        final_df = latest_data
    
    # Create name of the output file
    logging.info(f'Writing data in the file name {news_data_file}')
    #print(f'Writing data in the file name {news_data_file}')
    #display(final_df)
    final_df.to_csv(news_data_file, mode='w', index=True, encoding='utf-8') #index is True as we want it to be written in file
    