# Get open, high, low and close data from Yahoo Finance using Yahoo Query API
## We will be getting atleast 5 years historical prices for S&P 500 stocks

### Step 1: Import the required libraries
We will be using Yahoo Query API to pull data from Yahoo Finance. To install this library use pip install yahooquery

In [6]:
import yfinance as yf
import pandas as pd
import csv
import logging #library to create log files
import datetime as dt
from datetime import datetime # To get the current date and time
from datetime import date, timedelta
from pytz import timezone # Get timezone
import csv
import time
import os # To check if the file exists

### Step 2: Load tickers for S&P 500 stocks from csv file 
Note: We have created seperate python script (S&P500Tickers.ipynb) to get S&P 500 stocks and store that in file name is 'S&P500Tickers.csv'

In [7]:
ticker_data = pd.read_csv("S&P500Tickers.csv") # Open csv file and read data in pandas dataframe
tickers_list = ticker_data['Symbol'].tolist() #Get all the tickers in a list

### Step 3: Create a function that pull data for yfinance api for each ticker
OHLC - Open high low and close of a stock on a given date
Script needs start and end date to pull data between the range else data for max period will be pulled
Start date and end date has to be in format YYYY-MM-DD
Add daily percent return [(close - Open) / Open]

In [8]:
def get_OHLC_data(tickers, start_date = None, end_date = None):
    '''
    This function gets daily OHLC data from Yahoo Finance API for the provided number of days. 
    Daily returns are caluclated and stored in Returns column
    In case days is not provided then data sis pulled for maximum number of days
    
    Input Parameters: 
    tickers: List of tickers for which data needs to be extracted
    days: Number of days for which data needs to be pulled
    
    Returns: Dataframe of the extracted data
    '''
    logging.info("Getting OHLC data for days provided")
    final_OHLC_df = pd.DataFrame()
    OHLC_data = pd.DataFrame()
    for ticker in tickers:
        # Initiate object to get the data from API
        #print(f'working on the ticker {ticker}')
        yf_ticker_obj = yf.Ticker(ticker) # Setting the asynchronous flag as we are pulling data for multiple tickers
        if(start_date != None and end_date != None):
            #print(f'start date {start_date} and end_date {end_date}')
            OHLC_data = yf_ticker_obj.history(start = start_date, end = end_date, interval = "1d", auto_adjust = True)
        else:
            #print(f'No start and end date. Getting for maximum days')
            OHLC_data = yf_ticker_obj.history(period="max", interval = "1d", auto_adjust = True)
        
        OHLC_data.insert(0, 'Symbol', ticker)

        # Delete split and dividend columns
        #print(f'Deleting Split and divident column')
        OHLC_data = OHLC_data.drop(['Dividends','Stock Splits'], axis=1, errors='ignore')
        #display(OHLC_data.head())
        #print(f'Appending to existing dataframe')
        final_OHLC_df = final_OHLC_df.append(OHLC_data)
        #display(final_OHLC_df.head())
        time.sleep(.5)
    
    final_OHLC_df.reset_index(inplace=True)
    #display(final_OHLC_df.head())
    # Set index to symbol and date
    final_OHLC_df.set_index(["Date", "Symbol"], inplace=True)
    #display(final_OHLC_df.head())

    return final_OHLC_df
    # Add daily percent change on the price whic is [(close price - open price)/open price]
    # OHLC_data['return'] = ((OHLC_data['close'] - OHLC_data['open'])/OHLC_data['open'])*100 
    

### Step 4: Get the last date and dataframe from the existing OHLC csv file
This function will open existing csv file and put the data in existing dataframe and will also return last date

In [9]:
def get_last_date_data(file):
    '''
    This function open existing OHLC file, put that in pandas dataframe and extract the last date from the file
    Input Parameters: Name of the existing csv file that needs to be read in dataframe
    Returns: Last date and pandas dataframe containing data
    '''
    logging.info("Opening existing data file and extracting last date from it")
    #print("Opening existing data file and extracting last date from it")
    
    #old_data_df = pd.read_csv(file, index_col = ["symbol", "date"]) # Read the csv file in a dataframe
    old_data_df = pd.read_csv(file) # Read the csv file in a dataframe
    #display(old_data_df.head())
    
    # Convert Data column to datetime
    old_data_df['Date']= pd.to_datetime(old_data_df['Date'])
    previous_date = max(old_data_df['Date']) # Get the maximum date which is the last date for which data is present
    logging.info(f'Last date in the file is {previous_date}')
    # Converting datetime to only date
    old_data_df['Date'] = old_data_df['Date'].dt.date
    old_data_df.set_index(["Date", "Symbol"], inplace=True)
    #display(old_data_df.head())
    return previous_date, old_data_df

### The main function to run all the sub functions

In [10]:
if __name__ == "__main__":
    
    #tickers_list = ['AAPL', 'MSFT', 'fb', 'AIG', 'AMZN', 'CAT', 'GOOGL', 'A', 'AAL', 'ABC']
    # tickers_list = ['AAPL', 'MSFT'] # Created for testing
    # Get the next date in YYYY-MM-DD format. As API gives current business date data when next day is entered
    today = pd.Timestamp.now().normalize()
    next_day =  today + timedelta(days=1)
    #print(f'The next day is {next_day}')
   
    # Create name of the log file
    log_file_name = 'OHLC_yfin_logfile_' + str(today).split()[0] +'.log'
    #print(f'The log file name is {log_file_name}')
    # Initialize a log file at the Info level. This is just to ensure smooth debugging in case anything fails
    # %(asctime)s adds the time of creation of the LogRecord
    logging.basicConfig(filename=log_file_name, filemode="w", format='%(asctime)s - %(message)s', level=logging.INFO)
    
    logging.info("In the main function")
    logging.info(f'Processing for date {today}')
    
   # Name of the output file
    OHLC_data_file = "OHLC_yfinance_data.csv"
    #print(f'The name of the output file is {OHLC_data_file}')
    
    logging.info("Check if file already exists")
    
    # Check if the OHLC csv file existing the root folder from where this script is ran
    if os.path.isfile(OHLC_data_file):
        logging.info("OHLC Data file exists. Getting data in dataframe and last date")
        #print("OHLC Data file exists. Getting data in dataframe and last date")
        # Get the last date and dataframe
        previous_day, old_data_df = get_last_date_data(OHLC_data_file)
        latest_data = get_OHLC_data(tickers_list, previous_day, next_day)
        # Append new data with old data
        final_df = old_data_df.append(latest_data)
        # Reset index to remove duplicates
        final_df.reset_index(inplace=True)
        # Drop duplicates that has same symbol and date
        final_df = final_df.drop_duplicates(subset=['Date', 'Symbol'], keep='first')
        #final_df = final_df.drop_duplicates()
        # Set index to symbol and date
        final_df.set_index(["Date", "Symbol"], inplace=True)
        
    else:
        logging.info("OHLC file does not exists. Getting maximum possible data")
        #print("OHLC file does not exists. Getting maximum possible data")
        latest_data = get_OHLC_data(tickers_list) # Get the max data
        final_df = latest_data
    
    # Create name of the output file
    logging.info(f'Writing data in the file name {OHLC_data_file}')
    #print(f'Writing data in the file name {OHLC_data_file}')
    #print(final_df)
    final_df.to_csv(OHLC_data_file, mode='w', index=True) #index is True as we want it to be written in file
    