# Get open, high, low and close data from Yahoo Finance
## We will be getting atleast 5 years historical prices for S&P 500 stocks

### Step 1: Import the required libraries
We will be using yfinance API adn Yahoo Query to pull data from Yahoo Finance. To install this library use pip install yfinance, pip install yahooquery

In [63]:
import yfinance as yf
import yahooquery as yq
import pandas as pd
import csv
import logging #library to create log files
from datetime import datetime # To get the current date and time
from datetime import date
from pytz import timezone # Get timezone
import csv
import time
import os # To check if the file exists

### Step 2: Load tickers for S&P 500 stocks from csv file 
Note: We have created seperate python script (S&P500Tickers.ipynb) to get S&P 500 stocks and store that in file name is 'S&P500Tickers.csv'

### Step 3: Create a function that pull data for yfinance api for each ticker
OHLC - Open high low and close of a stock on a given date
Script needs start and end date to pull data between the range else data for max period will be pulled
Start date and end date has to be in format YYYY-MM-DD
Add daily percent return [(close - Open) / Open]

In [None]:
def get_OHLC_data(tickers, days="max"):
    '''
    This function gets daily OHLC data from Yahoo Finance API for the provided number of days. 
    Daily returns are caluclated and stored in Returns column
    In case days is not provided then data sis pulled for maximum number of days
    
    Input Parameters: 
    tickers: Ticker object from yahoo group for each of the S&P 500 symbols
    days: Number of days for which data needs to be pulled
    
    Returns: Dataframe of the extracted data
    '''
    logging.info("Getting OHLC data for days provided")
    OHLC_data = yfAPI.download(tickers = tickers_list, period=days, interval = "1d", group_by = 'ticker', 
                               auto_adjust = True, prepost = False, threads = True, proxy = None) 
    
    # Add daily percent change on the price whic is [(close price - open price)/open price]
    #OHLC_data['Return'] = (OHLC_data['Close'] - OHLC_data['Open'])/OHLC_data['Open']
    return OHLC_data

### Step 4: Get the last date and dataframe from the existing OHLC csv file
This function will open existing csv file and put the data in existing dataframe and will also return last date

In [None]:
def get_last_date_data(file):
    '''
    This function open existing OHLC file, put that in pandas dataframe and extract the last date from the file
    Input Parameters: Name of the existing csv file that needs to be read in dataframe
    Returns: Last date and pandas dataframe containing data
    '''
    logging.info("Opening existing data file and extracting last date from it")
    
    old_data_df = pd.read_csv(file) # Read the csv file in a dataframe
    
    # Convert Data column to datetime
    old_data_df['Date']= pd.to_datetime(old_data_df['Date'])
    
    previous_date = max(old_data_df['Date']) # Get the maximum date which is the last date for which data is present
    print(f'Last date is {previous_date}')
    
    logging.info(f'Last date in the file is {previous_date}')
    
    return previous_date, old_data_df

### The main function to run all the sub functions

In [None]:
if __name__ == "__main__":
    
    tickers_list = ['AAPL', 'MSFT']
    
    # Get the current date in YYYY-MM-DD format. This will be end date as well
    #tz = timezone('US/Eastern') # Get in estern timzezone
    #current_date = datetime.now(tz).strftime('%Y-%m-%d')
    today = date.today()
    
    # Create name of the log file
    log_file_name = 'OHLC_logfile_' + str(today) +'.log'
    
    # Initialize a log file at the Info level. This is just to ensure smooth debugging in case anything fails
    # %(asctime)s adds the time of creation of the LogRecord
    logging.basicConfig(filename=log_file_name, filemode="w", format='%(asctime)s - %(message)s', level=logging.INFO)
    
    logging.info("In the main function")
    logging.info(f'Processing for date {today}')
    
    print(today)
    print(log_file_name)
    
    # Name of the output file
    OHLC_data_file = "OHLC_data.csv"
    
    logging.info("Check if file already exists")
    
    # Check if the OHLC csv file existing the root folder from where this script is ran
    if os.path.isfile(OHLC_data_file):
        logging.info("OHLC Data file exists. Getting data in dataframe and last date")
        # Get the last date and dataframe
        previous_date, old_data_df = get_last_date_data(OHLC_data_file)
        
        # Get difference between last date and todays date in days. This will be passed to OHLC function
        no_days = str((today-previous_date).days)+'d' # Days to be passed in OHLC function is in format "1d"
        latest_data = get_OHLC_data(tickers_list, no_days)
        # Merge new data with old data and avoid duplicates
        final_df = old_data_df.append(latest_data).drop_duplicates()
    else:
        logging.info("OHLC file does not exists. Getting maximum possible data")
        latest_data = get_OHLC_data(tickers_list) # Get the max data
        final_df = latest_data
    
    # Create name of the output file
    logging.info(f'Writing data in the file name {OHLC_data_file}')
    final_df.to_csv(OHLC_data_file, mode='w', index=True) #index is True as we want to write index in csv file

In [None]:
#current_date = str(datetime.today()).split()[0]
#print(current_date)
#datetime.today()
tz = timezone('US/Eastern')
x = datetime.now(tz).strftime('%Y-%m-%d')
print(x)
#datetime.now(tz)

In [None]:
datetime_obj = datetime.strptime("2020-10-19", '%Y-%m-%d')
y = datetime_obj.date()

In [None]:
from datetime import date
today = date.today()
print(today)
delta = str((today-y).days)+'d'
print(delta)
log_file_name = 'OHLC_logfile_' + str(today) +'.log'
print(log_file_name)

In [None]:
data = yf.download(tickers = "AAPL MSFT", 
        
        #start="2020-10-19", end="2020-10-22",

        # use "period" instead of start/end
        # valid periods: 1d,5d,1mo,3mo,6mo,1y,2y,5y,10y,ytd,max
        # (optional, default is '1mo')
         period = "3d",

        # fetch data by interval (including intraday if period < 60 days)
        # valid intervals: 1m,2m,5m,15m,30m,60m,90m,1h,1d,5d,1wk,1mo,3mo
        # (optional, default is '1d')
        interval = "1d",

        # group by ticker (to access via data['SPY'])
        # (optional, default is 'column')
        group_by = 'ticker',

        # adjust all OHLC automatically
        # (optional, default is False)
        auto_adjust = True,

        # download pre/post regular market hours data
        # (optional, default is False)
        prepost = False,

        # use threads for mass downloading? (True/False/Integer)
        # (optional, default is True)
        threads = True,

        # proxy URL scheme use use when downloading?
        # (optional, default is None)
        proxy = None
    )

In [None]:
data1 = yf.download(tickers = "AAPL MSFT", 
        
        #start="2020-10-19", end="2020-10-22",

        # use "period" instead of start/end
        # valid periods: 1d,5d,1mo,3mo,6mo,1y,2y,5y,10y,ytd,max
        # (optional, default is '1mo')
         period = "1d",

        # fetch data by interval (including intraday if period < 60 days)
        # valid intervals: 1m,2m,5m,15m,30m,60m,90m,1h,1d,5d,1wk,1mo,3mo
        # (optional, default is '1d')
        interval = "1d",

        # group by ticker (to access via data['SPY'])
        # (optional, default is 'column')
        group_by = 'ticker',

        # adjust all OHLC automatically
        # (optional, default is False)
        auto_adjust = True,

        # download pre/post regular market hours data
        # (optional, default is False)
        prepost = False,

        # use threads for mass downloading? (True/False/Integer)
        # (optional, default is True)
        threads = True,

        # proxy URL scheme use use when downloading?
        # (optional, default is None)
        proxy = None
    )

In [8]:
import yfinance as yf
tickers = yf.Tickers("MSFT AAPL GOOG")
msft.history(start="2020-10-20", end="2020-10-23", auto_adjust = True)

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-10-19,220.419998,222.300003,213.720001,214.220001,27625800,0,0
2020-10-20,215.800003,217.369995,213.089996,214.649994,22753500,0,0
2020-10-21,213.119995,216.919998,213.119995,214.800003,22724900,0,0
2020-10-22,213.929993,216.059998,211.699997,214.889999,22334100,0,0
