## Update Stock Prices for each new trading day

In [1]:
# Import credentials

import json
f = open("/. .<your file path here> . . /credentials.json")
credentials = json.load(f)

file_path = credentials['file_path']
intrinio_key = credentials['intrinio_key']
aws_key = credentials['aws_access_key']
aws_secret_key = credentials['aws_secret_key']
rds_host = credentials['rds_host']
rds_user = credentials['rds_user']
rds_password = credentials['rds_password']
rds_database = credentials['rds_database']
rds_charset = credentials['rds_charset']


In [2]:
# Import Intrinio libraries

import time
import intrinio_sdk as intrinio
from intrinio_sdk.rest import ApiException

intrinio.ApiClient().configuration.api_key['api_key'] = intrinio_key

# Import Prefect library

from prefect.triggers import all_successful, all_failed
from prefect import task, Flow
import pendulum
from prefect.schedules import IntervalSchedule
from prefect.schedules.clocks import IntervalClock

# Import the usual Python libraries

from tqdm.notebook import tqdm, trange  # to be used to track progress in loop iterations
import pandas as pd
import numpy as np
import datetime as datetime
from datetime import datetime, date, time, timedelta

# Import the AWS libraries

import boto3
from boto3.s3.transfer import TransferConfig
from boto3.s3.transfer import S3Transfer
import io
import pyarrow as pa
import pyarrow.parquet as pq

# Import SQL libraries

import mysql.connector 
from mysql.connector import errorcode
from sqlalchemy import create_engine



In [3]:
# Fetch the last (max) date from the price history column

@task
def get_max_date():

    from datetime import datetime, date, time, timedelta

    global lastUpdate
    global td_days
    global todayDate

    mydb = mysql.connector.connect(
      host = rds_host,
      user = rds_user,
      password = rds_password,
      database = rds_database
    )

    mycursor = mydb.cursor()

    mycursor.execute("SELECT MAX(date) FROM price_data_historical")

    myresult = mycursor.fetchall()[0][0].date()

    todayDate = date.today()
    lastUpdate = myresult
    td = todayDate - lastUpdate
    td_days = td.days

    print("The last day that prices were updated was", lastUpdate.strftime('%m/%d/%Y'))
    print("That date was", td_days, "days ago.")

    return lastUpdate, td_days


In [4]:
# Get new data for each ticker to append to the price history table

@task
def get_price_data(lastUpdate, td_days):

    from datetime import datetime, date, time, timedelta

    bad_tickers = []
    
    global df_prices
    global nextDateString
    global df_price_update_total
    df_price_update_total = pd.DataFrame()

    # For each day from the last price update to today, retrieve the new security prices from the Intrinio API.
    for updateDate in tqdm(range(1, td_days+1)):

        nextDate = lastUpdate + timedelta(updateDate)
        nextDateString = nextDate.strftime("%Y-%m-%d")

        identifier = 'USCOMP'
        date = nextDateString
        records = 10000
        next_page = ''

        while next_page != None:

            try:

                response = intrinio.StockExchangeApi().get_stock_exchange_prices(identifier, date=date, page_size=records, next_page=next_page)
                df_prices = pd.DataFrame([x.to_dict() for x in response.stock_prices])

                if df_prices.empty:
                    print("No new prices available for ", nextDate.strftime('%m/%d/%Y'))
                    break

                df_security = df_prices.security.apply(pd.Series)
                df_price_update = pd.concat([df_prices, df_security], axis = 1).drop(['security'], axis = 1)

                df_price_update_total = pd.concat([df_price_update_total, df_price_update], ignore_index = True, axis = 0)

                next_page = response.next_page

            except:
                pass

    # If the API returns new prices, drop any duplicates and securities other than stocks, ADRS and ETFs, then
    # convert the intraperiod flag to a boolean, rename the adj factor column, set the dates to datetime format
    # and reset the column order for uploading to the database.
    
    if len(df_price_update_total) > 0:
    
        df_price_update_total = df_price_update_total.drop_duplicates(subset=['ticker', 'figi', 'date'], keep='last')
        df_price_update_total = df_price_update_total[df_price_update_total.code.isin(['EQS', 'DR', 'ETF'])]
        df_price_update_total.dropna(subset=['figi'], inplace = True)
        df_price_update_total['intraperiod'] = (df_price_update_total['intraperiod'] == 'TRUE').astype(int)
        df_price_update_total = df_price_update_total.rename(columns = {'factor':'adj_factor'})
        df_price_update_total['date'] = pd.to_datetime(df_price_update_total['date'])
        df_price_update_total = df_price_update_total[['ticker', 'figi', 'date', 'open', 'high', 'low', 'close', 
                                                'volume', 'adj_open', 'adj_high', 'adj_low', 'adj_close', 'adj_volume', 
                                                'adj_factor', 'split_ratio', 'change', 'percent_change', 
                                                'fifty_two_week_high', 'fifty_two_week_low', 'intraperiod']]

        print("The initial price update dataframe is retrieved.")
        print("The shape of the price update DF is", df_price_update_total.shape)
        print("The date range in the update DF goes from ", df_price_update_total.date.min().strftime('%m/%d/%Y'), " to ", 
              df_price_update_total.date.max().strftime('%m/%d/%Y'))

    return df_price_update_total

    

In [5]:
# Get historical weighted average diluted shares outstanding for each ticker

def get_shares_out(myFigi, myTicker):
    
    global shares_out_list
    global shares_out_lists_combined
    
    identifier = myFigi
    tag = 'adjweightedavedilutedsharesos'
    frequency = ''
    type = ''
    start_date = ''
    end_date = ''
    sort_order = 'desc'
    page_size = 2
    next_page = ''

    try:
        response = intrinio.HistoricalDataApi().get_historical_data(identifier, tag, frequency=frequency, type=type, start_date=start_date, end_date=end_date, sort_order=sort_order, page_size=page_size, next_page=next_page)
        shares_out_data = response.historical_data

        shares_out_list = []

        for item in range(len(shares_out_data)):
    
            # Add the ticker and figi values to the results
            dict_item = shares_out_data[item].to_dict()
            dict_item['ticker'] = myTicker
            dict_item['figi'] = myFigi
            shares_out_list.append(dict_item)
            shares_out_lists_combined.extend(shares_out_list)

    except:
        
        # Track any tickers that do not have shares outstanding data available.
        bad_tickers.append(myTicker)
        pass

    return shares_out_lists_combined
    


In [6]:
# Get shares out data for each ticker.

@task
def get_shares_out_data(df_price_update_total):

    import concurrent.futures

    global df_shares_out
    global shares_out_lists_combined

    df_shares_out = pd.DataFrame()
    bad_tickers = []
    shares_out_lists_combined = []
    
    if len(df_price_update_total) > 0:

        # Grab tickers and figis from the price history DF and drop any figi duplicates that might show up.    
        arg_list = list(df_price_update_total[['figi', 'ticker']].drop_duplicates().to_records(index = False))

        # Use concurrent.futures to use multiple threads to retrieve shares out data.
        with concurrent.futures.ThreadPoolExecutor(max_workers = 10) as executor:
            executor.map(lambda f: get_shares_out(*f), arg_list)

        # Comvert the shares out array to a dataframe
        df_shares_out = pd.DataFrame(shares_out_lists_combined)

        # Drop any duplicates, make sure the date column is in datetime format, rename the shares column and make
        # sure zeros are nulled out and any negative values are replaced with absolutes.
        df_shares_out = df_shares_out.drop_duplicates(subset=['ticker', 'date'], keep = 'first').copy()
        df_shares_out['date']= pd.to_datetime(df_shares_out['date'])
        df_shares_out = df_shares_out.rename(columns = {'value':'weighted_avg_shares_out'})
        df_shares_out['weighted_avg_shares_out'] = df_shares_out['weighted_avg_shares_out'].replace(0, np.nan)
        df_shares_out['weighted_avg_shares_out'] = df_shares_out['weighted_avg_shares_out'].abs()

        # Isolate the most recent shares out figures for each ticker
        df_shares_out = df_shares_out[df_shares_out.groupby('ticker')['date'].transform('max') == df_shares_out['date']]
    
    return df_shares_out


In [7]:
# Merge the price data with the shares out data to create the final update dataframe.

@task
def create_complete_update_dataframe(df_shares_out, df_price_update_total):
    
    global df_price_update_complete
    
    df_price_update_complete = pd.DataFrame()
    
    if len(df_price_update_total) > 0:
    
        # Use left join to add the shares out data to the stock prices, then calculate market caps and sort by ticker and dates
        df_price_update_complete = df_price_update_total.merge(df_shares_out[['weighted_avg_shares_out', 'ticker', 'figi']], on=['ticker', 'figi'], how='left')
        df_price_update_complete['market_cap'] = df_price_update_complete['adj_close'] * df_price_update_complete['weighted_avg_shares_out']
        df_price_update_complete['date'] = pd.to_datetime(df_price_update_complete['date'])
        df_price_update_complete = df_price_update_complete.sort_values(by = ['ticker', 'date'])

        # Add unique primary key column, last update date, last corporate action date and re-order columns
        df_price_update_complete['key_id'] = df_price_update_complete['ticker'] + df_price_update_complete['figi'] + df_price_update_complete['date'].dt.strftime('%Y-%m-%d')
        df_price_update_complete['last_update_date'] = todayDate
        df_price_update_complete['last_corp_action_date'] = None
        df_price_update_complete = df_price_update_complete[['key_id', 'ticker', 'figi', 'date', 'open', 'high', 'low', 'close', 'volume',
               'adj_open', 'adj_high', 'adj_low', 'adj_close', 'adj_volume',
               'adj_factor', 'split_ratio', 'change', 'percent_change',
               'fifty_two_week_high', 'fifty_two_week_low', 'market_cap',
               'weighted_avg_shares_out', 'intraperiod', 'last_update_date', 'last_corp_action_date']]

        print("The shares outstanding are captured and market caps calculated for all tickers that have shares out data available.")
        print("The shape of the new DF is ", df_price_update_complete.shape)
    
    return df_price_update_complete



In [8]:
# Push the dataframe to CSV on S3 if you want to use AWS Lambda to take it from there and push it into 
# the RDS table.

@task
def push_data_to_S3(df_price_update_complete):

    import io
    
    if len(df_price_update_total) > 0:

        # Create the AWS client
        client = boto3.client(
            's3',
            aws_access_key_id = aws_key,
            aws_secret_access_key = aws_secret_key,
            region_name = 'us-east-1'
        )

        myBucket = 'bns-intrinio-data'
        myFileLocation = "price-data-daily/df_price_update_complete_" + nextDateString + ".csv"

        with io.StringIO() as csv_buffer:
            df_price_update_complete.to_csv(csv_buffer, index=False)

            response = client.put_object(
                Bucket = myBucket, Key = myFileLocation, Body=csv_buffer.getvalue()
            )

            status = response.get("ResponseMetadata", {}).get("HTTPStatusCode")

            if status == 200:
                print(f"Successful S3 put_object response. Status - {status}")
            else:
                print(f"Unsuccessful S3 put_object response. Status - {status}")


In [9]:
# Or use SQLAlchemy to push the final dataframe into SQL DB on AWS RDS:

@task(trigger=all_successful)
def push_data_to_RDS(df_price_update_complete):
    
    if len(df_price_update_total) > 0:

        # Set database credentials.
        creds = {'usr': rds_user,
                 'pwd': rds_password,
                 'hst': rds_host,
                 'prt': 3306,
                 'dbn': rds_database}

        # MySQL conection string.
        connstr = 'mysql+mysqlconnector://{usr}:{pwd}@{hst}:{prt}/{dbn}'

        # Create sqlalchemy engine for MySQL connection.
        engine = create_engine(connstr.format(**creds))

        # Write DataFrame to MySQL using the engine (connection) created above.
        df_price_update_complete.to_sql(name='price_data_historical', 
                                              con=engine, 
                                              if_exists='append', 
                                              index=False)

        print("The new data has been appended to RDS. The number of new rows added is", df_price_update_complete.shape[0])


In [11]:
# Set up the daily run schedule.

schedule = IntervalSchedule(
    start_date=pendulum.datetime(2021, 11, 26, 21, 0, 0, tz="America/New_York"),
    interval=timedelta(days=1),
)

In [12]:
# Run the ETL update flow.

if __name__ == "__main__":

    with Flow("Stock-Data-Update-ETL", schedule) as flow:

        get_max_date = get_max_date()
        get_price_data = get_price_data(lastUpdate, td_days, upstream_tasks=[get_max_date])
        get_shares_out_data = get_shares_out_data(df_price_update_total, upstream_tasks=[get_price_data])
        create_complete_update_dataframe = create_complete_update_dataframe(df_shares_out, 
                                                                            df_price_update_total, 
                                                                            upstream_tasks=[get_shares_out_data])
        data_to_s3 = push_data_to_S3(df_price_update_complete, upstream_tasks=[data_to_s3])
        data_to_rds = push_data_to_RDS(df_price_update_complete, upstream_tasks=[create_complete_update_dataframe])

    flow.set_reference_tasks([data_to_rds])


In [None]:
flow.run()

In [10]:
# Test ETL process.

get_max_date.run()
get_price_data.run(lastUpdate, td_days)
get_shares_out_data.run(df_price_update_total)
create_complete_update_dataframe.run(df_shares_out, df_price_update_total)
push_data_to_S3.run(df_price_update_complete)
push_data_to_RDS.run(df_price_update_complete)

The last day that prices were updated was 11/24/2021
That date was 1 days ago.


  0%|          | 0/1 [00:00<?, ?it/s]

No new prices available for  11/25/2021
