## Adjust the database for recent corporate actions, such as splits and dividends

In [16]:
# Import credentials

import json
f = open("/. .<your file path here> . . /credentials.json")
credentials = json.load(f)

file_path = list(credentials.values())[0]
intrinio_key = list(credentials.values())[1]
aws_key = list(credentials.values())[2]
aws_secret_key = list(credentials.values())[3]
rds_host = list(credentials.values())[4]
rds_user = list(credentials.values())[5]
rds_password = list(credentials.values())[6]
rds_database = list(credentials.values())[7]
rds_charset = list(credentials.values())[8]

In [17]:
# Import Intrinio libraries

from __future__ import print_function
import time
import intrinio_sdk as intrinio
from intrinio_sdk.rest import ApiException

intrinio.ApiClient().configuration.api_key['api_key'] = intrinio_key

# Import the usual Python libraries

from tqdm.notebook import tqdm, trange  # to be used to track progress in loop iterations
import pandas as pd
import numpy as np
import datetime as datetime
from datetime import datetime, date, time, timedelta
import sys

# Import the AWS libraries

import boto3
from boto3.s3.transfer import TransferConfig
from boto3.s3.transfer import S3Transfer
import io
import pyarrow as pa
import pyarrow.parquet as pq

# Import SQL libraries

import mysql.connector 
from mysql.connector import errorcode
from sqlalchemy import create_engine

# Declare the local File Path:

global my_path
my_path = file_path

# Create the AWS client
client = boto3.client(
    's3',
    aws_access_key_id = aws_key,
    aws_secret_access_key = aws_secret_key,
    region_name = 'us-east-1'
)


In [18]:
# Fetch the last (max) date from the price history RDS table

def get_max_date():

    from datetime import datetime, date, time, timedelta
    import mysql.connector

    global lastUpdate
    global td_days
    global todayDate

    mydb = mysql.connector.connect(
      host = rds_host,
      user = rds_user,
      password = rds_password,
      database = rds_database
    )

    mycursor = mydb.cursor()

    mycursor.execute("SELECT MAX(last_corp_action_date) FROM price_data_historical")

    myresult = mycursor.fetchall()

    lastUpdate = myresult[0]
    todayDate = date.today()
    td = todayDate - lastUpdate[0].date()
    td_days = td.days
    
    # Print the number of days since last script run, or if this script is run too soon, exit the script.
    
    if td_days > 0:
        print("The last day that corp actions were updated was", lastUpdate[0])
        print("That date was", td_days, "day(s) ago.")
    else:
        print("The last corporate action update was today. Try again tomorrow.")
        sys.exit(0)

    return lastUpdate, td_days
    

In [19]:
# Fetch the list of adjusted tickers and figis

def get_adjusted_tickers_figis(lastUpdate, td_days):
    
    global df_adjusted_tickers_total
    global response
    global df_security
    
    df_adjusted_tickers = pd.DataFrame()
    df_adjusted_tickers_total = pd.DataFrame()

    # For each day since the last corporate actions update, fetch all the tickers/figis with recent corp actions.
    for updateDate in tqdm(range(0, td_days)):

        nextDate = lastUpdate[0] + timedelta(updateDate)
        nextDateString = nextDate.strftime("%Y-%m-%d")

        identifier = 'USCOMP'
        date = nextDateString
        page_size = 10000
        next_page = ''

        try:

            response = intrinio.StockExchangeApi().get_stock_exchange_price_adjustments(identifier, date=date, page_size=page_size, next_page=next_page)

            if len(response.stock_price_adjustments) > 0:

                df_security = pd.DataFrame([x.to_dict() for x in response.stock_price_adjustments]).security.apply(pd.Series)

                # If no new adjustments show up, exit the script.
                if df_security.empty:
                    print("No new adjustments available for ", nextDate.strftime('%m/%d/%Y'))
                    break

                # Filter the data for only stocks, ADRs and ETFs
                df_adjusted_tickers = df_security[df_security['code'].isin(['EQS', 'DR','ETF'])][['ticker', 'figi', 'code']]

                # Add a date column
                df_adjusted_tickers['date'] = nextDateString

                # Get the new split ratios and adjustment factors for each ticker/figi
                df_data = pd.DataFrame([x.to_dict() for x in response.stock_price_adjustments])[['split_ratio', 'factor']]
                df_data['ticker'] = df_security['ticker']

                # Merge the data to a single dataframe
                df_adjusted_tickers = pd.merge(df_adjusted_tickers, df_data, on = 'ticker', how = 'left')

        except:
            pass

        # Add the daily lists to a total adjustments list, sort by date and ticker, and drop any duplicates or NaNs
        df_adjusted_tickers_total = pd.concat([df_adjusted_tickers_total, df_adjusted_tickers], ignore_index = True, axis = 0)
 
    # If there are no tickers to be adjusted, quit the routine, else continue.
    if df_adjusted_tickers.shape[0] == 0:
        print("The number of adjusted EQS, DR or ETF securities is ", df_adjusted_tickers_total.shape[0])
        print("There is no need to proceed further")
        quit()
    else:
        
        df_adjusted_tickers_total = df_adjusted_tickers_total.sort_values(by = ['date', 'ticker'], ascending = True)
        df_adjusted_tickers_total = df_adjusted_tickers_total.drop_duplicates(keep = 'first')
        df_adjusted_tickers_total = df_adjusted_tickers_total.dropna(axis=0)
        df_adjusted_tickers_total = df_adjusted_tickers_total.rename(columns = {'factor':'adj_factor'})

        print("The number of adjusted securities is ", df_adjusted_tickers_total.shape[0])
        print("The date range in the update list DF goes from ", df_adjusted_tickers_total.date.min(), " to ", 
              df_adjusted_tickers_total.date.max())
        
    return df_adjusted_tickers_total


In [20]:
# Fetch the historical adjustment factors and split ratios for the adjusted tickers from the historical price data RDS table

def get_adj_factors_splits(df_adjusted_tickers_total):
    
    import mysql.connector
    
    global df_splits_factors

    figi_list = df_adjusted_tickers_total['figi'].tolist()
    
    mydb = mysql.connector.connect(
      host = rds_host,
      user = rds_user,
      password = rds_password,
      database = rds_database
    )

    mycursor = mydb.cursor()

    mycursor.execute("SELECT date, ticker, figi, adj_factor, split_ratio FROM price_data_historical WHERE figi in" + str(tuple(figi_list)))

    myresult = mycursor.fetchall()

    df_splits_factors = pd.DataFrame(myresult, columns = ['date', 'ticker', 'figi', 'adj_factor', 'split_ratio']).sort_values(by = ['figi', 'date'], ascending = False)

    df_splits_factors = pd.concat([df_splits_factors, df_adjusted_tickers_total[['date', 'ticker', 'figi', 'split_ratio', 'adj_factor']].copy()])
    df_splits_factors['date'] = pd.to_datetime(df_splits_factors['date'])
    df_splits_factors = df_splits_factors.sort_values(by = ['ticker', 'date'], ascending = False)
    df_splits_factors = df_splits_factors.drop_duplicates(keep = 'first')

    print("The shape of the historical splits and adjustments DF is ", df_splits_factors.shape)

    return df_splits_factors


In [21]:
# Get the new historical prices for each ticker

def get_historical_prices(myFigi, myTicker):
    
    global adjusted_prices_list
    global adjusted_prices_total
    
    identifier = myFigi
    start_date = ''
    end_date = todayDate
    frequency = 'daily'
    page_size = 10000
    next_page = ''
    
    adjusted_prices_list = []
    
    while next_page != None:
        
        try:

            response = intrinio.SecurityApi().get_security_stock_prices(identifier, start_date=start_date, end_date=end_date, frequency=frequency, page_size=page_size, next_page=next_page)
            adjusted_prices = [x.to_dict() for x in response.stock_prices]

            for item in range(len(adjusted_prices)):

                # Add ticker and figi to the results
                dict_item = adjusted_prices[item]
                dict_item['ticker'] = response.security.ticker
                dict_item['figi'] = response.security.figi
                adjusted_prices_list.append(dict_item)

        except:

            # Track tickers that do not have any price data available.
            bad_tickers.append(myTicker)        
            pass
        
        next_page = response.next_page
    
    # Return adjusted prices
    adjusted_prices_total.extend(adjusted_prices_list)
    
    return adjusted_prices_total


In [22]:
# Fetch historical prices for adjusted tickers and add some extra calculations to match those in the history table

def get_historical_price_data(df_adjusted_tickers_total):

    import concurrent.futures

    global df_adjusted_prices_total
    global arg_list
    global adjusted_prices_total

    bad_tickers = []
    adjusted_prices_total = []

    arg_list = list(df_adjusted_tickers_total[['figi', 'ticker']].to_records(index=False))

    # Use concurrent.futures to use multiple threads to retrieve price data.
    with concurrent.futures.ThreadPoolExecutor(max_workers = 10) as executor:
        executor.map(lambda f: get_historical_prices(*f), arg_list)

    # Convert the shares out array to a dataframe, drop any duplicates, set the date columnst to datetime format,
    # combine the prices with split and adjustment factors, the fill forward any null values.
    df_adjusted_prices_total = pd.DataFrame(adjusted_prices_total)
    df_adjusted_prices_total['date']= pd.to_datetime(df_adjusted_prices_total['date'])
    df_adjusted_prices_total = df_adjusted_prices_total.drop_duplicates(subset=['ticker', 'date'], keep = 'first')
    df_adjusted_prices_total = pd.merge(df_adjusted_prices_total, df_splits_factors, on = ['date', 'ticker', 'figi'], how = 'left')
    df_adjusted_prices_total[['adj_factor', 'split_ratio']] = df_adjusted_prices_total[['adj_factor', 'split_ratio']].fillna(1)

    # Add change, pct_change, and 52 week high/low columns

    df_adjusted_prices_total['change'] = df_adjusted_prices_total.sort_values('date').groupby(['ticker']).adj_close.diff()
    df_adjusted_prices_total['percent_change'] = df_adjusted_prices_total.sort_values('date').groupby(['ticker']).adj_close.pct_change()

    df_adjusted_prices_total['fifty_two_week_high'] = df_adjusted_prices_total.sort_values('date').groupby(['ticker']).adj_close.rolling(window = 260).max().reset_index(0,drop=True)
    df_adjusted_prices_total['fifty_two_week_low'] = df_adjusted_prices_total.sort_values('date').groupby(['ticker']).adj_close.rolling(window = 260).min().reset_index(0,drop=True)

    print("The shape of the historical price data DF is ", df_adjusted_prices_total.shape)
    print("The earliest date is ", df_adjusted_prices_total['date'].min())

    return df_adjusted_prices_total


In [23]:
# Get historical weighted average diluted shares outstanding for each ticker

def get_shares_out(myFigi, myTicker):
    
    global shares_out_list
    global shares_out_lists_combined
    
    identifier = myFigi
    tag = 'adjweightedavedilutedsharesos'
    frequency = ''
    type = ''
    start_date = ''
    end_date = ''
    sort_order = 'desc'
    page_size = 10000
    next_page = ''

    try:
        response = intrinio.HistoricalDataApi().get_historical_data(identifier, tag, frequency=frequency, type=type, start_date=start_date, end_date=end_date, sort_order=sort_order, page_size=page_size, next_page=next_page)
        shares_out_data = response.historical_data

        shares_out_list = []

        for item in range(len(shares_out_data)):
    
            # Add ticker and figi to the results
            dict_item = shares_out_data[item].to_dict()
            dict_item['ticker'] = myTicker
            dict_item['figi'] = myFigi
            shares_out_list.append(dict_item)

    except:
        
        # Track tickers that do not have any shares out data available.
        bad_tickers.append(myTicker)
        pass
        
    
    #return shares_out_list
    shares_out_lists_combined.extend(shares_out_list)
    
    return shares_out_lists_combined, shares_out_list


In [24]:
# Get the historical shares outstanding data for all tickers. Since shares out are reported quarterly, resample
# the data to show daily records.

def get_shares_out_data(df_adjusted_tickers_total):

    import concurrent.futures

    global df_shares_out
    global shares_out_lists_combined

    shares_out_list = []
    bad_tickers = []
    shares_out_lists_combined = []

    arg_list = list(df_adjusted_tickers_total[['figi', 'ticker']].to_records(index=False))

    # Use concurrent.futures to use multiple threads to retrieve shares out data.
    with concurrent.futures.ThreadPoolExecutor(max_workers = 10) as executor:
        executor.map(lambda f: get_shares_out(*f), arg_list)

    # Convert the shares out array to a dataframe, drop any duplicates, rename the values column, replace zeros with
    # NaNs, and get rid of any negative shares out numbers.
    df_shares_out = pd.DataFrame(shares_out_lists_combined)
    df_shares_out = df_shares_out.drop_duplicates(subset=['ticker', 'date'], keep = 'first')
    df_shares_out['date'] = pd.to_datetime(df_shares_out['date'])
    df_shares_out = df_shares_out.rename(columns = {'value':'weighted_avg_shares_out'})
    df_shares_out['weighted_avg_shares_out'] = df_shares_out['weighted_avg_shares_out'].replace(0, np.nan)
    df_shares_out['weighted_avg_shares_out'] = df_shares_out['weighted_avg_shares_out'].abs()

    # Set date as index and convert to daily periods. Since shares out are reported quarterly, we need to resample to
    # daily records.
    df_shares_resample = df_shares_out.copy()
    df_shares_resample = df_shares_resample.set_index('date')
    df_shares_resample.index = pd.to_datetime(df_shares_resample.index)
    df_shares_resample = df_shares_resample.groupby('ticker').resample('D', convention = 'end').ffill()
    df_shares_resample = df_shares_resample.droplevel('ticker')
    df_shares_resample = df_shares_resample.reset_index()

    df_shares_out = df_shares_resample.copy()

    print("The shape of the shares out DF is ", df_shares_out.shape)

    return df_shares_out


In [25]:
# Use left join to add shares out to history dataframe and calculate market cap

def combine_transform_adjusted_data(df_adjusted_prices_total, df_shares_out):
    
    global df_adjusted_prices_complete

    df_adjusted_prices_complete = pd.merge(df_adjusted_prices_total, df_shares_out, on=['ticker', 'figi', 'date'], how='left')
    df_adjusted_prices_complete = df_adjusted_prices_complete.sort_values(by = ['ticker', 'date'], ascending = True)

    df_adjusted_prices_complete['weighted_avg_shares_out'] = df_adjusted_prices_complete.groupby('ticker')['weighted_avg_shares_out'].transform(lambda x: x.ffill())
    df_adjusted_prices_complete['market_cap'] = df_adjusted_prices_complete['adj_close'] * df_adjusted_prices_complete['weighted_avg_shares_out']

    # Add last update date and primary key column, reset the data types for each column to be MySQL compliant, 
    # then reset the column order.

    df_adjusted_prices_complete['last_updated_date'] = todayDate
    df_adjusted_prices_complete['last_corp_action_date'] = todayDate
    df_adjusted_prices_complete['date'] = pd.to_datetime(df_adjusted_prices_complete['date'])
    df_adjusted_prices_complete['key_id'] = df_adjusted_prices_complete['ticker'] + df_adjusted_prices_complete['figi'] + df_adjusted_prices_complete['date'].dt.strftime('%Y-%m-%d')
    df_adjusted_prices_complete = df_adjusted_prices_complete.drop_duplicates(subset = ['key_id'], keep = 'first')
    df_adjusted_prices_complete = df_adjusted_prices_complete.where(df_adjusted_prices_complete.notnull(), None)
    df_adjusted_prices_complete = df_adjusted_prices_complete.convert_dtypes()
    df_adjusted_prices_complete['date'] = df_adjusted_prices_complete['date'].dt.date
    df_adjusted_prices_complete = df_adjusted_prices_complete.astype({'open':'Float32', 'high':'Float32', 'low':'Float32', 'close':'Float32', 'volume':'Int32', 'adj_open':'Float32', 'adj_high':'Float32', 'adj_low':'Float32', 'adj_close':'Float32', 'adj_volume':'Int32', 'adj_factor':'Float32', 'split_ratio':'Int32', 'change':'Float32', 'percent_change':'Float32', 'fifty_two_week_high':'Float32', 'fifty_two_week_low':'Float32'})

    df_adjusted_prices_complete = df_adjusted_prices_complete[['key_id', 'ticker', 'figi', 'date', 'open', 'high', 'low', 'close', 'volume', 'adj_open', 'adj_high', 
                                                               'adj_low', 'adj_close', 'adj_volume', 'adj_factor', 'split_ratio', 'change', 'percent_change', 
                                                               'fifty_two_week_high', 'fifty_two_week_low', 'market_cap', 'weighted_avg_shares_out', 'intraperiod', 
                                                               'last_updated_date', 'last_corp_action_date']]

    print("The complete corp actions DF shape is ", df_adjusted_prices_complete.shape)

    return df_adjusted_prices_complete


In [26]:
# Push the dataframe to CSV on S3 for backup and/or archive purposes.

def export_data_to_S3(df_adjusted_prices_complete):

    import io

    myBucket = 'bns-intrinio-data'
    myFileLocation = "price-data-daily/df_adjusted_prices_" + str(todayDate) + ".csv"

    with io.StringIO() as csv_buffer:
        df_adjusted_prices_complete.to_csv(csv_buffer, index=False)

        response = client.put_object(
            Bucket = myBucket, Key = myFileLocation, Body=csv_buffer.getvalue()
        )

        status = response.get("ResponseMetadata", {}).get("HTTPStatusCode")

        if status == 200:
            print(f"Successful S3 put_object response. Status - {status}")
            print("New adjusted history data sucessfully posted to S3.")
        else:
            print(f"Unsuccessful S3 put_object response. Status - {status}")


### Update History Table

In [27]:
# Select and save pre-adjusted records in S3 prior to deletion. Just in case we need to undo the operation.

def save_preadjusted_records_to_S3(df_adjusted_prices_complete):

    import mysql.connector
    import io
    
    global save_records_completion_status

    figi_list = df_adjusted_prices_complete['figi'].unique()
    preadjusted_records = []

    mydb = mysql.connector.connect(
      host = rds_host,
      user = rds_user,
      password = rds_password,
      database = rds_database
    )

    def chunker(seq, size):
        return (seq[pos:pos + size] for pos in range(0, len(seq), size))

    for chunk in chunker(figi_list, int(len(figi_list)/10)):

        mycursor = mydb.cursor()

        if len(tuple(chunk)) > 1:
            mycursor.execute("SELECT * FROM price_data_historical WHERE figi IN " + str(tuple(chunk)))
        else:
            mycursor.execute("SELECT * FROM price_data_historical WHERE figi IN ('" + str(chunk[0]) + "')")

        myresult = mycursor.fetchall()
        mycursor.close()
        preadjusted_records.extend(myresult)

    df_preadjusted_records = pd.DataFrame(preadjusted_records, columns = df_adjusted_prices_complete.columns)

    myBucket = 'bns-intrinio-data'
    myFileLocation = "price-data-daily/df_preadjusted_records_" + str(todayDate) + ".csv"

    with io.StringIO() as csv_buffer:
        df_preadjusted_records.to_csv(csv_buffer, index=False)

        response = client.put_object(
            Bucket = myBucket, Key = myFileLocation, Body=csv_buffer.getvalue()
        )

        status = response.get("ResponseMetadata", {}).get("HTTPStatusCode")
        if status == 200:
            print(f"Successful S3 put_object response. Status - {status}")
            print(df_preadjusted_records.shape[0], "Data record(s) saved on S3.")
        else:
            print(f"Unsuccessful S3 put_object response. Status - {status}")
            
    save_records_completion_status = status
    
    return save_records_completion_status
                                 
                                 

In [28]:
# Delete records to be updated from SQL table. The SQL Connector library does not have a reliable "Upsert" function,
# so we need to delete then replace the data.

def delete_preadjusted_records(save_records_completion_status):

    import mysql.connector
    
    global delete_records_status

    mydb = mysql.connector.connect(
      host = rds_host,
      user = rds_user,
      password = rds_password,
      database = rds_database
    )

    mycursor = mydb.cursor()

    sql_delete_query = "DELETE FROM price_data_historical WHERE figi IN " + str(tuple(df_adjusted_prices_complete['figi'].unique()))

    mycursor.execute(sql_delete_query)

    mydb.commit()

    print(mycursor.rowcount, "Data record(s) deleted.")
    delete_records_status = "Deleted."
    
    return delete_records_status



In [29]:
# Insert new updated records in the SQL history table.

def insert_new_records(delete_records_status):

    import pymysql.cursors
    
    global insert_records_status

    connection = pymysql.connect(host = rds_host,
                                 user = rds_user, 
                                 password = rds_password, 
                                 database = rds_database,
                                 charset = rds_charset,
                                 cursorclass=pymysql.cursors.DictCursor)

    mycursor = connection.cursor()

    sql_insert_query = """
    INSERT INTO price_data_historical 
    VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
    """

    data = df_adjusted_prices_complete.values.tolist()

    mycursor.executemany(sql_insert_query, data)

    connection.commit()

    print(mycursor.rowcount, "Data records inserted.")
    insert_records_status = "Inserted. Done."
    
    return insert_records_status


In [30]:
# Run ETL process.

get_max_date()
get_adjusted_tickers_figis(lastUpdate, td_days)
get_adj_factors_splits(df_adjusted_tickers_total)
get_historical_price_data(df_adjusted_tickers_total)
get_shares_out_data(df_adjusted_tickers_total)
combine_transform_adjusted_data(df_adjusted_prices_total, df_shares_out)
export_data_to_S3(df_adjusted_prices_complete)
save_preadjusted_records_to_S3(df_adjusted_prices_complete)
delete_preadjusted_records(save_records_completion_status)
insert_new_records(delete_records_status)



The last day that corp actions were updated was 2021-10-28 00:00:00
That date was 30 day(s) ago.


  0%|          | 0/30 [00:00<?, ?it/s]

The number of adjusted securities is  1131
The date range in the update list DF goes from  2021-10-28  to  2021-11-26
The shape of the historical splits and adjustments DF is  (4146442, 5)
The shape of the historical price data DF is  (3710702, 21)
The earliest date is  1962-01-02 00:00:00
The shape of the shares out DF is  (1373032, 4)
The complete corp actions DF shape is  (3710587, 25)
Successful S3 put_object response. Status - 200
New adjusted history data sucessfully posted to S3.
Successful S3 put_object response. Status - 200
3731347 Data record(s) saved on S3.
3731347 Data record(s) deleted.
3710587 Data records inserted.


'Inserted. Done.'

In [58]:
# Take a look at the bad tickers that did not pull any results from the price or shares out queries and make 
# sure they are not well recognized names/tickers. E.g. none should be MSFT or AAPL. Most should be ETFs or very
# small cap stocks.

df_bad_tickers = pd.DataFrame(bad_tickers, columns=['ticker'])
df_bad_tickers = df_bad_tickers.merge(df_adjusted_tickers_total, on=['ticker'], how='left')
df_bad_tickers = df_bad_tickers.drop_duplicates(keep = 'first')
df_bad_tickers.to_csv(path_or_buf = my_path + "/df_bad_tickers.csv", index=False)
df_bad_tickers

Unnamed: 0,ticker,figi,code,date,split_ratio,adj_factor
0,AGOX,BBG010WX25T5,ETF,2021-11-04,1.0,0.949429
1,FRBA,BBG000Q92GH5,EQS,2021-11-04,1.0,0.995924
2,BPRN,BBG00CXM0LX5,EQS,2021-11-04,1.0,0.994061
3,LVHD,BBG00BRDT880,ETF,2021-11-04,1.0,0.993915
4,MLPA,BBG002WKD634,ETF,2021-11-05,1.0,0.980469
...,...,...,...,...,...,...
169,XYLD,BBG00MVL3FH5,ETF,2021-11-22,1.0,0.992780
170,VABS,BBG00Z6QCLL5,ETF,2021-11-22,1.0,0.998601
171,VRIG,BBG00DW0GB49,ETF,2021-11-22,1.0,0.999405
172,XYLG,BBG00XH4TSF1,ETF,2021-11-22,1.0,0.996302
