## This is a set of modules for downloading historical equity data from Intrinio APIs and converting it into Point & Figure calculations.


First we import a variety of libraries and credentials to access APIs from Intrinio and AWS.

In [1]:
# Import credentials

import json
f = open("/. .<your file path here> . . /credentials.json")
credentials = json.load(f)

file_path = credentials['file_path']
intrinio_key = credentials['intrinio_key']
aws_key = credentials['aws_access_key']
aws_secret_key = credentials['aws_secret_key']
rds_host = credentials['rds_host']
rds_user = credentials['rds_user']
rds_password = credentials['rds_password']
rds_database = credentials['rds_database']
rds_charset = credentials['rds_charset']


In [2]:
# Import Intrinio libraries

from __future__ import print_function
import time
import intrinio_sdk as intrinio
from intrinio_sdk.rest import ApiException

intrinio.ApiClient().configuration.api_key['api_key'] = intrinio_key

# Import the usual Python libraries

from tqdm.notebook import tqdm, trange  # to be used to track progress in loop iterations
import pandas as pd
import numpy as np

# Import Zip file libraries

from zipfile import ZipFile
from io import BytesIO
import urllib.request as urllib2

# Import the AWS libraries

import boto3
from boto3.s3.transfer import TransferConfig
from boto3.s3.transfer import S3Transfer
import io
import pyarrow as pa
import pyarrow.parquet as pq

# Declare the local file path to be used for saving CSV outputs.

global my_path
my_path = file_path



## Extract the historical data from Intrinio

In [3]:
# Assemble the five Intrinio price history files into one dataframe after downloading them to your hard drive first.

def assemble_bulk_history():
    
    global df_price_history

    price_history = np.empty([0,29])

    for X in tqdm(range(1, 6)):

        ticker_file_path = my_path + "/" + "stock_prices_uscomp_all_file-" + str(X) + ".zip"
        data = pd.read_csv (ticker_file_path, low_memory = False)
        data = np.array(data.values)
        price_history = np.concatenate((price_history, data), axis=0)

    # Convert price history array to dataframe and do some cleanup

    df_price_history = pd.DataFrame(data = price_history, columns = ['security_id', 'company_id', 'name', 'cik', 'ticker', 'figi', 'composite_figi', 'composite_ticker', 'exchange_ticker', 'date', 'type', 'frequency', 'open', 'high', 'low', 'close', 'volume', 'adj_open', 'adj_high', 'adj_low', 'adj_close', 'adj_volume', 'adj_factor', 'ex_dividend',  'split_ratio', 'change', 'percent_change', 'fifty_two_week_high', 'fifty_two_week_low'])

    # Make sure Date column is in DateTime format, then sort by ticker and date

    df_price_history['date'] = pd.to_datetime(df_price_history['date'])
    df_price_history = df_price_history.sort_values(by=['ticker', 'date'])

    print("Price history files assembled.")    
    print("The shape of the price history dataframe is ", df_price_history.shape)
    
    return df_price_history


In [4]:
# Download the five Intrinio price history files from their bulk history API and assemble them into one dataframe.

def download_bulk_history():
    
    global df_price_history

    price_history = np.empty([0,29])

    response = intrinio.BulkDownloadsApi().get_bulk_download_links()

    for X in tqdm(range(0, 5)):

        url = response.bulk_downloads[1].links[X].url
        r = urllib2.urlopen(url).read()
        file = ZipFile(BytesIO(r))
        data_csv = file.open("stock_prices_uscomp_all_file-" + str(X+1) + ".csv")
        data = pd.read_csv(data_csv, low_memory=False)
        data = np.array(data.values)
        price_history = np.concatenate((price_history, data), axis=0)

    # Convert price history array to dataframe and do some cleanup

    df_price_history = pd.DataFrame(data = price_history, columns = ['security_id', 'company_id', 'name', 'cik', 'ticker', 'figi', 'composite_figi', 'composite_ticker', 'exchange_ticker', 'date', 'type', 'frequency', 'open', 'high', 'low', 'close', 'volume', 'adj_open', 'adj_high', 'adj_low', 'adj_close', 'adj_volume', 'adj_factor', 'ex_dividend',  'split_ratio', 'change', 'percent_change', 'fifty_two_week_high', 'fifty_two_week_low'])

    # Make sure Date column is in DateTime format, then sort by ticker and date

    df_price_history['date'] = pd.to_datetime(df_price_history['date'])
    df_price_history = df_price_history.sort_values(by=['ticker', 'date'])

    print("Price history files assembled.")    
    print("The shape of the price history dataframe is ", df_price_history.shape)
    print("df_price_history has ", len(df_price_history), "records.")
    
    return df_price_history


In [5]:
# Filter the price history for securities classified as code = ['EQS', 'DR', 'ETF'].

def filter_price_history(df_price_history):
    
    df_securities_total = pd.DataFrame()

    identifier = 'USCOMP'
    page_size = 10000
    next_page = ''

    # Query Intrinio's Securities by Exchange API to get a current list of securities for US exchanges with
    # metadata for each security. The initial list will be larger than the max page size (> 10K records), so you will
    # need to use this while loop to paginate the API.
    
    while next_page != None:

        try:

            response = intrinio.StockExchangeApi().get_stock_exchange_securities(identifier, page_size=page_size, next_page=next_page)
            df_securities = pd.DataFrame([x.to_dict() for x in response.securities])
            df_securities_total = pd.concat([df_securities_total, df_securities], ignore_index = True, axis = 0)
            next_page = response.next_page

        except:
            pass


    code_list = ['EQS', 'DR', 'ETF']  
    # This code list corresponds to "Equities", "ADR's" and "ETFs". We filter for these security types, get their figi codes,
    # then filter the price history dataframe for those figis.
    df_securities_total = df_securities_total[df_securities_total['code'].isin(code_list)]
    figi_list = df_securities_total['figi'].tolist()
    df_price_history = df_price_history[df_price_history['figi'].isin(figi_list)]

    print("df_price_history has ", len(df_price_history), "records.")

    return df_price_history


In [7]:
# Export the shares out history dataframe to a CSV file if you want to have a backup.
# df_price_history.to_csv(path_or_buf = my_path + "/df_price_history.csv", index=False)


In [12]:
# # In case the kernel fails later and we have to recreate the market cap history, we can grab the previous CSV file here instead.

# ticker_file_path = my_path + "/" + "df_price_history.csv"
# df_price_history = pd.read_csv (ticker_file_path, low_memory=False)
# df_price_history.shape


(46867581, 29)

## Clean up the data and run the calcs.

In [6]:
# Clean up the data

def clean_up_data(df_price_history):
    
    global df_price_data

    # Take the records out of the duplicate data set where figi is null.

    df_price_history = df_price_history.dropna(subset = ['figi'])

    # Find duplicates in dataframe on ticker, date and figi

    df_duplicate_rows_date_figi = df_price_history[df_price_history.duplicated(subset = ['figi', 'date'], keep = False)]
    dupe_tickers_count = df_duplicate_rows_date_figi.groupby('ticker').size().sort_values(ascending=False).to_frame('row_count')

    if dupe_tickers_count.empty:
        print("No dupes found.")

    else:
        dupe_tickers_count.to_csv(path_or_buf = my_path + "/dupe_count_on_tickers_date_figi.csv", index=True)
        print(dupe_tickers_count, "dupe records found and deleted.")

        df_price_history = df_price_history.drop_duplicates(['figi', 'date'], keep = 'last')

    print(df_price_history.shape)
    print(list(df_price_history.columns))

    df_price_data = df_price_history.copy()
    
    # Drop columns we don't need, and rename a few of the columns we do need.
    df_price_data.drop(['security_id', 'company_id', 'name', 'cik', 'composite_figi', 'composite_ticker', 'exchange_ticker', 'open', 
               'type', 'frequency', 'high', 'low', 'close', 'volume', 'adj_factor', 'ex_dividend', 'split_ratio', 
               'fifty_two_week_high', 'fifty_two_week_low'], axis = 1, inplace = True)
    df_price_data.rename(columns = {'adj_open':'open', 'adj_high':'high', 'adj_low':'low', 'adj_close': 'close', 'adj_volume':'volume'}, inplace = True)

    df_price_data.sort_values(by = ['ticker', 'date'], ascending = True, inplace = True)
    df_price_data.reset_index(drop = True, inplace = True)

    print(df_price_data.shape)
    print(list(df_price_data.columns))
    print(df_price_data.tail())

    # Add columns for Ticker, Plot Symbol, Reversal, Signal Name and Percent Change
    # Then seed first row for Plot Symbol and Signal Name with "X" and "BUY" respectively.

    df_price_data['plot_symbol'] = np.nan
    df_price_data['reversal'] = 0
    df_price_data['signal_name'] = np.nan
    df_price_data['high_point'] = np.nan
    df_price_data['last_high_point'] = np.nan
    df_price_data['prev_high_point'] = np.nan
    df_price_data['low_point'] = np.nan
    df_price_data['last_low_point'] = np.nan
    df_price_data['prev_low_point'] = np.nan
    df_price_data['entry_x'] = np.nan
    df_price_data['entry_o'] = np.nan
    df_price_data['next_entry'] = np.nan
    df_price_data['stop_loss'] = np.nan
    df_price_data['target_price'] = np.nan


    df_price_data = df_price_data[['date', 'figi', 'ticker', 'open', 'high', 'low', 'close', 'change', 'percent_change', 'volume', 'plot_symbol', 'reversal', 
                 'signal_name', 'high_point', 'last_high_point', 'prev_high_point', 'low_point', 'last_low_point', 'prev_low_point', 'entry_x', 'entry_o', 
                 'next_entry', 'stop_loss', 'target_price']]

    print(df_price_data.head())
    
    return df_price_data


In [16]:

def generate_pnf_calcs(myFigi):
    
    global data
    
    boxSize = .02
    reversalBoxes = 3
    reversalAmount = boxSize * reversalBoxes

    new_data_list = []
    
    data = df_price_data.loc[df_price_data['figi'] == myFigi].copy()
    data.reset_index(drop = True, inplace = True)
    
    # Set all starting High Points and Low Points equal to the close price on the first day of the time series.

    data.loc[0, 'plot_symbol'] = 'X'      #On Day 1, plot_symbol = "X"
    data.loc[0, 'signal_name'] = 'BUY'    #On Day 1, signal_name = "BUY"

    high_point = data['close'].iloc[0]     # Set Day 1 values for remaining P&F columns equal to the first close price.
    low_point = data['close'].iloc[0]
    last_high_point = data['close'].iloc[0]
    last_low_point = data['close'].iloc[0]
    prev_high_point = data['close'].iloc[0]
    prev_low_point = data['close'].iloc[0]
    entry_x = data['close'].iloc[0]
    entry_o = data['close'].iloc[0]
    target_price = data['close'].iloc[0]

    # Start the loop on the second day, loop through each day's close price after that.
    for i in range(1, len(data)):

        if data['plot_symbol'].iloc[i - 1] == 'X':   #If previous Plot Symbol = "X", then:

            if data['close'].iloc[i] >= data['close'].iloc[i - 1]:     #If current price >= previous price, then:
                data.loc[i, 'plot_symbol'] = 'X'        # Today's Plot Symbol = "X".
                data.loc[i, 'signal_name'] = data['signal_name'].iloc[i - 1]    #and copy yesterday's signal to today.

                if data['close'].iloc[i] > high_point:    #And if today's price is higher than the most recent high price, 
                    high_point = data['close'].iloc[i]       #then make today's price the  high price,
                    data.loc[i, 'signal_name'] = data['signal_name'].iloc[i - 1]   #and copy yesterday's signal to today.

                if data['close'].iloc[i] > last_high_point:  #And if today's price is higher than the high point from the last X column,
                    data.loc[i, 'signal_name'] = "BUY"           #then today's signal = "BUY".

            elif data['close'].iloc[i] < high_point * (1 - reversalAmount):     #Else if today's price is less than the previous high times 1 - reversal,
                data.loc[i, 'plot_symbol'] = 'O'                                     #the Plot Symbol reverses to "O",
                low_point = data['close'].iloc[i]                                   #and the  low point is today's price,
                data.loc[i, 'reversal'] = 1                                         #and reversal = 1,
                prev_high_point = last_high_point                                        #and prev_high_point = last_high_point, saving this ValueSignal to use in the Target Price calc below
                last_high_point = high_point                                               #and last_high_point = most recent high point
                entry_o = data['close'].iloc[i - 1]                                 #and entry_o = previous day's closing price, used in next_entry and stop_loss calcs

                if data['close'].iloc[i] < last_low_point:   #And if today's price is lower than the low point from the last O column,
                    data.loc[i, 'signal_name'] = "SELL"          #then today's signal = "SELL".
                else:
                    data.loc[i, 'signal_name'] = data['signal_name'].iloc[i - 1]   #Else copy yesterday's signal to today.

            else:
                data.loc[i, 'plot_symbol'] = 'X'  #Else, Plot Symbol = "X" (price is down but not enough to triger a reversal)
                data.loc[i, 'signal_name'] = data['signal_name'].iloc[i - 1]   #and copy yesterday's signal to today.


        if data['plot_symbol'].iloc[i - 1] == 'O':   #If previous Plot Symbol = "O", then:

            if data['close'].iloc[i] < data['close'].iloc[i - 1]:            #If current price <= previous price, then:
                data.loc[i, 'plot_symbol'] = 'O'         # Today's Plot Symbol = "O".
                data.loc[i, 'signal_name'] = data['signal_name'].iloc[i - 1]

                if data['close'].iloc[i] < low_point:       #And if today's price is lower than the most recent low price, 
                    low_point = data['close'].iloc[i]         #then make today's price the  low price.
                    data.loc[i, 'signal_name'] = data['signal_name'].iloc[i - 1]   #and copy yesterday's signal to today.

                if data['close'].iloc[i] < last_low_point:   #And if today's price is lower than the low point from the last O column,
                    data.loc[i, 'signal_name'] = "SELL"         #then today's signal = "SELL".


            elif data['close'].iloc[i] > low_point * (1 + reversalAmount):       #Else if today's price is greater than the previous high, times 1 + reversal,
                data.loc[i, 'plot_symbol'] = 'X'                                       #the Plot Symbol reverses to "X",
                high_point = data['close'].iloc[i]                                    #and the  high point is today's price,
                data.loc[i, 'reversal'] = 1                                           #and reversal = 1,
                prev_low_point = last_low_point                                            ##and prev_low_point = last_low_point, saving this ValueSignal to use in the Target Price calc below
                last_low_point = low_point                                                   #and last_low_point = most recent low point
                entry_x = data['close'].iloc[i - 1]                                   #and entry_x = previous day's closing price, used in next_entry and stop_loss calcs

                if data['close'].iloc[i] > last_high_point:  #And if today's price is higher than the high point from the last X column,
                    data.loc[i, 'signal_name'] = "BUY"          #then today's signal = "BUY".

                else:
                    data.loc[i, 'signal_name'] = data['signal_name'].iloc[i - 1]     #Else copy yesterday's signal to today.

            else:
                data.loc[i, 'plot_symbol'] = 'O'  #Else, Plot Symbol = "O" (price is up but not enough to triger a reversal)
                data.loc[i, 'signal_name'] = data['signal_name'].iloc[i - 1]   #and copy yesterday's signal to today.

        data.loc[i, 'high_point'] = high_point            #high_point = current "high_point"
        data.loc[i, 'low_point'] = low_point             #low_point = current "low_point"
        data.loc[i, 'last_high_point'] = last_high_point  #last_high_point = current "last_high_point"
        data.loc[i, 'last_low_point'] = last_low_point    #last_low_point = current "last_low_point"
        data.loc[i, 'prev_high_point'] = prev_high_point  #prev_high_point = current "prev_high_point"
        data.loc[i, 'prev_low_point'] = prev_low_point    #prev_low_point = current "prev_low_point"

        if data['signal_name'].iloc[i] == "BUY":

            next_entry = entry_o * (1 + boxSize)         #Set next_entry at one box up from the price at the last reversal from X to O, which should be near the top of the previous X column
            data.loc[i, 'next_entry'] = next_entry
            stop_loss = entry_x * (1 - boxSize)          #Set the stop_loss at one box down from the price at the last reversal from O to X, which should be near the bottom of the previous O column
            data.loc[i, 'stop_loss'] = stop_loss

            if data['signal_name'].iloc[i - 1] == "SELL":
                target_price = ((last_high_point - prev_low_point) * reversalBoxes) + prev_low_point   #Upon reversal from SELL to BUY, set the target_price equal to the size of the previous X column,
                                                                                                # times the box size, added to the bottom of the previous X column. Once calculated, it does not
                                                                                                # change for the balance of the current BUY signal.
            data.loc[i, 'target_price'] = target_price

        else:
            next_entry = entry_x * (1 - boxSize)         #Set next_entry at one box down from the price at the last reversal from O to X, which should be near the bottom of the previous O column
            data.loc[i, 'next_entry'] = next_entry
            stop_loss = entry_o * (1 + boxSize)          #Set the stop_loss at one box up from the price at the last reversal from X to O, which should be near the top of the previous X column
            data.loc[i, 'stop_loss'] = stop_loss

            if data['signal_name'].iloc[i - 1] == "BUY":
                target_price = prev_high_point - ((prev_high_point - last_low_point) * reversalBoxes)  #Upon reversal from BUY to SELL, set the target_price equal to the size of the previous O column,
                                                                                                # times the box size, subtracted from the top of the previous O column. Once calculated, it does not
                                                                                                # change for the balance of the current SELL signal.
            data.loc[i, 'target_price'] = target_price
            
        data.loc[i, 'entry_x'] = entry_x            #entry_x = current "entry_x"
        data.loc[i, 'entry_o'] = entry_o            #entry_o = current "entry_o"

    data_list = data.values.tolist()
    new_data_list.extend(data_list)
    
    return new_data_list


In [31]:
# Run all the calculations and prepare final dataframe.

def run_all_calcs(df_price_data):
    
    global df_pnf_data_history_complete_load
    global df_pnf_data_history

    import multiprocessing
    from multiprocessing import Pool

    import time
    start_time = time.time()
    new_data_list = []

    figi_list = df_price_data['figi'].unique().tolist()  # Get the list of figi codes to run P&F calcs on.

    p = Pool()
    result = p.map(generate_pnf_calcs, figi_list)  # Use multiprocessor pool to spread the work over several processors.
    p.close()
    p.join()

    end_time = time.time()  # Keep track of the time spent on calcs.
    elapsed_time = end_time - start_time

    print("Elapsed time was", round(elapsed_time/60, 2), "minutes.")

    new_data_list = []

    for i in range(0, len(figi_list)):
        data_list = result[i]
        new_data_list.extend(data_list)   # Assemble the data array to be converted to a dataframe below.

    myColumns = ['date', 'figi', 'ticker', 'open', 'high', 'low', 'close', 'change', 'percent_change', 'volume', 'plot_symbol', 'reversal', 
                 'signal_name', 'high_point', 'last_high_point', 'prev_high_point', 'low_point', 'last_low_point', 'prev_low_point', 'entry_x', 'entry_o', 
                 'next_entry', 'stop_loss', 'target_price']
    
    df_pnf_data_history = pd.DataFrame(new_data_list, columns = myColumns) # Convert to dataframe and save it to CSV here in case we need to QC it later.

    df_pnf_data_history.to_csv(path_or_buf = my_path + "/df_pnf_data_history.csv", index=False)

    print(df_pnf_data_history.shape)
    
    
    # Add key_id and a few useful date columns to final dataframe.
    df_pnf_data_history_complete_load = df_pnf_data_history.copy()
    df_pnf_data_history_complete_load['date'] = pd.to_datetime(df_pnf_data_history_complete_load['date'])
    df_pnf_data_history_complete_load['last_updated_date'] = pd.to_datetime(df_pnf_data_history_complete_load['date'].max()).normalize()
    df_pnf_data_history_complete_load['last_corp_action_date'] = pd.to_datetime(df_pnf_data_history_complete_load['date'].max()).normalize()
    df_pnf_data_history_complete_load['key_id'] = df_pnf_data_history_complete_load['ticker'] + df_pnf_data_history_complete_load['figi'] + df_pnf_data_history_complete_load['date'].dt.strftime('%Y-%m-%d')

    df_pnf_data_history_complete_load = df_pnf_data_history_complete_load[['key_id', 'date', 'figi', 'ticker', 'open', 'high', 'low', 'close', 'change', 'percent_change', 'volume', 'plot_symbol', 'reversal', 
                 'signal_name', 'high_point', 'last_high_point', 'prev_high_point', 'low_point', 'last_low_point', 'prev_low_point', 'entry_x', 'entry_o', 
                 'next_entry', 'stop_loss', 'target_price', 'last_updated_date', 'last_corp_action_date']]

    print(df_pnf_data_history_complete_load.shape)
    
    return df_pnf_data_history_complete_load


## For backup or archive purposes, save the final dataframe to CSV and/or parquet files and push them to AWS S3.

In [9]:
# Create the low level functional AWS client

def push_data_to_S3(df_pnf_data_history_complete_load):
    
    client = boto3.client(
        's3',
        aws_access_key_id = aws_key,
        aws_secret_access_key = aws_secret_key,
        region_name = 'us-east-1'
    )

    # Export the price history dataframe to a zipped CSV file then push to AWS S3.
    compression_opts = dict(method='zip', archive_name='df_pnf_data_history_complete_load.csv') 
    df_pnf_data_history_complete_load.to_csv(path_or_buf = my_path + "/df_pnf_data_history_complete_load.zip", index=False, compression=compression_opts)
    client.upload_file(my_path + "/df_pnf_data_history_complete_load.zip", 'bns-intrinio-data', "price-data-historical/csv_files/df_pnf_data_history_complete_load.zip")


    # Write parquet file to local drive, then push to AWS S3.
    local_file = my_path + "/df_pnf_data_history_complete_load.parquet"
    parquet_table = pa.Table.from_pandas(df_pnf_data_history_complete_load)
    pq.write_table(parquet_table, local_file)
    client.upload_file(local_file, 'bns-intrinio-data', "price-data-historical/parquet_files/df_pnf_data_history_complete_load.parquet")

    print("Data saved to S3 in zipped CSV and parquet.")
    S3_push_status = "Done."
    
    return S3_push_status


In [10]:
# Upload the complete history file if needed

def upload_data():
    
    global df_pnf_data_history_complete_load
    
    file_path = my_path + "/" + "df_pnf_data_history_complete_load.zip"
    df_pnf_data_history_complete_load = pd.read_csv (file_path, low_memory=False)
    df_pnf_data_history_complete_load.shape
    
    print('Upload done.')
    
    return df_pnf_data_history_complete_load

## Finally, create the MySQL price history table in RDS and push the history data into the table.

In [20]:
# Import SQL libraries

def create_and_fill_RDS_table(df_pnf_data_history_complete_load):

    import mysql.connector 
    from mysql.connector import errorcode

    from sqlalchemy import create_engine

    # Establish the MySQL connection

    connection = mysql.connector.connect(host=rds_host,
                                 user=rds_user, 
                                 password=rds_password, 
                                 database=rds_database,
                                 charset=rds_charset)

    mycursor = connection.cursor()

    # Create the data table in MySQL with MySQL Connector library

    create_pnf_data_history_table = """
    CREATE TABLE IF NOT EXISTS `base_pnf_data_historical` (
    `key_id` varchar(40) PRIMARY KEY,
    `date` datetime NOT NULL,
    `figi` varchar(14) NOT NULL,
    `ticker` varchar(8) NOT NULL,
    `open` float NULL,
    `high` float NULL,
    `low` float NULL,
    `close` float NULL,
    `change` float NULL,
    `percent_change` float NULL,
    `volume` float NULL,
    `plot_symbol` varchar(1) NULL,
    `reversal` integer NULL,
    `signal_name` varchar(4) NULL,
    `high_point` float NULL,
    `last_high_point` float NULL,
    `prev_high_point` float NULL,
    `low_point` float NULL,
    `last_low_point` float NULL,
    `prev_low_point` float NULL,
    `next_entry` float NULL,
    `stop_loss` float NULL,
    `entry_x` float NULL,
    `entry_o` float NULL,
    `target_price` float NULL,
    `last_updated_date` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
    `last_corp_action_date` datetime NULL
    ) ENGINE=InnoDB DEFAULT CHARSET=latin1;
    """

    mycursor.execute(create_pnf_data_history_table)
    
    # Create indexes for ticker, figi and dates

    connection.commit()

    print("The base_pnf_data_historical table is created in RDS.")


    # Push the final dataframe into SQL DB on AWS RDS.

    df = df_pnf_data_history_complete_load.copy()

    # Set SQLAlchemy database credentials.
    creds = {'usr': rds_user,
             'pwd': rds_password,
             'hst': rds_host,
             'prt': 3306,
             'dbn': rds_database}

    # MySQL conection string.
    connstr = 'mysql+mysqlconnector://{usr}:{pwd}@{hst}:{prt}/{dbn}'

    # Create sqlalchemy engine for MySQL connection.
    engine = create_engine(connstr.format(**creds))

    # Write DataFrame to MySQL using the engine (connection) created above.
    chunk = int(len(df) / 1000)
    df.to_sql(name='base_pnf_data_historical', 
                                          con=engine, 
                                          if_exists='replace', 
                                          chunksize=chunk,
                                          index=False)

    # Create indexes for ticker, figi and dates

    mycursor.execute("CREATE INDEX idx_ticker ON base_pnf_data_historical (ticker(8));")
    mycursor.execute("CREATE INDEX idx_figi ON base_pnf_data_historical (figi(14));")
    mycursor.execute("CREATE INDEX idx_date ON base_pnf_data_historical (date);")
    mycursor.execute("CREATE INDEX idx_update ON base_pnf_data_historical (last_updated_date);")
    mycursor.execute("CREATE INDEX idx_corp_action ON base_pnf_data_historical (last_corp_action_date);")
    mycursor.execute("CREATE INDEX idx_signal ON base_pnf_data_historical (signal_name(4));")
    mycursor.execute("CREATE INDEX idx_plot ON base_pnf_data_historical (plot_symbol(1));")

    print("The pnf history data is loaded and the indexes are set.")
    rds_table_status = "Done."
    
    return rds_table_status


In [36]:
# Test the ETL process.

#assemble_bulk_history()
download_bulk_history()
filter_price_history(df_price_history)

df_price_history.to_csv(path_or_buf = my_path + "/df_price_history.csv", index=False)

clean_up_data(df_price_history)
run_all_calcs(df_price_data)

df_pnf_data_history_complete_load.to_csv(path_or_buf = my_path + "/df_pnf_data_history_complete_load.csv", index=False)

push_data_to_S3(df_pnf_data_history_complete_load)
create_and_fill_RDS_table(df_pnf_data_history_complete_load)



Data saved to S3 in zipped CSV and parquet.
The base_pnf_data_historical table is created in RDS.
The pnf history data is loaded and the indexes are set.


'Done.'

### A few extra modules in case you need them.

In [19]:
# Double check for duplicates

df_duplicate_rows = df_pnf_data_history_complete_load[df_pnf_data_history_complete_load.duplicated(subset = ['key_id'], keep = False)]
dupe_tickers_count = df_duplicate_rows.groupby('ticker').size().sort_values(ascending=False).to_frame('row_count')
print(dupe_tickers_count)
print(dupe_tickers_count.shape)


Empty DataFrame
Columns: [row_count]
Index: []
(0, 1)


In [18]:

upload_data()
create_and_fill_RDS_table(df_pnf_data_history_complete_load)


Upload done.
The base_pnf_data_historical table is created in RDS.
The pnf history data is loaded and the indexes are set.


'Done.'