### This module downloads stock and company metadata from Intrinio to fill out and maintain a Stock Info table.

In [1]:
# Import credentials

import json
f = open("/. .<your file path here> . . /credentials.json")
credentials = json.load(f)

file_path = list(credentials.values())[0]
intrinio_key = list(credentials.values())[1]
aws_key = list(credentials.values())[2]
aws_secret_key = list(credentials.values())[3]
rds_host = list(credentials.values())[4]
rds_user = list(credentials.values())[5]
rds_password = list(credentials.values())[6]
rds_database = list(credentials.values())[7]
rds_charset = list(credentials.values())[8]


In [2]:
# Import Intrinio libraries and set API key

from __future__ import print_function
import intrinio_sdk as intrinio
from intrinio_sdk.rest import ApiException

intrinio.ApiClient().configuration.api_key['api_key'] = intrinio_key

# Import needed Python libraries

import pandas as pd
import numpy as np
import time
from datetime import datetime
from tqdm.notebook import tqdm, trange  # to be used in loop iterations

# Import FinViz library for getting sector, industry and other metadata

from finvizfinance.quote import finvizfinance

# Import the AWS libraries

import boto3
from boto3.s3.transfer import TransferConfig
import io
import pyarrow as pa
import pyarrow.parquet as pq

# Import Zip file libraries

from zipfile import ZipFile
from io import BytesIO
import urllib.request as urllib2

# Import SQL connection libraries

import mysql.connector 
from mysql.connector import errorcode
from sqlalchemy import create_engine

# Create the low level functional AWS client

client = boto3.client(
    's3',
    aws_access_key_id = aws_key,
    aws_secret_access_key = aws_secret_key,
    region_name = 'us-east-1'
)

# Set local path.

global my_path
my_path = file_path


In [3]:
# Get the active stock (EQS & DR) tickers and figis. This will be needed to retrieve market cap and shares outstanding
# data for active stocks and ADRs.

def get_active_tickers():
    
    global df_active_tickers
    global df_active_EQSDR_tickers
    
    active = True
    delisted = False
    currency = ''
    ticker = ''
    name = ''
    composite_mic = 'USCOMP'
    exchange_mic = ''
    stock_prices_after = ''
    stock_prices_before = ''
    cik = ''
    figi = ''
    composite_figi = ''
    share_class_figi = ''
    figi_unique_id = ''
    include_non_figi = False
    page_size = 10000
    primary_listing = True
    next_page = ''

    # Get active EQS ticker/figi list and convert to dataframe
    code = 'EQS'
    response = intrinio.SecurityApi().get_all_securities(active=active, delisted=delisted, code=code, composite_mic=composite_mic, page_size=page_size, primary_listing=primary_listing, next_page=next_page)
    df_active_EQS = pd.DataFrame([x.to_dict() for x in response
                                  .securities])[['ticker', 'figi', 'name', 'company_id', 'id', 'composite_ticker', 'share_class_figi', 
                                                'composite_figi', 'currency']].sort_values('ticker')
    df_active_EQS['instrument_type_name'] = "Equity Shares"
    df_active_EQS['instrument_type_code'] = "EQS"
    df_active_EQS['active_status'] = 1

    # Get the active ADR tickers and figis
    code = 'DR'
    response = intrinio.SecurityApi().get_all_securities(active=active, delisted=delisted, code=code, composite_mic=composite_mic, page_size=page_size, primary_listing=primary_listing, next_page=next_page)
    df_active_DR = pd.DataFrame([x.to_dict() for x in response
                                 .securities])[['ticker', 'figi', 'name', 'company_id', 'id', 'composite_ticker', 'share_class_figi', 
                                                'composite_figi', 'currency']].sort_values('ticker')
    df_active_DR['instrument_type_name'] = "Depository Receipts"
    df_active_DR['instrument_type_code'] = "DR"
    df_active_DR['active_status'] = 1
    
    # Get the active ADR tickers and figis
    code = 'ETF'
    response = intrinio.SecurityApi().get_all_securities(active=active, delisted=delisted, code=code, composite_mic=composite_mic, page_size=page_size, primary_listing=primary_listing, next_page=next_page)
    df_active_ETF = pd.DataFrame([x.to_dict() for x in response
                                  .securities])[['ticker', 'figi', 'name', 'company_id', 'id', 'composite_ticker', 'share_class_figi', 
                                                'composite_figi', 'currency']].sort_values('ticker')
    df_active_ETF['instrument_type_name'] = "Exchange Traded Fund"
    df_active_ETF['instrument_type_code'] = "ETF"
    df_active_ETF['active_status'] = 1
    
    # Assemble the EQS, DR and ETF ticker lists to one dataframe
    df_active_tickers = pd.concat([df_active_EQS, df_active_DR, df_active_ETF], ignore_index = True)
    
    # Assemble just the EQS and DR ticker lists to one dataframe
    df_active_EQSDR_tickers = pd.concat([df_active_EQS, df_active_DR], ignore_index = True)
    
    print("There are ", df_active_tickers.shape[0], " currently active tickers in total, including ETFs.")
    print("There are ", df_active_EQSDR_tickers.shape[0], " equity and ADR tickers currently active.")
    
    return df_active_tickers, df_active_EQSDR_tickers
    
    

In [4]:
# Get company metadata from Intrinio API.

def get_company_metadata():
    
    global df_company_metadata

    response = intrinio.BulkDownloadsApi().get_bulk_download_links()

    url = response.bulk_downloads[0].links[0].url

    r = urllib2.urlopen(url).read()
    file = ZipFile(BytesIO(r))
    companies_csv = file.open("companies.csv")
    df_company_metadata = pd.read_csv(companies_csv)
    
    df_company_metadata.columns = map(str.lower, df_company_metadata.columns)
    df_company_metadata = df_company_metadata[['ticker', 'id','lei', 'country', 'sector_name', 'industry_category_name', 'industry_group_name', 'sic', 'stock_exchange', 'short_description', 'long_description', 'legal_name', 'ceo', 'company_url', 'business_address', 'employees', 'cik', 'first_stock_price_date', 'last_stock_price_date', 'standardized_active', 'first_fundamental_date', 'last_fundamental_date', 'latest_filing_date', 'statement_template']]
    df_company_metadata = df_company_metadata.rename(columns={'id': 'company_id'})
    
    print("The shape of the metadata DF is ", df_company_metadata.shape)
    
    return df_company_metadata
    

In [5]:
# Get Finviz data downloader for sector and industry mapping.

def finviz_sectors(myTicker):
    
    global df_sector_info
    global bad_tickers_finviz
    global stock_sector_info
    
    bad_tickers_finviz = []

    try:
        ticker = myTicker

        result = finvizfinance(ticker).TickerFundament()

        sector = [result[key] for key in ['Sector']][0]
        industry = [result[key] for key in ['Industry']][0]

        data = [ticker, sector, industry]

    except:
        bad_tickers_finviz.append(ticker)
        pass

    stock_sector_info.append(data)
    
    return stock_sector_info


In [6]:
# Use FinViz to get sector and industry for each ticker

def get_finviz_sector_info(df_active_EQSDR_tickers):
    
    import concurrent.futures
    
    global arg_list
    global df_sector_info
    global stock_sector_info

    arg_list = list(df_active_EQSDR_tickers['ticker'])
    stock_sector_info = []

    with concurrent.futures.ThreadPoolExecutor(max_workers = 10) as executor:
        executor.map(finviz_sectors, arg_list)

    df_sector_info = pd.DataFrame(data = stock_sector_info, columns = ['ticker', 'fv_sector', 'fv_industry'])
    df_sector_info = df_sector_info.drop_duplicates(subset=['ticker'], keep = 'first')    

    #Push dataframe to CSV
    df_sector_info.to_csv(path_or_buf = my_path + "/df_company_info.csv", index=False)

    print("The sector info dataframe shape is ", df_sector_info.shape)
    
    return df_sector_info


In [7]:
# Merge the metadata with the original active ticker list

def merge_data(df_active_tickers, df_company_metadata, df_sector_info):
    
    global df_active_tickers_info

    df_active_tickers_info = df_active_tickers.merge(df_company_metadata, how = 'left', on = ['ticker', 'company_id'])
    df_active_tickers_info = df_active_tickers_info.merge(df_sector_info, how = 'left', on = 'ticker')

    df_active_tickers_info = df_active_tickers_info.drop_duplicates(subset=['ticker'], keep = 'first')
    df_active_tickers_info = df_active_tickers_info.sort_values(by = ['ticker'], ascending = True)

    # Add last update column and change boolean columns to integers (1/0)

    df_active_tickers_info['key_id'] = df_active_tickers_info['ticker'] + df_active_tickers_info['figi']
    df_active_tickers_info['last_updated_date'] = pd.to_datetime('today').normalize()

    df_active_tickers_info = df_active_tickers_info[['key_id', 'ticker', 'figi', 'name', 'company_id', 'id', 'composite_ticker',
           'share_class_figi', 'composite_figi', 'currency',
           'instrument_type_name', 'instrument_type_code', 'active_status', 'lei',
           'country', 'sector_name', 'industry_category_name',
           'industry_group_name', 'sic', 'stock_exchange', 'short_description',
           'long_description', 'legal_name', 'ceo', 'company_url',
           'business_address', 'employees', 'cik', 'first_stock_price_date',
           'last_stock_price_date', 'standardized_active',
           'first_fundamental_date', 'last_fundamental_date', 'latest_filing_date',
           'statement_template', 'fv_sector', 'fv_industry', 'last_updated_date']]

    print("The shape of the new active ticker info DF is ", df_active_tickers_info.shape)
    
    return df_active_tickers_info


In [8]:
# Push the dataframe to CSV on S3

def push_new_data_to_S3(df_active_tickers_info):

    import io
    
    today = datetime.today().strftime('%Y-%m-%d')

    myBucket = 'bns-intrinio-data'
    myKey = "security_info/df_active_tickers_info_" + today + ".csv"

    with io.StringIO() as csv_buffer:
        df_active_tickers_info.to_csv(csv_buffer, index=False)

        response = client.put_object(
            Bucket = myBucket, Key = myKey, Body=csv_buffer.getvalue()
        )

        status = response.get("ResponseMetadata", {}).get("HTTPStatusCode")

        if status == 200:
            print(f"Successful S3 put_object response. Status - {status}")
        else:
            print(f"Unsuccessful S3 put_object response. Status - {status}")
            

### Copy Old Table and Update Existing Table

In [9]:
# Use MySQL Connector to get all records from the existing Security_Info table

def get_old_info_records():
    
    import mysql.connector
    
    global df_previous_active_tickers_info

    mydb = mysql.connector.connect(
      host = rds_host,
      user = rds_user,
      password = rds_password,
      database = rds_database
    )

    mycursor = mydb.cursor()

    mycursor.execute("SELECT * FROM security_info")

    myresult = mycursor.fetchall()

    df_previous_active_tickers_info = pd.DataFrame(myresult, columns = df_active_tickers_info.columns)

    print("There are", df_previous_active_tickers_info.shape[0], "records in the current securities_info table.")
    
    return df_previous_active_tickers_info


In [10]:
# Use SQL Alchemy to push existing data from Security_Info table to Security_Info_old (backup) table in 
# SQL DB on AWS RDS:

def push_old_data_to_backup_table(df_previous_active_tickers_info):
    
    global old_data_table_update_status

    # Set database credentials.
    creds = {'usr': rds_user,
             'pwd': rds_password,
             'hst': rds_host,
             'prt': 3306,
             'dbn': rds_database}

    # Generate MySQL conection string.
    connstr = 'mysql+mysqlconnector://{usr}:{pwd}@{hst}:{prt}/{dbn}'

    # Create sqlalchemy engine for MySQL connection.
    engine = create_engine(connstr.format(**creds))

    # Write DataFrame to MySQL using the engine (connection) created above.

    df_previous_active_tickers_info.to_sql(name='security_info_old', 
                                          con=engine, 
                                          if_exists='replace', 
                                          index=False)

    print("The current securities list has been copied to the security_info_old table.")
    
    old_data_table_update_status = "Done"
    
    return old_data_table_update_status


In [11]:
# Compare new Active Tickers dataframe to current Security_Info table to isolate records that are no longer in
# the new data set and therefore no longer actively traded.

def transform_new_data_records(df_active_tickers_info, df_previous_active_tickers_info):
    
    global df_ticker_info

    df_ticker_diff = df_previous_active_tickers_info.merge(df_active_tickers_info.drop_duplicates(), on=['ticker'], 
                       how='left', indicator=True)
    df_dead_tickers = df_ticker_diff[df_ticker_diff['_merge'] == 'left_only']
    df_dead_tickers = df_dead_tickers.iloc[: , :38]
    df_dead_tickers.columns = df_previous_active_tickers_info.columns

    # df_dead_tickers.to_csv(my_path + "/dead_tickers.csv")

    print("There are", df_dead_tickers.shape[0], "records in the old securities list " \
          "where the ticker is no longer trading.")

    # With old records, set 'Active_Status' to 0 and then join to new data set and configure date columns to be
    # MySQL friendly. Also check for dupe tickers.

    df_dead_tickers['active_status'] = 0

    df_ticker_info = pd.concat([df_active_tickers_info, df_dead_tickers])
    dupes = df_ticker_info[df_ticker_info.duplicated(subset = ['ticker'], keep=False)]
    df_ticker_info['first_stock_price_date'] = pd.to_datetime(df_ticker_info['first_stock_price_date'])
    df_ticker_info['last_stock_price_date'] = pd.to_datetime(df_ticker_info['last_stock_price_date'])
    df_ticker_info['first_fundamental_date'] = pd.to_datetime(df_ticker_info['first_fundamental_date'])
    df_ticker_info['last_fundamental_date'] = pd.to_datetime(df_ticker_info['last_fundamental_date'])
    df_ticker_info['latest_filing_date'] = pd.to_datetime(df_ticker_info['latest_filing_date'])
    df_ticker_info['last_updated_date'] = pd.to_datetime('today').normalize()
    df_ticker_info.convert_dtypes().dtypes

    print("The shape of the new ticker info DF is", df_ticker_info.shape)
    print("There are", dupes.shape[0], "duplicated tickers in this set." )
    
    return df_ticker_info


In [12]:
# Push the final dataframe into SQL DB on AWS RDS:

def push_new_data_to_info_table(df_ticker_info):
    
    # Set database credentials.
    creds = {'usr': rds_user,
             'pwd': rds_password,
             'hst': rds_host,
             'prt': 3306,
             'dbn': rds_database}

    
    # MySQL conection string.
    connstr = 'mysql+mysqlconnector://{usr}:{pwd}@{hst}:{prt}/{dbn}'

    # Create sqlalchemy engine for MySQL connection.
    engine = create_engine(connstr.format(**creds))

    # Write DataFrame to MySQL using the engine (connection) created above.
    chunk = int(len(df_ticker_info) / 1000)

    df_ticker_info.to_sql(
        name='security_info', 
        con=engine, 
        if_exists='replace', 
        chunksize=chunk, 
        index=False
    )

    print("The security_info table has been updated.")
    
    new_data_update_status = "Update done."
    
    return new_data_update_status


In [15]:
# Run ETL process.

get_active_tickers()
get_company_metadata()
get_finviz_sector_info(df_active_EQSDR_tickers)
merge_data(df_active_tickers, df_company_metadata, df_sector_info)
push_new_data_to_S3(df_active_tickers_info)
get_old_info_records()
push_old_data_to_backup_table(df_previous_active_tickers_info)
transform_new_data_records(df_active_tickers_info, df_previous_active_tickers_info)
push_new_data_to_info_table(df_ticker_info)


There are  8414  currently active tickers in total, including ETFs.
There are  5763  equity and ADR tickers currently active.
The shape of the metadata DF is  (18467, 24)
The sector info dataframe shape is  (5291, 3)
The shape of the new active ticker info DF is  (8412, 38)
Successful S3 put_object response. Status - 200


In [58]:
# Push all four info dataframes to CSV files.

df_active_tickers.to_csv(path_or_buf = my_path + "/df_active_tickers.csv", index=False)
df_active_EQSDR_tickers.to_csv(path_or_buf = my_path + "/df_active_EQSDR_tickers.csv", index=False)
df_company_metadata.to_csv(path_or_buf = my_path + "/df_company_metadata.csv", index=False)
df_sector_info.to_csv(path_or_buf = my_path + "/df_sector_info.csv", index=False)

# Push bad figis/tickers to CSV for further analysis

df_bad_tickers_finviz = pd.DataFrame(bad_tickers_finviz)
df_bad_tickers_finviz.to_csv(path_or_buf = my_path + "/df_bad_tickers_finviz.csv", index=False)


In [None]:
# Push final dataframe to CSV with today's date added to the file name.

today = datetime.today().strftime('%Y-%m-%d')

df_active_tickers_info.to_csv(path_or_buf = my_path + "/df_active_tickers_info_" + today + ".csv", index=False)