# Processing AlgoSeek's Trade and Quote Minute Bar Data
---

### Import Libraries 

In [1]:
import warnings 
warnings.filterwarnings('ignore')

In [4]:
#% matplotlib inline

from pathlib import Path
from tqdm import tqdm 

import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt
import seaborn as sns 

In [5]:
sns.set_style('whitegrid')
idx = pd.IndexSlice

# AlgoSeek Trade and Quote Minute Bar Data 
---
### Data Dictionary 
-  Quote fields based on changes to the National Best Bid and Offer [(NBBO)](https://www.investopedia.com/terms/n/nbbo.asp) 
    - quote reporting *highest bid price* and *lowest ask price* amongst exchanges and trading venues 
    - represents the tightest composite bid-ask spread ina security -> brokers supposed to trade at best avaliable ask and bid
    - Calculated and Disseminated by the Security Information Processors (SIP)
> - *Field*: Name of Field
> - *Q/T*: Field based on Quotes or Trades 
> - *Type*: Field Format
> - *No Value*: Value of Field when there is No Value or Data 
>   - 'Never' means field should always have a value EXCEPT for the first bar of the day 
> - *Description*: Description of the Field 

# Data Prep 
---
- [AlgoSeek Documentation](https://us-equity-market-data-docs.s3.amazonaws.com/algoseek.US.Equity.TAQ.Minute.Bars.pdf)
- shorten field names to reduce typing 
- Minute-Bar data comes in compressed .csv files that contain the data for one symbol and day 
    - Organized in 3 Directories for Each Year (2015-2017)

In [None]:
tcols = ['openbartime',
         'firsttradetime',
         'highbidtime',
         'highasktime',
         'hightradetime',
         'lowbidtime',
         'lowasktime',
         'lowtradetime',
         'closebartime',
         'lasttradetime']

drop_cols = ['unknowntickvolume',
             'cancelsize',
             'tradeatcrossorlocked']

keep = ['firsttradeprice',
        'hightradeprice',
        'lowtradeprice',
        'lasttradeprice',
        'minspread',
        'maxspread',
        'volumeweightprice',
        'nbboquotecount',
        'tradeatbid',
        'tradeatbidmid',
        'tradeatmid',
        'tradeatmidask',
        'tradeatask',
        'volume',
        'totaltrades',
        'finravolume',
        'finravolumeweightprice',
        'uptickvolume',
        'downtickvolume',
        'repeatuptickvolume',
        'repeatdowntickvolume',
        'tradetomidvolweight',
        'tradetomidvolweightrelative']

columns = {'volumeweightprice': 'price',
           'finravolume': 'fvolume',
           'finravolumeweightprice': 'fprice',
           'uptickvolume': 'up',
           'downtickvolume': 'down',
           'repeatuptickvolume': 'rup',
           'repeatdowntickvolume': 'rdown',
           'firsttradeprice': 'first',
           'hightradeprice': 'high',
           'lowtradeprice': 'low',
           'lasttradeprice': 'last',
           'nbboquotecount': 'nbbo',
           'totaltrades': 'ntrades',
           'openbidprice': 'obprice',
           'openbidsize': 'obsize',
           'openaskprice': 'oaprice',
           'openasksize': 'oasize',
           'highbidprice': 'hbprice',
           'highbidsize': 'hbsize',
           'highaskprice': 'haprice',
           'highasksize': 'hasize',
           'lowbidprice': 'lbprice',
           'lowbidsize': 'lbsize',
           'lowaskprice': 'laprice',
           'lowasksize': 'lasize',
           'closebidprice': 'cbprice',
           'closebidsize': 'cbsize',
           'closeaskprice': 'caprice',
           'closeasksize': 'casize',
           'firsttradesize': 'firstsize',
           'hightradesize': 'highsize',
           'lowtradesize': 'lowsize',
           'lasttradesize': 'lastsize',
           'tradetomidvolweight': 'volweight',
           'tradetomidvolweightrelative': 'volweightrel'}            

### `extract_and_combine_data()` reads the ~80k source files and combines them into a single hdf5 file for faster access 