# Download and Parse a Sample File of ITCH Messages
--- 
[GITHUB](https://github.com/PacktPublishing/Machine-Learning-for-Algorithmic-Trading-Second-Edition/blob/master/02_market_and_fundamental_data/01_NASDAQ_TotalView-ITCH_Order_Book/01_parse_itch_order_flow_messages.ipynb)

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
%matplotlib inline
import gzip
import shutil 

import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns 

from matplotlib.ticker import FuncFormatter 
from struct import unpack
from collections import namedtuple, Counter, defaultdict
from pathlib import Path
from urllib.request import urlretrieve
from urllib.parse import urljoin
from datetime import timedelta
from time import time


In [4]:
sns.set_style('whitegrid')

In [5]:
def format_time(t):
    """Return a formatted time string 'HH:MM:SS
    based on a numeric time() value"""
    m, s = divmod(t, 60)
    h, m = divmod(m, 60)
    return f'{h:0>2.0f}:{m:0>2.0f}:{s:0>5.2f}'

# Get NASDAQ ITCH Data from FTP Server 
--- 
- Nasdaq offers samples of daily binary files for several months 
- parse a sample file of ITCH messages 
- reconstruct executed trades and the order book for any given tick 
- Large Dataset (time and memory space 16GB+)

### Set Data Paths
- store data in `data` subdirectory and convert result to `hdf` format 
- Sample Files: [NASDAQ ftp server](ftp://emi.nasdaq.com/ITCH/)

In [6]:
data_path = Path('data') # SET TO EXTERNAL HARDDRIVE -> LARGE DATASET
itch_store = str(data_path / 'itch.h5')
order_book_store = data_path / 'order_book.h5'

In [7]:
# Sample FTP Address, filename and corresponding date used in example
FTP_URL = 'ftp://emi.nasdaq.com/ITCH/Nasdaq ITCH/'
SOURCE_FILE = '10302019.NASDAQ_ITCH50.gz'

### Download and Unzip 

In [8]:
def may_be_downloaded(url): 
    ''' 
    Download and Unzip ITCH Data if Not Yet Avaliable
    '''
    if not data_path.exists(): 
        print('Creating Directory')
        data_path.mkdir()
    else: 
        print('Directory Exists')
    
    filename = data_path / url.split('/')[-1]
    if not filename.exists(): 
        print('Downloading...', url)
        urlretrieve(url,filename)
    else: 
        print('File Exists')
    
    unzipped = data_path / (filename.stem + '.bin')
    if not unzipped.exists(): 
        print('Unzipping to', unzipped)
        with gzip.open(str(filename), 'rb') as f_in: 
            with open(unzipped, 'wb') as f_out: 
                shutil.copyfileobj(f_in, f_out)
    else: 
        print('File Already Unpacked')
    
    return unzipped

In [10]:
file_name = may_be_downloaded(urljoin(FTP_URL, SOURCE_FILE))
date = file_name.name.split('.')[0]

Directory Exists
Downloading... ftp://emi.nasdaq.com/ITCH/Nasdaq ITCH/10302019.NASDAQ_ITCH50.gz


URLError: <urlopen error ftp error: TimeoutError(60, 'Operation timed out')>