In [None]:
import warnings
warnings.filterwarnings("ignore")
from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')


### Research processing CME span files

In [None]:

import zipfile
import glob
import pandas as pd
import numpy as np

from argparse import ArgumentParser
from argparse import RawDescriptionHelpFormatter
import sys
import os
if  not './' in sys.path:
    sys.path.append('./')
if  not '../' in sys.path:
    sys.path.append('../')

from barchartacs import build_db
from barchartacs import db_info
import plotly.graph_objs as go
from plotly.offline import  init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.tools as tls
from plotly.graph_objs.layout import Font,Margin
from IPython import display

import datetime
import io
from tqdm import tqdm,tqdm_notebook
from barchartacs import pg_pandas as pg
import mibian
import py_vollib
import importlib
from py_vollib import black
from py_vollib.black import implied_volatility
import ipdb
import traceback
import pandas_datareader.data as pdr
import cme_expirations as cmeexp
import re
import requests
import io
import zipfile
import urllib
import pathlib


### important global variables

In [None]:
def figure_crosshairs(fig):
    fig['layout'].hovermode='x'
    fig['layout'].yaxis.showspikes=True
    fig['layout'].xaxis.showspikes=True
    fig['layout'].yaxis.spikemode="toaxis+across"
    fig['layout'].xaxis.spikemode="toaxis+across"
    fig['layout'].yaxis.spikedash="solid"
    fig['layout'].xaxis.spikedash="solid"
    fig['layout'].yaxis.spikethickness=1
    fig['layout'].xaxis.spikethickness=1
    fig['layout'].spikedistance=1000
    return fig


def plotly_plot(df_in,x_column,plot_title=None,
                y_left_label=None,y_right_label=None,
                bar_plot=False,width=800,height=400,
                number_of_ticks_display=20,
                yaxis2_cols=None,
                x_value_labels=None,
                modebar_orientation='v',modebar_color='grey',
                legend_x=None,legend_y=None,
                title_y_pos = 0.9,
                title_x_pos = 0.5):
    
    ya2c = [] if yaxis2_cols is None else yaxis2_cols
    ycols = [c for c in df_in.columns.values if c != x_column]
    # create tdvals, which will have x axis labels
    td = list(df_in[x_column]) 
    nt = len(df_in)-1 if number_of_ticks_display > len(df_in) else number_of_ticks_display
    spacing = len(td)//nt
    tdvals = td[::spacing]
    tdtext = tdvals
    if x_value_labels is not None:
        tdtext = [x_value_labels[i] for i in tdvals]
    
    # create data for graph
    data = []
    # iterate through all ycols to append to data that gets passed to go.Figure
    for ycol in ycols:
        if bar_plot:
            b = go.Bar(x=td,y=df_in[ycol],name=ycol,yaxis='y' if ycol not in ya2c else 'y2')
        else:
            b = go.Scatter(x=td,y=df_in[ycol],name=ycol,yaxis='y' if ycol not in ya2c else 'y2')
        data.append(b)

    # create a layout

    layout = go.Layout(
        title=plot_title,
        xaxis=dict(
            ticktext=tdtext,
            tickvals=tdvals,
            tickangle=45,
            type='category'),
        yaxis=dict(
            title='y main' if y_left_label is None else y_left_label
        ),
        yaxis2=dict(
            title='y alt' if y_right_label is None else y_right_label,
            overlaying='y',
            side='right'),
        autosize=True,
#         autosize=False,
#         width=width,
#         height=height,
        margin=Margin(
            b=100
        ),
        modebar={'orientation': modebar_orientation,'bgcolor':modebar_color}
    )

    fig = go.Figure(data=data,layout=layout)
    fig.update_layout(
        title={
            'text': plot_title,
            'y':title_y_pos,
            'x':title_x_pos,
            'xanchor': 'center',
            'yanchor': 'top'})
    if (legend_x is not None) and (legend_y is not None):
        fig.update_layout(legend=dict(x=legend_x, y=legend_y))
    fig = figure_crosshairs(fig)
    return fig


def plotly_shaded_rectangles(beg_end_date_tuple_list,fig):
    ld_shapes = []
    for beg_end_date_tuple in beg_end_date_tuple_list:
        ld_beg = beg_end_date_tuple[0]
        ld_end = beg_end_date_tuple[1]
        ld_shape = dict(
            type="rect",
            # x-reference is assigned to the x-values
            xref="x",
            # y-reference is assigned to the plot paper [0,1]
            yref="paper",
            x0=ld_beg[i],
            y0=0,
            x1=ld_end[i],
            y1=1,
            fillcolor="LightSalmon",
            opacity=0.5,
            layer="below",
            line_width=0,
        )
        ld_shapes.append(ld_shape)

    fig.update_layout(shapes=ld_shapes)
    return fig

#### Read the links on the CME Confluence documentation page that contain links to Span Expanded Format files that one finds on the CME ftp site (ftp://ftp.cmegroup.com/span/archive/cme/)
* Create the variable `urls`, that points to the documentation pages for each record type in the Fixed Position Span files
* Create the variable `rtypes` that contains single character Span Record Types
  * The record type `8` will expand to `81`, `82`, `83`, `84`

In [None]:
links_xpath = '//div[@id="main-content"]//ul//li/a[starts-with(text(),"Expanded Format")]/@href'
from lxml import html
import requests
page = requests.get('https://www.cmegroup.com/confluence/display/pubspan/Risk+Parameter+File+Layouts+for+the+Positional+Formats')
tree = html.fromstring(page.content)
links = tree.xpath(links_xpath)
urls = ['https://www.cmegroup.com/'+l for l in links]
rtypes = [re.findall('Type\+(.)',l)[0] for l in links]


In [None]:
urls

In [None]:
rtypes

#### Create the dictionary `dict_rec_types` which divides the records into separate DataFrames for each Span Record Type

In [None]:
def get_url(url,table_index=0):
    header = {
      "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
      "X-Requested-With": "XMLHttpRequest"
    }
    r = requests.get(url, headers=header)
    try:
        dft = pd.read_html(r.text)[table_index]
        return dft 
    except Exception as e:
        print(f'error on type {rt}: {str(e)}')
        return None
    

dict_rec_types={}
for i in tqdm_notebook(range(len(urls))):
    if rtypes[i]=='8':
        dict_rec_types['81'] = get_url(urls[i],table_index=0)
        cols = dict_rec_types['81'].iloc[0].values
        dict_rec_types['81'] = dict_rec_types['81'].iloc[1:]
        dict_rec_types['81'].columns = cols
        dict_rec_types['82'] = get_url(urls[i],table_index=1)
        dict_rec_types['82'] = dict_rec_types['82'].iloc[1:]
        dict_rec_types['82'].columns = cols
        dict_rec_types['83'] = get_url(urls[i],table_index=2)
        dict_rec_types['83'] = dict_rec_types['83'].iloc[1:]
        dict_rec_types['83'].columns = cols
        dict_rec_types['84'] = get_url(urls[i],table_index=3)
        dict_rec_types['84'] = dict_rec_types['84'].iloc[1:]
        dict_rec_types['84'].columns = cols
    else:
        dict_rec_types[rtypes[i]] = get_url(urls[i])
    


In [None]:
# dict_rec_types['81'].columns = dict_rec_types['81'].iloc[0].values
dict_rec_types['84']


#### Get example pa2 Span Data files with different types of records

In [None]:
import requests
import io
import zipfile
import urllib


def download_extract_zip(url):
    """
    Download a ZIP file and extract its contents in memory
    yields (filename, file-like object) pairs
    """
#     response = requests.get(url)
    mysock = urllib.request.urlopen(url)
    memfile = io.BytesIO(mysock.read())
    with zipfile.ZipFile(memfile, 'r') as thezip:
        d =  {
            name: io.BytesIO(thezip.read(name)).read().decode('UTF-8')
            for name in thezip.namelist()}
        return list(d.values())[0]


In [None]:
pa22.split('\r\n')[0:20]


In [None]:
pa2[0:20]

In [None]:
# Get pa2 cme.nr.20201006.c.pa2
print('fetching pa2 zip from CME ftp site ... (takes about 30 seconds)')
pa2_url = 'ftp://ftp.cmegroup.com/span/archive/cme/2020/cme.20200106.c.pa2.zip'
pa2 = download_extract_zip(pa2_url).split('\r\n')
# pa2 = open(f'{pathlib.Path.home()}/downloads/cme.20200106.c.pa2','r').readlines()
# create a dictionary of pa2 records per record_type (rt)
data_dict = {}
for rt in tqdm_notebook(dict_rec_types.keys()):
    print(rt,end=",")
    data_dict[rt] = [s for s in pa2 if s[0:len(rt)]==rt]

len(data_dict['81'][0])


In [None]:
data_dict['81']

#### Create `dict_df`, which holds a DataFrame of data extracted from the pa2 csv, for each Span Record Type

In [None]:
# create a dictionary of DataFrames per record type that hold the pa2 data for each
#  record type (rt)
dict_df = {}
for rt in tqdm_notebook(dict_rec_types.keys()):
    try:
        dict_df[rt] = pd.DataFrame(
            [
                {
                    c[5].strip():r[int(c[1])-1:int(c[2])].strip()
                    for c in dict_rec_types[rt].values if 'filler' not in c[5].lower()
                }
                for r in data_dict[rt]
            ])
    except Exception as e:
        dict_df[rt] = None
        print(f'error on {rt}: {str(e)}')


In [None]:
dict_df_c_recs = dict_df.copy()

In [None]:
pd.DataFrame({'rtype':list(dict_rec_types.keys()),'rlen':[len(data_dict[rt]) for rt in dict_rec_types.keys()]})

In [None]:
dict_df['81']['Commodity (Product) Code'].unique()

In [None]:
cols = dict_df['82'].columns.values
cols2 = [' '.join(c.split(' ')[:8]) for c in cols]
pd.DataFrame({'col':cols,'col2':cols2})

In [None]:
ccs = ['81','82','83','84']
for c in ccs:
    dict_df[c].columns = [' '.join(c.split(' ')[:8]) for c in dict_df[c].columns.values]

In [None]:
dict_df['81'][dict_df['81']['Exchange Acronym']=='NYM']['Commodity (Product) Code'].unique()