In [9]:
import sys, os
import plotly
import plotly.plotly as py
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.graph_objs import Line, Scatter, Figure, Layout
import cufflinks as cf
import pandas as pd
import numpy as np
import datetime, time
from IPython.display import Image
init_notebook_mode(connected=True)

folder_name = 'premium_data_CL2'
file_names = os.listdir(folder_name)

data_list = []
data_by_name = dict()
columns = ["Date","Open","High","Low","Close","Volume","OpenInt", "Y"]
mon_lookup = {'F': '01', 'G': '02', 'H': '03', 'J': '04', 
              'K': '05', 'M': '06', 'N': '07', 'Q': '08', 
              'U': '09', 'V': '10', 'X': '11', 'Z': '12'}
for file_name in file_names:
    df = pd.read_csv("{}/{}".format(folder_name, file_name), index_col=False, names=columns)
    if df.iloc[0].tolist() == columns:
        df.drop(0, inplace=True)
    
    df['FileName'] = file_name
    
    df['Date'] = df['Date'].map(lambda x: datetime.datetime.strptime(str(x), '%Y%m%d'))

    # Convert file_name to something actually readable and sortable
    # All files have CL2__1984H.csv style formatting (Crude Oil 1984 March)
    year = file_name[5:9]
    month_code = file_name[9]
    month = mon_lookup[month_code]
    data_name = "CL{}-{}".format(year, month)
    df['contract_name'] = data_name
    df['contract_month'] = int(month)
    df['contract_month_code'] = month_code
    df['contract_year'] = int(year)
    data_list.append(df)
    data_by_name[data_name] = df
    
data = pd.concat(data_list, ignore_index=True)

# Operating an Arbitrage on Basic Assumptions

This is an explorative take on an arbitrage condition that potentially exists. 

In [12]:
pairs = [['CL2018-12', 'CL2018-07'],
         ['CL2017-12', 'CL2017-07'],
         ['CL2016-12', 'CL2016-07'],
         ['CL2015-12', 'CL2015-07'],]
         
pairs = [['CL2018-06', 'CL2018-01'],
         ['CL2018-08', 'CL2018-02'],
         ['CL2018-09', 'CL2017-03'],
         ['CL2018-11', 'CL2018-04']]


In [13]:
def plot_pair(sym1, sym2):
    # sym1 = short, sym2 = long
    data1 = data[data.contract_name == sym1][['Date', 'Open']]
    data2 = data[data.contract_name == sym2][['Date', 'Open']]

    # This has some weird data where price is over $1,099.....!
    data2 = data2[(data2['Open'] < 500)]
    data1 = data1[(data1['Open'] < 500)]

    scatters = [Scatter(x=data1['Date'], y=data1['Open'], name=sym1, 
                        mode='markers', marker=dict(size=4)),
                Scatter(x=data2['Date'], y=data2['Open'], name=sym2, 
                        mode='markers', marker=dict(size=4))]

    layout = dict(title = "Comparing {} and {} over time".format(sym1, sym2),
                  yaxis = dict(zeroline = False),
                  xaxis = dict(zeroline = False))

    fig = dict(data=scatters, layout=layout)
    #iplot(fig)

    joined_data = data1.join(data2.set_index('Date'), on='Date', how="inner", rsuffix='_'+sym2, lsuffix='_'+sym1)
    print(sym2, "-", sym1)
    joined_data['diff'] = joined_data['Open_'+sym2] - joined_data['Open_'+sym1]
    joined_data['positive'] = joined_data['diff'] > 0
    data_positive = joined_data[joined_data['positive']]
    data_negative = joined_data[~joined_data['positive']]

    colors = ['rgba(20, 100, 200, .8)', 'rgba(200, 50, 100, .8)']
    scatter = Scatter(x=joined_data['Date'],
                      y=joined_data['diff'],
                      mode='markers',
                      name='Positive differences',
                      hoverinfo='skip',
                      marker = dict(size = 1,
                                    color = joined_data['positive'].map(lambda x: colors[x])),
                      text = joined_data.apply(lambda x: "{}={:.3f}<br>{}={:.3f}".format(sym2, x['Open_'+sym2], sym1, x['Open_'+sym1]), axis=1),
                      showlegend=False
                     )

    g = joined_data[['Date', 'diff']].set_index('Date').groupby(pd.TimeGrouper("M"))
    df_mean = g.mean()
    df_mean.reset_index(level=0, inplace=True)
    df_mean['Date'] -= datetime.timedelta(days=15)
    scatter_mean = Scatter(x=df_mean['Date'],
        y=df_mean['diff'],
        mode='lines',
        line=dict(width=2, shape='spline', smoothing=.1),
        name='{}-{}'.format(sym2[2:].replace('-', '/'), sym1[2:].replace('-', '/')))    
    df_std = g.std()
    df_std.reset_index(level=0, inplace=True)
    df_std['Date'] -= datetime.timedelta(days=15)
    scatter_std_up = Scatter(x=df_std['Date'],
        y=df_mean['diff']+df_std['diff'],
        mode='lines',
        marker=dict(color="444"),
        line=dict(width=0, shape='spline', smoothing=.2),
        fillcolor='rgba(68, 68, 68, 0.3)',
        fill='tonexty',
        hoverinfo='skip',
        showlegend=False)

    scatter_std_down = Scatter(x=df_std['Date'],
        y=df_mean['diff']-df_std['diff'],
        mode='lines',
        marker=dict(color="444"),
        line=dict(width=0, shape='spline', smoothing=.2),
        fillcolor='rgba(68, 68, 68, 0.3)',
        fill='tonexty',
        hoverinfo='skip',
        showlegend=False)
    
    layout = Layout(title = 'Difference between a few contracts from 2015 to 2018'.format(sym2, sym1),
                  yaxis = dict(zeroline = False),
                  xaxis = dict(zeroline = True),
                  showlegend = True)
    fig = dict(data=[scatter, scatter_std_up, scatter_std_down, scatter_mean], layout=layout)
    return fig

fig = dict(data=[])
for row in pairs:
    print("Comparing {} and {}".format(row[0], row[1]))
    new_fig = plot_pair(row[0], row[1])
    fig['data'] += new_fig['data']
    fig['layout'] = new_fig['layout']
    iplot(fig)
    time.sleep(1)

Comparing CL2018-06 and CL2018-01
CL2018-01 - CL2018-06


Comparing CL2018-08 and CL2018-02
CL2018-02 - CL2018-08


Comparing CL2018-09 and CL2017-03
CL2017-03 - CL2018-09


Comparing CL2018-11 and CL2018-04
CL2018-04 - CL2018-11


## Analysis when comparing differences in contracts over different years

The above graph was created by taking the January contract for any year and subtracting it by the July contract for the same year. 

As seen above, the general trend has been that Jan-July is positive up until 2015. After 2015, Jan-July is negative, indicating a drastic market shift. Furthermore, future contract (e.g 2018) seem to be closer to zero before and after Jan 2015. 

# Average price of contracts over time

One analysis would be to naiively take all contracts and take the average of all their prices. This could give us a good impression of what contracts do over the course of many years.

In [None]:
g = data[['Date', 'Open']].set_index('Date').groupby(pd.TimeGrouper("M"))
df_mean = g.mean().reset_index()
df_std = g.std().reset_index()
df_diff = g.diff(3).reset_index()
fig = dict(data=[Scatter(x=df_mean['Date'], y=df_mean['Open'], name='Avg Contract price per day'), 
                 Scatter(x=df_diff['Date'], y=df_diff['Open'], name='Change in price from last month', mode='line', hoverinfo='skip'),
                 Scatter(x=df_std['Date'], y=df_std['Open'], name='Std Dev of contracts per day'),], 
       layout=Layout(title= 'Average and Std Dev of Contracts Per Day'))
iplot(fig)


The price is seem to vary with time, as expected, but unexpectedly increases in variance over time fairly signficantly. 

## Sanity Checks: How are contracts different? 

Time for some sanity checks. How do the contracts themselves change in their fundamental structure over time? 

In [None]:
# Count number of entries per contract
g = data[['Open', 'contract_name']].groupby(['contract_name'])
df = g.count()
fig=dict(data=[Scatter(x=df.index, y=df['Open'])], layout=Layout(title='Number of Entries per contract in historical data'))
iplot(fig)

# What is the mean price of each contract over its recorded history?
g = data[['Open', 'contract_name']].groupby(['contract_name'])
df_mean = g.mean().reset_index()
df_std = g.std().reset_index()
fig = dict(data=[Scatter(x=df_mean['contract_name'], y=df_mean['Open'], name='Avg Contract price'), 
        Scatter(x=df_std['contract_name'], y=df_std['Open'], name='Std Dev of contract')], 
       layout=Layout(title= 'Average Contract Price over its history'))
iplot(fig)

Wow! The prices and the number of historical data on the contract seems to relate in a significant way. Sharp spikes are seen in June and December of each year from around the year 2000 to the year 2014, which is suddenly flat. In 2015 The spikes reverse the prices are now higher in December and June instead of lower. The date range are as follows:

In [None]:
d = data[data['contract_name'] == 'CL2007-12']
d['Date'].max() - d['Date'].min()

In [None]:
d = data[data['contract_name'] == 'CL2007-11']
d['Date'].max() - d['Date'].min()

In the contract that expires December of 2007 there are 2,552 days being recorded for the contract. A month earlier, in November, this is almost 3 times less at 882 days! 

In [None]:
# Limit the amount of data by records and rerun our graphs

g = data[['Date', 'contract_name']].groupby('contract_name')
df = g.max()
df['date_limit'] = df['Date'] - datetime.timedelta(days=365)
df.drop('Date', axis=1, inplace=True)
limited_data = data.join(df, on='contract_name', how='left')
limited_data.drop(limited_data[limited_data['Date'] < limited_data['date_limit']].index, inplace=True)

In [None]:
g = limited_data[['Open', 'contract_name']].groupby(['contract_name'])
df = g.count()
fig=dict(data=[Scatter(x=df.index, y=df['Open'])], layout=Layout(title='Number of Entries per contract in historical data over its last year'))
iplot(fig)

In [None]:
g = data[['Open', 'contract_name']].groupby(['contract_name'])
df_mean = g.mean().reset_index()
df_std = g.std().reset_index()
fig = dict(data=[Scatter(x=df_mean['contract_name'], y=df_mean['Open'], name='Avg Contract price'), 
        Scatter(x=df_std['contract_name'], y=df_std['Open'], name='Std Dev of contract')], 
       layout=Layout(title= 'Average Contract Price over its last year in existance'))
iplot(fig)

## Exploring combos

In order to explore combination we need to find the correct and valid combinations of contracts. In general, we are looking to make a table of the following:

> (short contract, long contract, months_apart, date, days_till_expire)

These combos are valid as long as the contracts have historical data on at least one day. The columns are described as follows:

| column name      | description |
| ---------------- | --------------------------- |
| short_contract   | contract that expires first |
| long_contract    | contract that expires last  |
| short_value      | value of short contract on this date |
| long_value       | value of long contract on this date  |
| months_apart     | how many months apart do the contracts expire |
| date             | date of historical data     |
| days_till_expire | how many days until the short contract  <br>no longer has historical data |

In [None]:
# Notes: 
#     Making the date all inclusive will create 9317468 rows. 
#     I am running out of memory. We will only use the year 2000 on
#     which limits the rows to 8319236 (grumble). 
#     Further filtering makes sure pairs of contracts expire within
#     a year of each other (rows become 2526585)


df = data[['Date', 'contract_name']][data['Date'] > datetime.date(2000, 1, 1)]
df = df.groupby('Date')['contract_name'].apply(lambda x: tuple(sorted(x))).reset_index()
df.rename(index=str, columns={'contract_name': 'contract_names'}, inplace=True)

max_year = data['Date'].max().year + 2

def make_spreads(contracts: list):
    pairs = []
    for idx, c1 in enumerate(contracts):
        for c2 in contracts[idx+1:]:
            y1, y2, m1, m2 = int(c1[2:6]), int(c2[2:6]), int(c1[7:]), int(c2[7:])
            if max(y1, y2) > max_year:
                continue
            months_apart = abs(y2*12+(m2)-(y1*12+(m1)))
            if months_apart <= 12.0:
                pairs.append((c1,c2, months_apart))
            
    return pairs 


In [None]:
# Make spreads per each date 
df['spreads'] = df['contract_names'].apply(make_spreads)

In [None]:
# Explode the spreads so that each have their own column
date_to_spreads = pd.DataFrame(df['spreads'].tolist(), index=df['Date']).stack().reset_index(level=0)
date_to_spreads.rename(index=str, columns={0: 'spread'}, inplace=True)

In [None]:
# Split the spread data into their own 2 columns
df2 = pd.DataFrame(date_to_spreads['spread'].values.tolist(), 
                   columns=['short_contract', 'long_contract', 'months_apart'])
date_to_spreads['short_contract'] = df2['short_contract'].values
date_to_spreads['long_contract'] = df2['long_contract'].values
date_to_spreads['months_apart'] = df2['months_apart'].values
date_to_spreads.drop('spread', axis=1, inplace=True)

In [None]:
# Cross reference data from contract in main table to the spreads dataset
df = date_to_spreads
df = pd.merge(data[['Date', 'contract_name', 'Open', 'contract_month', 'contract_year']], df, 
              left_on=['Date', 'contract_name'], 
              right_on=['Date', 'short_contract'])
df.rename(index=str, columns={'Open': 'short_open_value', 
                              'contract_month': 'short_month', 
                              'contract_year': 'short_year'}, inplace=True)
df.drop('contract_name', axis=1, inplace=True)

df = pd.merge(data[['Date', 'contract_name', 'Open', 'contract_month', 'contract_year']], df, 
              left_on=['Date', 'contract_name'], 
              right_on=['Date', 'long_contract'])
df.rename(index=str, columns={'Open': 'long_open_value',
                              'contract_month': 'long_month', 
                              'contract_year': 'long_year'}, inplace=True)
df.drop('contract_name', axis=1, inplace=True)

# Keep differences in price
df['long_short_diff'] = df['long_open_value'] - df['short_open_value']

# Get max dates for each contract

max_dates = data[['contract_name', 'Date']].groupby(['contract_name']).max().reset_index()
max_dates.rename(index=str, columns={'Date': 'stop_date'}, inplace=True)
df = df.set_index('short_contract').join(max_dates.set_index('contract_name'))

df['days_till_stop'] = (df['stop_date'] - df['Date']).days

spreads = df.reset_index()
spreads.rename(index=str, columns={'index': 'short_contract'}, inplace=True)
# spreads.to_csv('spreads_from_year_2000.csv')
# makes a 291mb file!

In [None]:
spreads['months_till_stop'] = spreads['days_till_stop'].map(lambda x: x//30.5).astype(int)
df = spreads
df = df[(df['months_till_stop'] < 6) & (df['months_till_stop'] >= 3)]
# Look at trends
apart_3_mon = df[df.months_apart == 3]
apart_6_mon = df[df.months_apart == 6]

g = apart_3_mon.groupby(['Date', 'months_till_stop'])
df3mon = g.mean().reset_index()
g = apart_6_mon.groupby(['Date', 'months_till_stop'])
df6mon = g.mean().reset_index()
#apart_3_mon


##iplot([Scatter(x=df3mon['Date'], y=df3mon['long_short_diff'], mode='markers', marker=dict(size=1), hoverinfo='skip'),
#       Scatter(x=df6mon['Date'], y=df6mon['long_short_diff'], mode='markers', marker=dict(size=1), hoverinfo='skip')])