In [1]:
import pandas as pd
import sqlalchemy as sa
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot
from plotly.graph_objs import Scatter, Figure, Layout, Histogram, Heatmap
from plotly import tools
init_notebook_mode(connected=True)

# Custom functions for data exchange
from utils.data_utils import update_data, df_from_table, gen_datetime_col

# Custom plotting
from utils.plot_utils import plot, plot_subplot

# Create a database file using sqlite through sqlalchemy
engine = sa.create_engine('sqlite:///cl_basic_data_analysis.db')

# Grab our data again. 
df_query = df_from_table('cl_data', engine, 'index')
df_query.head()

Unnamed: 0_level_0,date,open,high,low,close,volume,openint,contract_name,year,month,day,contract_symbol,contract_year,contract_month
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,1983-03-30,29.01,29.56,29.01,29.4,949,470,CL1983-06,1983,3,30,CL,1983,6
1,1983-03-31,29.4,29.6,29.25,29.29,521,523,CL1983-06,1983,3,31,CL,1983,6
2,1983-04-04,29.3,29.7,29.29,29.44,156,583,CL1983-06,1983,4,4,CL,1983,6
3,1983-04-05,29.5,29.8,29.5,29.71,175,623,CL1983-06,1983,4,5,CL,1983,6
4,1983-04-06,29.9,29.92,29.65,29.9,392,640,CL1983-06,1983,4,6,CL,1983,6


## Adjusting contract data for volume

What happens to the life of a contract? Surely it doesn't have a constant trading size over its life. We can see that most of the contracts aren't traded on over the course of their lives:

In [2]:
df_query['diff'] = df_query['high'] - df_query['low']
df = df_query
pct = df[df['diff'] == 0].shape[0]*100./df.shape[0]
print('{:.2f}% of contracts have a High - Low difference of 0 (no change)'.format(pct))

61.42% of contracts have a High - Low difference of 0 (no change)


**This is great!** We can assume that contracts that don't change in price do not have people trading on them. Not only that but the volume will be close to 0 for these contracts. Let's look at a single contract:

In [5]:
df = df_query[df_query['contract_name'] == 'CL2001-01']
plot([Scatter(x=df['date'], y=df['volume'], mode='markers')], 
     title='January 2001 Contract date vs volume')

The trend is exponential! How do we choose when the cutoff is for the volumes we care about? For now I will try a way to make a good selection. Let's compare the contract to the market:

In [7]:
df = df_query[df_query['contract_name'] == 'CL2001-01']

# Only grab data that shares the same date and time as our contract above
df_iso = df_query.merge(df[['date']], how='right', on='date')
df_iso = df_iso.groupby('date').mean()
plot(
    [
        Scatter(x=df['date'], y=df['open'], name='CL2001-01 open prices'),
        Scatter(
            x=df_iso.index,
            y=df_iso['open'],
            name='Average open prices in market')
    ],
    title='Price comparison: Jan 2001 Contract vs Market')

We see an obvious deviation in January of the year 2000. I strongly suspect this is because of low-volume contracts influencing the market, so let's remove all contracts that are not within a year of their expiration:

In [9]:
df = df_query[df_query['contract_name'] == 'CL2001-01']

# Only grab data that shares the same date and time as our contract above
df_iso = df_query.merge(df[['date']], how='right', on='date')

# Separate out the contracts that expire more than a year after any given date
# This logic is in parts: if the year difference is less than one, the contracts are definitely
# within a year of each other. If the difference is exactly one, then the contracts
# are within a year of each other if the contracts month is less than the date's month
# This assume (correctly) the that the contract expire by the time they hit date
year_diff = df_iso['contract_year'] - df_iso['year']
df_bool = (year_diff < 1) | (year_diff == 1) & (df_iso['contract_month'] <= df_iso['month'])
df_iso = df_iso[df_bool]

df_iso = df_iso.groupby('date').median()
plot(
    [
        Scatter(x=df['date'], y=df['open'], name='CL2001-01 open prices'),
        Scatter(
            x=df_iso.index,
            y=df_iso['open'],
            name='Average open prices in market')
    ],
    title='Price comparison: Jan 2001 Contract vs positive volume Market')

The trend no longer follows. What if we cut off the contracts when 1% of the total volume has been filled?

In [11]:
df = df_query[df_query['contract_name'] == 'CL2001-01']
total_volume = df['volume'].sum()
cum_volume = df['volume'].cumsum()
cum_pcts = cum_volume * 100. / total_volume

# Get the dates where cumulative percents are met at the following intervals
# 1, 5, 10, 20, 50, and 100 percent.
x, y, text = [], [], []
for p in [1, 5, 10, 20, 50, 100]:
    # Better way to do this?
    # Get the last percent that satisfied condition using tail
    d = df[cum_pcts < p].tail(1)
    # There is only one entry. Take it.
    x.append(d['date'].tolist()[0])
    # d.index is the location relative to the old, non-filtered dataset
    y.append(cum_volume[d.index].tolist()[0])
    text.append('{}%'.format(p))

plot(
    [
        Scatter(
            x=df['date'],
            y=cum_volume,
            name='Cumulative Volume of CL2001-01'),
        Scatter(
            x=x,
            y=y,
            mode='markers+text',
            text=text,
            name='Percentage of total volume',
            textposition='topleft')
    ],
    title='Volume percentages w.r.t. date')