In [1]:
from datetime import date, datetime, timedelta
from copy import copy
import numpy as np
import pandas as pd
import sqlalchemy as sa
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot
from plotly.graph_objs import Scatter, Figure, Layout, Histogram, Heatmap
from plotly import tools
init_notebook_mode(connected=True)

# Custom functions for data exchange
from utils.data_utils import update_data, df_from_table, gen_datetime_col

# Custom plotting
from utils.plot_utils import plot, plot_subplot

# Create a database file using sqlite through sqlalchemy
engine = sa.create_engine('sqlite:///cl_basic_data_analysis.db')

# Grab our data again. 
df_query = df_from_table('cl_data', engine, 'index')
df_query.head()

Unnamed: 0_level_0,date,open,high,low,close,volume,openint,contract_name,year,month,day,contract_symbol,contract_year,contract_month
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,1983-03-30,29.01,29.56,29.01,29.4,949,470,CL1983-06,1983,3,30,CL,1983,6
1,1983-03-31,29.4,29.6,29.25,29.29,521,523,CL1983-06,1983,3,31,CL,1983,6
2,1983-04-04,29.3,29.7,29.29,29.44,156,583,CL1983-06,1983,4,4,CL,1983,6
3,1983-04-05,29.5,29.8,29.5,29.71,175,623,CL1983-06,1983,4,5,CL,1983,6
4,1983-04-06,29.9,29.92,29.65,29.9,392,640,CL1983-06,1983,4,6,CL,1983,6


# Spreads

We've all heard of Hedge funds. But how did they get their name? I recently found that the name came from something calling Spreads. The idea was simple: in a perfect market data is completely random. Any imbalances are quickly exploited by traders who immediately make profit. 

### Example of when you need to start using spreads

You are a large company and you buy a large quantity of oil contracts (say, 10% of the entire market). The market reacts: supply drops but demand remains the same and therefore the price should go up. If you are carefully watching these changes in the market something like this would set your alarm off. What would you do? 

The answer is simple. Immediately sell some of your contracts and increase the supply in the market. Within a couple of days the market returns to the price set before you bought the oil contracts. When the market stabalizes, immediate buy back your contracts for the cheaper price. 

This concept is called "shorting" and many, many players in the market participate. There is a low probability, therefore, you will have the opporutnity to participate with any reasonable gains. 

### Introducing Spreads

The market is random, and quickly corrects for sudden changes. What do you do? One great and simple idea was to sell contracts in pairs. One contract acts as a long term bet that it will increase in price, and another acts as a short-term bet that it's price will decrease. This acts as a counter-balance. 

If two contracts are similar in price and you expect one to increase and one to decrease in price, this is a perfect siuation that guides a spread.

In [2]:
# Simulate a spread of data
def plot_spread(x1, y1, x2, y2, title):
    assert len(x1) == len(x2) == len(y1) == len(y2)
    s1 = np.std(y1)
    s2 = np.std(y2)
    

    ub1 = Scatter(
        x=x,
        y=y1 + s1,
        mode='lines',
        marker=dict(color="444"),
        line=dict(width=0),
        fillcolor='rgba(101, 100, 210, 0.3)',
        fill='tonexty')

    lb1 = Scatter(
        x=x, y=y1 - s1, marker=dict(color="444"), line=dict(width=0), mode='lines')

    t1 = Scatter(
        x=x,
        y=y1,
        mode='lines',
        line=dict(color='rgb(101, 100, 210)'),
        fillcolor='rgba(101, 100, 210, 0.3)',
        fill='tonexty'    
        )

    ub2 = Scatter(
        x=x,
        y=y2 + s2,
        mode='lines',
        marker=dict(color="444"),
        line=dict(width=0),
        fillcolor='rgba(31, 119, 60, 0.3)',
        fill='tonexty')

    lb2 = Scatter(
        x=x, y=y2 - s2, mode='lines', marker=dict(color="444"), line=dict(width=0))

    t2 = Scatter(
        x=x,
        y=y2,
        fill='tonexty',
        mode='lines',
        line=dict(color='rgb(31, 119, 60)'),
        fillcolor='rgba(31, 119, 60, 0.3)')

    data = [lb1, t1, ub1, lb2, t2, ub2]
    plot(data, title=title, show_legend=False)

n = 200
x = np.linspace(0, 5, num=n)
y1 = (np.arctan(x - 2) + 2 * np.e) + np.random.normal(size=n)
s1 = np.std(y1)
y2 = -1 * np.arctan(x - 2) + 3 + np.random.normal(size=n)
s2 = np.std(y2)
plot_spread(x1=x, y1=y1, x2=x, y2=y2, title='Sample Ideal Spread')

To Figure our how much money you can make from the above sample, simply find the equation that fits the graphs the best and find the difference.

In [3]:
# From Above
n = 200
x = np.linspace(0, 5, num=n)
y1 = (np.arctan(x - 2) + 2 * np.e)
y2 = -1 * np.arctan(x - 2) + 3
print('Original difference in price: ${:.02f}'.format(y1[0] - y2[0]))
print('Final difference in price: ${:.02f}'.format(y1[-1] - y2[-1]))

Original difference in price: $0.22
Final difference in price: $4.93


## What does that look like with real data? 

If we were to buy the January 2016 contract in 2015 and compare it to the July 2016 contract we could potentially find an opportunity to make money: 

In [4]:
bool_df = (df_query['year'] == 2015)
bool_df_2016_01 = (df_query['contract_name'] == 'CL2016-01')
bool_df_2016_07 = (df_query['contract_name'] == 'CL2016-07')

# 100% filtered data
df_2015 = df_query[bool_df & bool_df_2016_01 | bool_df_2016_07]

# Grab only the data for the two individual contracts
df_2015_CL2016_01 = df_query[bool_df & bool_df_2016_01]
df_2015_CL2016_07 = df_query[bool_df & bool_df_2016_07]

In [5]:
df = pd.merge(df_2015_CL2016_01, df_2015_CL2016_07,  
              on='date', suffixes=('_01', '_07'))
df['spread'] = df['close_07'] - df['close_01']
df['spread_deriv'] = df['spread'].diff()
df['spread_deriv_pos'] = df['spread_deriv'][df['spread_deriv'] > 0]
df['spread_deriv_neg'] = df['spread_deriv'][df['spread_deriv'] <= 0]
plot([Scatter(x=df['date'], y=df['spread'], name='Spread'),
      Scatter(x=df['date'], y=df['spread_deriv_neg'], name='Negative Change', mode='markers'),
      Scatter(x=df['date'], y=df['spread_deriv_pos'], name='Positive Change', mode='markers')])
# Save the variable
df_2016_07_sub_2016_01 = df

## How do we make money? 

From the above graph we can see a story on spreads and how to make money off of them if we can tell teh future. For this data there is a local minimum and a local maximum which whos that we have the potential of making lots of money. On a day-to-day scale there is not much we can do, but on a scale of months there may be opportunity. 

One way to find our if there is opportunity is to make two tests: 

+ Test 1: What is the most we could have made? 
+ Test 2: What is the average we could make if we randomly chose a time to buy into the spread? [assuming the contract that expires later will always be more expensive than the one that expires earlier]
+ Test 3: What is the average we could make if we randomly chose a time to buy into the spread and randomly choose which contract will be more expensive?

### Test 1: What is the most we could have made?

In [6]:
# Easiest test first
df = df_2016_07_sub_2016_01
profit = df['spread'].max() - df['spread'].min()
print("The most profit you could make in the spread " + \
      "between CL2016-07 and CL2016-01 is ${:.02f}.".format(profit))

The most profit you could make in the spread between CL2016-07 and CL2016-01 is $5.59.


### Test 2: What is the average we could make assuming 2016-07 > 2016-01

This one is a bit more complicated. We want to know if there was a gaurenteed expectation of making money if we randomly chose a timeframe in 2015 to hedge our bets. 

Essentially it is a combinatorics problem. In general one would choose every point in **2015** and compare it to every other point after that date. The first step is to generate all possible date pairs in a date range:

In [7]:
def generate_day_combinations(start_date, end_date):
    # Generate for date-time combinations.
    if not isinstance(start_date, date) or not isinstance(end_date, date):
        raise TypeError('start or end date not datetime object.')
        
    if start_date >= end_date:
        raise ValueError('start date should be less than end date.')
        
    day = timedelta(days=1)
    _start = copy(start_date)
    while _start < end_date:
        _end = _start + day
        while _end <= end_date:
            yield (_start, _end)
            _end += day
        _start += day
    
# Test
df = df_2016_07_sub_2016_01
list(generate_day_combinations(df['date'].iloc[0], df['date'].iloc[0] + timedelta(days=3)))

[(Timestamp('2015-01-02 00:00:00'), Timestamp('2015-01-03 00:00:00')),
 (Timestamp('2015-01-02 00:00:00'), Timestamp('2015-01-04 00:00:00')),
 (Timestamp('2015-01-02 00:00:00'), Timestamp('2015-01-05 00:00:00')),
 (Timestamp('2015-01-03 00:00:00'), Timestamp('2015-01-04 00:00:00')),
 (Timestamp('2015-01-03 00:00:00'), Timestamp('2015-01-05 00:00:00')),
 (Timestamp('2015-01-04 00:00:00'), Timestamp('2015-01-05 00:00:00'))]

The second component is to find the average value of all the hedges in the date pairs we generated above:

In [20]:
df = df_2016_07_sub_2016_01.set_index('date')
dates = df.index

df_2015_spreads = []
for combo in generate_day_combinations(dates.min(), dates.max()):
    try:
        buy =  df['spread'].loc[combo[0]]
        sell = df['spread'].loc[combo[1]]
        df_2015_spreads.append([combo[0], combo[1], sell-buy, combo[1]-combo[0]])
    except:
        pass

In [21]:
df_2015_spreads = pd.DataFrame(df_2015_spreads, columns=('start_date', 'end_date', 'hedge', 'days_delta'))

What is the average amount we could make if we randomly chose any of the spreads to hedge against?

In [24]:
df_2015_spreads['hedge'].mean(), df_2015_spreads['hedge'].std()

(0.7594974907995968, 1.536247208962716)

Unfortunately although we have a positive value (about $0.76) if we randomly chose a bet in 2016 for these futures, we are battling against a much larger standard deviation and therefore higher risk. Can we lower the standard deviation by increasing or decreasing the amount of time we wait between buying and selling the spread?

For the sake of simplicity, we can do it month-by-month:

In [30]:
df_2015_spreads['month_delta'] = df_2015_spreads['days_delta'].map(lambda x: x.days//30)

df_2015_spreads.groupby('month_delta')['hedge'].describe()[['mean', 'std']]


Unnamed: 0_level_0,mean,std
month_delta,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.11338,0.665171
1,0.296652,1.242346
2,0.404139,1.521227
3,0.541433,1.625096
4,0.915981,1.688001
5,1.305164,1.797058
6,1.485753,1.659454
7,1.412546,1.620925
8,1.251501,1.396492
9,1.571404,1.315192


From this can see the chart above reflected in numbers: The trend is generally upwards. Unfortunately the spreads that are the most profitable and have the lowest realtive standard deviation are the farthest apart, decreasing our potential sample size. 