## Build history of commodity COTs

1. Retrieve data from the CFTC website (www.cft.gov/files);
2. Extract data from the most important Commercial and Non Commercial long and short columns;
3. Create "net" columns for each important category;
4. Merge this data with the ETF history data created in the previous steps


In [1]:
import pandas as pd
import numpy as np
import os, sys
import datetime
%matplotlib inline
import matplotlib.pyplot as plt
import plotly.plotly as py
import plotly.graph_objs as go
import zipfile
import urllib.request
from PIL import Image
def str_to_date(d):
    try:
        dt = datetime.datetime.strptime(str(d),'%Y-%m-%d')
    except:
        return None
    return dt

# Make important folders
TEMP_FOLDER = './temp_folder'
try:
    os.mkdir(TEMP_FOLDER)
except:
    pass
try:
    os.mkdir(f'{TEMP_FOLDER}/cot')
except:
    pass
try:
    os.mkdir(f'{TEMP_FOLDER}/zip')
except:
    pass


___
## First, decide if you want to re-create the ETF and COT data, or just retrieve the previously saved data DataFrames.

In [2]:
CREATE_COT_DATA = True
cot_save_path = './cot_history.csv'

___
## Process CFTC COT Data
___

### Initial processing
1. Download zip files from www.cft.gov/files;
2. Unip the files using the zipfile package;
3. Read each csv (usually named Annual.TXT), and merge them into the df_cot DataFrame.


In [None]:
if CREATE_COT_DATA:
    last_year = datetime.datetime.now().year
    year_list = np.linspace(2000,last_year,last_year-2000+1)
    zip_download_folder = f'TEMP_FOLDER/zip'
    df_cot_temp = None
    df_cot = None
    for y in year_list:
        yint = int(y)
        url = f"https://www.cftc.gov/files/dea/history/deacot{yint}.zip"
        path_to_zip_file = f'{zip_download_folder}/dea_fut_xls_{y}.zip'
        if not os.path.isfile(path_to_zip_file) or y >= last_year:
            print(f'retrieving cot zip file from {url}')
            try:
                urllib.request.urlretrieve(url, path_to_zip_file)    
            except:
                import time
                time.sleep(1)
                urllib.request.urlretrieve(url, path_to_zip_file)    
        zip_ref = zipfile.ZipFile(path_to_zip_file, 'r')
        zip_ref.extractall(zip_download_folder)
        zip_ref.close()
        df_cot_temp = pd.read_csv(f'{zip_download_folder}/Annual.TXT')
        if df_cot is None:
            df_cot = df_cot_temp.copy()
        else:
            df_cot = df_cot.append(df_cot_temp,ignore_index=True)
            df_cot.index = list(range(len(df_cot)))
        print(f'processed cot csv file from {url}. Length = {len(df_cot_temp)}')
        
    df_cot.head()

___
### Make column names easier to process, make main date field a datetime object, and sort the DataFrame
___

In [4]:
if CREATE_COT_DATA:
    col_rename_dict = {c:c.replace(' ','_').replace('-','_').replace('(','').replace(')','') for c in df_cot.columns.values}
    df_cot2 = df_cot.rename(columns=col_rename_dict)
    df_cot2.As_of_Date_in_Form_YYYY_MM_DD = df_cot2.As_of_Date_in_Form_YYYY_MM_DD.apply(str_to_date)
    df_cot2 = df_cot2.sort_values(['Market_and_Exchange_Names','As_of_Date_in_Form_YYYY_MM_DD'])
    df_cot2.columns.values
    df_cot2.to_csv(cot_save_path,index=False)

___
### Show important columns for a specific  commodity
___

In [5]:
len(df_cot2)

139541

In [None]:
df_cot2 = pd.read_csv(cot_save_path)
df_cot2.As_of_Date_in_Form_YYYY_MM_DD = df_cot2.As_of_Date_in_Form_YYYY_MM_DD.apply(str_to_date)
commod = 'CRUDE OIL, LIGHT SWEET - NEW YORK MERCANTILE EXCHANGE'
# commod = 'GOLD'
cot_beg_date = datetime.datetime.now() - datetime.timedelta(2000)
df_commod = df_cot2[df_cot2.Market_and_Exchange_Names.str.contains(commod)][df_cot2.As_of_Date_in_Form_YYYY_MM_DD>=cot_beg_date]
basic_cols = ['Market_and_Exchange_Names','As_of_Date_in_Form_YYYY_MM_DD','Open_Interest_All']
long_cols = ['Market_and_Exchange_Names','As_of_Date_in_Form_YYYY_MM_DD',
            'Noncommercial_Positions_Long_All','Commercial_Positions_Long_All',
            'Nonreportable_Positions_Long_All','Traders_Commercial_Long_All',
             'Traders_Noncommercial_Long_All','Traders_Total_Reportable_Long_All']
short_cols = ['Market_and_Exchange_Names','As_of_Date_in_Form_YYYY_MM_DD',
            'Noncommercial_Positions_Short_All','Commercial_Positions_Short_All',
            'Nonreportable_Positions_Short_All','Total_Reportable_Positions_Short_All',
            'Traders_Commercial_Short_All','Traders_Noncommercial_Short_All',
            'Traders_Total_Reportable_Short_All']
df_commod_basic = df_commod[basic_cols]
df_commod_long = df_commod[long_cols]
df_commod_short = df_commod[short_cols]


### Show basic open interest info

In [7]:
df_commod_basic.sort_values('As_of_Date_in_Form_YYYY_MM_DD',ascending=False).head()

Unnamed: 0,Market_and_Exchange_Names,As_of_Date_in_Form_YYYY_MM_DD,Open_Interest_All
26437,"CRUDE OIL, LIGHT SWEET - NEW YORK MERCANTILE E...",2019-04-09,2021814
26436,"CRUDE OIL, LIGHT SWEET - NEW YORK MERCANTILE E...",2019-04-02,1975009
26435,"CRUDE OIL, LIGHT SWEET - NEW YORK MERCANTILE E...",2019-03-26,1968511
26434,"CRUDE OIL, LIGHT SWEET - NEW YORK MERCANTILE E...",2019-03-19,1962559
26433,"CRUDE OIL, LIGHT SWEET - NEW YORK MERCANTILE E...",2019-03-12,2012864


### Show important "long" position info

In [8]:
df_commod_long.sort_values('As_of_Date_in_Form_YYYY_MM_DD',ascending=False).head()

Unnamed: 0,Market_and_Exchange_Names,As_of_Date_in_Form_YYYY_MM_DD,Noncommercial_Positions_Long_All,Commercial_Positions_Long_All,Nonreportable_Positions_Long_All,Traders_Commercial_Long_All,Traders_Noncommercial_Long_All,Traders_Total_Reportable_Long_All
26437,"CRUDE OIL, LIGHT SWEET - NEW YORK MERCANTILE E...",2019-04-09,621766,712040,97129,93,137,306
26436,"CRUDE OIL, LIGHT SWEET - NEW YORK MERCANTILE E...",2019-04-02,590312,702767,88727,93,135,315
26435,"CRUDE OIL, LIGHT SWEET - NEW YORK MERCANTILE E...",2019-03-26,560552,696940,79103,91,137,312
26434,"CRUDE OIL, LIGHT SWEET - NEW YORK MERCANTILE E...",2019-03-19,534563,701175,80355,96,121,305
26433,"CRUDE OIL, LIGHT SWEET - NEW YORK MERCANTILE E...",2019-03-12,500169,753879,82657,101,126,317


### Show important "short" position info

In [9]:
df_commod_short.sort_values('As_of_Date_in_Form_YYYY_MM_DD',ascending=False).head()

Unnamed: 0,Market_and_Exchange_Names,As_of_Date_in_Form_YYYY_MM_DD,Noncommercial_Positions_Short_All,Commercial_Positions_Short_All,Nonreportable_Positions_Short_All,Total_Reportable_Positions_Short_All,Traders_Commercial_Short_All,Traders_Noncommercial_Short_All,Traders_Total_Reportable_Short_All
26437,"CRUDE OIL, LIGHT SWEET - NEW YORK MERCANTILE E...",2019-04-09,105104,1247522,78309,1943505,106,83,285
26436,"CRUDE OIL, LIGHT SWEET - NEW YORK MERCANTILE E...",2019-04-02,108951,1190040,82815,1892194,103,89,289
26435,"CRUDE OIL, LIGHT SWEET - NEW YORK MERCANTILE E...",2019-03-26,111933,1152358,72304,1896207,104,87,296
26434,"CRUDE OIL, LIGHT SWEET - NEW YORK MERCANTILE E...",2019-03-19,119817,1128022,68254,1894305,103,93,286
26433,"CRUDE OIL, LIGHT SWEET - NEW YORK MERCANTILE E...",2019-03-12,137904,1123382,75419,1937445,108,97,295


### Show important "net" position info

In [10]:
def non_comm_net(r):
    return float(r.Noncommercial_Positions_Long_All) - float(r.Noncommercial_Positions_Short_All)
def comm_net(r):
    return float(r.Commercial_Positions_Long_All) - float(r.Commercial_Positions_Short_All)
def non_report_net(r):
    return float(r.Nonreportable_Positions_Long_All) - float(r.Nonreportable_Positions_Short_All)
def traders_comm_net(r):
    return float(r.Traders_Commercial_Long_All) - float(r.Traders_Commercial_Short_All)
def traders_noncomm_net(r):
    return float(r.Traders_Noncommercial_Long_All) - float(r.Traders_Noncommercial_Short_All)

df_commod_net = df_commod_long.merge(df_commod_short,how='inner',on=['Market_and_Exchange_Names','As_of_Date_in_Form_YYYY_MM_DD'])
df_commod_net['Noncommercial_Positions_Net_All'] = df_commod_net.apply(non_comm_net,axis=1)
df_commod_net['Commercial_Positions_Net_All'] = df_commod_net.apply(comm_net,axis=1)
df_commod_net['Nonreportable_Positions_Net_All'] = df_commod_net.apply(non_report_net,axis=1)
df_commod_net['Traders_Commercial_Net_All'] = df_commod_net.apply(traders_comm_net,axis=1)
df_commod_net['Traders_Noncommercial_Net_All'] = df_commod_net.apply(traders_noncomm_net,axis=1)
net_cols = ['Market_and_Exchange_Names','As_of_Date_in_Form_YYYY_MM_DD','Noncommercial_Positions_Net_All','Commercial_Positions_Net_All','Nonreportable_Positions_Net_All','Traders_Commercial_Net_All','Traders_Noncommercial_Net_All']
df_commod_net = df_commod_net[net_cols]
df_commod_net.sort_values('As_of_Date_in_Form_YYYY_MM_DD',ascending=False).head()


Unnamed: 0,Market_and_Exchange_Names,As_of_Date_in_Form_YYYY_MM_DD,Noncommercial_Positions_Net_All,Commercial_Positions_Net_All,Nonreportable_Positions_Net_All,Traders_Commercial_Net_All,Traders_Noncommercial_Net_All
222,"CRUDE OIL, LIGHT SWEET - NEW YORK MERCANTILE E...",2019-04-09,516662.0,-535482.0,18820.0,-13.0,54.0
221,"CRUDE OIL, LIGHT SWEET - NEW YORK MERCANTILE E...",2019-04-02,481361.0,-487273.0,5912.0,-10.0,46.0
220,"CRUDE OIL, LIGHT SWEET - NEW YORK MERCANTILE E...",2019-03-26,448619.0,-455418.0,6799.0,-13.0,50.0
219,"CRUDE OIL, LIGHT SWEET - NEW YORK MERCANTILE E...",2019-03-19,414746.0,-426847.0,12101.0,-7.0,28.0
218,"CRUDE OIL, LIGHT SWEET - NEW YORK MERCANTILE E...",2019-03-12,362265.0,-369503.0,7238.0,-7.0,29.0


## END

In [11]:
df_cot2[df_cot2.Market_and_Exchange_Names.str.lower().str.contains('wheat')].Market_and_Exchange_Names.unique()


array(['BLACK SEA WHEAT FINANCIAL - CHICAGO BOARD OF TRADE',
       'WHEAT - CHICAGO BOARD OF TRADE ',
       'WHEAT - KANSAS CITY BOARD OF TRADE ',
       'WHEAT - MINNEAPOLIS GRAIN EXCHANGE ',
       'WHEAT-HRSpring - MINNEAPOLIS GRAIN EXCHANGE',
       'WHEAT-HRSpring - MINNEAPOLIS GRAIN EXCHANGE ',
       'WHEAT-HRW - CHICAGO BOARD OF TRADE',
       'WHEAT-HRW - CHICAGO BOARD OF TRADE ',
       'WHEAT-SRW - CHICAGO BOARD OF TRADE',
       'WHEAT-SRW - CHICAGO BOARD OF TRADE '], dtype=object)

In [12]:
list(filter(lambda s:'As_of_Date_in_Form_YYYY_MM_DD' in s,df_cot2.columns.values))

['As_of_Date_in_Form_YYYY_MM_DD']

In [13]:
df_wheat = df_cot2[df_cot2.Market_and_Exchange_Names.str.contains('WHEAT-SRW - CHICAGO BOARD OF TRADE')]
len(df_wheat)
# np.sort(df_wheat.As_of_Date_in_Form_YYYY_MM_DD.unique())

278

In [14]:
df_cot2[df_cot2.Market_and_Exchange_Names.str.contains('WHEAT-HRW - CHICAGO BOARD OF TRADE')].As_of_Date_in_Form_YYYY_MM_DD.max()

Timestamp('2019-04-09 00:00:00')