## Build history of commodity COTs

1. Retrieve data from the CFTC website (www.cft.gov/files);
2. Extract data from the most important Commercial and Non Commercial long and short columns;
3. Create "net" columns for each important category;
4. Merge this data with the ETF history data created in the previous steps


In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
import os, sys
import datetime
%matplotlib inline
import matplotlib.pyplot as plt
# import plotly.plotly as py
import plotly.graph_objs as go
import zipfile
import urllib.request
from PIL import Image
def str_to_date(d):
    try:
        dt = datetime.datetime.strptime(str(d),'%Y-%m-%d')
    except:
        return None
    return dt

# Make important folders
TEMP_FOLDER = './temp_folder'
try:
    os.mkdir(TEMP_FOLDER)
except:
    pass
try:
    os.mkdir(f'{TEMP_FOLDER}/cot')
except:
    pass
try:
    os.mkdir(f'{TEMP_FOLDER}/zip')
except:
    pass


___
## First, decide if you want to re-create the ETF and COT data, or just retrieve the previously saved data DataFrames.

In [3]:
CREATE_COT_DATA = True
cot_save_path = './cot_history.csv'

___
## Process CFTC COT Data
___

### Initial processing
1. Download zip files from www.cft.gov/files;
2. Unip the files using the zipfile package;
3. Read each csv (usually named Annual.TXT), and merge them into the df_cot DataFrame.


In [4]:
if CREATE_COT_DATA:
    last_year = datetime.datetime.now().year
    year_list = np.linspace(2000,last_year,last_year-2000+1)
    zip_download_folder = f'TEMP_FOLDER/zip'
    df_cot_temp = None
    df_cot = None
    for y in year_list:
        yint = int(y)
        url = f"https://www.cftc.gov/files/dea/history/deacot{yint}.zip"
        path_to_zip_file = f'{zip_download_folder}/dea_fut_xls_{y}.zip'
        if not os.path.isfile(path_to_zip_file) or y >= last_year:
            print(f'retrieving cot zip file from {url}')
            try:
                urllib.request.urlretrieve(url, path_to_zip_file)    
            except:
                import time
                time.sleep(1)
                try:
                    urllib.request.urlretrieve(url, path_to_zip_file)    
                except Exception as e:
                    print(f'{url} {e}')
                    continue
        zip_ref = zipfile.ZipFile(path_to_zip_file, 'r')
        zip_ref.extractall(zip_download_folder)
        zip_ref.close()
        df_cot_temp = pd.read_csv(f'{zip_download_folder}/Annual.TXT')
        if df_cot is None:
            df_cot = df_cot_temp.copy()
        else:
            df_cot = df_cot.append(df_cot_temp,ignore_index=True)
            df_cot.index = list(range(len(df_cot)))
        print(f'processed cot csv file from {url}. Length = {len(df_cot_temp)}')
        
    df_cot.head()

processed cot csv file from https://www.cftc.gov/files/dea/history/deacot2000.zip. Length = 3261
processed cot csv file from https://www.cftc.gov/files/dea/history/deacot2001.zip. Length = 2782
processed cot csv file from https://www.cftc.gov/files/dea/history/deacot2002.zip. Length = 2854
processed cot csv file from https://www.cftc.gov/files/dea/history/deacot2003.zip. Length = 3135
processed cot csv file from https://www.cftc.gov/files/dea/history/deacot2004.zip. Length = 3465
processed cot csv file from https://www.cftc.gov/files/dea/history/deacot2005.zip. Length = 3909
processed cot csv file from https://www.cftc.gov/files/dea/history/deacot2006.zip. Length = 4926
processed cot csv file from https://www.cftc.gov/files/dea/history/deacot2007.zip. Length = 4926
processed cot csv file from https://www.cftc.gov/files/dea/history/deacot2008.zip. Length = 5631
processed cot csv file from https://www.cftc.gov/files/dea/history/deacot2009.zip. Length = 6570
processed cot csv file from ht

___
### Make column names easier to process, make main date field a datetime object, and sort the DataFrame
___

In [5]:
if CREATE_COT_DATA:
    col_rename_dict = {c:c.replace(' ','_').replace('-','_').replace('(','').replace(')','') for c in df_cot.columns.values}
    df_cot2 = df_cot.rename(columns=col_rename_dict)
    df_cot2.As_of_Date_in_Form_YYYY_MM_DD = df_cot2.As_of_Date_in_Form_YYYY_MM_DD.apply(str_to_date)
    df_cot2 = df_cot2.sort_values(['Market_and_Exchange_Names','As_of_Date_in_Form_YYYY_MM_DD'])
    df_cot2.columns.values
    df_cot2.to_csv(cot_save_path,index=False)

___
### Show important columns for a specific  commodity
___

In [6]:
len(df_cot2)

156536

In [7]:
df_cot2 = pd.read_csv(cot_save_path)
df_cot2.As_of_Date_in_Form_YYYY_MM_DD = df_cot2.As_of_Date_in_Form_YYYY_MM_DD.apply(str_to_date)
commod = 'CRUDE OIL, LIGHT SWEET - NEW YORK MERCANTILE EXCHANGE'
# commod = 'GOLD'
cot_beg_date = datetime.datetime.now() - datetime.timedelta(2000)
df_commod = df_cot2[df_cot2.Market_and_Exchange_Names.str.contains(commod)][df_cot2.As_of_Date_in_Form_YYYY_MM_DD>=cot_beg_date]
basic_cols = ['Market_and_Exchange_Names','As_of_Date_in_Form_YYYY_MM_DD','Open_Interest_All']
long_cols = ['Market_and_Exchange_Names','As_of_Date_in_Form_YYYY_MM_DD',
            'Noncommercial_Positions_Long_All','Commercial_Positions_Long_All',
            'Nonreportable_Positions_Long_All','Traders_Commercial_Long_All',
             'Traders_Noncommercial_Long_All','Traders_Total_Reportable_Long_All']
short_cols = ['Market_and_Exchange_Names','As_of_Date_in_Form_YYYY_MM_DD',
            'Noncommercial_Positions_Short_All','Commercial_Positions_Short_All',
            'Nonreportable_Positions_Short_All','Total_Reportable_Positions_Short_All',
            'Traders_Commercial_Short_All','Traders_Noncommercial_Short_All',
            'Traders_Total_Reportable_Short_All']
df_commod_basic = df_commod[basic_cols]
df_commod_long = df_commod[long_cols]
df_commod_short = df_commod[short_cols]


### Show basic open interest info

In [8]:
df_commod_basic.sort_values('As_of_Date_in_Form_YYYY_MM_DD',ascending=False).head()

Unnamed: 0,Market_and_Exchange_Names,As_of_Date_in_Form_YYYY_MM_DD,Open_Interest_All
30065,"CRUDE OIL, LIGHT SWEET - NEW YORK MERCANTILE E...",2020-07-28,1992556
30064,"CRUDE OIL, LIGHT SWEET - NEW YORK MERCANTILE E...",2020-07-21,1956768
30063,"CRUDE OIL, LIGHT SWEET - NEW YORK MERCANTILE E...",2020-07-14,2019705
30062,"CRUDE OIL, LIGHT SWEET - NEW YORK MERCANTILE E...",2020-07-07,1972049
30061,"CRUDE OIL, LIGHT SWEET - NEW YORK MERCANTILE E...",2020-06-30,2008618


### Show important "long" position info

In [9]:
df_commod_long.sort_values('As_of_Date_in_Form_YYYY_MM_DD',ascending=False).head()

Unnamed: 0,Market_and_Exchange_Names,As_of_Date_in_Form_YYYY_MM_DD,Noncommercial_Positions_Long_All,Commercial_Positions_Long_All,Nonreportable_Positions_Long_All,Traders_Commercial_Long_All,Traders_Noncommercial_Long_All,Traders_Total_Reportable_Long_All
30065,"CRUDE OIL, LIGHT SWEET - NEW YORK MERCANTILE E...",2020-07-28,676822,639196,91511,96,113,295
30064,"CRUDE OIL, LIGHT SWEET - NEW YORK MERCANTILE E...",2020-07-21,687910,613323,89870,95,119,293
30063,"CRUDE OIL, LIGHT SWEET - NEW YORK MERCANTILE E...",2020-07-14,692258,662645,86950,98,114,301
30062,"CRUDE OIL, LIGHT SWEET - NEW YORK MERCANTILE E...",2020-07-07,686543,630952,91001,96,116,295
30061,"CRUDE OIL, LIGHT SWEET - NEW YORK MERCANTILE E...",2020-06-30,704393,638159,91667,100,116,299


### Show important "short" position info

In [10]:
df_commod_short.sort_values('As_of_Date_in_Form_YYYY_MM_DD',ascending=False).head()

Unnamed: 0,Market_and_Exchange_Names,As_of_Date_in_Form_YYYY_MM_DD,Noncommercial_Positions_Short_All,Commercial_Positions_Short_All,Nonreportable_Positions_Short_All,Total_Reportable_Positions_Short_All,Traders_Commercial_Short_All,Traders_Noncommercial_Short_All,Traders_Total_Reportable_Short_All
30065,"CRUDE OIL, LIGHT SWEET - NEW YORK MERCANTILE E...",2020-07-28,144253,1203653,59623,1932933,106,95,287
30064,"CRUDE OIL, LIGHT SWEET - NEW YORK MERCANTILE E...",2020-07-21,139034,1191979,60090,1896678,109,85,282
30063,"CRUDE OIL, LIGHT SWEET - NEW YORK MERCANTILE E...",2020-07-14,152507,1223438,65908,1953797,106,106,293
30062,"CRUDE OIL, LIGHT SWEET - NEW YORK MERCANTILE E...",2020-07-07,151226,1193278,63992,1908057,109,96,293
30061,"CRUDE OIL, LIGHT SWEET - NEW YORK MERCANTILE E...",2020-06-30,160567,1208544,65108,1943510,108,99,289


### Show important "net" position info

In [11]:
def non_comm_net(r):
    return float(r.Noncommercial_Positions_Long_All) - float(r.Noncommercial_Positions_Short_All)
def comm_net(r):
    return float(r.Commercial_Positions_Long_All) - float(r.Commercial_Positions_Short_All)
def non_report_net(r):
    return float(r.Nonreportable_Positions_Long_All) - float(r.Nonreportable_Positions_Short_All)
def traders_comm_net(r):
    return float(r.Traders_Commercial_Long_All) - float(r.Traders_Commercial_Short_All)
def traders_noncomm_net(r):
    return float(r.Traders_Noncommercial_Long_All) - float(r.Traders_Noncommercial_Short_All)

df_commod_net = df_commod_long.merge(df_commod_short,how='inner',on=['Market_and_Exchange_Names','As_of_Date_in_Form_YYYY_MM_DD'])
df_commod_net['Noncommercial_Positions_Net_All'] = df_commod_net.apply(non_comm_net,axis=1)
df_commod_net['Commercial_Positions_Net_All'] = df_commod_net.apply(comm_net,axis=1)
df_commod_net['Nonreportable_Positions_Net_All'] = df_commod_net.apply(non_report_net,axis=1)
df_commod_net['Traders_Commercial_Net_All'] = df_commod_net.apply(traders_comm_net,axis=1)
df_commod_net['Traders_Noncommercial_Net_All'] = df_commod_net.apply(traders_noncomm_net,axis=1)
net_cols = ['Market_and_Exchange_Names','As_of_Date_in_Form_YYYY_MM_DD','Noncommercial_Positions_Net_All','Commercial_Positions_Net_All','Nonreportable_Positions_Net_All','Traders_Commercial_Net_All','Traders_Noncommercial_Net_All']
df_commod_net = df_commod_net[net_cols]
df_commod_net.sort_values('As_of_Date_in_Form_YYYY_MM_DD',ascending=False).head()


Unnamed: 0,Market_and_Exchange_Names,As_of_Date_in_Form_YYYY_MM_DD,Noncommercial_Positions_Net_All,Commercial_Positions_Net_All,Nonreportable_Positions_Net_All,Traders_Commercial_Net_All,Traders_Noncommercial_Net_All
282,"CRUDE OIL, LIGHT SWEET - NEW YORK MERCANTILE E...",2020-07-28,532569.0,-564457.0,31888.0,-10.0,18.0
281,"CRUDE OIL, LIGHT SWEET - NEW YORK MERCANTILE E...",2020-07-21,548876.0,-578656.0,29780.0,-14.0,34.0
280,"CRUDE OIL, LIGHT SWEET - NEW YORK MERCANTILE E...",2020-07-14,539751.0,-560793.0,21042.0,-8.0,8.0
279,"CRUDE OIL, LIGHT SWEET - NEW YORK MERCANTILE E...",2020-07-07,535317.0,-562326.0,27009.0,-13.0,20.0
278,"CRUDE OIL, LIGHT SWEET - NEW YORK MERCANTILE E...",2020-06-30,543826.0,-570385.0,26559.0,-8.0,17.0


## END

In [12]:
df_cot2[df_cot2.Market_and_Exchange_Names.str.lower().str.contains('bond')].Market_and_Exchange_Names.unique()


array(['LONG-TERM U.S. TREASURY BONDS - CHICAGO BOARD OF TRADE',
       'LONG-TERM U.S. TREASURY BONDS - CHICAGO BOARD OF TRADE ',
       'MUNICIPAL BOND INDEX - CHICAGO BOARD OF TRADE ',
       'U.S. TREASURY BONDS - CHICAGO BOARD OF TRADE',
       'U.S. TREASURY BONDS - CHICAGO BOARD OF TRADE ',
       'U.S. TREASURY BONDS - MIDAMERICA COMMODITY EXCHANGE ',
       'ULTRA U.S. TREASURY BONDS - CHICAGO BOARD OF TRADE'], dtype=object)

In [13]:
list(filter(lambda s:'As_of_Date_in_Form_YYYY_MM_DD' in s,df_cot2.columns.values))

['As_of_Date_in_Form_YYYY_MM_DD']

In [14]:
df_wheat = df_cot2[df_cot2.Market_and_Exchange_Names.str.contains('WHEAT-SRW - CHICAGO BOARD OF TRADE')]
len(df_wheat)
# np.sort(df_wheat.As_of_Date_in_Form_YYYY_MM_DD.unique())

344

In [15]:
df_cot2[df_cot2.Market_and_Exchange_Names.str.contains('WHEAT-HRW - CHICAGO BOARD OF TRADE')].As_of_Date_in_Form_YYYY_MM_DD.max()

Timestamp('2020-07-28 00:00:00')