## Build history of commodity COTs

1. Retrieve data from the CFTC website (www.cft.gov/files);
2. Extract data from the most important Commercial and Non Commercial long and short columns;
3. Create "net" columns for each important category;
4. Merge this data with the ETF history data created in the previous steps


In [1]:
import pandas as pd
import numpy as np
import os, sys
import datetime
%matplotlib inline
import matplotlib.pyplot as plt
import plotly.plotly as py
import plotly.graph_objs as go
import zipfile
import urllib.request
from PIL import Image
def str_to_date(d):
    try:
        dt = datetime.datetime.strptime(str(d),'%Y-%m-%d')
    except:
        return None
    return dt

# Make important folders
TEMP_FOLDER = './temp_folder'
try:
    os.mkdir(TEMP_FOLDER)
except:
    pass
try:
    os.mkdir(f'{TEMP_FOLDER}/cot')
except:
    pass
try:
    os.mkdir(f'{TEMP_FOLDER}/zip')
except:
    pass


___
## First, decide if you want to re-create the ETF and COT data, or just retrieve the previously saved data DataFrames.

In [2]:
CREATE_COT_DATA = True
cot_save_path = './cot_history.csv'

___
## Process CFTC COT Data
___

### Initial processing
1. Download zip files from www.cft.gov/files;
2. Unip the files using the zipfile package;
3. Read each csv (usually named Annual.TXT), and merge them into the df_cot DataFrame.


In [3]:
if CREATE_COT_DATA:
    year_list = np.linspace(2000,2019,20)
#     directory_to_extract_to = TEMP_FOLDER
    zip_download_folder = f'TEMP_FOLDER/zip'
#     annual_path = f'{zip_download_folder}/Annual.TXT'
    df_cot_temp = None
    df_cot = None
    for y in year_list:
        yint = int(y)
        url = f"https://www.cftc.gov/files/dea/history/deacot{yint}.zip"
        path_to_zip_file = f'{zip_download_folder}/dea_fut_xls_{y}.zip'
        print(f'retrieving cot zip file from {url}')
        try:
            urllib.request.urlretrieve(url, path_to_zip_file)    
        except:
            import time
            time.sleep(1)
            urllib.request.urlretrieve(url, path_to_zip_file)    
        zip_ref = zipfile.ZipFile(path_to_zip_file, 'r')
        zip_ref.extractall(zip_download_folder)
        zip_ref.close()
        df_cot_temp = pd.read_csv(f'{zip_download_folder}/Annual.TXT')
        if df_cot is None:
            df_cot = df_cot_temp.copy()
        else:
            df_cot = df_cot.append(df_cot_temp,ignore_index=True)
            df_cot.index = list(range(len(df_cot)))
        print(f'processed cot csv file from {url}. Length = {len(df_cot_temp)}')    
    df_cot.head()

retrieving cot zip file from https://www.cftc.gov/files/dea/history/deacot2000.zip
processed cot csv file from https://www.cftc.gov/files/dea/history/deacot2000.zip. Length = 3261
retrieving cot zip file from https://www.cftc.gov/files/dea/history/deacot2001.zip
processed cot csv file from https://www.cftc.gov/files/dea/history/deacot2001.zip. Length = 2782
retrieving cot zip file from https://www.cftc.gov/files/dea/history/deacot2002.zip
processed cot csv file from https://www.cftc.gov/files/dea/history/deacot2002.zip. Length = 2854
retrieving cot zip file from https://www.cftc.gov/files/dea/history/deacot2003.zip
processed cot csv file from https://www.cftc.gov/files/dea/history/deacot2003.zip. Length = 3135
retrieving cot zip file from https://www.cftc.gov/files/dea/history/deacot2004.zip
processed cot csv file from https://www.cftc.gov/files/dea/history/deacot2004.zip. Length = 3465
retrieving cot zip file from https://www.cftc.gov/files/dea/history/deacot2005.zip
processed cot csv


Columns (3,37,38,39,40,41,42,43,44,45,46,126) have mixed types. Specify dtype option on import or set low_memory=False.



processed cot csv file from https://www.cftc.gov/files/dea/history/deacot2017.zip. Length = 12314
retrieving cot zip file from https://www.cftc.gov/files/dea/history/deacot2018.zip
processed cot csv file from https://www.cftc.gov/files/dea/history/deacot2018.zip. Length = 13410
retrieving cot zip file from https://www.cftc.gov/files/dea/history/deacot2019.zip
processed cot csv file from https://www.cftc.gov/files/dea/history/deacot2019.zip. Length = 1741


___
### Make column names easier to process, make main date field a datetime object, and sort the DataFrame
___

In [4]:
if CREATE_COT_DATA:
    col_rename_dict = {c:c.replace(' ','_').replace('-','_').replace('(','').replace(')','') for c in df_cot.columns.values}
    df_cot2 = df_cot.rename(columns=col_rename_dict)
    df_cot2.As_of_Date_in_Form_YYYY_MM_DD = df_cot2.As_of_Date_in_Form_YYYY_MM_DD.apply(str_to_date)
    df_cot2 = df_cot2.sort_values(['Market_and_Exchange_Names','As_of_Date_in_Form_YYYY_MM_DD'])
    df_cot2.columns.values
    df_cot2.to_csv(cot_save_path,index=False)

___
### Show important columns for a specific  commodity
___

In [5]:
df_cot2 = pd.read_csv(cot_save_path)
df_cot2.As_of_Date_in_Form_YYYY_MM_DD = df_cot2.As_of_Date_in_Form_YYYY_MM_DD.apply(str_to_date)
# commod = 'CRUDE OIL, LIGHT SWEET'
commod = 'GOLD'
cot_beg_date = datetime.datetime.now() - datetime.timedelta(2000)
df_commod = df_cot2[df_cot2.Market_and_Exchange_Names.str.contains(commod)][df_cot2.As_of_Date_in_Form_YYYY_MM_DD>=cot_beg_date]
basic_cols = ['Market_and_Exchange_Names','As_of_Date_in_Form_YYYY_MM_DD','Open_Interest_All']
long_cols = ['Market_and_Exchange_Names','As_of_Date_in_Form_YYYY_MM_DD',
            'Noncommercial_Positions_Long_All','Commercial_Positions_Long_All',
            'Nonreportable_Positions_Long_All','Traders_Commercial_Long_All',
             'Traders_Noncommercial_Long_All','Traders_Total_Reportable_Long_All']
short_cols = ['Market_and_Exchange_Names','As_of_Date_in_Form_YYYY_MM_DD',
            'Noncommercial_Positions_Short_All','Commercial_Positions_Short_All',
            'Nonreportable_Positions_Short_All','Total_Reportable_Positions_Short_All',
            'Traders_Commercial_Short_All','Traders_Noncommercial_Short_All',
            'Traders_Total_Reportable_Short_All']
df_commod_basic = df_commod[basic_cols]
df_commod_long = df_commod[long_cols]
df_commod_short = df_commod[short_cols]



Boolean Series key will be reindexed to match DataFrame index.



### Show basic open interest info

In [6]:
df_commod_basic.head()

Unnamed: 0,Market_and_Exchange_Names,As_of_Date_in_Form_YYYY_MM_DD,Open_Interest_All
41286,GOLD - COMMODITY EXCHANGE INC.,2015-01-06,394021
41287,GOLD - COMMODITY EXCHANGE INC.,2015-01-13,402108
41288,GOLD - COMMODITY EXCHANGE INC.,2015-01-20,430128
41289,GOLD - COMMODITY EXCHANGE INC.,2015-01-27,438279
41290,GOLD - COMMODITY EXCHANGE INC.,2015-02-03,419524


### Show important "long" position info

In [7]:
df_commod_long.head()

Unnamed: 0,Market_and_Exchange_Names,As_of_Date_in_Form_YYYY_MM_DD,Noncommercial_Positions_Long_All,Commercial_Positions_Long_All,Nonreportable_Positions_Long_All,Traders_Commercial_Long_All,Traders_Noncommercial_Long_All,Traders_Total_Reportable_Long_All
41286,GOLD - COMMODITY EXCHANGE INC.,2015-01-06,187705,130427,34990,47,126,204
41287,GOLD - COMMODITY EXCHANGE INC.,2015-01-13,192959,129436,38984,51,130,214
41288,GOLD - COMMODITY EXCHANGE INC.,2015-01-20,223257,120946,43445,53,161,254
41289,GOLD - COMMODITY EXCHANGE INC.,2015-01-27,238407,117326,43731,49,170,247
41290,GOLD - COMMODITY EXCHANGE INC.,2015-02-03,229006,117174,44670,49,166,244


### Show important "short" position info

In [8]:
df_commod_short.head()

Unnamed: 0,Market_and_Exchange_Names,As_of_Date_in_Form_YYYY_MM_DD,Noncommercial_Positions_Short_All,Commercial_Positions_Short_All,Nonreportable_Positions_Short_All,Total_Reportable_Positions_Short_All,Traders_Commercial_Short_All,Traders_Noncommercial_Short_All,Traders_Total_Reportable_Short_All
41286,GOLD - COMMODITY EXCHANGE INC.,2015-01-06,65527,253099,34496,359525,59,82,190
41287,GOLD - COMMODITY EXCHANGE INC.,2015-01-13,62733,267112,31534,370574,61,76,185
41288,GOLD - COMMODITY EXCHANGE INC.,2015-01-20,60802,298756,28090,402038,63,83,194
41289,GOLD - COMMODITY EXCHANGE INC.,2015-01-27,49482,323486,26496,411783,66,68,184
41290,GOLD - COMMODITY EXCHANGE INC.,2015-02-03,43991,320447,26412,393112,55,61,162


### Show important "net" position info

In [10]:
def non_comm_net(r):
    return float(r.Noncommercial_Positions_Long_All) - float(r.Noncommercial_Positions_Short_All)
def comm_net(r):
    return float(r.Commercial_Positions_Long_All) - float(r.Commercial_Positions_Short_All)
def non_report_net(r):
    return float(r.Nonreportable_Positions_Long_All) - float(r.Nonreportable_Positions_Short_All)
def traders_comm_net(r):
    return float(r.Traders_Commercial_Long_All) - float(r.Traders_Commercial_Short_All)
def traders_noncomm_net(r):
    return float(r.Traders_Noncommercial_Long_All) - float(r.Traders_Noncommercial_Short_All)

df_commod_net = df_commod_long.merge(df_commod_short,how='inner',on=['Market_and_Exchange_Names','As_of_Date_in_Form_YYYY_MM_DD'])
df_commod_net['Noncommercial_Positions_Net_All'] = df_commod_net.apply(non_comm_net,axis=1)
df_commod_net['Commercial_Positions_Net_All'] = df_commod_net.apply(comm_net,axis=1)
df_commod_net['Nonreportable_Positions_Net_All'] = df_commod_net.apply(non_report_net,axis=1)
df_commod_net['Traders_Commercial_Net_All'] = df_commod_net.apply(traders_comm_net,axis=1)
df_commod_net['Traders_Noncommercial_Net_All'] = df_commod_net.apply(traders_noncomm_net,axis=1)
net_cols = ['Market_and_Exchange_Names','As_of_Date_in_Form_YYYY_MM_DD','Noncommercial_Positions_Net_All','Commercial_Positions_Net_All','Nonreportable_Positions_Net_All','Traders_Commercial_Net_All','Traders_Noncommercial_Net_All']
df_commod_net = df_commod_net[net_cols]
df_commod_net.head()


Unnamed: 0,Market_and_Exchange_Names,As_of_Date_in_Form_YYYY_MM_DD,Noncommercial_Positions_Net_All,Commercial_Positions_Net_All,Nonreportable_Positions_Net_All,Traders_Commercial_Net_All,Traders_Noncommercial_Net_All
0,GOLD - COMMODITY EXCHANGE INC.,2015-01-06,122178.0,-122672.0,494.0,-12.0,44.0
1,GOLD - COMMODITY EXCHANGE INC.,2015-01-13,130226.0,-137676.0,7450.0,-10.0,54.0
2,GOLD - COMMODITY EXCHANGE INC.,2015-01-20,162455.0,-177810.0,15355.0,-10.0,78.0
3,GOLD - COMMODITY EXCHANGE INC.,2015-01-27,188925.0,-206160.0,17235.0,-17.0,102.0
4,GOLD - COMMODITY EXCHANGE INC.,2015-02-03,185015.0,-203273.0,18258.0,-6.0,105.0


## END