## Build history of commodity COTs

1. Retrieve data from the CFTC website (www.cft.gov/files);
2. Extract data from the most important Commercial and Non Commercial long and short columns;
3. Create "net" columns for each important category;
4. Merge this data with the ETF history data created in the previous steps


In [None]:
import pandas as pd
import numpy as np
import os, sys,glob
import datetime
%matplotlib inline
import matplotlib.pyplot as plt
import plotly.plotly as py
import plotly.graph_objs as go
import zipfile
import urllib.request
from PIL import Image
def str_to_date(d):
    try:
        dt = datetime.datetime.strptime(str(d),'%Y-%m-%d')
    except:
        return None
    return dt

# Make important folders
TEMP_FOLDER = './temp_folder'
try:
    os.mkdir(TEMP_FOLDER)
except:
    pass
try:
    os.mkdir(f'{TEMP_FOLDER}/cot')
except:
    pass
try:
    os.mkdir(f'{TEMP_FOLDER}/zip')
except:
    pass


___
## First, decide if you want to re-create the ETF and COT data, or just retrieve the previously saved data DataFrames.

In [None]:
CREATE_COT_DATA = True
cot_save_path = './cot_new_history.csv'
cot_net_save_path = './cot_net_new_history.csv'

### Main column names

In [None]:
basic_cols = ['Market_and_Exchange_Names','As_of_Date_In_Form_YYMMDD','Open_Interest_All']
long_cols = basic_cols + ['Prod_Merc_Positions_Long_All','Swap_Positions_Long_All',
            'M_Money_Positions_Long_All','Other_Rept_Positions_Long_All',
            'NonRept_Positions_Long_All','Tot_Rept_Positions_Long_All']
short_cols = basic_cols + ['Prod_Merc_Positions_Short_All','Swap_Positions_Short_All',
            'M_Money_Positions_Short_All','Other_Rept_Positions_Short_All',
            'NonRept_Positions_Short_All','Tot_Rept_Positions_Short_All']
summary_types = ['prod','swap','monman','other','nonrep','totrep']
summary_cols_dict = {summary_types[i]:(long_cols[i+len(basic_cols)],short_cols[i+len(basic_cols)]) for i in range(len(summary_types))}


In [None]:
summary_cols_dict

___
## Process CFTC COT Data
___

### Initial processing
1. Download zip files from www.cft.gov/files;
2. Unip the files using the zipfile package;
3. Read each csv (usually named Annual.TXT), and merge them into the df_cot DataFrame.


In [None]:
zip_download_folder = f'{TEMP_FOLDER}/zip'

In [None]:
f'{zip_download_folder}/*.csv'

In [None]:
glob.glob(f'{zip_download_folder}/*')

In [None]:
if CREATE_COT_DATA:
    last_year = datetime.datetime.now().year
    year_list = np.arange(2010,last_year+1)
    zip_download_folder = f'{TEMP_FOLDER}/zip'
    df_cot_temp = None
    df_cot = None
    for y in year_list:
        yint = int(y)
#         url = f"https://www.cftc.gov/files/dea/history/deacot{yint}.zip" 
        url = f'https://www.cftc.gov/files/dea/history/fut_disagg_txt_{yint}.zip'
        path_to_zip_file = f'{zip_download_folder}/fut_disagg_txt_{y}.zip'
        if len(glob.glob(f'{zip_download_folder}/fut_disagg_txt_{y}.csv'))>0:
            print(f'Already downloaded fut_disagg_txt_{y}.csv')
            continue
        print(f'About to downloaded fut_disagg_txt_{y}.csv')


In [None]:
if CREATE_COT_DATA:
    last_year = datetime.datetime.now().year
    year_list = np.arange(2010,last_year+1)
    zip_download_folder = f'{TEMP_FOLDER}/zip'
    df_cot_temp = None
    df_cot = None
    for y in year_list:
        yint = int(y)
#         url = f"https://www.cftc.gov/files/dea/history/deacot{yint}.zip" 
        url = f'https://www.cftc.gov/files/dea/history/fut_disagg_txt_{yint}.zip'
        path_to_zip_file = f'{zip_download_folder}/fut_disagg_txt_{y}.zip'
        if len(glob.glob(f'{zip_download_folder}/fut_disagg_txt_{y}.csv'))>0:
            print(f'Already downloaded fut_disagg_txt_{y}.csv')
            continue
        if not os.path.isfile(path_to_zip_file) or y >= last_year:
            print(f'retrieving cot zip file from {url}')
            try:
                urllib.request.urlretrieve(url, path_to_zip_file)    
            except:
                import time
                time.sleep(1)
                urllib.request.urlretrieve(url, path_to_zip_file)    
        zip_ref = zipfile.ZipFile(path_to_zip_file, 'r')
        zip_ref.extractall(zip_download_folder)
        zip_ref.close()
#         df_cot_temp = pd.read_csv(f'{zip_download_folder}/Annual.TXT')
        df_cot_temp = pd.read_csv(f'{zip_download_folder}/f_year.txt')
        if df_cot is None:
            df_cot = df_cot_temp.copy()
        else:
            df_cot = df_cot.append(df_cot_temp,ignore_index=True)
            df_cot.index = list(range(len(df_cot)))
        print(f'processed cot csv file from {url}. Length = {len(df_cot_temp)}')
        
df_cot.head()

___
### Make column names easier to process, make main date field a datetime object, and sort the DataFrame
___

___
### Show important columns for a specific  commodity
___

In [None]:
if CREATE_COT_DATA:
    col_rename_dict = {c:c.strip().replace('__','_').replace(' ','_').replace('-','_').replace('(','').replace(')','') for c in df_cot.columns.values}
    df_cot2 = df_cot.rename(columns=col_rename_dict)
    df_cot2 = df_cot2.drop(columns=['Report_Date_as_MM_DD_YYYY'])
    df_cot2.Market_and_Exchange_Names = df_cot2.Market_and_Exchange_Names.str.strip()
    l = lambda s:datetime.datetime(2000+int(str(s)[0:2]),int(str(s)[2:4]),int(str(s)[4:6]))
    df_cot2.As_of_Date_In_Form_YYMMDD = df_cot2.As_of_Date_In_Form_YYMMDD.apply(l)
    df_cot2 = df_cot2.sort_values(['Market_and_Exchange_Names','As_of_Date_In_Form_YYMMDD'])
    df_cot2.to_csv(cot_save_path,index=False)
    
    

In [None]:
df_cot2 = pd.read_csv(cot_save_path)
df_cot2.As_of_Date_In_Form_YYMMDD = df_cot2.As_of_Date_In_Form_YYMMDD.apply(str_to_date)
cot_beg_date = datetime.datetime.now() - datetime.timedelta(2000)
# df_commod = df_cot2[df_cot2.Market_and_Exchange_Names.str.contains(commod)][df_cot2.As_of_Date_In_Form_YYMMDD>=cot_beg_date]
df_commod = df_cot2[df_cot2.As_of_Date_In_Form_YYMMDD>=cot_beg_date]
df_commod_basic = df_commod[basic_cols]
df_commod_long = df_commod[long_cols]
df_commod_short = df_commod[short_cols]


### Show basic open interest info

In [None]:
commod = 'CRUDE OIL, LIGHT SWEET - NEW YORK MERCANTILE EXCHANGE'
[c for c in df_commod.Market_and_Exchange_Names.values if 'CRUDE' in c]

In [None]:
df_commod_basic[df_cot2.Market_and_Exchange_Names.str.contains(commod)].sort_values('As_of_Date_In_Form_YYMMDD',ascending=False).head()


### Show important "long" position info

In [None]:
df_commod_long[df_cot2.Market_and_Exchange_Names.str.contains(commod)].sort_values('As_of_Date_In_Form_YYMMDD',ascending=False).head()

### Show important "short" position info

In [None]:
df_commod_short[df_cot2.Market_and_Exchange_Names.str.contains(commod)].sort_values('As_of_Date_In_Form_YYMMDD',ascending=False).head()

### CREATE important "net" position info for other anlysis notebooks

In [None]:
df_commod_long.head()

In [None]:
df_commod_long.head()

In [None]:
df_commod_short.head()

In [None]:
df_commod_net = df_commod_long.merge(df_commod_short,how='inner',on=basic_cols)
print(len(df_commod_net),len(df_commod_long),len(df_commod_short))


In [None]:
df_commod_net = df_commod_long.merge(df_commod_short,how='inner',on=basic_cols)
print(len(df_commod_net),len(df_commod_long),len(df_commod_short))

net_cols = [c1 for c1 in long_cols + short_cols if c1 not in basic_cols]
for c2 in net_cols:
    df_commod_net = df_commod_net[df_commod_net[c2].astype(str).str.contains('[0-9]')]
print(len(df_commod_net))


for p in summary_cols_dict.keys():
    t = summary_cols_dict[p]
    lc = t[0]
    sc = t[1]
    df_commod_net[p+'_net'] = df_commod_net[lc].astype(float) - df_commod_net[sc].astype(float)
    df_commod_net[p+'_ratio'] = df_commod_net[lc].astype(float) / df_commod_net[sc].astype(float)
print(len(df_commod_net))

sort_cols = ['Market_and_Exchange_Names','As_of_Date_In_Form_YYMMDD']
df_commod_net = df_commod_net.sort_values(sort_cols)


In [None]:
ratio_cols = ['prod_ratio','monman_ratio','swap_ratio','other_ratio','nonrep_ratio']
df_commod_net[df_commod_net.Market_and_Exchange_Names.str.contains(commod)][sort_cols + ratio_cols].sort_values(sort_cols,ascending=False).head()



In [None]:
nan_cols = df_commod_net.columns[df_commod_net.isna().any()].tolist()
if len(nan_cols)>0:
    print(nan_cols)
    df_commod_net = df_commod_net.fillna(0)
    print(df_commod_net[df_commod_net.monman_ratio.isnull()][ratio_cols])

In [None]:
df_commod_net.columns.values

In [None]:
print(f'saving {len(df_commod_net)} records')
df_commod_net.to_csv(cot_net_save_path,index=False)

## END

In [None]:
len(df_commod_net[df_commod_net.Market_and_Exchange_Names.str.contains('SILVER - COMMODITY EXCHANGE INC.')])

In [None]:
df_cot2[df_cot2.Market_and_Exchange_Names.str.lower().str.contains('10')].Market_and_Exchange_Names.unique()


In [None]:
list(filter(lambda s:'As_of_Date_In_Form_YYMMDD' in s,df_cot2.columns.values))

In [None]:
df_wheat = df_cot2[df_cot2.Market_and_Exchange_Names.str.contains('WHEAT-SRW - CHICAGO BOARD OF TRADE')]
len(df_wheat)
# np.sort(df_wheat.As_of_Date_In_Form_YYMMDD.unique())

In [None]:
df_cot2[df_cot2.Market_and_Exchange_Names.str.contains('WHEAT-HRW - CHICAGO BOARD OF TRADE')].As_of_Date_In_Form_YYMMDD.max()
