___
# Merge csv files for ETF shares, CME Open Interest, and CFTC COT reports.
___

In [None]:
import pandas as pd
import numpy as np
import os, sys
import datetime
%matplotlib inline
import matplotlib.pyplot as plt
import plotly.plotly as py
import plotly.graph_objs as go
import zipfile
import urllib.request
from PIL import Image
import jupyter_utilities as ju
import importlib
import pandasql as psql

# Make important folders
TEMP_FOLDER = './temp_folder'
try:
    os.mkdir(TEMP_FOLDER)
except:
    pass
SAVE_IMAGE_FOLDER = f'{TEMP_FOLDER}/gold'
try:
    os.mkdir(SAVE_IMAGE_FOLDER)
except:
    pass

def to_int(s):
    try:
        return int(float(str(s)))
    except:
        print(f'to_int exception on value:{s}')
        return None

pd.set_option('display.max_colwidth',1000)
    
cme_csv_save_folder = './cme_oi_data'
cot_data_path = './cot_history.csv'
etf_data_path = './etf_cap_hist.csv'

### Define commodity and etf identifiers in the csv files

In [None]:
OI_ID_GOLD = 'GOLD FUTURES'
OI_ID_SILVER = 'SILVER FUTURES'
OI_ID_10Y = '10Y NOTE FUTURE'
OI_ID_SPY = 'E-MINI S&P 500 FUTURE'

COT_ID_GOLD= 'GOLD - COMMODITY EXCHANGE INC.'
COT_ID_SILVER= 'SILVER - COMMODITY EXCHANGE INC.'
COT_ID_CL = 'CRUDE OIL, LIGHT SWEET'
COT_ID_10Y = '10-YEAR U.S. TREASURY NOTES - CHICAGO BOARD OF TRADE'
COT_ID_SPY = 'E-MINI S&P 500 STOCK INDEX - CHICAGO MERCANTILE EXCHANGE'

ETF_ID_GOLD = 'GLD'
ETF_ID_SILVER = 'SLV'
ETF_ID_10Y = 'AGG'
ETF_ID_SPY = 'SPY'

ETF_SHARES_DIVISOR_GOLD = 1000
ETF_SHARES_DIVISOR_SILVER = 5000
ETF_SHARES_DIVISOR_10Y = 1000
ETF_SHARES_DIVISOR_SPY = 500


ID_DICT = {
    'gold':{'OI':OI_ID_GOLD,'COT':COT_ID_GOLD,'ETF':ETF_ID_GOLD,'ETF_DIVISOR':ETF_SHARES_DIVISOR_GOLD},
    'silver':{'OI':OI_ID_SILVER,'COT':COT_ID_SILVER,'ETF':ETF_ID_SILVER,'ETF_DIVISOR':ETF_SHARES_DIVISOR_SILVER},
    '10Y':{'OI':OI_ID_10Y,'COT':COT_ID_10Y,'ETF':ETF_ID_10Y,'ETF_DIVISOR':ETF_SHARES_DIVISOR_10Y},
    'spy':{'OI':OI_ID_SPY,'COT':COT_ID_SPY,'ETF':ETF_ID_SPY,'ETF_DIVISOR':ETF_SHARES_DIVISOR_SPY},
}


In [None]:
# importlib.reload(ju)

___
### Get cme open interest, COT and ETF data from csv files
___

In [None]:
df_oi = None
years = np.linspace(2013,2019,7,dtype=int)
df_oi = None
for y in years:
    df_temp = pd.read_csv(f'{cme_csv_save_folder}/cme_open_interest_{y}.csv')
    df_temp = df_temp[~df_temp.Open_Interest.isnull()]
    if df_oi is None:
        df_oi = df_temp.copy()
    else:
        df_oi = df_oi.append(df_temp,ignore_index=True)
        df_oi.index = list(range(len(df_oi)))
df_oi.Open_Interest = df_oi.Open_Interest.apply(to_int)
df_oi.Total_Volume = df_oi.Total_Volume.apply(to_int)
df_oi = df_oi[~df_oi.Total_Volume.isnull()]
print(f'oi length:{len(df_oi)}')
df_etf = pd.read_csv(etf_data_path)
df_etf['trade_date'] = df_etf.date.apply(ju.str_to_yyyymmdd)
print(f'etf length:{len(df_etf)}')
df_cot2 = pd.read_csv(cot_data_path)
df_cot2.As_of_Date_in_Form_YYYY_MM_DD = df_cot2.As_of_Date_in_Form_YYYY_MM_DD.apply(ju.str_to_date)
df_cot2.Market_and_Exchange_Names = df_cot2.Market_and_Exchange_Names.str.strip() 
print(f'cot length:{len(df_cot2)}')


### Find identifier strings for specific Open Interest and COT rows in their respective DataFrames
1. Enter values for oi_key_word, cot_key_word and etf_key_word below
2. Choose the product/market_and_exchange_name/symbol that has the highest open_interest or volume

In [None]:
commod_to_use = 'gold'
OI_ID = ID_DICT[commod_to_use]['OI']
COT_ID = ID_DICT[commod_to_use]['COT']
ETF_ID = ID_DICT[commod_to_use]['ETF']
ETF_DIVISOR = ID_DICT[commod_to_use]['ETF_DIVISOR']

In [None]:

oi_key_word = OI_ID.lower()
cot_key_word = COT_ID.lower()
etf_key_word = ETF_ID.lower()
print('use the results of the groupbys below to select the right ID for each of the  oi, cot and etf Dataframes')
print('oi list')
l = (list(filter(lambda s: oi_key_word in str(s).lower(),df_oi.Product_Description.unique())))
df_oi_sub = df_oi[df_oi.Product_Description.isin(l)][['Product_Description','Open_Interest']]
print(df_oi_sub.groupby('Product_Description',as_index=False).sum())
print()

print('cot list')
l = (list(filter(lambda s: cot_key_word in str(s).lower(),df_cot2.Market_and_Exchange_Names.unique())))
df_cot_sub = df_cot2[df_cot2.Market_and_Exchange_Names.isin(l)][['Market_and_Exchange_Names','Open_Interest_All']]
print(df_cot_sub.groupby('Market_and_Exchange_Names',as_index=False).sum())
print()

print('etf list')
l = (list(filter(lambda s: etf_key_word in str(s).lower(),df_etf.symbol.unique())))
df_etf_sub = df_etf[df_etf.symbol.isin(l)][['symbol','shares']]
print(df_etf_sub.groupby('symbol',as_index=False).sum())


### After you have updated the values that make up ID_DICT, then run this to make sure that those ID's only produce one identifier for each of the oi, cot and etf dataframes

In [None]:
print('Each groupby should only return one value below.')

l = (list(filter(lambda s: oi_key_word == str(s).lower(),df_oi.Product_Description.unique())))
df_oi_sub = df_oi[df_oi.Product_Description.isin(l)][['Product_Description','Open_Interest']]
df_oi_gb = df_oi_sub.groupby('Product_Description',as_index=False).sum()

l = (list(filter(lambda s: cot_key_word == str(s).lower(),df_cot2.Market_and_Exchange_Names.unique())))
df_cot_sub = df_cot2[df_cot2.Market_and_Exchange_Names.isin(l)][['Market_and_Exchange_Names','Open_Interest_All']]
df_cot_gb = df_cot_sub.groupby('Market_and_Exchange_Names',as_index=False).sum()

l = (list(filter(lambda s: etf_key_word == str(s).lower(),df_etf.symbol.unique())))
df_etf_sub = df_etf[df_etf.symbol.isin(l)][['symbol','shares']]
df_etf_gb = df_etf_sub.groupby('symbol',as_index=False).sum()

if len(df_oi_gb) + len(df_cot_gb) + len(df_etf_gb) == 3:
    print(f"all ID's for commodity: {commod_to_use} are OK")
else:
    print(f"!!!!! ALL ID's FOR COMMODITY: {commod_to_use} ARE NOT OK!!!!!!")
    print(f'oi is OK: {len(df_oi_gb)==1}')
    print(f'cot is OK: {len(df_cot_gb)==1}')
    print(f'etf is OK: {len(df_etf_gb)==1}')
    
          



### Merge Open Interest and ETF datasets

In [None]:
df_oi_single = df_oi[df_oi.Product_Description == OI_ID][['trade_date','Open_Interest','Total_Volume']]
df_oi_single.Open_Interest = df_oi_single.Open_Interest.apply(to_int)
df_oi_single.Total_Volume = df_oi_single.Total_Volume.astype(float).astype(int)
df_etf_single = df_etf[df_etf.symbol==ETF_ID]

df_cot_single = df_cot2[df_cot2.Market_and_Exchange_Names==COT_ID]
basic_cols = ['Market_and_Exchange_Names','As_of_Date_in_Form_YYYY_MM_DD','Open_Interest_All']
long_cols = ['Market_and_Exchange_Names','As_of_Date_in_Form_YYYY_MM_DD',
            'Noncommercial_Positions_Long_All','Commercial_Positions_Long_All',
            'Nonreportable_Positions_Long_All','Traders_Commercial_Long_All',
             'Traders_Noncommercial_Long_All','Traders_Total_Reportable_Long_All']
short_cols = ['Market_and_Exchange_Names','As_of_Date_in_Form_YYYY_MM_DD',
            'Noncommercial_Positions_Short_All','Commercial_Positions_Short_All',
            'Nonreportable_Positions_Short_All','Total_Reportable_Positions_Short_All',
            'Traders_Commercial_Short_All','Traders_Noncommercial_Short_All',
            'Traders_Total_Reportable_Short_All']
df_commod_long = df_cot_single[long_cols]
df_commod_short = df_cot_single[short_cols]

def non_comm_net(r):
    return float(r.Noncommercial_Positions_Long_All) - float(r.Noncommercial_Positions_Short_All)
def comm_net(r):
    return float(r.Commercial_Positions_Long_All) - float(r.Commercial_Positions_Short_All)
def non_report_net(r):
    return float(r.Nonreportable_Positions_Long_All) - float(r.Nonreportable_Positions_Short_All)
def traders_comm_net(r):
    return float(r.Traders_Commercial_Long_All) - float(r.Traders_Commercial_Short_All)
def traders_noncomm_net(r):
    return float(r.Traders_Noncommercial_Long_All) - float(r.Traders_Noncommercial_Short_All)

df_commod_net = df_commod_long.merge(df_commod_short,how='inner',on=['Market_and_Exchange_Names','As_of_Date_in_Form_YYYY_MM_DD'])
df_commod_net['Noncommercial_Positions_Net_All'] = df_commod_net.apply(non_comm_net,axis=1)
df_commod_net['Commercial_Positions_Net_All'] = df_commod_net.apply(comm_net,axis=1)
df_commod_net['Nonreportable_Positions_Net_All'] = df_commod_net.apply(non_report_net,axis=1)
df_commod_net['Traders_Commercial_Net_All'] = df_commod_net.apply(traders_comm_net,axis=1)
df_commod_net['Traders_Noncommercial_Net_All'] = df_commod_net.apply(traders_noncomm_net,axis=1)
net_cols = ['Market_and_Exchange_Names','As_of_Date_in_Form_YYYY_MM_DD','Noncommercial_Positions_Net_All','Commercial_Positions_Net_All','Nonreportable_Positions_Net_All','Traders_Commercial_Net_All','Traders_Noncommercial_Net_All']
df_commod_net = df_commod_net[net_cols]
df_commod_net['cot_yyyymmdd'] = df_commod_net.As_of_Date_in_Form_YYYY_MM_DD.apply(ju.str_to_yyyymmdd)
df_commod_net = df_commod_net.sort_values('cot_yyyymmdd')
df_commod_net.index = list(range(len(df_commod_net)))

last_date = ju.str_to_date(str(df_commod_net.iloc[-1].cot_yyyymmdd),sep='') + datetime.timedelta(7)
last_date_yyyymmdd = ju.str_to_yyyymmdd(last_date)
df_commod_net['next_cot_yyyymmdd'] = list(df_commod_net[1:].cot_yyyymmdd) + [last_date_yyyymmdd]
cols_to_change = {'Noncommercial_Positions_Net_All':'noncomm','Commercial_Positions_Net_All':'comm',
                  'Nonreportable_Positions_Net_All':'nonrep','Traders_Commercial_Net_All':'trade_com',
                 'Traders_Noncommercial_Net_All':'trade_noncomm'}
df_commod_net = df_commod_net.rename(columns=cols_to_change)

df_etf_oi = df_etf_single[['trade_date','nav','shares']].merge(df_oi_single,how='inner',on='trade_date')
df_etf_oi['nav_diff'] = df_etf_oi.nav.pct_change()
df_etf_oi['share_diff'] = df_etf_oi.shares.pct_change()
df_etf_oi['oi_diff'] = df_etf_oi.Open_Interest.pct_change()
q = f"select * from df_etf_oi inner join df_commod_net on df_etf_oi.trade_date >= df_commod_net.cot_yyyymmdd and df_etf_oi.trade_date < df_commod_net.next_cot_yyyymmdd"
df_etf_oi_cot =  psql.sqldf(q, locals())

etf_oi_cols = list(df_etf_oi.columns.values)
cot_cols = list(cols_to_change.values()) + ['cot_yyyymmdd','next_cot_yyyymmdd']
etf_oi_cot_cols = etf_oi_cols + cot_cols
df_etf_oi_cot = df_etf_oi_cot[etf_oi_cot_cols]

print(f'df_commod_net length:{len(df_commod_net)}')
print(f'df_etf_oi length:{len(df_etf_oi)}')
print(f'df_etf_oi_cot length:{len(df_etf_oi_cot)}')


In [None]:
df_etf_oi_cot.tail(10)

In [None]:
# pl = 1
# pl_hist = []
# trade_hist = []
# last_n = 1000
# # df_b4 = df_etf_oi_3[-last_n:]
# df_b4 = df_etf_oi[-last_n:]
# print(len(df_b4))

# for i in df_b4.index:
#     r = df_b4.loc[i]
#     nav_roll = float(r.nav_roll)
#     oi_roll = float(r.oi_roll)
#     nav_diff = float(r.nav_diff)
#     if abs(nav_roll) < .002 or abs(oi_roll) < .002:
#         continue
#     if nav_roll * oi_roll >=0:
#         continue
#     sign = 1
#     if nav_roll < 0:
#         sign = -1
#     trade = nav_diff*sign
#     pl = (1+trade) * pl
#     trade_hist.append(trade)
#     pl_hist.append(pl)

In [None]:
# from scipy import stats
# pl_std = np.array(trade_hist).std()
# pl_geomean = stats.gmean(np.array(trade_hist)+1)-1
# pl_avg = np.array(trade_hist).mean()
# pl,pl_geomean,pl_std,pl_geomean/pl_std,(pl_geomean+1)**len(trade_hist),pl_avg/pl_std

In [None]:
df_etf_single.tail()

In [None]:
# ju.multi_plot(df_etf_single[['trade_date','shares']],'trade_date',dates_per_plot=100,figsize=(30,20),bar_plot=True)


In [None]:
ax = ju.plot_pandas(df_etf_single[['trade_date','shares']][-3000:],
                    'trade_date',num_of_x_ticks=40,bar_plot=True,figsize=(16,10))
ax.get_figure().savefig(f'{TEMP_FOLDER}/etf_shares.png')
ax.plot()

In [None]:
df_commod_net['trade_date'] = df_commod_net.As_of_Date_in_Form_YYYY_MM_DD.apply(ju.str_to_yyyymmdd)
df_commod_net.index = list(range(len(df_commod_net)))
df_cn = df_commod_net[['trade_date','noncomm']].sort_values('trade_date').merge(
    df_etf_single[['trade_date','nav','shares']],how='inner',on='trade_date')
df_cn.shares = df_cn.shares / ETF_DIVISOR
df_cn = df_cn[['trade_date','nav','shares','noncomm']]
ax = ju.plot_pandas(df_cn,'trade_date',bar_plot=False,figsize=(16,10))
ax.get_figure().savefig(f'{TEMP_FOLDER}/cot_vs_etf.png')
ax.plot()

In [None]:
ju.plot_pandas(df_cn[['trade_date','noncomm']],'trade_date',bar_plot=False,figsize=(16,5))
ju.plot_pandas(df_cn[['trade_date','nav','shares']],'trade_date',bar_plot=False,figsize=(16,5))

In [None]:
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()
last_n_dates = 500
df_cn2 = df_cn[-last_n_dates:]
df_cn2.index = list(range(len(df_cn2)))
nav_scaled = min_max_scaler.fit_transform(np.array(df_cn2.nav).reshape(-1,1)).reshape(-1)
shares_scaled = min_max_scaler.fit_transform(np.array(df_cn2.shares).reshape(-1,1)).reshape(-1)
non_commercial_scaled = min_max_scaler.fit_transform(np.array(
    df_cn2.noncomm).reshape(-1,1)).reshape(-1)
df_cn_scaled = pd.DataFrame({'trade_date':list(df_cn2.trade_date),'nav':nav_scaled,
                             'shares':shares_scaled,'non_commercial':non_commercial_scaled})

ax = ju.plot_pandas(df_cn_scaled,'trade_date',bar_plot=False,figsize=(16,10))
# ax.get_figure().savefig(f'{TEMP_FOLDER}/cot_vs_etf.png')
ax.plot()
navh = round(df_cn2.nav.max(),2)
navl = round(df_cn2.nav.min(),2)
sharesh = round(df_cn2.shares.max(),2)
sharesl = round(df_cn2.shares.min(),2)
nch = round(df_cn2.noncomm.max(),2)
ncl = round(df_cn2.noncomm.min(),2)
print(f'nav_high:{navh}, nav_low:{navl}, shares_high:{sharesh}, shares_low:{sharesl}, non_com_high:{nch}, non_com_low:{ncl}')
ax.get_figure().savefig(f'{TEMP_FOLDER}/cot_vs_etf_norm.png')


In [None]:
ax = ju.plot_pandas(df_cn_scaled[['trade_date','nav','non_commercial']],'trade_date',bar_plot=False,figsize=(16,10))

In [None]:
df_etf_oi_cot.columns.values 

In [None]:
df_etf_oi_cot_2 = df_etf_oi_cot[['trade_date','nav','noncomm']][-1000:]
ju.plot_pandas(df_etf_oi_cot_2,'trade_date',bar_plot=False,figsize=(16,10))

In [None]:
db1 = 100
db2 = 0
df_etf_oi_cot_2 = df_etf_oi_cot[['trade_date','nav','noncomm']]
df_etf_oi_cot_2['nav_10'] = df_etf_oi_cot_2.nav.rolling(10).mean()
df_etf_oi_cot_2['noncomm_10'] = df_etf_oi_cot_2.noncomm.rolling(10).mean()
df_etf_oi_cot_3 = df_etf_oi_cot_2[['trade_date','nav','noncomm_10','noncomm']]
if db2>=0:
    ju.plot_pandas(df_etf_oi_cot_3.iloc[-db1:],'trade_date',bar_plot=False,figsize=(16,7))
else:
    ju.plot_pandas(df_etf_oi_cot_3.iloc[-db1:-db2],'trade_date',bar_plot=False,figsize=(16,7))

___
### Do a Multip plot of all commodities for 1000 days
___

In [None]:
ju.multi_df_plot(dict_df={'es1':df_etf_oi_cot_2,'es2':df_etf_oi_cot_2,'es3':df_etf_oi_cot_2},x_column='trade_date')