## Load downloaded zip futures files to Postgres DB

In [None]:
import zipfile
import glob
import pandas as pd
import numpy as np

from argparse import ArgumentParser
from argparse import RawDescriptionHelpFormatter
import sys
import os
if  not './' in sys.path:
    sys.path.append('./')
if  not '../' in sys.path:
    sys.path.append('../')

from barchartacs import build_db
from barchartacs import db_info
import datetime
import io
from tqdm import tqdm,tqdm_notebook
from barchartacs import pg_pandas as pg
import importlib
import pathlib
HOME_FOLDER = pathlib.Path.home()

MMM_LIST = {
    1:'jan',
    2:'feb',
    3:'mar',
    4:'apr',
    5:'may',
    6:'jun',
    7:'jul',
    8:'aug',
    9:'sep',
    10:'oct',
    11:'nov',
    12:'dec'
}



### Set Important constants

In [None]:
DB_USER_NAME = None
DB_PASSWORD = None
DELETE_ALL = False # set to True if you want to delete all data in postgres db for UNDERLYING_TABLE_NAME
WRITE_DATA=False # set to True if you want to copy new data to postgres using psql copy 
ZIP_FOLDER_PARENT = './temp_folder/zip_files' #os.path.join(HOME_FOLDER, 'barchart_downloads/barchart') 
BEGIN_YY = 21
END_YY = 21
MONTHS_TO_INCLUDE = ['feb','mar','apr','may','jun','jul'] #MMM_LIST.values() #['jul','aug']
COMMODS_TO_INCLUDE = ['SB'] #['HO','RB','CL','NG','ES']

In [None]:
list(MMM_LIST.values())

### Create variable derived from above

In [None]:
YEARS_TO_INCLUDE = list(np.arange(BEGIN_YY,END_YY+1))
DB_NAME = 'sec_db'
SCHEMA_NAME = 'sec_schema'
UNDERLYING_TABLE_NAME = 'underlying_table'
FULL_TABLE_NAME = f'{SCHEMA_NAME}.{UNDERLYING_TABLE_NAME}'
CSV_TEMP_PATH = './temp_folder/df_all_temp.csv'
FUTURES_ZIP_FOLDER = f'{ZIP_FOLDER_PARENT}/futures' 
FUTURES_UNZIP_FOLDER = './temp_folder/unzipfolder_futures'
if not os.path.isdir(FUTURES_UNZIP_FOLDER):
    print(f'futures unzip folder {FUTURES_UNZIP_FOLDER} being created')
    os.mkdir(FUTURES_UNZIP_FOLDER)
else:
    print(f'futures unzip folder {FUTURES_UNZIP_FOLDER} already created')


### Unzip futures files

In [None]:
all_names_ordered = []
for yy in YEARS_TO_INCLUDE:
    if len(MONTHS_TO_INCLUDE)>0:
        fnames = []
        for mm in MONTHS_TO_INCLUDE:
            txt_file_list = glob.glob(f'{FUTURES_ZIP_FOLDER}/*{mm}{yy}.txt')
            if len(txt_file_list)>0:
                fnames.append(txt_file_list[0])
                continue
            path_to_zip_file = glob.glob(f'{FUTURES_ZIP_FOLDER}/*{mm}{yy}.zip')[0]
            zip_ref = zipfile.ZipFile(path_to_zip_file, 'r')
            zip_ref.extractall(FUTURES_UNZIP_FOLDER)
            zip_ref.close()  
            txt_file = glob.glob(f'{FUTURES_UNZIP_FOLDER}/*{mm}{yy}.txt')[0]
            fnames.append(txt_file)
                          
    d = {}
    for fname in fnames:
        mmm = fname.split('/')[-1].split('.txt')[0][0:-2][-3:]
        d[mmm] = fname
    fnames_ordered = [d[MMM_LIST[m]] for m in MMM_LIST.keys() if MMM_LIST[m] in d]
    all_names_ordered += fnames_ordered
all_names_ordered

df_all = None
header = ['contract','month_year','yymmdd','open','high','low','close','volume','open_interest']

for fname in tqdm_notebook(all_names_ordered):
    df_temp = pd.read_csv(fname,header=None)
    df_temp.columns = header
    df_temp['commod'] = df_temp.contract.str.slice(0,2)
    df_temp = df_temp[df_temp.commod.isin(COMMODS_TO_INCLUDE)]
    if df_all is None:
        df_all = df_temp.copy()
    else:
        df_all = df_all.append(df_temp,ignore_index=True)
        df_all.index = list(range(len(df_all)))
    

In [None]:
len(df_all)

In [None]:
df_temp = df_all.copy()#.iloc[:1000]
isnas = df_temp.yymmdd.isna()
df_temp = df_temp[~isnas]
df_temp = df_temp[~df_temp.open_interest.isna()]
df_temp.volume = df_temp.volume.fillna(0)
df_temp = df_temp[df_temp.open.astype(str).str.count('\.')<=1]
df_temp.index = list(range(len(df_temp)))
df_temp.loc[df_temp.month_year=='Y','month_year'] = '2099Z'
symbols = df_temp.contract + df_temp.month_year.str.slice(-1,)  + df_temp.month_year.str.slice(2,4)
settle_dates = ('20' + df_temp.yymmdd.astype(str)).astype(float).astype(int)
opens = df_temp.open.astype(float)
highs = df_temp.high.astype(float)
lows = df_temp.low.astype(float)
closes = df_temp.close.astype(float)
volumes = df_temp.volume.astype(int)
open_interests = df_temp.open_interest.astype(int)
df_final = pd.DataFrame({'symbol':symbols,
    'settle_date':settle_dates,
    'open':opens,
    'high':highs,
    'low':lows,
    'close':closes,
    'adj_close':closes,
    'volume':volumes,
    'open_interest':open_interests})
df_final.head()

#### add month_num to df_final

In [None]:
df_monthnum = pd.read_csv('month_codes.csv')
df_monthnum

In [None]:
df_final.symbol.str.slice(0,-3).unique()

In [None]:
dfu2 = df_final.copy()
dfu2['contract'] = dfu2.symbol.str.slice(0,-3)
dfu2['year'] = dfu2.symbol.apply(lambda s: 2000 + int(s[-2:]))
dfu2['month_code'] = dfu2.symbol.str.slice(-3,-2)
dfu3 = dfu2.merge(df_monthnum,on='month_code',how='inner')
dfu3['yyyymm'] = dfu3.year*100+dfu3.month_num
display(dfu2.month_code.unique())
dfu4 = dfu3[['contract','symbol','settle_date','yyyymm']]
dfu4['contract_num'] =dfu4[['contract','settle_date','yyyymm']].groupby(['contract','settle_date']).yyyymm.rank()
dfu4['contract_num'] = dfu4['contract_num'].astype(int)
dfu4 = dfu4.sort_values(['settle_date','contract','yyyymm'])
dfu4.index = list(range(len(dfu4)))
print(len(df_final),len(dfu4))
dfu5 = df_final.merge(dfu4[['symbol','settle_date','contract_num']],on=['symbol','settle_date'])
dfu5.index = list(range(len(dfu5)))
dfu5.open=dfu5.open.round(8)
dfu5.high=dfu5.high.round(8)
dfu5.low=dfu5.low.round(8)
dfu5.close=dfu5.close.round(8)
dfu5.adj_close = dfu5.adj_close.round(8)



In [None]:
len(df_all),len(df_final),len(dfu5)

#### Are there dupes??

In [None]:
ag = ['symbol','settle_date']
df_counts = dfu5[ag+['close']].groupby(ag,as_index=False).count()
dupes_exist  = len(df_counts[df_counts.close>1])>0
dupes_exist

#### if there are dupes, get rid of them

In [None]:
if dupes_exist > 0:
    dfu5 = dfu5.drop_duplicates()
    dfu5.index = list(range(len(dfu5)))
    

#### show unique contract_num numbers

In [None]:
dfu5.contract_num.unique()

### NOW WRITE THE DATA FOR ALL YEARs


#### First create an instance of PgPandas

In [None]:
csv_temp_path = CSV_TEMP_PATH#'./temp_folder/df_all_temp.csv'
# pga = db_info.get_db_info()
tbname = FULL_TABLE_NAME# 'sec_schema.underlying_table'
col_tuple_list =   [('symbol','text'),('settle_date','integer'),('contract_num','integer'),
     ('open','numeric'),('high','numeric'),('low','numeric'),('close','numeric'),
     ('adj_close','numeric'),('volume','integer'),('open_interest','integer')]
col_list = [l[0] for l in col_tuple_list]
print(f'creating csv file {csv_temp_path}: {datetime.datetime.now()}')
dfu5[col_list].to_csv(os.path.abspath(csv_temp_path),index=False)


In [None]:
DEBUG_IT=False
opttab = 'sec_schema.options_table'
futtab = 'sec_schema.underlying_table'
pga = db_info.get_db_info()


#### Delete all rows if necessary

In [None]:
WRITE_DATA

In [None]:
def psql_copy():
    copy_cmd = f"\COPY {FULL_TABLE_NAME} FROM '{os.path.abspath(CSV_TEMP_PATH)}' DELIMITER ',' CSV HEADER;"
#     copy_cmd = f"select count(*) from {FULL_TABLE_NAME};"
    username_clause = ''
    if DB_USER_NAME is not None and len(DB_USER_NAME)>0:
        username_clause = f' -U {DB_USER_NAME} '
    psql_cmd = f'psql {username_clause} -d sec_db -c "CMD"'
    psql_cmd = psql_cmd.replace('CMD',copy_cmd)
    if  WRITE_DATA:  # double check !!!
       !{psql_cmd}
    else:
        print(psql_cmd)
        

In [None]:
def get_data_by_contract(pga,contract,
            month_code_clause=None,
             settle_date_clause=None,
             limit=None):
    futtab = tbname
    cl_month_code = '' if month_code_clause is None else f"and substring(symbol,3,1) {month_code_clause}"
    cl_sd = '' if settle_date_clause is None else f"and 'settle_date {settle_date_clause}"
    clim = '' if limit is None else f"limit {limit}"
    other_criteria = f'{cl_month_code} {cl_sd} {clim}'
    sql = f"select * from {tbname} where substring(symbol,1,2) = '{contract}' {other_criteria};"
    print(sql)
    df=  pga.get_sql(sql)
    return df

for commod in COMMODS_TO_INCLUDE:
    dfc = get_data_by_contract(pga,commod,limit=3)
    display(dfc)


In [None]:
# if WRITE_DATA:
psql_copy()


In [None]:
syms_string = ','.join([f"'{s}'" for s in dfu5.symbol.str.slice(0,2).unique()])

sql_cmd = f"""
select substring(symbol,1,2),max(settle_date) from sec_schema.underlying_table 
where substring(symbol,1,2) in ({syms_string}) 
group by substring(symbol,1,2) order  by substring(symbol,1,2);
"""
pga.get_sql(sql_cmd)


In [None]:
#!jupyter nbconvert step_03_underlying_table_loader.ipynb --to python