# CFTC data

Scraper for Commitment of Traders from CFTC.

## Compare files CFTC x ICE

Let's compare file schemas.

In [None]:
filename = r'C:\Users\ROSA_L\PycharmProjects\scraper\filestore\com_theice_COTHist2021.csv'

import pandas as pd
df = pd.read_csv(filename,
                 skip_blank_lines=True,
                 na_values=[' ', '#VALUE!'],
                 index_col = False,
                 header=0
                 )

file_cftc = r'C:\Users\ROSA_L\Downloads\f_year.xls'
df_cftc = pd.read_excel(file_cftc, engine='xlrd')

In [None]:
ice_cols = set(df.columns)
cftc_cols = set(df_cftc.columns)

print(f'lenght ice: {len(ice_cols)}')
print(f'lenght cftc: {len(cftc_cols)}')

print(f'ice - cftc: {sorted(ice_cols - cftc_cols)}')
print('\n')
print(f'cftc - ice: {sorted(cftc_cols - ice_cols)}')

In [None]:
cftc_ice_mapping = {'M_Money_Positions_Long_ALL': 'M_Money_Positions_Long_All', 
                    'M_Money_Positions_Short_ALL': 'M_Money_Positions_Short_All', 
                    'M_Money_Positions_Spread_ALL': 'M_Money_Positions_Spread_All', 
                    'Other_Rept_Positions_Long_ALL': 'Other_Rept_Positions_Long_All', 
                    'Other_Rept_Positions_Short_ALL': 'Other_Rept_Positions_Short_All', 
                    'Other_Rept_Positions_Spread_ALL': 'Other_Rept_Positions_Spread_All', 
                    'Other_Rept_Positions_Spread_Othr': 'Other_Rept_Positions_Spread_Other', 
                    'Pct_of_OI_Other_Rept_Spread_Othr': 'Pct_of_OI_Other_Rept_Spread_Other', 
                    'Prod_Merc_Positions_Long_ALL': 'Prod_Merc_Positions_Long_All', 
                    'Prod_Merc_Positions_Short_ALL': 'Prod_Merc_Positions_Short_All', 
                    #'Report_Date_as_MM_DD_YYYY': 'As_of_Date_In_Form_YYMMDD', 
                    'Swap__Positions_Short_All': 'Swap_Positions_Short_All', 
                    'Swap__Positions_Spread_All': 'Swap_Positions_Spread_All'}

df_cftc.rename(columns=cftc_ice_mapping, inplace=True)

ice_cols = set(df.columns)
cftc_cols = set(df_cftc.columns)

print(f'ice - cftc: {sorted(ice_cols - cftc_cols)}')
print('\n')
print(f'cftc - ice: {sorted(cftc_cols - ice_cols)}')

To make these 2 scrapers share the same table, we need to:

- add column Report_Date_as_MM_DD_YYYY
- add provider column
- fill in provider dimension



## Test scraper
Let's test the scraper.

In [1]:
%cd ..

C:\Users\ROSA_L\PycharmProjects\scraper


In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import logging
logger = logging.getLogger()
logging.basicConfig(level=logging.DEBUG)
logger.setLevel(logging.DEBUG)

In [4]:
from scraper.core import factory

In [11]:
job = factory.get_scraper_job('gov_cftc', 'futures_and_options_comb', full_load=True)
#job = factory.get_scraper_job('gov_cftc', 'futures_and_options_comb')

DEBUG:scraper.core.factory:Loading module scraper.jobs.gov_cftc.futures_and_options_comb
DEBUG:scraper.core.factory:Getting class FuturesAndOptionsCombJob


In [None]:
job.get_sources()

In [None]:
[vars(source) for source in job.sources]

In [None]:
job.download_and_get_checksum()
job.rm_sources_up_to_date()
job.transform()


In [None]:
job.data

In [12]:
job.run()

INFO:scraper.jobs.gov_cftc.futures_and_options_comb:Defining list of files to load.
DEBUG:scraper.jobs.gov_cftc.futures_and_options_comb:Start year: 2017
DEBUG:scraper.jobs.gov_cftc.futures_and_options_comb:Generating source files for com
DEBUG:scraper.jobs.gov_cftc.futures_and_options_comb:Creating source for com and 2021
DEBUG:scraper.jobs.gov_cftc.futures_and_options_comb:Creating source for com and 2020
DEBUG:scraper.jobs.gov_cftc.futures_and_options_comb:Creating source for com and 2019
DEBUG:scraper.jobs.gov_cftc.futures_and_options_comb:Creating source for com and 2018
DEBUG:scraper.jobs.gov_cftc.futures_and_options_comb:Creating source for com and 2017
DEBUG:scraper.jobs.gov_cftc.futures_and_options_comb:Creating source for com and hist_2006_2016
DEBUG:scraper.jobs.gov_cftc.futures_and_options_comb:Generating source files for fut
DEBUG:scraper.jobs.gov_cftc.futures_and_options_comb:Creating source for fut and 2021
DEBUG:scraper.jobs.gov_cftc.futures_and_options_comb:Creating so

### Test futures only

In [9]:
# job = factory.get_scraper_job('gov_cftc', 'futures_only')
job = factory.get_scraper_job('gov_cftc', 'futures_only', full_load=True)

DEBUG:scraper.core.factory:Loading module scraper.jobs.gov_cftc.futures_only
DEBUG:scraper.core.factory:Getting class FuturesOnlyJob


In [7]:
job.run()

INFO:scraper.jobs.gov_cftc.futures_only:Defining list of files to load.
DEBUG:scraper.jobs.gov_cftc.futures_only:Start year: 2021
DEBUG:scraper.jobs.gov_cftc.futures_only:Creating source for 2021
DEBUG:scraper.core.job:remove_existing_dynamic_dim: query - http://vipenta:8000/dimension/source
DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): vipenta:8000
DEBUG:urllib3.connectionpool:http://vipenta:8000 "GET /dimension/source HTTP/1.1" 200 767979
DEBUG:scraper.core.job:self.dynamic_dim['source'] size before: 1
DEBUG:scraper.core.job:self.dynamic_dim['source'] size after: 0
DEBUG:scraper.core.job:download: True, parallel download: True
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.cftc.gov:443
DEBUG:urllib3.connectionpool:https://www.cftc.gov:443 "GET /files/dea/history/fut_disagg_txt_2021.zip HTTP/1.1" 301 290
DEBUG:urllib3.connectionpool:https://www.cftc.gov:443 "GET /sites/default/files/files/dea/history/fut_disagg_txt_2021.zip HTTP/1.1" 200 591301
IN

In [8]:
print('CFTC_SubGroup_Code', job.data['CFTC_SubGroup_Code'].str.len().max())
print('CFTC_Market_Code_Quotes', job.data['CFTC_Market_Code_Quotes'].str.len().max())
print('CFTC_Contract_Market_Code_Quotes', job.data['CFTC_Contract_Market_Code_Quotes'].str.len().max())
print('Market_and_Exchange_Names', job.data['Market_and_Exchange_Names'].str.len().max())
print('As_of_Date_In_Form_YYMMDD', job.data['As_of_Date_In_Form_YYMMDD'].str.len().max())
print('Report_Date_as_YYYY-MM-DD', job.data['Report_Date_as_YYYY-MM-DD'].str.len().max())
print('CFTC_Contract_Market_Code', job.data['CFTC_Contract_Market_Code'].str.len().max())
print('CFTC_Market_Code', job.data['CFTC_Market_Code'].str.len().max())
print('CFTC_Commodity_Code', job.data['CFTC_Commodity_Code'].str.len().max())
print('Contract_Units', job.data['Contract_Units'].str.len().max())
print('CFTC_Contract_Market_Code_Quotes', job.data['CFTC_Contract_Market_Code_Quotes'].str.len().max())
print('CFTC_Commodity_Code_Quotes', job.data['CFTC_Commodity_Code_Quotes'].str.len().max())
print('FutOnly_or_Combined', job.data['FutOnly_or_Combined'].str.len().max())
print('source', job.data['source'].str.len().max())
print('provider', job.data['provider'].str.len().max())

CFTC_SubGroup_Code 3
CFTC_Market_Code_Quotes 4
CFTC_Contract_Market_Code_Quotes 6
Market_and_Exchange_Names 78
As_of_Date_In_Form_YYMMDD 6
Report_Date_as_YYYY-MM-DD 10
CFTC_Contract_Market_Code 6
CFTC_Market_Code 4
CFTC_Commodity_Code 4
Contract_Units 48
CFTC_Contract_Market_Code_Quotes 6
CFTC_Commodity_Code_Quotes 4
FutOnly_or_Combined 7
source 28
provider 8


In [None]:
job.data[job.data['Contract_Units'].str.len() == 114]['Contract_Units'].tolist()

In [None]:
# Lets go step by step on fixing these errors
from zipfile import ZipFile
import pandas as pd
from typing import List

csv_columns = {'Market_and_Exchange_Names': str,
                   'As_of_Date_In_Form_YYMMDD': str,
                   'Report_Date_as_YYYY-MM-DD': str,
                   'CFTC_Contract_Market_Code': str,
                   'CFTC_Market_Code': str,
                   'CFTC_Region_Code': int,
                   'CFTC_Commodity_Code': str,
                   'Open_Interest_All': float,
                   'Prod_Merc_Positions_Long_All': int,
                   'Prod_Merc_Positions_Short_All': int,
                   'Swap_Positions_Long_All': int,
                   'Swap__Positions_Short_All': int,
                   'Swap__Positions_Spread_All': int,
                   'M_Money_Positions_Long_All': int,
                   'M_Money_Positions_Short_All': int,
                   'M_Money_Positions_Spread_All': int,
                   'Other_Rept_Positions_Long_All': int,
                   'Other_Rept_Positions_Short_All': int,
                   'Other_Rept_Positions_Spread_All': int,
                   'Tot_Rept_Positions_Long_All': float,
                   'Tot_Rept_Positions_Short_All': float,
                   'NonRept_Positions_Long_All': int,
                   'NonRept_Positions_Short_All': int,
                   'Open_Interest_Old': float,
                   'Prod_Merc_Positions_Long_Old': int,
                   'Prod_Merc_Positions_Short_Old': int,
                   'Swap_Positions_Long_Old': int,
                   'Swap__Positions_Short_Old': int,
                   'Swap__Positions_Spread_Old': int,
                   'M_Money_Positions_Long_Old': int,
                   'M_Money_Positions_Short_Old': int,
                   'M_Money_Positions_Spread_Old': int,
                   'Other_Rept_Positions_Long_Old': int,
                   'Other_Rept_Positions_Short_Old': int,
                   'Other_Rept_Positions_Spread_Old': int,
                   'Tot_Rept_Positions_Long_Old': float,
                   'Tot_Rept_Positions_Short_Old': float,
                   'NonRept_Positions_Long_Old': float,
                   'NonRept_Positions_Short_Old': float,
                   'Open_Interest_Other': float,
                   'Prod_Merc_Positions_Long_Other': float,
                   'Prod_Merc_Positions_Short_Other': float,
                   'Swap_Positions_Long_Other': float,
                   'Swap__Positions_Short_Other': float,
                   'Swap__Positions_Spread_Other': float,
                   'M_Money_Positions_Long_Other': float,
                   'M_Money_Positions_Short_Other': float,
                   'M_Money_Positions_Spread_Other': float,
                   'Other_Rept_Positions_Long_Other': float,
                   'Other_Rept_Positions_Short_Other': float,
                   'Other_Rept_Positions_Spread_Other': float,
                   'Tot_Rept_Positions_Long_Other': float,
                   'Tot_Rept_Positions_Short_Other': float,
                   'NonRept_Positions_Long_Other': float,
                   'NonRept_Positions_Short_Other': float,
                   'Change_in_Open_Interest_All': float,
                   'Change_in_Prod_Merc_Long_All': float,
                   'Change_in_Prod_Merc_Short_All': float,
                   'Change_in_Swap_Long_All': float,
                   'Change_in_Swap_Short_All': float,
                   'Change_in_Swap_Spread_All': float,
                   'Change_in_M_Money_Long_All': float,
                   'Change_in_M_Money_Short_All': float,
                   'Change_in_M_Money_Spread_All': float,
                   'Change_in_Other_Rept_Long_All': float,
                   'Change_in_Other_Rept_Short_All': float,
                   'Change_in_Other_Rept_Spread_All': float,
                   'Change_in_Tot_Rept_Long_All': float,
                   'Change_in_Tot_Rept_Short_All': float,
                   'Change_in_NonRept_Long_All': float,
                   'Change_in_NonRept_Short_All': float,
                   'Pct_of_Open_Interest_All': float,
                   'Pct_of_OI_Prod_Merc_Long_All': float,
                   'Pct_of_OI_Prod_Merc_Short_All': float,
                   'Pct_of_OI_Swap_Long_All': float,
                   'Pct_of_OI_Swap_Short_All': float,
                   'Pct_of_OI_Swap_Spread_All': float,
                   'Pct_of_OI_M_Money_Long_All': float,
                   'Pct_of_OI_M_Money_Short_All': float,
                   'Pct_of_OI_M_Money_Spread_All': float,
                   'Pct_of_OI_Other_Rept_Long_All': float,
                   'Pct_of_OI_Other_Rept_Short_All': float,
                   'Pct_of_OI_Other_Rept_Spread_All': float,
                   'Pct_of_OI_Tot_Rept_Long_All': float,
                   'Pct_of_OI_Tot_Rept_Short_All': float,
                   'Pct_of_OI_NonRept_Long_All': float,
                   'Pct_of_OI_NonRept_Short_All': float,
                   'Pct_of_Open_Interest_Old': float,
                   'Pct_of_OI_Prod_Merc_Long_Old': float,
                   'Pct_of_OI_Prod_Merc_Short_Old': float,
                   'Pct_of_OI_Swap_Long_Old': float,
                   'Pct_of_OI_Swap_Short_Old': float,
                   'Pct_of_OI_Swap_Spread_Old': float,
                   'Pct_of_OI_M_Money_Long_Old': float,
                   'Pct_of_OI_M_Money_Short_Old': float,
                   'Pct_of_OI_M_Money_Spread_Old': float,
                   'Pct_of_OI_Other_Rept_Long_Old': float,
                   'Pct_of_OI_Other_Rept_Short_Old': float,
                   'Pct_of_OI_Other_Rept_Spread_Old': float,
                   'Pct_of_OI_Tot_Rept_Long_Old': float,
                   'Pct_of_OI_Tot_Rept_Short_Old': float,
                   'Pct_of_OI_NonRept_Long_Old': float,
                   'Pct_of_OI_NonRept_Short_Old': float,
                   'Pct_of_Open_Interest_Other': float,
                   'Pct_of_OI_Prod_Merc_Long_Other': float,
                   'Pct_of_OI_Prod_Merc_Short_Other': float,
                   'Pct_of_OI_Swap_Long_Other': float,
                   'Pct_of_OI_Swap_Short_Other': float,
                   'Pct_of_OI_Swap_Spread_Other': float,
                   'Pct_of_OI_M_Money_Long_Other': float,
                   'Pct_of_OI_M_Money_Short_Other': float,
                   'Pct_of_OI_M_Money_Spread_Other': float,
                   'Pct_of_OI_Other_Rept_Long_Other': float,
                   'Pct_of_OI_Other_Rept_Short_Other': float,
                   'Pct_of_OI_Other_Rept_Spread_Other': float,
                   'Pct_of_OI_Tot_Rept_Long_Other': float,
                   'Pct_of_OI_Tot_Rept_Short_Other': float,
                   'Pct_of_OI_NonRept_Long_Other': float,
                   'Pct_of_OI_NonRept_Short_Other': float,
                   'Traders_Tot_All': float,
                   'Traders_Prod_Merc_Long_All': float,
                   'Traders_Prod_Merc_Short_All': float,
                   'Traders_Swap_Long_All': float,
                   'Traders_Swap_Short_All': float,
                   'Traders_Swap_Spread_All': float,
                   'Traders_M_Money_Long_All': float,
                   'Traders_M_Money_Short_All': float,
                   'Traders_M_Money_Spread_All': float,
                   'Traders_Other_Rept_Long_All': float,
                   'Traders_Other_Rept_Short_All': float,
                   'Traders_Other_Rept_Spread_All': float,
                   'Traders_Tot_Rept_Long_All': float,
                   'Traders_Tot_Rept_Short_All': float,
                   'Traders_Tot_Old': float,
                   'Traders_Prod_Merc_Long_Old': float,
                   'Traders_Prod_Merc_Short_Old': float,
                   'Traders_Swap_Long_Old': float,
                   'Traders_Swap_Short_Old': float,
                   'Traders_Swap_Spread_Old': float,
                   'Traders_M_Money_Long_Old': float,
                   'Traders_M_Money_Short_Old': float,
                   'Traders_M_Money_Spread_Old': float,
                   'Traders_Other_Rept_Long_Old': float,
                   'Traders_Other_Rept_Short_Old': float,
                   'Traders_Other_Rept_Spread_Old': float,
                   'Traders_Tot_Rept_Long_Old': float,
                   'Traders_Tot_Rept_Short_Old': float,
                   'Traders_Tot_Other': float,
                   'Traders_Prod_Merc_Long_Other': float,
                   'Traders_Prod_Merc_Short_Other': float,
                   'Traders_Swap_Long_Other': float,
                   'Traders_Swap_Short_Other': float,
                   'Traders_Swap_Spread_Other': float,
                   'Traders_M_Money_Long_Other': float,
                   'Traders_M_Money_Short_Other': float,
                   'Traders_M_Money_Spread_Other': float,
                   'Traders_Other_Rept_Long_Other': float,
                   'Traders_Other_Rept_Short_Other': float,
                   'Traders_Other_Rept_Spread_Other': float,
                   'Traders_Tot_Rept_Long_Other': float,
                   'Traders_Tot_Rept_Short_Other': float,
                   'Conc_Gross_LE_4_TDR_Long_All': float,
                   'Conc_Gross_LE_4_TDR_Short_All': float,
                   'Conc_Gross_LE_8_TDR_Long_All': float,
                   'Conc_Gross_LE_8_TDR_Short_All': float,
                   'Conc_Net_LE_4_TDR_Long_All': float,
                   'Conc_Net_LE_4_TDR_Short_All': float,
                   'Conc_Net_LE_8_TDR_Long_All': float,
                   'Conc_Net_LE_8_TDR_Short_All': float,
                   'Conc_Gross_LE_4_TDR_Long_Old': float,
                   'Conc_Gross_LE_4_TDR_Short_Old': float,
                   'Conc_Gross_LE_8_TDR_Long_Old': float,
                   'Conc_Gross_LE_8_TDR_Short_Old': float,
                   'Conc_Net_LE_4_TDR_Long_Old': float,
                   'Conc_Net_LE_4_TDR_Short_Old': float,
                   'Conc_Net_LE_8_TDR_Long_Old': float,
                   'Conc_Net_LE_8_TDR_Short_Old': float,
                   'Conc_Gross_LE_4_TDR_Long_Other': float,
                   'Conc_Gross_LE_4_TDR_Short_Other': float,
                   'Conc_Gross_LE_8_TDR_Long_Other': float,
                   'Conc_Gross_LE_8_TDR_Short_Other': float,
                   'Conc_Net_LE_4_TDR_Long_Other': float,
                   'Conc_Net_LE_4_TDR_Short_Other': float,
                   'Conc_Net_LE_8_TDR_Long_Other': float,
                   'Conc_Net_LE_8_TDR_Short_Other': float,
                   'Contract_Units': str,
                   'CFTC_Contract_Market_Code_Quotes': float,
                   'CFTC_Market_Code_Quotes': float,
                   'CFTC_Commodity_Code_Quotes': float,
                   'CFTC_SubGroup_Code': float,
                   'FutOnly_or_Combined': str}
null_values = ['.', ' ']


path = r'C:\Users\ROSA_L\PycharmProjects\scraper\filestore\gov_cftc_com_disagg_txt_2021.zip'

z = ZipFile(path)
dfs: List[pd.DataFrame] = []
for filename in z.namelist():
    logger.debug(f'Reading {filename} from {path}')
    with z.open(filename) as file:
        try:
            df = pd.read_csv(file,
            skip_blank_lines=True,
            #names=csv_columns.keys(),
            #dtype=csv_columns,
            na_values=null_values,
            header=0
            )
            logger.debug(f'{filename}: {len(df)} rows loaded.')
            dfs.append(df)
        except ValueError as e:
            logger.exception(f'Error while reading file {filename}')
            raise e

        df = pd.concat(dfs)
df

In [None]:
len(df.columns)

In [None]:
dict(df.dtypes)

In [None]:
df['CFTC_Contract_Market_Code_Quotes'].drop_duplicates()