# step_01_download_monthly_acs_files.ipynb

## Overview:
#### This workbook automates the download of monthly options and futures pricing zip files from Barchart ACS.

1. It will use Selenium to login to the site: 
```
ACS_HOME_PAGE = 'http://acs.barchart.com/mri/mripag.htm' 
ACS_LOGIN_INFO = './temp_folder/acs_login.txt'
acs_text = open(ACS_LOGIN_INFO,'r').read()
ACS_USERNAME = acs_text.split(',')[0]
ACS_PASSWORD = acs_text.split(',')[1]
```
2. It will scrape the urls of the zip files, and then download those zipfiles into a folder which is specified using:
```
ZIP_FOLDER_PARENT = open('./temp_folder/zip_folder_parent.txt','r').read()
```

3. You can manually upload one or several months after manuall logging in to Barchartacs at http://acs.barchart.com/mri/mriopt.htm.  Navigate down the page to the table **Monthly Options Zip Files** and then click on a CSV file that you want to download

## Usage:

1. In **Step 2**, set 
```
ZIP_FOLDER_PARENT
BEGIN_YY
END_YY
```
2. Run all cells below **Step 2**


### Step 1:  Imports

In [1]:
import sel_scrape as sc
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.alert import Alert
from selenium.webdriver.support.ui import WebDriverWait as wait
from selenium.webdriver.support import expected_conditions as EC
import requests
from requests.auth import HTTPBasicAuth
import os
import time
import traceback
from tqdm import tqdm,tqdm_notebook
import numpy as np
import importlib
import re
import pathlib
HOME_FOLDER = pathlib.Path.home()

In [None]:
# uncomment the line below if you make changes to the module sel_scrape
# importlib.reload(sc)

### Step 2: Set import variables to determine years to download, and the download location
#### Set the variables 
```
ZIP_FOLDER_PARENT
BEGIN_YY
END_YY
``` 
These values determine 
1. The location to which zip files get downloaded;
2. The first year and last year of daily options settlements to scrape from the Barchart ACS website.


In [None]:
# ZIP_FOLDER_PARENT = os.path.join(HOME_FOLDER, 'barchart_downloads/barchart2')
ZIP_FOLDER_PARENT = ('./temp_folder/zip_files') # os.path.join(HOME_FOLDER, 'barchart_downloads/barchart2')
BEGIN_YY = 21
END_YY = 21
print(f'ZIP_FOLDER_PARENT into which files will be download = {ZIP_FOLDER_PARENT}')
RELOAD_CHROMEDRIVER=False

### Step 3: Set important constants
#### The constants below should be left as is - DO NOT CHANGE.

In [None]:
MMM_LIST = ['jan','feb','mar','apr','may','jun','jul','aug','sep','oct','nov','dec']
YY_LIST = list(np.arange(BEGIN_YY,END_YY+1))
MMMYY_LIST = [mmm + str(yy) for mmm in MMM_LIST for yy in YY_LIST]
MMMYY_LIST

In [None]:
ACS_HOME_PAGE = 'http://acs.barchart.com/mri/mripag.htm' 
# ACS_FUTURES_PAGE = 'http://acs.barchart.com/mri/mridta.htm'
ACS_FUTURES_PAGE = 'http://acs.barchart.com/mri/mrgfutz.htm'
ACS_OPTIONS_PAGE = 'http://acs.barchart.com/mri/mriopt.htm'
ACS_LOGIN_INFO = './temp_folder/acs_login.txt'
acs_text = open(ACS_LOGIN_INFO,'r').read()
ACS_USERNAME = acs_text.split(',')[0]
ACS_PASSWORD = acs_text.split(',')[1]

### Step 4: Set ```ZIP_FOLDER_PARENT```
#### Determine ```ZIP_FOLDER_PARENT```, which represents the folder into which Barchart ACS zip files get downloaded.

In [None]:
if not os.path.exists(ZIP_FOLDER_PARENT):
    print(f'making parent folder {ZIP_FOLDER_PARENT}')
    os.makedirs(ZIP_FOLDER_PARENT)
else:
    print(f'parent folder {ZIP_FOLDER_PARENT} already exists')

In [None]:
MY_HOME = str(pathlib.Path.home())
PROFILE_PATH = f'{MY_HOME}/Library/Application Support/Google/Chrome/Default'

### Step 5: Instatiate SelScape
#### Instantiate an instance of ```sel_scape.SelScrape``` in order to scrape the Barchart ACS website.

In [None]:
# if RELOAD_CHROMEDRIVER:
#     sc.download_chromedriver()

In [3]:
# sela = sc.SelScrape(headless=False)

In [None]:
# sela = sc.SelScrape(headless=False)
# # sela = sc.SelScrape(headless=False,driver_name='chrome')#,profile_path=PROFILE_PATH)
# sela.wait_implicitly(2)
# sela.goto(ACS_HOME_PAGE)
# # time.sleep(10)
# wait(sela.driver, 5).until(EC.alert_is_present())
# alert = sela.driver.switch_to_alert()
# alert.send_keys(f'{ACS_USERNAME}{Keys.TAB}{ACS_PASSWORD}')
# time.sleep(3)
# alert.accept()


### Step 6:
#### Navigate to Home Page

In [None]:
# sela.goto(ACS_OPTIONS_PAGE)

### Step 7: Obtain Options URLS
#### Scrape the urls for options zip files to be downloaded.

In [None]:
# monthly_csv_files_xpath = "//a[contains(@href,'data/opt/opv')]"
# mcsv_elements = sela.findxpath(monthly_csv_files_xpath)['value']
# mcsv_hrefs_all = []
# for mcsv in mcsv_elements:
#     mcsv_hrefs_all.append(mcsv.get_attribute('href'))
# mcsv_hrefs_all

In [None]:
# def is_valid_yyymm(h):
#     return any([m in h for m in MMMYY_LIST])
# mcsv_hrefs = [h for h in mcsv_hrefs_all if is_valid_yyymm(h) ] 
# mcsv_hrefs

### Step 8:  Execute Download of Options zip files
#### Download the zip files into their appropriate folders.

In [None]:
options_url_base = 'http://acs.barchart.com/mri/data/opt/opvMMMYY.zip'
mcsv_hrefs = []
for mmmyy in MMMYY_LIST:
    this_url = options_url_base.replace('MMMYY',mmmyy)
    mcsv_hrefs.append(this_url)
mcsv_hrefs    

In [None]:
options_parent = ZIP_FOLDER_PARENT+'/options'
if not os.path.isdir(options_parent):
    print(f'making options folder {options_parent}')
    os.mkdir(options_parent)
else:
    print(f'options folder {options_parent} already exists')
hrefs_to_unzip = []
paths_to_unzip_to = []
for mcsv_href in mcsv_hrefs:
    zip_file_name = mcsv_href.split('/')[-1]
    folder_name = zip_file_name.replace('.zip','')
    path_to_zip_folder = f'{options_parent}/{folder_name}'
    if not os.path.isdir(path_to_zip_folder):
        print(f'making {path_to_zip_folder}')
        os.mkdir(path_to_zip_folder)
    path_to_zip_file = f'{path_to_zip_folder}/{zip_file_name}'
    if not os.path.isfile(path_to_zip_file):
        hrefs_to_unzip.append(mcsv_href)
        paths_to_unzip_to.append(path_to_zip_file)

successful_downloads = []
for i in tqdm_notebook(range(len(hrefs_to_unzip))):
    url = hrefs_to_unzip[i]
    path_to_zip_file = paths_to_unzip_to[i]
    try:    
        r=requests.get(url, auth=HTTPBasicAuth(ACS_USERNAME, ACS_PASSWORD))
        p = paths_to_unzip_to[i]
        with open(p, 'wb') as f:
            f.write(r.content)
        successful_downloads.append(path_to_zip_file)
    except Exception as e:
        traceback.print_exc()
        

        

### Step 9: Execute download of Futures files.
#### Download the zip files, each of which contain one month of daily futures selttements for all commodities tracked by Barchart.

In [None]:
futures_url_base = 'http://acs.barchart.com/mri/data/mrg/mrgMMMYY.zip'
mcsv_hrefs = []
for mmmyy in MMMYY_LIST:
    this_url = futures_url_base.replace('MMMYY',mmmyy)
    mcsv_hrefs.append(this_url)
mcsv_hrefs    

In [None]:
futures_parent = ZIP_FOLDER_PARENT+'/futures'
if not os.path.isdir(futures_parent):
    print(f'making futures folder {futures_parent}')
    os.mkdir(futures_parent)
else:
    print(f'futures folder {futures_parent} already exists')


In [None]:
# sela = sc.SelScrape(headless=False)
# sela.goto(ACS_HOME_PAGE)
# time.sleep(1)
# wait(sela.driver, 5).until(EC.alert_is_present())
# alert = sela.driver.switch_to_alert()
# alert.send_keys(f'{ACS_USERNAME}{Keys.TAB}{ACS_PASSWORD}')
# time.sleep(3)
# alert.accept()


In [None]:
# sela.goto(ACS_FUTURES_PAGE)
# monthly_csv_files_xpath = "//td/a[contains(@href,'data/mrg/mrg')]"
# mcsv_elements = sela.findxpath(monthly_csv_files_xpath)['value']
# mcsv_hrefs_all = []
# for mcsv in mcsv_elements:
#     mcsv_hrefs_all.append(mcsv.get_attribute('href'))
# all_years = np.arange(BEGIN_YY,END_YY+1)
# mcsv_hrefs = [h for h in mcsv_hrefs_all if int(re.findall('[0-9]{1,2}',h)[0]) in all_years]    

hrefs_to_unzip = []
paths_to_unzip_to = []
for mcsv_href in mcsv_hrefs:
    zip_file_name = mcsv_href.split('/')[-1]
    path_to_zip_file = f'{futures_parent}/{zip_file_name}'
    if not os.path.isfile(path_to_zip_file):
        hrefs_to_unzip.append(mcsv_href)
        paths_to_unzip_to.append(path_to_zip_file)
        
successful_downloads = []
for i in tqdm_notebook(range(len(mcsv_hrefs))):
    try:    
        url = hrefs_to_unzip[i]
        r=requests.get(url, auth=HTTPBasicAuth(ACS_USERNAME, ACS_PASSWORD))
        p = paths_to_unzip_to[i]
        with open(p, 'wb') as f:
            f.write(r.content)
        successful_downloads.append(path_to_zip_file)
    except Exception as e:
        traceback.print_exc()
    

In [None]:
successful_downloads

## END

In [None]:
#!jupyter nbconvert step_01_download_monthly_acs_files.ipynb --to python