## Import

In [None]:
%load_ext autoreload
%autoreload 2

import logging
import os

import boto3
import pandas as pd

import helpers.dbg as dbg
import helpers.env as env
import helpers.printing as pri
import helpers.s3 as hs3
import vendors.kibot.utils as kut

In [2]:
print(env.get_system_signature())

pri.config_notebook()

# dbg.init_logger(verb=logging.DEBUG)
dbg.init_logger(verb=logging.INFO)
# dbg.test_logger()

_LOG = logging.getLogger(__name__)

# Packages
         python: 3.7.3
         joblib: 0.14.0
          numpy: 1.17.2
         pandas: 0.25.1
        pyarrow: 0.15.0
          scipy: 1.3.1
        seaborn: 0.9.0
        sklearn: 0.21.3
    statsmodels: 0.10.1
# Last commits:
  * d0c9e40 saggese  PartTask408: Add script to generate TOC for documentation         (  24 hours ago) Sun Oct 20 13:40:28 2019  (HEAD -> master, origin/master, origin/HEAD)
  * 7f5dd00 saggese  Improve documentation                                             (    2 days ago) Sat Oct 19 22:49:38 2019           
  *   bf73214 Paul     Merge pull request #39 from alphamatic/dataflow_cleanup           (    2 days ago) Sat Oct 19 21:51:17 2019           
  |\  


## Functions

In [3]:
def get_file_names_s3_objects(s3_objects):
    contents = s3_objects["Contents"]
    return [cont["Key"] for cont in contents]


def _get_subdirs(file_names):
    return set(map(lambda x: x.split("/")[1], file_names))

In [4]:
def normalize_1_min(df):
    df[0] = pd.to_datetime(df[0] + " " + df[1], format="%m/%d/%Y %H:%M")
    df.drop(columns=[1], inplace=True)
    df.columns = "datetime open high low close vol".split()
    df.set_index("datetime", drop=True, inplace=True)
    _LOG.debug("Add columns")
    df["time"] = [d.time() for d in df.index]
    return df


def normalize_daily(df):
    df[0] = pd.to_datetime(df[0], format="%m/%d/%Y")
    df.columns = "date open high low close vol".split()
    df.set_index("date", drop=True, inplace=True)
    # TODO(GP): Should this be renamed to datetime as described
    # in kibot/utils.py L56?
    return df

In [6]:
AMAZON_MAX_INT = 2147483647

# List s3 files

In [5]:
# sys.maxsize is larger than Amazon's max int

In [7]:
s3 = boto3.client("s3")
s3_objects = s3.list_objects_v2(
    Bucket="default00-bucket", StartAfter="kibot", MaxKeys=AMAZON_MAX_INT
)
s3_objects

Found credentials in shared credentials file: ~/.aws/credentials


{'ResponseMetadata': {'RequestId': 'C4E2E77A6A129ED6',
  'HostId': 'gXSFT25keaiY5jTsBBPqhdsbZCffU5edmh0hTwLrg35PXJimFIT9iyHbnDQJwDAiZ1LMjWrKF5o=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'gXSFT25keaiY5jTsBBPqhdsbZCffU5edmh0hTwLrg35PXJimFIT9iyHbnDQJwDAiZ1LMjWrKF5o=',
   'x-amz-request-id': 'C4E2E77A6A129ED6',
   'date': 'Mon, 21 Oct 2019 13:46:11 GMT',
   'x-amz-bucket-region': 'us-east-2',
   'content-type': 'application/xml',
   'transfer-encoding': 'chunked',
   'server': 'AmazonS3'},
  'RetryAttempts': 1},
 'IsTruncated': True,
 'Contents': [{'Key': 'kibot/All_Futures_Continuous_Contracts_1min/AC.csv.gz',
   'LastModified': datetime.datetime(2019, 10, 1, 19, 14, 39, tzinfo=tzlocal()),
   'ETag': '"642c83a029031a1b0c65bd53b2ee3801"',
   'Size': 406608,
   'StorageClass': 'STANDARD'},
  {'Key': 'kibot/All_Futures_Continuous_Contracts_1min/AD.csv.gz',
   'LastModified': datetime.datetime(2019, 10, 1, 19, 14, 39, tzinfo=tzlocal()),
   'ETag': '"496a61f2050dc5889041db4e7

In [8]:
kibot_files = get_file_names_s3_objects(s3_objects)
kibot_files

['kibot/All_Futures_Continuous_Contracts_1min/AC.csv.gz',
 'kibot/All_Futures_Continuous_Contracts_1min/AD.csv.gz',
 'kibot/All_Futures_Continuous_Contracts_1min/AE.csv.gz',
 'kibot/All_Futures_Continuous_Contracts_1min/AEX.csv.gz',
 'kibot/All_Futures_Continuous_Contracts_1min/AJY.csv.gz',
 'kibot/All_Futures_Continuous_Contracts_1min/ALJ.csv.gz',
 'kibot/All_Futures_Continuous_Contracts_1min/ALM.csv.gz',
 'kibot/All_Futures_Continuous_Contracts_1min/BB.csv.gz',
 'kibot/All_Futures_Continuous_Contracts_1min/BBN.csv.gz',
 'kibot/All_Futures_Continuous_Contracts_1min/BD.csv.gz',
 'kibot/All_Futures_Continuous_Contracts_1min/BFQ.csv.gz',
 'kibot/All_Futures_Continuous_Contracts_1min/BGI.csv.gz',
 'kibot/All_Futures_Continuous_Contracts_1min/BL.csv.gz',
 'kibot/All_Futures_Continuous_Contracts_1min/BO.csv.gz',
 'kibot/All_Futures_Continuous_Contracts_1min/BON.csv.gz',
 'kibot/All_Futures_Continuous_Contracts_1min/BP.csv.gz',
 'kibot/All_Futures_Continuous_Contracts_1min/BR.csv.gz',
 'kibo

In [9]:
subdirs = _get_subdirs(kibot_files)
subdirs

{'All_Futures_Continuous_Contracts_1min',
 'All_Futures_Continuous_Contracts_daily',
 'All_Futures_Continuous_Contracts_tick',
 'All_Futures_Contracts_1min'}

In [10]:
kibot_dir = os.path.join(hs3.get_path(), "kibot")
kibot_dir

's3://default00-bucket/kibot'

In [11]:
kibot_subdir = os.path.join(kibot_dir, list(subdirs)[0])
kibot_subdir

's3://default00-bucket/kibot/All_Futures_Continuous_Contracts_daily'

# All_Futures_Continuous_Contracts_1min

In [12]:
symbol = "CL"
# nrows = None
nrows = 10
s3_path = hs3.get_path()
file_name = os.path.join(
    hs3.get_path(),
    "kibot/All_Futures_Continuous_Contracts_1min/%s.csv.gz" % symbol,
)
df = kut.read_data(file_name, nrows)
df.head(3)

args=('s3://default00-bucket/kibot/All_Futures_Continuous_Contracts_1min/CL.csv.gz', 10) kwargs={}


Unnamed: 0_level_0,open,high,low,close,vol,time
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2009-09-27 18:00:00,66.15,66.35,66.15,66.32,88,18:00:00
2009-09-27 18:01:00,66.36,66.49,66.35,66.37,124,18:01:00
2009-09-27 18:02:00,66.37,66.41,66.37,66.37,25,18:02:00


In [13]:
df = pd.read_csv(file_name, header=None)

In [14]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6
0,09/27/2009,18:00,66.15,66.35,66.15,66.32,88
1,09/27/2009,18:01,66.36,66.49,66.35,66.37,124
2,09/27/2009,18:02,66.37,66.41,66.37,66.37,25
3,09/27/2009,18:03,66.37,66.42,66.37,66.42,33
4,09/27/2009,18:04,66.39,66.46,66.39,66.46,14


In [15]:
df[0] = pd.to_datetime(df[0] + " " + df[1], format="%m/%d/%Y %H:%M")

In [16]:
df.drop(columns=[1], inplace=True)

In [17]:
df.head()

Unnamed: 0,0,2,3,4,5,6
0,2009-09-27 18:00:00,66.15,66.35,66.15,66.32,88
1,2009-09-27 18:01:00,66.36,66.49,66.35,66.37,124
2,2009-09-27 18:02:00,66.37,66.41,66.37,66.37,25
3,2009-09-27 18:03:00,66.37,66.42,66.37,66.42,33
4,2009-09-27 18:04:00,66.39,66.46,66.39,66.46,14


In [18]:
df.columns = "datetime open high low close vol".split()
df.set_index("datetime", drop=True, inplace=True)
_LOG.debug("Add columns")
df["time"] = [d.time() for d in df.index]

In [19]:
df.head()

Unnamed: 0_level_0,open,high,low,close,vol,time
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2009-09-27 18:00:00,66.15,66.35,66.15,66.32,88,18:00:00
2009-09-27 18:01:00,66.36,66.49,66.35,66.37,124,18:01:00
2009-09-27 18:02:00,66.37,66.41,66.37,66.37,25,18:02:00
2009-09-27 18:03:00,66.37,66.42,66.37,66.42,33,18:03:00
2009-09-27 18:04:00,66.39,66.46,66.39,66.46,14,18:04:00


# All_Futures_Continuous_Contracts_daily

In [20]:
symbol = "CL"
# nrows = None
nrows = 10
s3_path = hs3.get_path()
file_name = os.path.join(
    hs3.get_path(),
    "kibot/All_Futures_Continuous_Contracts_daily/%s.csv.gz" % symbol,
)
df = kut.read_data(file_name, nrows)
df.head(3)

args=('s3://default00-bucket/kibot/All_Futures_Continuous_Contracts_daily/CL.csv.gz', 10) kwargs={}


Unnamed: 0_level_0,open,high,low,close,vol
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1996-08-16,21.46,22.1,21.34,22.05,28704
1996-08-19,21.91,22.57,21.91,22.47,41736
1996-08-20,22.41,22.43,22.1,22.11,38759


In [21]:
df = pd.read_csv(file_name, header=None)

In [22]:
df.head()

Unnamed: 0,0,1,2,3,4,5
0,08/16/1996,21.46,22.1,21.34,22.05,28704
1,08/19/1996,21.91,22.57,21.91,22.47,41736
2,08/20/1996,22.41,22.43,22.1,22.11,38759
3,08/21/1996,22.08,22.09,21.6,21.72,34230
4,08/22/1996,21.8,22.35,21.75,22.3,40269


In [23]:
df[0] = pd.to_datetime(df[0], format="%m/%d/%Y")

In [24]:
df.columns = "date open high low close vol".split()
df.set_index("date", drop=True, inplace=True)
# TODO(GP): Should this be renamed to datetime as described
# in kibot/utils.py L56?

In [25]:
df.head()

Unnamed: 0_level_0,open,high,low,close,vol
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1996-08-16,21.46,22.1,21.34,22.05,28704
1996-08-19,21.91,22.57,21.91,22.47,41736
1996-08-20,22.41,22.43,22.1,22.11,38759
1996-08-21,22.08,22.09,21.6,21.72,34230
1996-08-22,21.8,22.35,21.75,22.3,40269


# All_Futures_Contracts_1min

In [26]:
symbol = "CL"
nrows = 10
s3_path = hs3.get_path()
file_name = os.path.join(
    hs3.get_path(), "kibot/All_Futures_Contracts_1min/%s.csv.gz" % symbol
)
df = kut.read_data(file_name, nrows)
df.head(3)

args=('s3://default00-bucket/kibot/All_Futures_Contracts_1min/CL.csv.gz', 10) kwargs={}


Unnamed: 0_level_0,open,high,low,close,vol,time
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2009-09-27 18:00:00,66.15,66.35,66.15,66.32,88,18:00:00
2009-09-27 18:01:00,66.36,66.49,66.35,66.37,124,18:01:00
2009-09-27 18:02:00,66.37,66.41,66.37,66.37,25,18:02:00


In [27]:
df = pd.read_csv(file_name, header=None)

In [28]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6
0,09/27/2009,18:00,66.15,66.35,66.15,66.32,88
1,09/27/2009,18:01,66.36,66.49,66.35,66.37,124
2,09/27/2009,18:02,66.37,66.41,66.37,66.37,25
3,09/27/2009,18:03,66.37,66.42,66.37,66.42,33
4,09/27/2009,18:04,66.39,66.46,66.39,66.46,14


The same as continuous contracts 1 min

# All_Futures_Continuous_Contracts_tick

In [29]:
symbol = "AD"
nrows = 10
s3_path = hs3.get_path()
file_name = "kibot/All_Futures_Continuous_Contracts_tick/%s.csv.gz" % symbol
file_name in kibot_files

True

In [30]:
file_name = os.path.join(hs3.get_path(), file_name)
print(file_name)

s3://default00-bucket/kibot/All_Futures_Continuous_Contracts_tick/AD.csv.gz


In [31]:
df = kut.read_data(file_name, nrows)
df.head(3)

args=('s3://default00-bucket/kibot/All_Futures_Continuous_Contracts_tick/AD.csv.gz', 10) kwargs={}
args=('s3://default00-bucket/kibot/All_Futures_Continuous_Contracts_tick/AD.csv.gz', 10) kwargs={}
Reading file_name='s3://default00-bucket/kibot/All_Futures_Continuous_Contracts_tick/AD.csv.gz' nrows=10
Reading only the first nrows=10 rows


ValueError: Invalid dir_name='All_Futures_Continuous_Contracts_tick' in file_name='s3://default00-bucket/kibot/All_Futures_Continuous_Contracts_tick/AD.csv.gz'