In [2]:
import pandas.io.data as web
import datetime
from blaze import data, compute, symbol, resource, transform, by

In [3]:
# using uri/structure found in Pandas cause it works
start = datetime.datetime(2010, 1, 1)
end = datetime.datetime(2013, 1, 27)

sym = 'F'
interval = 'd'  # get data at daily interval

# In pandas we'd do:
f = web.DataReader(sym, 'yahoo', start, end)

# QUESTION: cannot get all data with blaze - we get NA values somewhere after
# 5/1 so blaze fails: end = datetime.datetime(2011, 5, 2)
# However, if we go to the URL and download the table.csv, it looks and loads
# fine via the data interface:
# http://ichart.finance.yahoo.com/table.csv?s=F&a=0&b=1&c=2010&d=0&e=27&f=2013&g=d&ignore=.csv
end = datetime.datetime(2011, 5, 1)

_HISTORICAL_YAHOO_URL = 'http://ichart.finance.yahoo.com/table.csv?'
uri = (_HISTORICAL_YAHOO_URL + 's=%s' % sym +
       '&a=%s' % (start.month - 1) +
       '&b=%s' % start.day +
       '&c=%s' % start.year +
       '&d=%s' % (end.month - 1) +
       '&e=%s' % end.day +
       '&f=%s' % end.year +
       '&g=%s' % interval +
       '&ignore=.csv')


In [4]:
# A blaze symbol represents a collection of data. In this instance the
# blaze expression bound to the resource.
# This allows for some interactive behavior and data exploration
d = data(uri)

# QUESTION: ok - we can work with this subset though maybe need a different
# example, because pandas makes this easier

# The shape and datatypes of the columns are captured in dshape
# dshape("""var * {
#   Date: ?datetime,
#   Open: float64,
#   High: float64,
#   Low: float64,
#   Close: float64,
#   Volume: int64,
#   'Adj Close': float64
#   }""")
#
# var: data contains a variable number of rows so we don't know table length
#      The columns and dtypes of the columns are as follows:
# Date: datetime object but this is optional so it could be missing for some
#    records. This is indicated by the '?' preceding datetime
# Open, High, Low, Close, 'Adj Close': columns contains values of type float64
# Volume: column contains int64 values
d.dshape

dshape("""var * {
  Date: ?datetime,
  Open: float64,
  High: float64,
  Low: float64,
  Close: float64,
  Volume: int64,
  'Adj Close': float64
  }""")

In [5]:
# Let's peek inside the data. This is similar to d.head() except in interactive
# mode d.head() shows first 10 lines and peek() shows first 11 lines.
# You can limit the data
# QUESTION: need docstring for d.peek()
# QUESTION: the 11 vs 10 lines seems inconsistent
d.peek()


Unnamed: 0,Date,Open,High,Low,Close,Volume,Adj Close
0,2011-04-29,15.49,15.68,15.4,15.47,40203200,13.176518
1,2011-04-28,15.47,15.7,15.42,15.5,48145500,13.20207
2,2011-04-27,15.85,15.87,15.42,15.58,84380600,13.270209
3,2011-04-26,16.040001,16.18,15.66,15.66,166900000,13.338349
4,2011-04-25,15.51,15.59,15.35,15.54,54775800,13.23614
5,2011-04-21,15.23,15.45,15.1,15.43,53181900,13.142448
6,2011-04-20,14.95,15.2,14.84,15.09,70978900,12.852854
7,2011-04-19,14.65,14.79,14.54,14.66,33575500,12.486603
8,2011-04-18,14.51,14.74,14.49,14.62,49822500,12.452533
9,2011-04-15,14.86,14.91,14.61,14.71,47434700,12.52919


In [6]:
# Viewing a single column creates a Field object, uses single brackets
d['Date']# can see 10 or fewer lines at the beginning or the end using head(), tail()
d.head(4)

Unnamed: 0,Date,Open,High,Low,Close,Volume,Adj Close
0,2011-04-29,15.49,15.68,15.4,15.47,40203200,13.176518
1,2011-04-28,15.47,15.7,15.42,15.5,48145500,13.20207
2,2011-04-27,15.85,15.87,15.42,15.58,84380600,13.270209
3,2011-04-26,16.040001,16.18,15.66,15.66,166900000,13.338349


In [7]:
d.tail(8)

Unnamed: 0,Date,Open,High,Low,Close,Volume,Adj Close
286,2010-03-12,13.02,13.37,12.98,13.34,104771000,11.362298
287,2010-03-11,12.79,12.97,12.75,12.91,59274700,10.996046
288,2010-03-10,12.88,12.95,12.8,12.82,56850600,10.919389
289,2010-03-09,12.83,13.03,12.73,12.8,83934300,10.902355
290,2010-03-08,12.94,12.96,12.79,12.93,78903200,11.013082
291,2010-03-05,12.92,13.04,12.83,13.0,106920600,11.072704
292,2010-03-04,12.78,12.8,12.52,12.79,89836800,10.893837
293,2010-03-03,12.46,12.83,12.35,12.69,157004600,10.808662


In [8]:
# get total number of rows in result set - 294
d.nrows

In [12]:
# Viewing a single column creates a Field object, uses single brackets
d['Date']
d[['Date', 'Open', 'Close']]

Unnamed: 0,Date,Open,Close
0,2011-04-29,15.49,15.47
1,2011-04-28,15.47,15.5
2,2011-04-27,15.85,15.58
3,2011-04-26,16.040001,15.66
4,2011-04-25,15.51,15.54
5,2011-04-21,15.23,15.43
6,2011-04-20,14.95,15.09
7,2011-04-19,14.65,14.66
8,2011-04-18,14.51,14.62
9,2011-04-15,14.86,14.71


In [13]:
# Arithmetic operations can be applied to Field objects, so following is valid:
d['Open'] * 10

Unnamed: 0,Open
0,154.9
1,154.7
2,158.5
3,160.40001
4,155.1
5,152.3
6,149.5
7,146.5
8,145.1
9,148.6


In [14]:
# However, Projection objects do not support arithmetic operations, so
# following will raise an AttributeError
# d[['Open']] * 10
# d[['Open', 'Close']] * 10

# Say we wish to subselect based on a criteria and we want to know how big
# the result set is - this reults in 19 records
d[d['Date'] > datetime.datetime(2011, 4, 1)].nrows

In [15]:
# The highest value occured on 2010-06-29
d.sort('High')[['Date', 'High', 'Low']].head(1)

Unnamed: 0,Date,High,Low
211,2010-06-29,10.14,9.75


In [16]:
# The highest value occured on 2010-06-29 and
# the 'High' was at its lowest on 2011-01-13
d.sort('High')[['Date', 'High', 'Low']].head(1)
d.sort('High', ascending=False)[['Date', 'High', 'Low']].head(1)

Unnamed: 0,Date,High,Low
73,2011-01-13,18.969999,18.549999


In [17]:
# In interactive mode, arithmetic operations can be applied to single column
# but not to multiple columns, so following cannot be done in interactive mode.
# The Projection object does not support arithmetic operations
#   (d[['Open', 'Close']] * (d['Adj Close']/d['Close'])).sort()
#
# however, a single column
(d['Open'] * (d['Adj Close']/d['Close'])).sort()

Unnamed: 0,Open
73,16.02135
70,15.936175
71,15.893588
72,15.851002
64,15.774344
75,15.740274
76,15.697687
77,15.68917
74,15.638065
78,15.35699


In [18]:
(d['Open'] * (d['Adj Close']/d['Close'])).sort(ascending=False)

Unnamed: 0,Open
73,16.02135
70,15.936175
71,15.893588
72,15.851002
64,15.774344
75,15.740274
76,15.697687
77,15.68917
74,15.638065
78,15.35699


In [19]:
# Use transform to add another a new column to the dataset
d = transform(d, ratio=d['Adj Close']/d['Close'], year=d['Date'].year)

In [20]:
# In order to see the raw data normalized by 'Adj Close' for multiple columns:
# ['Open', 'High', 'Low', 'Close', 'Adj Close'], we need to depart from
# interactive mode. Let's create a dataframe and operate on that now
df = compute(d)
df[['Open', 'High', 'Low', 'Close',
    'Adj Close']].multiply(df['Adj Close']/df['Close'], axis="index").head()

Unnamed: 0,Open,High,Low,Close,Adj Close
0,13.193553,13.355385,13.116896,13.176518,11.223053
1,13.176518,13.372419,13.13393,13.20207,11.244816
2,13.500181,13.517215,13.13393,13.270209,11.302853
3,13.662013,13.781257,13.338349,13.338349,11.360891
4,13.210588,13.278727,13.074308,13.23614,11.273835


In [21]:
# or equivalently
df[['Open', 'High', 'Low', 'Close',
    'Adj Close']].multiply(df['ratio'], axis="index").head()

Unnamed: 0,Open,High,Low,Close,Adj Close
0,13.193553,13.355385,13.116896,13.176518,11.223053
1,13.176518,13.372419,13.13393,13.20207,11.244816
2,13.500181,13.517215,13.13393,13.270209,11.302853
3,13.662013,13.781257,13.338349,13.338349,11.360891
4,13.210588,13.278727,13.074308,13.23614,11.273835


In [22]:
# group by year
yearly_high_minmax = by(d[['year']], High_min=d.High.min(),
                        High_max=d.High.max())

In [23]:
# QUESTION: seems like we cannot do above operation in blaze expression
#
# a dataframe in order to apply arithmetic operation to subset of data
# Define a blaze symbol. This symbol is not bound to a data
# resource. It has no interactive behavior and an expression must be computed
# in order to visualize results. However, ir provides many more operations. It
# also abstracts the computation in a blaze expression and the backend data
# source into a blaze resource. We can then compute the same blaze expression
# against various backend data resources; so long as they all have the same
# datashape.
s = symbol('s', d.dshape)
rsx = resource(uri, dshape=d.dshape)

# Cannot do following in blaze
# expr = (s[['Open', 'High', 'Low', 'Close', 'Adj Close']] * (s['Adj Close']/s['Close'])).sort()
# result = compute(expr, rsx)