# Clean Forward Returns (Solution)

## Install packages

In [1]:
import sys

In [2]:
# !{sys.executable} -m pip install -r requirements.txt

In [3]:
import cvxpy as cvx
import numpy as np
import pandas as pd
import time
import os
#import quiz_tests
import quiz_helper
import matplotlib.pyplot as plt

In [4]:
%matplotlib inline
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (14, 8)

### data bundle

In [5]:
import os
import quiz_helper
from zipline.data import bundles

In [6]:
os.environ['ZIPLINE_ROOT'] = os.path.join(os.getcwd(), '..', '..','data','project_4_eod')
ingest_func = bundles.csvdir.csvdir_equities(['daily'], quiz_helper.EOD_BUNDLE_NAME)
bundles.register(quiz_helper.EOD_BUNDLE_NAME, ingest_func)
print('Data Registered')

Data Registered


### Build pipeline engine

In [7]:
from zipline.pipeline import Pipeline
from zipline.pipeline.factors import AverageDollarVolume
from zipline.utils.calendars import get_calendar

universe = AverageDollarVolume(window_length=120).top(500) 
trading_calendar = get_calendar('NYSE') 
bundle_data = bundles.load(quiz_helper.EOD_BUNDLE_NAME)
engine = quiz_helper.build_pipeline_engine(bundle_data, trading_calendar)

ValueError: no data for bundle 'm4-quiz-eod-quotemedia' on or before 2019-04-22 18:42:20.252472+00:00
maybe you need to run: $ zipline ingest -b m4-quiz-eod-quotemedia

### View Data¶
With the pipeline engine built, let's get the stocks at the end of the period in the universe we're using. We'll use these tickers to generate the returns data for the our risk model.

In [None]:
universe_end_date = pd.Timestamp('2016-01-05', tz='UTC')

universe_tickers = engine\
    .run_pipeline(
        Pipeline(screen=universe),
        universe_end_date,
        universe_end_date)\
    .index.get_level_values(1)\
    .values.tolist()
    
universe_tickers

# Get Returns data

In [None]:
from zipline.data.data_portal import DataPortal

data_portal = DataPortal(
    bundle_data.asset_finder,
    trading_calendar=trading_calendar,
    first_trading_day=bundle_data.equity_daily_bar_reader.first_trading_day,
    equity_minute_reader=None,
    equity_daily_reader=bundle_data.equity_daily_bar_reader,
    adjustment_reader=bundle_data.adjustment_reader)

## Get pricing data helper function

In [None]:
def get_pricing(data_portal, trading_calendar, assets, start_date, end_date, field='close'):
    end_dt = pd.Timestamp(end_date.strftime('%Y-%m-%d'), tz='UTC', offset='C')
    start_dt = pd.Timestamp(start_date.strftime('%Y-%m-%d'), tz='UTC', offset='C')

    end_loc = trading_calendar.closes.index.get_loc(end_dt)
    start_loc = trading_calendar.closes.index.get_loc(start_dt)

    return data_portal.get_history_window(
        assets=assets,
        end_dt=end_dt,
        bar_count=end_loc - start_loc,
        frequency='1d',
        field=field,
        data_frequency='daily')

## get pricing data into a dataframe

In [None]:
returns_df = \
    get_pricing(
        data_portal,
        trading_calendar,
        universe_tickers,
        universe_end_date - pd.DateOffset(years=5),
        universe_end_date)\
    .pct_change()[1:].fillna(0) #convert prices into returns

returns_df

## Sector data helper function
We'll create an object for you, which defines a sector for each stock.  The sectors are represented by integers.  We inherit from the Classifier class.  [Documentation for Classifier](https://www.quantopian.com/posts/pipeline-classifiers-are-here), and the [source code for Classifier](https://github.com/quantopian/zipline/blob/master/zipline/pipeline/classifiers/classifier.py)

In [None]:
from zipline.pipeline.classifiers import Classifier
from zipline.utils.numpy_utils import int64_dtype
class Sector(Classifier):
    dtype = int64_dtype
    window_length = 0
    inputs = ()
    missing_value = -1

    def __init__(self):
        self.data = np.load('../../data/project_4_sector/data.npy')

    def _compute(self, arrays, dates, assets, mask):
        return np.where(
            mask,
            self.data[assets],
            self.missing_value,
        )

In [None]:
sector = Sector()

## We'll use 2 years of data to calculate the factor

**Note:** Going back 2 years falls on a day when the market is closed. Pipeline package doesn't handle start or end dates that don't fall on days when the market is open. To fix this, we went back 2 extra days to fall on the next day when the market is open.

In [None]:
factor_start_date = universe_end_date - pd.DateOffset(years=2, days=2)
factor_start_date

## Create smoothed momentum factor

In [None]:
from zipline.pipeline.factors import Returns
from zipline.pipeline.factors import SimpleMovingAverage


# create a pipeline called p
p = Pipeline(screen=universe)
# create a factor of one year returns, deman by sector, then rank
factor = (
    Returns(window_length=252, mask=universe).
    demean(groupby=Sector()). #we use the custom Sector class that we reviewed earlier
    rank().
    zscore()
)


# Use this factor as input into SimpleMovingAverage, with a window length of 5
# Also rank and zscore (don't need to de-mean by sector, s)
factor_smoothed = (
    SimpleMovingAverage(inputs=[factor], window_length=5).
    rank().
    zscore()
)

# add the unsmoothed factor to the pipeline
p.add(factor, 'Momentum_Factor')
# add the smoothed factor to the pipeline too
p.add(factor_smoothed, 'Smoothed_Momentum_Factor')

## visualize the pipeline

Note that if the image is difficult to read in the notebook, right-click and view the image in a separate tab.

In [None]:
p.show_graph(format='png')

## run pipeline and view the factor data

In [None]:
df = engine.run_pipeline(p, factor_start_date, universe_end_date)

In [18]:
df.head()

Unnamed: 0,Unnamed: 1,Momentum_Factor,Smoothed_Momentum_Factor
2014-01-03 00:00:00+00:00,Equity(0 [A]),1.499391,1.484618
2014-01-03 00:00:00+00:00,Equity(1 [AAL]),1.602797,1.573252
2014-01-03 00:00:00+00:00,Equity(2 [AAP]),0.376694,0.206813
2014-01-03 00:00:00+00:00,Equity(3 [AAPL]),-1.484618,-1.477232
2014-01-03 00:00:00+00:00,Equity(4 [ABBV]),0.915884,0.945429


## Evaluate Factors

We'll go over some tools that we can use to evaluate alpha factors.  To do so, we'll use the [alphalens library](https://github.com/quantopian/alphalens)


## Import alphalens

In [19]:
import alphalens as al

## Get price data

Note, we already got the price data and converted it to returns, which we used to calculate a factor.  We'll retrieve the price data again, but won't convert these to returns.  This is because we'll use alphalens functions that take their input as prices and not returns.

## Define the list of assets
Just to make sure we get the prices for the stocks that have factor values, we'll get the list of assets, which may be a subset of the original universe

In [20]:
# get list of stocks in our portfolio (tickers that identify each stock)
assets = df.index.levels[1].values.tolist()
print(f"stock universe number of stocks {len(universe_tickers)}, and number of stocks for which we have factor values {len(assets)}")

stock universe number of stocks 490, and number of stocks for which we have factor values 490


In [21]:
factor_start_date

Timestamp('2014-01-03 00:00:00+0000', tz='UTC')

In [22]:
pricing = get_pricing(
        data_portal,
        trading_calendar,
        assets, #notice that we used assets instead of universe_tickers; in this example, they're the same
        factor_start_date, # notice we're using the same start and end dates for when we calculated the factor
        universe_end_date)

## Double check the dates of the pricing data
Check that they make sense compared to the factor data

In [23]:
pricing.head(2)

Unnamed: 0,Equity(0 [A]),Equity(1 [AAL]),Equity(2 [AAP]),Equity(3 [AAPL]),Equity(4 [ABBV]),Equity(5 [ABC]),Equity(6 [ABT]),Equity(7 [ACN]),Equity(8 [ADBE]),Equity(9 [ADI]),...,Equity(481 [XL]),Equity(482 [XLNX]),Equity(483 [XOM]),Equity(484 [XRAY]),Equity(485 [XRX]),Equity(486 [XYL]),Equity(487 [YUM]),Equity(488 [ZBH]),Equity(489 [ZION]),Equity(490 [ZTS])
2014-01-06 00:00:00+00:00,122.824,26.026,110.917,71.475,43.053,65.132,35.407,73.223,58.12,44.021,...,27.697,40.601,85.397,46.704,29.222,32.332,49.399,89.795,28.221,30.894
2014-01-07 00:00:00+00:00,124.687,25.905,112.286,70.963,43.139,65.842,35.135,74.114,58.97,44.253,...,27.697,40.69,86.606,47.426,29.464,32.426,50.093,91.587,28.306,31.01


In [24]:
pricing.tail(2)

Unnamed: 0,Equity(0 [A]),Equity(1 [AAL]),Equity(2 [AAP]),Equity(3 [AAPL]),Equity(4 [ABBV]),Equity(5 [ABC]),Equity(6 [ABT]),Equity(7 [ACN]),Equity(8 [ADBE]),Equity(9 [ADI]),...,Equity(481 [XL]),Equity(482 [XLNX]),Equity(483 [XOM]),Equity(484 [XRAY]),Equity(485 [XRX]),Equity(486 [XYL]),Equity(487 [YUM]),Equity(488 [ZBH]),Equity(489 [ZION]),Equity(490 [ZTS])
2016-01-04 00:00:00+00:00,130.838,39.933,151.537,100.621,52.526,97.613,40.544,96.984,91.97,51.31,...,36.226,43.194,70.617,58.018,26.056,34.923,49.3,99.714,25.983,46.398
2016-01-05 00:00:00+00:00,131.369,39.552,150.502,98.1,52.307,99.041,40.534,97.489,92.34,50.933,...,36.302,43.836,71.218,59.201,26.005,34.913,49.177,101.79,25.701,47.124


In [25]:
pricing.shape

(504, 490)

### Compare to the factor data for a single stock

In [26]:
stock_index_name = df.index.get_level_values(1)[3] #just pick a stock; in this case, stock number 3 is AAPL
single_stock_factor_df = df[np.in1d(df.index.get_level_values(1), [stock_index_name])]

In [27]:
single_stock_factor_df.head(2)

Unnamed: 0,Unnamed: 1,Momentum_Factor,Smoothed_Momentum_Factor
2014-01-03 00:00:00+00:00,Equity(3 [AAPL]),-1.484618,-1.477232
2014-01-06 00:00:00+00:00,Equity(3 [AAPL]),-1.469846,-1.492005


In [28]:
single_stock_factor_df.tail(2)

Unnamed: 0,Unnamed: 1,Momentum_Factor,Smoothed_Momentum_Factor
2016-01-04 00:00:00+00:00,Equity(3 [AAPL]),-0.494357,-0.530442
2016-01-05 00:00:00+00:00,Equity(3 [AAPL]),-0.306718,-0.479923


In [29]:
single_stock_factor_df.shape

(505, 2)

## Quiz 1
If you have factor values calculated before time t, what price data would you use to calculate the factor return on that data?

## Answer 1
Use data from time t to time t+1 to calculate forward returns

## Prepare data for use in alphalens

Alphalens makes sure the data is formatted properly so that other neat alphalens functions can work with the data.
For instance, it lines up the price data and factor data and calculates forward returns that are associated with each factor value.
We'll use [alphalens.utils.get_clean_factor_and_forward_returns](https://github.com/quantopian/alphalens/blob/master/alphalens/utils.py)

The source code describes what it's used for (I'm showing just the parameters that we'll use here):
```
def get_clean_factor_and_forward_returns(factor,
                                         prices,
                                         ...
                                         periods=(1, 5, 10),
                                         ...
                                         ):

...
```

We'll give it three inputs: the factor, prices, and periods.  
* The factor is the Series containing the factor scores for each stock on each date.
* The prices are the Series of prices for each stock on each date (the same dates as for the factor).  Note that if the period we give is greater than 1, we'll want to make sure to pad our price data by the period amount so that forward returns can be calculated.  Keep reading for details about "periods" parameter.
* periods: this is the period for which we'll compute forward returns.  For instance, if prices and factor data have one data point per day (daily data), and if we wish to calculate the return of our factor-weighted portfolio every day, then the period would be 1, and input as a list [1].  If we wanted to calculate the weekly return, we would input [5].  If we wanted both daily and weekly, we could input [1,5].

* returns: multi-index Pandas dataframe containing the cleaned version of the data.

## Quiz 2
What alphalens function does get_clean_factor_and_forward_returns call to get forward returns? You'll find the answer in  the [source code](https://github.com/quantopian/alphalens/blob/master/alphalens/utils.py)

## Answer 2
The function `compute_forward_returns` computes forward returns.

## Quiz 3
Clean and line up the factors and forward returns using alphalens

In this case, we have the unsmoothed and smoothed factors

## Answer 3

In [30]:
factor_names = df.columns
print(f"The factor names are {factor_names}")

# Use a dictionary to store each dataframe, one for each factor and its associated forward returns
factor_data = {}
for factor_name in factor_names:
    print("Formatting factor data for: " + factor_name)
    # TODO: get clean factor and forward returns for each factor
    # Choose single period returns (daily returns)
    factor_data[factor_name] = al.utils.get_clean_factor_and_forward_returns(
        factor=df[factor_name],
        prices=pricing,
        periods=[1])

The factor names are Index(['Momentum_Factor', 'Smoothed_Momentum_Factor'], dtype='object')
Formatting factor data for: Momentum_Factor
Dropped 2.3% entries from factor data: 2.3% in forward returns computation and 0.0% in binning phase (set max_loss=0 to see potentially suppressed Exceptions).
max_loss is 35.0%, not exceeded: OK!
Formatting factor data for: Smoothed_Momentum_Factor
Dropped 2.3% entries from factor data: 2.3% in forward returns computation and 0.0% in binning phase (set max_loss=0 to see potentially suppressed Exceptions).
max_loss is 35.0%, not exceeded: OK!


## Inspect the cleaned data

In [31]:
cleaned_smooth_factor = factor_data[factor_names[1]]
cleaned_smooth_factor.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,1D,factor,factor_quantile
date,asset,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2014-01-06 00:00:00+00:00,Equity(0 [A]),0.015168,1.484618,5
2014-01-06 00:00:00+00:00,Equity(1 [AAL]),-0.004649,1.543708,5
2014-01-06 00:00:00+00:00,Equity(2 [AAP]),0.012343,0.310219,3
2014-01-06 00:00:00+00:00,Equity(3 [AAPL]),-0.007163,-1.492005,1
2014-01-06 00:00:00+00:00,Equity(4 [ABBV]),0.001998,0.974973,4


In [32]:
cleaned_smooth_factor.tail()

Unnamed: 0_level_0,Unnamed: 1_level_0,1D,factor,factor_quantile
date,asset,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016-01-04 00:00:00+00:00,Equity(486 [XYL]),-0.000286,-0.097428,3
2016-01-04 00:00:00+00:00,Equity(487 [YUM]),-0.002495,0.155163,3
2016-01-04 00:00:00+00:00,Equity(488 [ZBH]),0.02082,-1.086143,1
2016-01-04 00:00:00+00:00,Equity(489 [ZION]),-0.010853,-0.184031,3
2016-01-04 00:00:00+00:00,Equity(490 [ZTS]),0.015647,0.169597,3


## Quiz 4
What do you think the '1D' column represents?

## Answer 4

The 1D column represents the forward returns. 