In [1]:
# Standard imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import sklearn

import os
import math

%matplotlib inline
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Reload all modules imported with %aimport
%load_ext autoreload
%autoreload 1

# Import nn_helper module
import helper
%aimport helper

helper = helper.HELPER()

In [2]:
ticker = "AAPL"
index_ticker = "SPY"
dateAttr = "Dt"
priceAttr = ["Adj Close","Open"]

data = helper.getData([ticker], index_ticker, priceAttr)
print("Start time: ", data.index.min())
print("End time: ", data.index.max())
start_dt = "2018-01-02"
end_dt = "2018-09-28"
train_data_price = None

# Set variable train_data_price to be a DataFrame with two columns
## AAPL_Adj_Close, SPY_Adj_Close
## with dates as the index
## Having minimum date equal to THE DAY BEFORE start_dt
## Having maximum date equal to end_dt

def getRange(df, start_dt, end_dt):
    s_dt="2017-12-29"
    return df.loc[s_dt:end_dt]
    

train_data_price = getRange(data, start_dt, end_dt)
train_data_price

Start time:  2017-01-03
End time:  2019-10-31


Unnamed: 0_level_0,AAPL_Adj_Close,AAPL_Open,SPY_Adj_Close,SPY_Open
Dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-12-29,164.2589,170.52,258.2823,268.5678
2018-01-02,167.1999,170.16,260.1310,267.8400
2018-01-03,167.1708,172.53,261.7763,268.9600
2018-01-04,167.9473,172.54,262.8796,271.2000
2018-01-05,169.8594,173.44,264.6314,272.5100
...,...,...,...,...
2018-09-24,216.7654,216.82,285.3496,291.3400
2018-09-25,218.1399,219.75,285.0848,291.5300
2018-09-26,216.4021,221.00,284.2318,290.9100
2018-09-27,220.8496,223.82,285.0260,290.4100


In [3]:
test_start_dt = '2018-10-01'
test_end_dt = '2018-12-31'
test_data_price = None

    
ts_dt='2018-09-28'
test_data_price = data.loc[ts_dt:test_end_dt]

test_data_price

Unnamed: 0_level_0,AAPL_Adj_Close,AAPL_Open,SPY_Adj_Close,SPY_Open
Dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-09-28,221.6252,224.79,285.0555,289.99
2018-10-01,223.1175,227.95,286.0458,292.11
2018-10-02,225.1006,227.25,285.8791,291.56
2018-10-03,227.8398,230.05,286.0359,292.74
2018-10-04,223.8342,230.78,283.8004,291.18
...,...,...,...,...
2018-12-24,144.6565,148.15,231.1158,239.04
2018-12-26,154.8435,148.30,242.7929,235.97
2018-12-27,153.8386,155.84,244.6569,242.57
2018-12-28,153.9174,157.50,244.3412,249.58


In [4]:
train_data_ret = None

def getReturns(df):
    return df.pct_change()
train_data_ret = getReturns(train_data_price)
#train_data_ret.head()
train_data_ret

Unnamed: 0_level_0,AAPL_Adj_Close,AAPL_Open,SPY_Adj_Close,SPY_Open
Dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-12-29,,,,
2018-01-02,0.017905,-0.002111,0.007158,-0.002710
2018-01-03,-0.000174,0.013928,0.006325,0.004182
2018-01-04,0.004645,0.000058,0.004215,0.008328
2018-01-05,0.011385,0.005216,0.006664,0.004830
...,...,...,...,...
2018-09-24,0.014380,-0.017936,-0.003322,-0.006046
2018-09-25,0.006341,0.013514,-0.000928,0.000652
2018-09-26,-0.007966,0.005688,-0.002992,-0.002127
2018-09-27,0.020552,0.012760,0.002794,-0.001719


In [5]:
## Rename the columns to indicate that they have been transformed from price (Adj_close) to Return
train_data_ret = helper.renamePriceToRet( train_data_ret )

## Drop the first date (the day before `start_dt`) since it has an undefined return
train_data_ret = train_data_ret[ start_dt:]
train_data_ret.head()

Unnamed: 0_level_0,AAPL_Ret,AAPL_Open,SPY_Ret,SPY_Open
Dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-01-02,0.017905,-0.002111,0.007158,-0.00271
2018-01-03,-0.000174,0.013928,0.006325,0.004182
2018-01-04,0.004645,5.8e-05,0.004215,0.008328
2018-01-05,0.011385,0.005216,0.006664,0.00483
2018-01-08,-0.003714,0.005247,0.001829,0.002936


In [6]:
tickerAttr = ticker + "_Ret"

X_train, y_train =  train_data_ret.drop(columns=[tickerAttr]), train_data_ret[[ tickerAttr ]]
y_train

Unnamed: 0_level_0,AAPL_Ret
Dt,Unnamed: 1_level_1
2018-01-02,0.017905
2018-01-03,-0.000174
2018-01-04,0.004645
2018-01-05,0.011385
2018-01-08,-0.003714
...,...
2018-09-24,0.014380
2018-09-25,0.006341
2018-09-26,-0.007966
2018-09-27,0.020552


In [7]:
test_data_price.head()

Unnamed: 0_level_0,AAPL_Adj_Close,AAPL_Open,SPY_Adj_Close,SPY_Open
Dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-09-28,221.6252,224.79,285.0555,289.99
2018-10-01,223.1175,227.95,286.0458,292.11
2018-10-02,225.1006,227.25,285.8791,291.56
2018-10-03,227.8398,230.05,286.0359,292.74
2018-10-04,223.8342,230.78,283.8004,291.18


In [8]:
test_data_ret = None
X_test = None
y_test = None

test_data_ret = getReturns(test_data_price)
#test_data_ret
test_data_ret = helper.renamePriceToRet( test_data_ret )

## Drop the first date (the day before `test_start_dt`) since it has an undefined return
test_data_ret = test_data_ret[ test_start_dt:]
#test_data_ret



X_test, y_test =  test_data_ret.drop(columns=[tickerAttr]), test_data_ret[[ tickerAttr ]]

print("test data length", test_data_ret.shape[0])
print("X test length", X_test.shape[0])
print("y test length", y_test.shape[0])
test_data_ret.head()

test data length 63
X test length 63
y test length 63


Unnamed: 0_level_0,AAPL_Ret,AAPL_Open,SPY_Ret,SPY_Open
Dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-10-01,0.006733,0.014058,0.003474,0.007311
2018-10-02,0.008888,-0.003071,-0.000583,-0.001883
2018-10-03,0.012169,0.012321,0.000548,0.004047
2018-10-04,-0.017581,0.003173,-0.007815,-0.005329
2018-10-05,-0.016229,-0.012219,-0.005597,-0.005117


In [9]:
from sklearn import datasets, linear_model

beta_0 = 0    # The regression parameter for the constant
beta_SPY = 0  # The regression parameter for the return of SPY
ticker = "AAPL"

def createModel():
    '''
    Build your linear regression model using sklearn
    
    Returns
    -------
    An sklearn model object implementing Linear Regression
    '''
    from sklearn.linear_model import LinearRegression
    answer_model=LinearRegression()
    return answer_model

def regress(model, X, y):
    '''
    Do regression using returns of your ticker and index
    
    Parameters
    -----------
    model: model object implementing Linear Regression
        
    X: DataFrame
    - Index returns
    
    y: DataFrame
    - Ticker returns
    
    Returns
    -------
    Tuple (beta_0, beta_SPY)
    where,
        beta_0: Scalar number 
        - Parameter for the constant

        beta_SPY: Scalar number
        - Parameter for the return of SPY
    

    '''
    model.fit(X, y)
    b0=model.intercept_
    b1=model.coef_
    #print(b0,b1)
    return b0, b1

# Assign to answer variables
regr = createModel()

beta_0, beta_SPY = regress(regr, X_train, y_train)
print(beta_0, beta_SPY)

[0.00071172] [[ 0.45611984  0.99868063 -0.6577503 ]]


In [10]:
from sklearn.model_selection import cross_val_score

cross_val_avg = 0 # average score of cross validation
k = 5             # 5-fold cross validation

def compute_cross_val_avg(model, X, y, k):
    '''
    Compute the average score of k-fold cross validation
    
    Parameters
    -----------
    model: An sklearn model
    
    X: DataFrame
    - Index returns
    
    y: DataFrame
    - Ticker returns
    
    k: Scalar number
    - k-fold cross validation
    
    Returns
    --------
    The average, across the k iterations, of the score
    '''
    # YOUR CODE HERE
    
    
    return cross_val_score(model, X, y, cv=k).mean()

    
cross_val_avg = compute_cross_val_avg(regr, X_train, y_train, 5)
print("{t:s}: Avg cross val score = {sc:3.2f}".format(t=ticker, sc=cross_val_avg) )

AAPL: Avg cross val score = 0.43


In [11]:
from sklearn.metrics import mean_squared_error

rmse_in_sample = 0 # in sample loss
rmse_out_sample = 0 # out of sample performance

# Predicted  in-sample returns of AAPL using SPY index
aapl_predicted_in_sample = regr.predict(X_train)
# Predicted out-of-sample returns of AAPL using SPY index
aapl_predicted_out_sample = regr.predict(X_test)

def computeRMSE( target, predicted ):
    '''
    Calculate the RMSE
    
    Parameters
    ----------
    target: DataFrame
    - Real ticker returns
    
    predicted: ndarray
    - Predicted ticker returns
    
    Return
    ------
    Scalar number
    - The value of the RMSE
    '''
    from sklearn.metrics import mean_squared_error

    rms = mean_squared_error(target, predicted, squared=False)
    
    return rms
    
rmse_in_sample = computeRMSE(y_train, aapl_predicted_in_sample)
rmse_out_sample = computeRMSE(y_test, aapl_predicted_out_sample)
print("In Sample Root Mean squared error: {:.3f}".format( rmse_in_sample ) )
print("Out of Sample Root Mean squared error: {:.3f}".format( rmse_out_sample ) )

In Sample Root Mean squared error: 0.010
Out of Sample Root Mean squared error: 0.013


In [12]:
hedged_series = pd.DataFrame()

def compute_hedged_series(model, X, y):

    model.fit(X, y)
    b0=model.intercept_
    b1=model.coef_
    A=X.join(y)
    A['ans']=A['AAPL_Ret']-b1[0][0]*A['SPY_Ret']
    return A['ans']
    

hedged_series = compute_hedged_series(regr, X_test, y_test)
print(hedged_series[:5])

Dt
2018-10-01    0.005096
2018-10-02    0.009163
2018-10-03    0.011910
2018-10-04   -0.013897
2018-10-05   -0.013590
Name: ans, dtype: float64


In [13]:
print('Done')

Done
