In [93]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import quandl
import seaborn as sns
import statsmodels.api as sm
authtoken="AmT6hbzjx59sQAvKde2L"
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import KBinsDiscretizer
from sklearn import linear_model
%matplotlib inline

# Standard plotly imports
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode

# Using plotly + cufflinks in offline mode
import cufflinks
cufflinks.go_offline(connected=True)
init_notebook_mode(connected=True)

In [94]:
start_date = pd.datetime(1900, 1, 1)
end_date = pd.datetime(2018, 3, 31)
yale = quandl.get("YALE/SPCOMP", start_date = start_date, end_date = end_date, authtoken=authtoken)

In [95]:
yale.columns = ['SP_Comp','Dividend', 'Earnings', 'CPI', 'Long_Int_Rate',
       'Real_Price', 'Real_Dividend', 'Real_Earnings',
       'CAPE']

In [96]:
yale.head()

Unnamed: 0_level_0,SP_Comp,Dividend,Earnings,CPI,Long_Int_Rate,Real_Price,Real_Dividend,Real_Earnings,CAPE
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1900-01-31,6.1,0.2175,0.48,7.897091,3.15,197.180979,7.030633,15.51588,18.674275
1900-02-28,6.21,0.225,0.48,7.992232,3.145833,198.347095,7.186489,15.331176,18.703797
1900-03-31,6.26,0.2325,0.48,7.992232,3.141667,199.944093,7.426039,15.331176,18.775793
1900-04-30,6.34,0.24,0.48,7.992232,3.1375,202.499289,7.665588,15.331176,18.936402
1900-05-31,6.04,0.2475,0.48,7.801942,3.133333,197.622574,8.097945,15.705105,18.403197


In [97]:
def FwdReturnAnnualized(prices, n):
    gross_returns = prices.pct_change(n).shift(-n)
    n_years = n/12
    annualized_returns = ((1+ gross_returns) ** (1/n_years)) - 1
    return annualized_returns

yale['Fwd10YrRet'] = FwdReturnAnnualized(prices=yale.SP_Comp ,n=120)
yale['Fwd1YrRet'] = FwdReturnAnnualized(prices=yale.SP_Comp ,n=12)
yale['RealEarnYield'] = yale.Real_Earnings / yale.Real_Price # Create earnings yield feature
yale['RealDivYield'] = yale.Real_Dividend / yale.Real_Price # Create dividen yield feature
yale['ERP_FedModel'] = yale.RealEarnYield - (yale.Long_Int_Rate / 100)

In [98]:
yale.head()

Unnamed: 0_level_0,SP_Comp,Dividend,Earnings,CPI,Long_Int_Rate,Real_Price,Real_Dividend,Real_Earnings,CAPE,Fwd10YrRet,Fwd1YrRet,RealEarnYield,RealDivYield,ERP_FedModel
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1900-01-31,6.1,0.2175,0.48,7.897091,3.15,197.180979,7.030633,15.51588,18.674275,0.051509,0.159016,0.078689,0.035656,0.047189
1900-02-28,6.21,0.225,0.48,7.992232,3.145833,198.347095,7.186489,15.331176,18.703797,0.045821,0.167472,0.077295,0.036232,0.045836
1900-03-31,6.26,0.2325,0.48,7.992232,3.141667,199.944093,7.426039,15.331176,18.775793,0.047535,0.199681,0.076677,0.037141,0.045261
1900-04-30,6.34,0.24,0.48,7.992232,3.1375,202.499289,7.665588,15.331176,18.936402,0.043657,0.283912,0.07571,0.037855,0.044335
1900-05-31,6.04,0.2475,0.48,7.801942,3.133333,197.622574,8.097945,15.705105,18.403197,0.046989,0.279801,0.07947,0.040977,0.048137


# train test split


In [122]:
variables = ['CAPE','ERP_FedModel','RealEarnYield']
target = ['Fwd1YrRet']
data = yale.dropna()
data = data[variables + target]
train_data = data.iloc[:1000,:]
test_data = data.iloc[1001:,:]
X_test = np.asarray(test_data[variables])
y_test = np.asarray(test_data[target])
X = np.asarray(train_data[variables])
y = np.asarray(train_data[target])
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# preprocess

In [123]:
est = KBinsDiscretizer(n_bins=4, encode='ordinal')
X_train = est.fit_transform(X_train)
X_val = est.transform(X_val)
est = KBinsDiscretizer(n_bins=4, encode='ordinal')
y_train = est.fit_transform(y_train)
y_val = est.transform(y_val)

In [124]:
def MapToTrinary(num, low, high):
    if num == low:
        return -1
    elif num == high:
        return 1
    else:
        return 0
vfunc = np.vectorize(MapToTrinary)
X_train = vfunc(X_train, low=0, high=3)
X_val = vfunc(X_val, low=0, high=3)
y_train = vfunc(y_train, low=0, high=3)
y_val = vfunc(y_val, low=0, high=3)

In [125]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
clf = DecisionTreeClassifier(random_state=0, max_depth=3)
clf.fit(X_train, y_train)
# cross_val_score(clf, X_train, y_train, cv=5)
accuracy_score(y_val, clf.predict(X_val))

0.505

In [None]:
current_data = np.array(data[-119:]).reshape(-1,X.shape[1])
pred_current = neigh.predict(current_data)

In [None]:
plt.scatter(data.CAPE[-119:],pred_current)
print("The current CAPE RATIO is:",(current_data[-1]))
print("The predicted 10 year forward real return is:", round(100*pred_current[-1],1),"%")

In [None]:
ret_ann = yale.SP_Comp.pct_change(12)
ret_ann = ret_ann.dropna()

ret_neg = ret_ann < 0
ret_neg = ret_neg * ret_ann
ret_neg = ret_neg[ret_neg != 0]

ret_norm = (ret_ann > 0) & (ret_ann < .1)
ret_norm = ret_norm * ret_ann
ret_norm = ret_norm[ret_norm != 0]

ret_high = (ret_ann > .1)
ret_high = ret_high * ret_ann
ret_high = ret_high[ret_high != 0]

print(len(ret_norm) / len(ret_ann))
print(len(ret_neg) / len(ret_ann))
print(len(ret_high) / len(ret_ann))

In [None]:
sns.distplot(ret_neg, kde=False)
sns.distplot(ret_norm, kde=False)
sns.distplot(ret_high, kde=False)

In [None]:
X = np.array(yale.dropna().CAPE)
X = X.reshape(-1,1)
Y = np.asarray(yale.dropna().Fwd10YrRet)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = linear_model.LinearRegression()
model.fit(X_train,y_train)
pred = model.predict(X_test)
mse(y_test,pred)

In [None]:
yale.Fwd10YrRet.plot()
yale.CAPE.plot(secondary_y=True)

In [None]:
current_data = np.asarray(yale.CAPE[-119:]).reshape(-1,1)
pred_current = neigh.predict(current_data)