In [1]:
# Forecasting Interest Rate Hikes by the U.S. Federal Reserve
import pandas as pd
from sklearn.cross_validation import train_test_split
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.linear_model import LogisticRegression
import math
import numpy as np

In [2]:
rates = pd.read_csv('data/federalFundsRate.csv')

In [3]:
rates.head()

Unnamed: 0,Date,Chairman,PreviousRate,Streak,GDP,Unemployment,CPI,HomeownershipRate,DebtAsPctGDP,DemocraticPres,MonthsUntilElection,RaisedFedFunds
0,1966-02-01,"Martin, William M.",4.42,4,4201.891,4.0,31.88,63.5,40.26076,1,33,1
1,1966-03-01,"Martin, William M.",4.6,5,4201.891,3.8,32.08,63.5,4201.891,1,32,1
2,1966-04-01,"Martin, William M.",4.65,6,4201.891,3.8,32.18,63.5,4201.891,1,31,1
3,1966-05-01,"Martin, William M.",4.67,7,4219.097,3.8,32.28,63.2,39.15969,1,30,1
4,1966-06-01,"Martin, William M.",4.9,8,4219.097,3.9,32.35,63.2,4219.097,1,29,1


In [4]:
rates.RaisedFedFunds.value_counts()/len(rates)

1    0.502564
0    0.497436
dtype: float64

In [5]:
rates.Chairman.value_counts()

Greenspan, Alan       221
Volcker, Paul          96
Bernanke, Ben          96
Burns, Arthur          96
Martin, William M.     48
Miller, G. William     17
Yellen, Janet           9
dtype: int64

In [6]:
rates.RaisedFedFunds = rates.RaisedFedFunds.astype('category')
train, test = train_test_split(rates, train_size=.7, test_size=.3, random_state=200)
#seed in python will not give same result as seed in R

In [7]:
mod1 = smf.glm('RaisedFedFunds ~ PreviousRate + Streak + Unemployment + HomeownershipRate + C(DemocraticPres)' \
               '+ MonthsUntilElection', data = train, family=sm.families.Binomial()).fit()
mod1.summary()

0,1,2,3
Dep. Variable:,RaisedFedFunds,No. Observations:,409.0
Model:,GLM,Df Residuals:,402.0
Model Family:,Binomial,Df Model:,6.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-249.88
Date:,"Fri, 21 Aug 2015",Deviance:,499.77
Time:,11:14:28,Pearson chi2:,449.0
No. Iterations:,5,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,8.0955,5.201,1.557,0.120,-2.097 18.288
C(DemocraticPres)[T.1],0.3597,0.229,1.571,0.116,-0.089 0.808
PreviousRate,0.0045,0.032,0.140,0.889,-0.058 0.067
Streak,0.1416,0.024,5.921,0.000,0.095 0.188
Unemployment,-0.0790,0.065,-1.221,0.222,-0.206 0.048
HomeownershipRate,-0.1177,0.078,-1.512,0.130,-0.270 0.035
MonthsUntilElection,-0.0059,0.008,-0.775,0.438,-0.021 0.009


In [8]:
pred_test = mod1.predict(test)
conf_mtx = pd.crosstab(test.RaisedFedFunds, pred_test < 0.5, rownames=['Actual'], colnames=['Predicted'], margins=True)
conf_mtx

Predicted,False,True,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,28,65,93
1,59,24,83
All,87,89,176


Predicted probability that the interest rate will be raised given:
Streak=-3, PreviousRate=1.7, Unemployment=5.1, HomeownershipRate=65.3, DemocraticPres=0, MonthsUntilElection=18 

In [9]:
odds = math.exp(mod1.params.ix['C(DemocraticPres)[T.1]']) # odds of rates being raised when president is Democratic
1/(1+math.exp(-sum(mod1.params * [1, 0, 1.7, -3, 5.1, 65.3, 18])))

0.37405468218339666

In [10]:
# building logistic regression using sm
train2 = train.copy()
train2.DemocraticPres = train2.DemocraticPres.astype('category')
_dummies = pd.get_dummies(train2.DemocraticPres, prefix='DemocraticPres').iloc[:, 1:] 
train2 = pd.concat([train2, _dummies], axis=1) 
train2 = sm.add_constant(train2)
mod2 = sm.GLM(train2['RaisedFedFunds'], train2[['PreviousRate','Streak','Unemployment',\
                                            'HomeownershipRate','DemocraticPres_1', 'MonthsUntilElection', 'const']],
              family= sm.families.Binomial()).fit()
mod2.summary()
mod2.predict(np.array([1.7, -3, 5.1, 65.3, 0, 18, 1]))

0.37405468218339077

In [11]:
test2 = test.copy()
test2.DemocraticPres = test2.DemocraticPres.astype('category')
_dummies = pd.get_dummies(test2.DemocraticPres, prefix='DemocraticPres').iloc[:, 1:] 
test2 = pd.concat([test2, _dummies], axis=1) 
test2 = sm.add_constant(test2)
pred_test2 = mod2.predict(test2[['PreviousRate','Streak','Unemployment',\
                                            'HomeownershipRate','DemocraticPres_1', 'MonthsUntilElection', 'const']])
conf_mtx2 = pd.crosstab(test.RaisedFedFunds, pred_test2 < 0.5, rownames=['Actual'], colnames=['Predicted'], margins=True)
conf_mtx2 #pd.crosstab has more description than sklearn confusion_matrix; confusion_matrix(test2.RaisedFedFunds, pred_test<0.5, labels=[False,True])

Predicted,False,True,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,28,65,93
1,59,24,83
All,87,89,176


In [12]:
# using sklearn for predictions
train3 = train.copy()
test3 = test.copy()
train3.DemocraticPres = train3.DemocraticPres.astype('category')
test3.DemocraticPres = test3.DemocraticPres.astype('category')
mod3 = LogisticRegression(C=100000) #no regularization
mod3.fit(train3[['PreviousRate','Streak','Unemployment','HomeownershipRate','DemocraticPres', 'MonthsUntilElection']],
         train3.RaisedFedFunds)
mod3.coef_, mod3.intercept_

(array([[ 0.00448308,  0.14157045, -0.07901033, -0.11748451,  0.35970539,
         -0.00592208]]), array([ 8.08335245]))

In [13]:
pred_test3 = mod3.predict(test3[['PreviousRate','Streak','Unemployment','HomeownershipRate','DemocraticPres', 'MonthsUntilElection']])
#prediction probability for 0 or 1 (RaisedFedFunds)
pred_test3_prob = pd.DataFrame(mod3.predict_proba(test3[['PreviousRate','Streak','Unemployment','HomeownershipRate',\
                                       'DemocraticPres', 'MonthsUntilElection']]), columns = mod3.classes_) 
conf_mtx3 = pd.crosstab(test.RaisedFedFunds, pred_test3 < 0.5, rownames=['Actual'], colnames=['Predicted'], margins=True)

In [None]:
# perform cross validation use cross_val_score
# create CART model use tree
# compute confusion matrix and accuracy of CART model ~ 64%