In [4]:
%matplotlib inline

In [5]:
import multiple_reg as mr
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [6]:
# get loandata
loansData = pd.read_csv('https://spark-public.s3.amazonaws.com/dataanalysis/loansData.csv')

In [7]:
loansData['Interest.Rate'] = [float(interest[0:-1])/100 for interest in loansData['Interest.Rate']]
loansData['Loan.Length'] = [int(length[0:-7]) for length in loansData['Loan.Length']]
loansData['FICO.Score'] = [int(val.split('-')[0]) for val in loansData['FICO.Range']]

In [8]:
# extract the columns from the data frame returning a series
intrate = loansData['Interest.Rate']
loanamt = loansData['Amount.Requested']
fico = loansData['FICO.Score']

# create new dataframe to try r style formula
# dfrstyle = loansData['Interest.Rate','FICO.Score','Amount.Requested']
# dfrstyle = loansData.ix[:,'Interest.Rate'] NO NEED FOR THIS JUST USE THE DF

In [9]:
# reshape the data
# The dependent variable
y = np.matrix(intrate).transpose()
# The independent variables shaped as columns
x1 = np.matrix(fico).transpose()
x2 = np.matrix(loanamt).transpose()

In [10]:
# put the columns back together to create a matrix
x = np.column_stack([x1,x2])

In [11]:
# create the linear model with statsmodels - # read more about stats models
X = sm.add_constant(x)
model = sm.OLS(y,X)
f = model.fit()

In [12]:
f.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.657
Model:,OLS,Adj. R-squared:,0.656
Method:,Least Squares,F-statistic:,2388.0
Date:,"Mon, 07 Nov 2016",Prob (F-statistic):,0.0
Time:,18:42:19,Log-Likelihood:,5727.6
No. Observations:,2500,AIC:,-11450.0
Df Residuals:,2497,BIC:,-11430.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.7288,0.010,73.734,0.000,0.709 0.748
x1,-0.0009,1.4e-05,-63.022,0.000,-0.001 -0.001
x2,2.107e-06,6.3e-08,33.443,0.000,1.98e-06 2.23e-06

0,1,2,3
Omnibus:,69.496,Durbin-Watson:,1.979
Prob(Omnibus):,0.0,Jarque-Bera (JB):,77.811
Skew:,0.379,Prob(JB):,1.27e-17
Kurtosis:,3.414,Cond. No.,296000.0


In [13]:
#interest rate = .7299 + -.0009(fico) + 2.108e-06(loanamount)
# fico 700 and loan amount 20000 ir of 14.09%

# irtest = .7288 - (.0009 * 700) + (2.108 * np.e**-6 * 20000)
irtest = .7288 - (.0009 * 700) + (.000002107 * 20000)
print(irtest)

0.14094


In [14]:
f.params

array([  7.28827983e-01,  -8.84424222e-04,   2.10747769e-06])

In [15]:
# reasonableness check
# model takes constant, fico, amount as input and returns interest rate


print("Baseline fico 750 amount 10,000 - rate is {}".format(f.predict([1, 750, 10000])))
print("Test 1 - Increase fico to 800 amount stays 10,000 expect rate to go down - rate is {}".format(f.predict([1, 800, 10000])))
print("test 2 - fico stays at 750 increase amount to 20,000 expect rate to go up - rate is {}".format(f.predict([1, 750, 20000])))


Baseline fico 750 amount 10,000 - rate is [ 0.08658459]
Test 1 - Increase fico to 800 amount stays 10,000 expect rate to go down - rate is [ 0.04236338]
test 2 - fico stays at 750 increase amount to 20,000 expect rate to go up - rate is [ 0.10765937]


In [16]:
# linmodel_D = smf.ols(formula='Rate ~ FICO + loanamt + ocl + rcb + i6m + ai', data=df).fit()
# refactor for r style using smf.formula.api as smf

loansData['ir']=loansData['Interest.Rate']
loansData['amount']=loansData['Amount.Requested']
loansData['fico'] = loansData['FICO.Score']

# g = smf.ols(formula = 'intrate ~ loanamt  + fico' data = )
g = smf.ols(formula = 'ir ~ amount + fico', data = loansData).fit()

In [17]:
for param in g.params:
    print(param)

0.728827983168
2.10747768548e-06
-0.000884424221792


In [18]:
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

In [19]:
def mean_square(dftest, model):
    pirdifsum = 0
    
    
    for i in range(dftest.shape[0]): 
        item = dftest.iloc[i]
        pir = kfmodel.predict(item)
        pirdif = pir - item['ir']
        pirdifsum = pirdifsum + pirdif**2

    meansqerror = pirdifsum / dftest.shape[0]
    
    return(meansqerror)
        
        


In [22]:
# use kfold to get test and train sets
kf = KFold(n_splits = 10)
modellist = []

for train, test in kf.split(loansData):
#     print("{} {}".format(train,test))
    dftrain = loansData.iloc[train]
    kfmodel = smf.ols(formula = 'ir ~ amount + fico', data = dftrain).fit()
    modellist.append(kfmodel) # make a list of the models so we can use the one with least squares.
    dftest = loansData.iloc[test]
#     print(kfmodel.params)
    mean_sq_error = mean_square(dftest, kfmodel)
#     print(kfmodel.summary())
    print("mean squared error for model is {}".format(mean_sq_error))
    

# run model against train
# run model against test and review outputs


mean squared error for model is [ 0.00066772]
mean squared error for model is [ 0.00061569]
mean squared error for model is [ 0.00062959]
mean squared error for model is [ 0.0005562]
mean squared error for model is [ 0.00059758]
mean squared error for model is [ 0.00060341]
mean squared error for model is [ 0.00052686]
mean squared error for model is [ 0.00059656]
mean squared error for model is [ 0.00065213]
mean squared error for model is [ 0.00056755]


In [23]:
type(modellist)

list

In [25]:
modellist[0].params


Intercept    0.725206
amount       0.000002
fico        -0.000880
dtype: float64