In [1]:
%matplotlib inline

In [2]:
import multiple_reg as mr
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [3]:
# get loandata
loansData = pd.read_csv('https://spark-public.s3.amazonaws.com/dataanalysis/loansData.csv')

In [4]:
loansData['Interest.Rate'] = [float(interest[0:-1])/100 for interest in loansData['Interest.Rate']]
loansData['Loan.Length'] = [int(length[0:-7]) for length in loansData['Loan.Length']]
loansData['FICO.Score'] = [int(val.split('-')[0]) for val in loansData['FICO.Range']]

In [5]:
# extract the columns from the data frame returning a series
intrate = loansData['Interest.Rate']
loanamt = loansData['Amount.Requested']
fico = loansData['FICO.Score']

# create new dataframe to try r style formula
# dfrstyle = loansData['Interest.Rate','FICO.Score','Amount.Requested']
# dfrstyle = loansData.ix[:,'Interest.Rate'] NO NEED FOR THIS JUST USE THE DF

In [6]:
# reshape the data
# The dependent variable
y = np.matrix(intrate).transpose()
# The independent variables shaped as columns
x1 = np.matrix(fico).transpose()
x2 = np.matrix(loanamt).transpose()

In [7]:
# put the columns back together to create a matrix
x = np.column_stack([x1,x2])

In [8]:
# create the linear model with statsmodels - # read more about stats models
X = sm.add_constant(x)
model = sm.OLS(y,X)
f = model.fit()

In [9]:
f.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.657
Model:,OLS,Adj. R-squared:,0.656
Method:,Least Squares,F-statistic:,2388.0
Date:,"Mon, 07 Nov 2016",Prob (F-statistic):,0.0
Time:,14:33:20,Log-Likelihood:,5727.6
No. Observations:,2500,AIC:,-11450.0
Df Residuals:,2497,BIC:,-11430.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.7288,0.010,73.734,0.000,0.709 0.748
x1,-0.0009,1.4e-05,-63.022,0.000,-0.001 -0.001
x2,2.107e-06,6.3e-08,33.443,0.000,1.98e-06 2.23e-06

0,1,2,3
Omnibus:,69.496,Durbin-Watson:,1.979
Prob(Omnibus):,0.0,Jarque-Bera (JB):,77.811
Skew:,0.379,Prob(JB):,1.27e-17
Kurtosis:,3.414,Cond. No.,296000.0


In [10]:
#interest rate = .7299 + -.0009(fico) + 2.108e-06(loanamount)
# fico 700 and loan amount 20000 ir of 14.09%

# irtest = .7288 - (.0009 * 700) + (2.108 * np.e**-6 * 20000)
irtest = .7288 - (.0009 * 700) + (.000002107 * 20000)
print(irtest)

0.14094


In [11]:
f.params

array([  7.28827983e-01,  -8.84424222e-04,   2.10747769e-06])

In [12]:
# reasonableness check
# model takes constant, fico, amount as input and returns interest rate


print("Baseline fico 750 amount 10,000 - rate is {}".format(f.predict([1, 750, 10000])))
print("Test 1 - Increase fico to 800 amount stays 10,000 expect rate to go down - rate is {}".format(f.predict([1, 800, 10000])))
print("test 2 - fico stays at 750 increase amount to 20,000 expect rate to go up - rate is {}".format(f.predict([1, 750, 20000])))


Baseline fico 750 amount 10,000 - rate is [ 0.08658459]
Test 1 - Increase fico to 800 amount stays 10,000 expect rate to go down - rate is [ 0.04236338]
test 2 - fico stays at 750 increase amount to 20,000 expect rate to go up - rate is [ 0.10765937]


In [13]:
# linmodel_D = smf.ols(formula='Rate ~ FICO + loanamt + ocl + rcb + i6m + ai', data=df).fit()
# refactor for r style using smf.formula.api as smf

loansData['ir']=loansData['Interest.Rate']
loansData['amount']=loansData['Amount.Requested']
loansData['fico'] = loansData['FICO.Score']

# g = smf.ols(formula = 'intrate ~ loanamt  + fico' data = )
g = smf.ols(formula = 'ir ~ amount + fico', data = loansData).fit()

In [14]:
for param in g.params:
    print(param)

0.728827983168
2.10747768548e-06
-0.000884424221792


In [15]:
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

In [16]:
dftest = pd.read_excel('test1.xlsx', sheetname = 'Sheet1', header =0)

In [17]:
dftest

Unnamed: 0,C1,C2,C3
R1,1,2,3
R2,4,5,6
R3,7,8,9
R4,10,11,12
R5,13,14,15
R6,16,17,18
R7,19,20,21
R8,22,23,24
R9,25,26,27
R10,28,29,30


In [18]:
# This is showing the index number for the dftest df split into two sets.  We use all the data swapping each half as test and train
# 50 50 split
kf = KFold(n_splits = 2)
for train, test in kf.split(dftest):
    print("{} {}".format(train,test))

[5 6 7 8 9] [0 1 2 3 4]
[0 1 2 3 4] [5 6 7 8 9]


In [19]:
# kf = KFold(n_splits = 4) # gives 70 30 then 80 20 - output not sequential implies some randomness in selection
# for train, test in kf.split(dftest):
# #     tmptrain = train
     
    
# #     print("{} {}".format(train,test))

IndentationError: expected an indented block (<ipython-input-19-ca0aeda80eab>, line 6)

In [20]:
dftest.iloc[[3,4]] # iloc may not always map to the row #
# dftest.ix[4]

Unnamed: 0,C1,C2,C3
R4,10,11,12
R5,13,14,15


In [35]:
# [model.predict(1, item['fico'], item['amount']) for item in dftest]

for i in range(dftest.shape[0]): 
        item = dftest.iloc[i]
        print(kfmodel.predict(item)) # because we use r format formula we can use predict with the item and it will get fields
        
        

[ 0.08013383]
[ 0.12317492]
[ 0.13263975]
[ 0.1609239]
[ 0.16352236]
[ 0.17116735]
[ 0.13616719]
[ 0.14862073]
[ 0.11337122]
[ 0.06086909]
[ 0.13543428]
[ 0.15175792]
[ 0.11459905]
[ 0.16283705]
[ 0.13298241]
[ 0.1061179]
[ 0.13940336]
[ 0.16283705]
[ 0.11660739]
[ 0.13219811]
[ 0.11562891]
[ 0.14847415]
[ 0.16675852]
[ 0.11263831]
[ 0.17013748]
[ 0.13705047]
[ 0.16837661]
[ 0.08077155]
[ 0.14469546]
[ 0.15180551]
[ 0.13616719]
[ 0.16641587]
[ 0.18905957]
[ 0.09832301]
[ 0.14072638]
[ 0.14391496]
[ 0.08018143]
[ 0.15234566]
[ 0.11474184]
[ 0.14067879]
[ 0.07582021]
[ 0.0638102]
[ 0.12675564]
[ 0.07719272]
[ 0.09832301]
[ 0.04532596]
[ 0.13283583]
[ 0.14077398]
[ 0.07552704]
[ 0.14067879]
[ 0.14881444]
[ 0.14268712]
[ 0.15661217]
[ 0.15460194]
[ 0.18107382]
[ 0.16239446]
[ 0.12293314]
[ 0.15827785]
[ 0.03189584]
[ 0.09900833]
[ 0.14460027]
[ 0.08008624]
[ 0.21185554]
[ 0.11136288]
[ 0.02601362]
[ 0.1707752]
[ 0.15023692]
[ 0.13312757]
[ 0.11086796]
[ 0.10807912]
[ 0.16876781]
[ 0.085768

In [38]:
def mean_square(dftest, model):
    pirdifsum = 0
    
    
    for i in range(dftest.shape[0]): 
        item = dftest.iloc[i]
        pir = kfmodel.predict(item)
        pirdif = pir - item['ir']
        pirdifsum = pirdifsum + pirdif**2

    meansqerror = pirdifsum / dftest.shape[0]
    
    return(meansqerror)
        
        


In [39]:
# use kfold to get test and train sets
kf = KFold(n_splits = 2)
for train, test in kf.split(loansData):
#     print("{} {}".format(train,test))
    dftrain = loansData.iloc[train]
    kfmodel = smf.ols(formula = 'ir ~ amount + fico', data = dftrain).fit()
    dftest = loansData.iloc[test]
    print(kfmodel.params)
    mean_sq_error = mean_square(dftest, kfmodel)
#     print(kfmodel.summary())
    print("mean squared error for model is {}".format(mean_sq_error))
    

# run model against train
# run model against test and review outputs
# somewhere in here calculate average model



Intercept    0.709948
amount       0.000002
fico        -0.000860
dtype: float64
                            OLS Regression Results                            
Dep. Variable:                     ir   R-squared:                       0.675
Model:                            OLS   Adj. R-squared:                  0.675
Method:                 Least Squares   F-statistic:                     1297.
Date:                Mon, 07 Nov 2016   Prob (F-statistic):          2.08e-305
Time:                        14:50:08   Log-Likelihood:                 2879.5
No. Observations:                1250   AIC:                            -5753.
Df Residuals:                    1247   BIC:                            -5738.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
--------------------------------------------------