In [30]:
import numpy as np
import pandas as pd
import statsmodels.api as sm


# Load and clean data
loans_data = pd.read_csv('https://spark-public.s3.amazonaws.com/dataanalysis/loansData.csv')
# Rename Columns
loans_data.columns = loans_data.columns.map(lambda x: x.replace('.', '_').lower())

# Simple Regression: interest_rate ~ monthly_income

In [32]:
df = loans_data.dropna(subset=['home_ownership', 'interest_rate', 'monthly_income'])

# Clean Interest Rate
df['interest_rate'] = df['interest_rate'].apply(lambda x: float(str(x).rstrip('%')))

X = sm.add_constant(df['monthly_income'])
est = sm.OLS(df['interest_rate'], X).fit()

est.summary()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


0,1,2,3
Dep. Variable:,interest_rate,R-squared:,0.0
Model:,OLS,Adj. R-squared:,-0.0
Method:,Least Squares,F-statistic:,0.3764
Date:,"Tue, 07 Jul 2015",Prob (F-statistic):,0.54
Time:,09:36:27,Log-Likelihood:,-7118.1
No. Observations:,2499,AIC:,14240.0
Df Residuals:,2497,BIC:,14250.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,12.9956,0.146,88.868,0.000,12.709 13.282
monthly_income,1.294e-05,2.11e-05,0.614,0.540,-2.84e-05 5.43e-05

0,1,2,3
Omnibus:,69.811,Durbin-Watson:,1.991
Prob(Omnibus):,0.0,Jarque-Bera (JB):,54.205
Skew:,0.274,Prob(JB):,1.7e-12
Kurtosis:,2.531,Cond. No.,12100.0


# Including Categorical Variable: home_ownership

In [27]:
import statsmodels.formula.api as smf

#return home_ownership excluding OTHER and NONE
filtered_df = df[df['home_ownership'].isin(['OWN','RENT','MORTGAGE'])]

est = smf.ols(formula='interest_rate ~ monthly_income + home_ownership', data=filtered_df).fit()

est.summary()

0,1,2,3
Dep. Variable:,interest_rate,R-squared:,0.007
Model:,OLS,Adj. R-squared:,0.006
Method:,Least Squares,F-statistic:,5.748
Date:,"Tue, 07 Jul 2015",Prob (F-statistic):,0.000647
Time:,09:33:20,Log-Likelihood:,-7095.4
No. Observations:,2494,AIC:,14200.0
Df Residuals:,2490,BIC:,14220.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,12.5396,0.189,66.517,0.000,12.170 12.909
home_ownership[T.OWN],0.2214,0.322,0.688,0.491,-0.409 0.852
home_ownership[T.RENT],0.7248,0.178,4.075,0.000,0.376 1.074
monthly_income,3.04e-05,2.16e-05,1.410,0.159,-1.19e-05 7.27e-05

0,1,2,3
Omnibus:,68.335,Durbin-Watson:,1.989
Prob(Omnibus):,0.0,Jarque-Bera (JB):,54.958
Skew:,0.285,Prob(JB):,1.16e-12
Kurtosis:,2.547,Cond. No.,28200.0


# Include interaction between income and home ownership:

Notice that there is now a home ownership specific coefficent for income. 

In [29]:
est = smf.ols(formula='interest_rate ~ monthly_income * home_ownership', data=filtered_df).fit()

est.summary()

0,1,2,3
Dep. Variable:,interest_rate,R-squared:,0.008
Model:,OLS,Adj. R-squared:,0.006
Method:,Least Squares,F-statistic:,4.183
Date:,"Tue, 07 Jul 2015",Prob (F-statistic):,0.000869
Time:,09:34:05,Log-Likelihood:,-7093.5
No. Observations:,2494,AIC:,14200.0
Df Residuals:,2488,BIC:,14230.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,12.6717,0.207,61.191,0.000,12.266 13.078
home_ownership[T.OWN],0.3051,0.589,0.518,0.605,-0.851 1.461
home_ownership[T.RENT],0.2115,0.331,0.639,0.523,-0.438 0.861
monthly_income,1.047e-05,2.51e-05,0.416,0.677,-3.88e-05 5.98e-05
monthly_income:home_ownership[T.OWN],-2.529e-05,0.000,-0.250,0.802,-0.000 0.000
monthly_income:home_ownership[T.RENT],9.76e-05,5.27e-05,1.853,0.064,-5.69e-06 0.000

0,1,2,3
Omnibus:,67.204,Durbin-Watson:,1.993
Prob(Omnibus):,0.0,Jarque-Bera (JB):,53.68
Skew:,0.279,Prob(JB):,2.21e-12
Kurtosis:,2.547,Cond. No.,52800.0
