In [16]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
%pylab inline

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy


In [17]:
def bootstrap_replicate_1d(data, func):
    return func(np.random.choice(data, size=len(data)))

In [18]:
def draw_bs_reps(data, func, size=1):
    """Draw bootstrap replicates."""

    # Initialize array of replicates: bs_replicates
    bs_replicates = np.empty(size)

    # Generate replicates
    for i in range(size):
        bs_replicates[i] = bootstrap_replicate_1d(data,func)

    return bs_replicates

In [19]:
def permutation_sample(data1, data2):
    """Generate a permutation sample from two data sets."""

    # Concatenate the data sets: data
    data = np.concatenate((data1,data2))

    # Permute the concatenated array: permuted_data
    permuted_data = np.random.permutation(data)

    # Split the permuted array into two: perm_sample_1, perm_sample_2
    perm_sample_1 = permuted_data[:len(data1)]
    perm_sample_2 = permuted_data[len(data1):]

    return perm_sample_1, perm_sample_2

In [20]:
def diff_of_means(data_1, data_2):
    """Difference in means of two arrays."""

    # The difference of means of data_1, data_2: diff
    diff = np.mean(data_1) - np.mean(data_2)

    return diff

In [21]:
def draw_perm_reps(data_1, data_2, func, size=1):
    """Generate multiple permutation replicates."""

    # Initialize array of replicates: perm_replicates
    perm_replicates = np.empty(size)

    for i in range(size):
        # Generate permutation sample
        perm_sample_1, perm_sample_2 = permutation_sample(data_1,data_2)

        # Compute the test statistic
        perm_replicates[i] = func(perm_sample_1,perm_sample_2)

    return perm_replicates

In [22]:
data_tr = pd.read_csv('train.csv')

In [23]:
data_tr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
Loan_ID              614 non-null object
Gender               601 non-null object
Married              611 non-null object
Dependents           599 non-null object
Education            614 non-null object
Self_Employed        582 non-null object
ApplicantIncome      614 non-null int64
CoapplicantIncome    614 non-null float64
LoanAmount           592 non-null float64
Loan_Amount_Term     600 non-null float64
Credit_History       564 non-null float64
Property_Area        614 non-null object
Loan_Status          614 non-null object
dtypes: float64(4), int64(1), object(8)
memory usage: 62.4+ KB


In [24]:
data_tr.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

# Only numeric attributes

In [25]:
data_tr.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


# Only non numeric attributes

In [26]:
data_tr.describe(include=['object'])

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,Property_Area,Loan_Status
count,614,601,611,599,614,582,614,614
unique,614,2,2,4,2,2,3,2
top,LP001194,Male,Yes,0,Graduate,No,Semiurban,Y
freq,1,489,398,345,480,500,233,422


# Univariate Analysis

In [27]:
from altair import *

In [28]:
chart = Chart(data_tr)

In [29]:
chart.mark_bar().encode(
    X('ApplicantIncome',bin=Bin(maxbins=50)),
    Y('count(*):Q')
)

In [30]:
chart.mark_bar().encode(
    X('CoapplicantIncome',bin=Bin(maxbins=50)),
    Y('count(*):Q')
)

In [31]:
chart.mark_bar().encode(
   X('LoanAmount',bin=Bin(maxbins=50)),
   Y('count(*):Q')
)

In [32]:
chart.mark_bar().encode(
   X('Loan_Amount_Term',bin=Bin(maxbins=20)),
   Y('count(*):Q')
)

In [33]:
chart.mark_bar().encode(
    X('Credit_History',bin=True),
    Y('count(*):Q')
)

In [34]:
chart.mark_bar().encode(
    X('Gender:N'),
    Y('count(*):Q'),
    color='Gender:N'
)

In [35]:
chart.mark_bar().encode(
    X('Dependents:N'),
    Y('count(*):Q'),
    color='Dependents:N'
)

In [36]:
chart.mark_bar().encode(
    X('Married:N'),
    Y('count(*):Q'),
    color='Married:N'
)

In [37]:
chart.mark_bar().encode(
    X('Property_Area:N'),
    Y('count(*):Q'),
    color='Property_Area:N'
)

In [38]:
chart.mark_bar().encode(
    X('Education:N'),
    Y('count(*):Q'),
    color='Education:N'
)

In [39]:
chart.mark_bar().encode(
    X('Self_Employed:N'),
    Y('count(*):Q'),
    color='Self_Employed:N'
)

# Bivariate EDA with Altair

In [40]:
# Loan_Status vs ApplicantIncome CoapplicantIncome LoanAmount Loan_Amount_Term Credit_History

In [41]:
data_tr['Log_Total_Income'] = (data_tr['ApplicantIncome']+data_tr['CoapplicantIncome']).apply(lambda x:np.log10(x))

In [42]:
Chart(data_tr).mark_bar(stacked='normalize').encode(
    X('Log_Total_Income', bin=Bin(maxbins=50)),
    Y('*:Q', aggregate='count'),
    Color('Loan_Status')
).configure_cell(width=800,height=500)

In [43]:
Chart(data_tr).mark_bar(stacked='normalize').encode(
    X('LoanAmount', bin=Bin(maxbins=50)),
    Y('*:Q', aggregate='count'),
    Color('Loan_Status'),
    row='Property_Area'
).configure_cell(height=200)

In [44]:
Chart(data_tr).mark_bar(stacked='normalize').encode(
    Y('Loan_Amount_Term:O',sort='descending'),
    X('*:Q', aggregate='count'),
    Color('Loan_Status:N')
).configure_cell(width=800)

In [45]:
Chart(data_tr).mark_bar(stacked='normalize').encode(
    X('Credit_History:O'),
    Y('*:Q', aggregate='count'),
    Color('Loan_Status:N')
).configure_cell(width=500)

In [46]:
# Loan_Status vs Gender Married Dependents Education Self_Employed Property_Area

In [47]:
Chart(data_tr).mark_bar(stacked='normalize').encode(
    X('Gender:N'),
    Y('*:Q', aggregate='count'),
    Color('Loan_Status:N',scale=Scale(range=["#FF9800", "#03A9F4"])),
    row='Education',
    column='Self_Employed'
)

In [48]:
Chart(data_tr).mark_bar(stacked='normalize').encode(
    Y('Married:N'),
    X('*:Q', aggregate='count'),
    Color('Loan_Status:N',scale=Scale(range=["#FF9800", "#03A9F4"]))
)

In [49]:
Chart(data_tr).mark_bar(stacked='normalize').encode(
    Y('Dependents:O',sort='descending'),
    X('*:Q', aggregate='count'),
    Color('Loan_Status:N',scale=Scale(range=["#FF9800", "#03A9F4"]))
)

In [50]:
Chart(data_tr).mark_bar(stacked='normalize').encode(
    X('Education:N'),
    Y('*:Q', aggregate='count'),
    Color('Loan_Status:N',scale=Scale(range=["#FF9800", "#03A9F4"])),
    column='Property_Area'
)

# Chi-square test

In [51]:
import scipy.stats as st

In [52]:
# contingency table of observed counts
ct1=pd.crosstab(data_tr['Loan_Status'], data_tr['Education'])
print (ct1)

Education    Graduate  Not Graduate
Loan_Status                        
N                 140            52
Y                 340            82


In [53]:
# column percentages
colsum=ct1.sum(axis=0)
colpct=ct1/colsum
print(colpct)

Education    Graduate  Not Graduate
Loan_Status                        
N            0.291667       0.38806
Y            0.708333       0.61194


In [54]:
# chi-square
print ('chi-square value, p value, expected counts')
cs1= st.chi2_contingency(ct1)
print (cs1)

chi-square value, p value, expected counts
(4.091490413303621, 0.043099621293573552, 1, array([[ 150.09771987,   41.90228013],
       [ 329.90228013,   92.09771987]]))


In [55]:
ct2=pd.crosstab(data_tr['Loan_Status'], data_tr['Dependents'])
print (ct2)

Dependents     0   1   2  3+
Loan_Status                 
N            107  36  25  18
Y            238  66  76  33


In [56]:
# column percentages
colsum=ct2.sum(axis=0)
colpct=ct2/colsum
print(colpct)

Dependents          0         1         2        3+
Loan_Status                                        
N            0.310145  0.352941  0.247525  0.352941
Y            0.689855  0.647059  0.752475  0.647059


In [57]:
# chi-square
print ('chi-square value, p value, expected counts')
cs1= st.chi2_contingency(ct2)
print (cs1)

chi-square value, p value, expected counts
(3.1583397706982632, 0.36785067408632099, 3, array([[ 107.12854758,   31.67278798,   31.36227045,   15.83639399],
       [ 237.87145242,   70.32721202,   69.63772955,   35.16360601]]))


In [58]:
ct3=pd.crosstab(data_tr['Loan_Status'], data_tr['Property_Area'])
print (ct3)

Property_Area  Rural  Semiurban  Urban
Loan_Status                           
N                 69         54     69
Y                110        179    133


In [59]:
# column percentages
colsum=ct3.sum(axis=0)
colpct=ct3/colsum
print(colpct)

Property_Area     Rural  Semiurban     Urban
Loan_Status                                 
N              0.385475    0.23176  0.341584
Y              0.614525    0.76824  0.658416


In [60]:
# chi-square
print ('chi-square value, p value, expected counts')
cs1= st.chi2_contingency(ct3)
print (cs1)

chi-square value, p value, expected counts
(12.297623130485677, 0.0021360187811644915, 2, array([[  55.97394137,   72.85993485,   63.16612378],
       [ 123.02605863,  160.14006515,  138.83387622]]))


In [61]:
ct3=pd.crosstab(data_tr['Loan_Status'], data_tr['Married'])
print (ct3)
# column percentages
colsum=ct3.sum(axis=0)
colpct=ct3/colsum
print(colpct)
# chi-square
print ('chi-square value, p value, expected counts')
cs1= st.chi2_contingency(ct3)
print (cs1)

Married       No  Yes
Loan_Status          
N             79  113
Y            134  285
Married            No      Yes
Loan_Status                   
N            0.370892  0.28392
Y            0.629108  0.71608
chi-square value, p value, expected counts
(4.4750193483150973, 0.034393813015799878, 1, array([[  66.93289689,  125.06710311],
       [ 146.06710311,  272.93289689]]))


In [62]:
ct4=pd.crosstab(data_tr['Loan_Status'], data_tr['Self_Employed'])
print (ct4)
# column percentages
colsum=ct4.sum(axis=0)
colpct=ct4/colsum
print(colpct)
# chi-square
print ('chi-square value, p value, expected counts')
cs1= st.chi2_contingency(ct4)
print (cs1)

Self_Employed   No  Yes
Loan_Status            
N              157   26
Y              343   56
Self_Employed     No       Yes
Loan_Status                   
N              0.314  0.317073
Y              0.686  0.682927
chi-square value, p value, expected counts
(0.0052927701100011137, 0.94200392422237178, 1, array([[ 157.21649485,   25.78350515],
       [ 342.78350515,   56.21649485]]))


In [63]:
ct4=pd.crosstab(data_tr['Loan_Status'], data_tr['Credit_History'])
print (ct4)
# column percentages
colsum=ct4.sum(axis=0)
colpct=ct4/colsum
print(colpct)
# chi-square
print ('chi-square value, p value, expected counts')
cs1= st.chi2_contingency(ct4)
print (cs1)

Credit_History  0.0  1.0
Loan_Status             
N                82   97
Y                 7  378
Credit_History       0.0       1.0
Loan_Status                       
N               0.921348  0.204211
Y               0.078652  0.795789
chi-square value, p value, expected counts
(174.63729658142535, 7.1847595487507458e-40, 1, array([[  28.2464539,  150.7535461],
       [  60.7535461,  324.2464539]]))


In [64]:
ct4=pd.crosstab(data_tr['Loan_Status'], data_tr['Loan_Amount_Term'])
print (ct4)
# column percentages
colsum=ct4.sum(axis=0)
colpct=ct4/colsum
print(colpct)
# chi-square
print ('chi-square value, p value, expected counts')
cs1= st.chi2_contingency(ct4)
print (cs1)

Loan_Amount_Term  12.0   36.0   60.0   84.0   120.0  180.0  240.0  300.0  \
Loan_Status                                                                
N                     0      2      0      1      0     15      1      5   
Y                     1      0      2      3      3     29      3      8   

Loan_Amount_Term  360.0  480.0  
Loan_Status                     
N                   153      9  
Y                   359      6  
Loan_Amount_Term  12.0   36.0   60.0   84.0   120.0     180.0  240.0  \
Loan_Status                                                            
N                   0.0    1.0    0.0   0.25    0.0  0.340909   0.25   
Y                   1.0    0.0    1.0   0.75    1.0  0.659091   0.75   

Loan_Amount_Term     300.0     360.0  480.0  
Loan_Status                                  
N                 0.384615  0.298828    0.6  
Y                 0.615385  0.701172    0.4  
chi-square value, p value, expected counts
(14.013166955181328, 0.12185762421276597, 9, ar

# Recode Loan_Status as a binary variable

In [65]:
data_tr['Target'] = data_tr['Loan_Status'].apply(lambda x: 1 if(x=='Y') else 0)

# Begin imputation of missing data

In [66]:
data_tr.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
Log_Total_Income      0
Target                0
dtype: int64

In [67]:
data_tr['Self_Employed'].fillna('No',inplace=True)

In [68]:
data_tr['Credit_History'].fillna(1.0,inplace=True)

# Imputation of Loan Amount based on pivot table of education/self employed

In [69]:
pv_tab = data_tr.pivot_table(values='LoanAmount', index='Self_Employed' ,columns='Education', aggfunc=np.median)

In [70]:
def medv(x):
    return(pv_tab.loc[x['Self_Employed'],x['Education']])

In [71]:
data_tr['LoanAmount'].fillna(data_tr[data_tr['LoanAmount'].isnull()].apply(medv, axis=1),inplace=True)

In [72]:
################FROM DATACAMP##################
for _ in range(50):
    # Generate permutation samples
    perm_sample_1, perm_sample_2 = permutation_sample(rain_july, rain_november)


    # Compute ECDFs
    x_1, y_1 = ecdf(perm_sample_1)
    x_2, y_2 = ecdf(perm_sample_2)

    # Plot ECDFs of permutation sample
    _ = plt.plot(x_1, y_1, marker='.', linestyle='none',
                 color='red', alpha=0.02)
    _ = plt.plot(x_2, y_2, marker='.', linestyle='none',
                 color='blue', alpha=0.02)

# Create and plot ECDFs from original data
x_1, y_1 = ecdf(rain_july)
x_2, y_2 = ecdf(rain_november)
_ = plt.plot(x_1, y_1, marker='.', linestyle='none', color='red')
_ = plt.plot(x_2, y_2, marker='.', linestyle='none', color='blue')

# Label axes, set margin, and show plot
plt.margins(0.02)
_ = plt.xlabel('monthly rainfall (mm)')
_ = plt.ylabel('ECDF')
plt.show()

NameError: name 'rain_july' is not defined