In [27]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as ss
import math



from sklearn import preprocessing
import sklearn.model_selection as ms
from sklearn import linear_model
import sklearn.metrics as sklm
from sklearn import feature_selection as fs
from sklearn import metrics
from sklearn.model_selection import cross_validate

%matplotlib inline

pd.set_option('display.float_format', lambda x: '{:.3f}'.format(x)) #Limiting floats output to 3 decimal points

import warnings
warnings.filterwarnings('ignore')

## Prepare Dataset for Models

Drop Row ID from training set as it is not a predictor.

In [28]:
# Load test dataset
df_test_dataset = pd.read_csv('Data/test_values.csv')

print("Home mortgage test data values has {} data points with {} variables each.".format(*df_test_dataset.shape))

Home mortgage test data values has 500000 data points with 22 variables each.


In [32]:
print(df_test_dataset['loan_type'].unique())

[2 1 3 4]


In [30]:
display(df_test_dataset.head())

Unnamed: 0,row_id,loan_type,property_type,loan_purpose,occupancy,loan_amount,preapproval,msa_md,state_code,county_code,...,applicant_sex,applicant_income,population,minority_population_pct,ffiecmedian_family_income,tract_to_msa_md_income_pct,number_of_owner-occupied_units,number_of_1_to_4_family_units,lender,co_applicant
0,0,2,1,3,1,115.0,3,101,16,276,...,1,,6329.0,59.536,69889.0,85.78,1874.0,2410.0,3791,True
1,1,1,1,1,1,252.0,2,87,20,68,...,1,107.0,2473.0,8.05,65313.0,100.0,947.0,1214.0,2839,True
2,2,1,1,1,1,270.0,1,-1,-1,-1,...,2,119.0,,,,,,,4701,False
3,3,2,1,1,1,179.0,2,376,20,11,...,2,44.0,4795.0,29.676,57766.0,100.0,1426.0,1765.0,2153,True
4,4,2,1,1,1,36.0,2,254,48,156,...,3,32.0,5246.0,5.11,63332.0,100.0,1452.0,2092.0,5710,False


In [31]:
df_test_dataset.columns

Index(['row_id', 'loan_type', 'property_type', 'loan_purpose', 'occupancy',
       'loan_amount', 'preapproval', 'msa_md', 'state_code', 'county_code',
       'applicant_ethnicity', 'applicant_race', 'applicant_sex',
       'applicant_income', 'population', 'minority_population_pct',
       'ffiecmedian_family_income', 'tract_to_msa_md_income_pct',
       'number_of_owner-occupied_units', 'number_of_1_to_4_family_units',
       'lender', 'co_applicant'],
      dtype='object')

In [9]:
df_test_dataset.isnull().sum()

row_id                                0
loan_type                             0
property_type                         0
loan_purpose                          0
occupancy                             0
loan_amount                           0
preapproval                           0
msa_md                                0
state_code                            0
county_code                           0
applicant_ethnicity                   0
applicant_race                        0
applicant_sex                         0
applicant_income                  40141
population                        22480
minority_population_pct           22482
ffiecmedian_family_income         22453
tract_to_msa_md_income_pct        22517
number_of_owner-occupied_units    22574
number_of_1_to_4_family_units     22550
lender                                0
co_applicant                          0
dtype: int64

In [10]:
df_test_dataset['msa_md'] = df_test_dataset['msa_md'].replace(-1, df_test_dataset['msa_md'].median())
df_test_dataset['state_code'] = df_test_dataset['state_code'].replace(-1, df_test_dataset['state_code'].median())
df_test_dataset['county_code'] = df_test_dataset['county_code'].replace(-1, df_test_dataset['county_code'].median())
#df_test_dataset['co_applicant'] = df_test_dataset['co_applicant'].replace(True, 1)
#df_test_dataset['co_applicant'] = df_test_dataset['co_applicant'].replace(False, df_test_dataset['co_applicant'].median())

In [11]:
display(df_test_dataset.head())

Unnamed: 0,row_id,loan_type,property_type,loan_purpose,occupancy,loan_amount,preapproval,msa_md,state_code,county_code,...,applicant_sex,applicant_income,population,minority_population_pct,ffiecmedian_family_income,tract_to_msa_md_income_pct,number_of_owner-occupied_units,number_of_1_to_4_family_units,lender,co_applicant
0,0,2,1,3,1,115.0,3,101,16,276,...,1,,6329.0,59.536,69889.0,85.78,1874.0,2410.0,3791,True
1,1,1,1,1,1,252.0,2,87,20,68,...,1,107.0,2473.0,8.05,65313.0,100.0,947.0,1214.0,2839,True
2,2,1,1,1,1,270.0,1,192,26,131,...,2,119.0,,,,,,,4701,False
3,3,2,1,1,1,179.0,2,376,20,11,...,2,44.0,4795.0,29.676,57766.0,100.0,1426.0,1765.0,2153,True
4,4,2,1,1,1,36.0,2,254,48,156,...,3,32.0,5246.0,5.11,63332.0,100.0,1452.0,2092.0,5710,False


In [12]:
# Imputing the missing value with mean.
df_test_dataset['loan_amount'].fillna(df_test_dataset['loan_amount'].median(), inplace=True)
df_test_dataset['applicant_income'].fillna(df_test_dataset['applicant_income'].median(), inplace=True)
df_test_dataset['population'].fillna(df_test_dataset['population'].median(), inplace=True)
df_test_dataset['minority_population_pct'].fillna(df_test_dataset['minority_population_pct'].median(), inplace=True)
df_test_dataset['ffiecmedian_family_income'].fillna(df_test_dataset['ffiecmedian_family_income'].median(), inplace=True)
df_test_dataset['tract_to_msa_md_income_pct'].fillna(df_test_dataset['tract_to_msa_md_income_pct'].median(), inplace=True)
df_test_dataset['number_of_owner-occupied_units'].fillna(df_test_dataset['number_of_owner-occupied_units'].median(), inplace=True)
df_test_dataset['number_of_1_to_4_family_units'].fillna(df_test_dataset['number_of_1_to_4_family_units'].median(), inplace=True)
df_test_dataset['msa_md'].fillna(df_test_dataset['msa_md'].median(), inplace=True)
df_test_dataset['state_code'].fillna(df_test_dataset['state_code'].median(), inplace=True)
df_test_dataset['county_code'].fillna(df_test_dataset['county_code'].median(), inplace=True)
df_test_dataset['lender'].fillna(df_test_dataset['lender'].median(), inplace=True)

In [13]:
df_test_dataset.isnull().sum()

row_id                            0
loan_type                         0
property_type                     0
loan_purpose                      0
occupancy                         0
loan_amount                       0
preapproval                       0
msa_md                            0
state_code                        0
county_code                       0
applicant_ethnicity               0
applicant_race                    0
applicant_sex                     0
applicant_income                  0
population                        0
minority_population_pct           0
ffiecmedian_family_income         0
tract_to_msa_md_income_pct        0
number_of_owner-occupied_units    0
number_of_1_to_4_family_units     0
lender                            0
co_applicant                      0
dtype: int64

In [14]:
df_clean = df_test_dataset

In [15]:
print(df_clean['co_applicant'].unique())
def encode_string(cat_feature):
    ## First encode the strings to numeric categories
    enc = preprocessing.LabelEncoder()
    enc.fit(cat_feature)
    enc_cat_feature = enc.transform(cat_feature)
    ## Now, apply one hot encoding
    ohe = preprocessing.OneHotEncoder()
    encoded = ohe.fit(enc_cat_feature.reshape(-1,1))
    return encoded.transform(enc_cat_feature.reshape(-1,1)).toarray()
    
categorical_columns= ['co_applicant']

for col in categorical_columns:
    temp = encode_string(df_clean[col])
    Features_co_applicant = np.concatenate([temp], axis = 1)
#explorer categorical (bool) columns 
enc_co_applicant = ['co_applicant_True', 'co_applicant_False']
print(Features_co_applicant.shape)
print(Features_co_applicant[:10, :])

[ True False]
(500000, 2)
[[0. 1.]
 [0. 1.]
 [1. 0.]
 [0. 1.]
 [1. 0.]
 [0. 1.]
 [0. 1.]
 [1. 0.]
 [1. 0.]
 [1. 0.]]


In [16]:
print(df_clean['loan_type'].unique())
categorical_columns= ['loan_type']
for col in categorical_columns:
    temp = encode_string(df_clean[col])
    Features_loan_type = np.concatenate([temp], axis = 1)
enc_loan_type = ['loan_type_conv','loan_type_FHA','loan_type_VA','loan_type_FSA_RHS']
print(Features_loan_type.shape)
print(Features_loan_type[:5, :]) 

[2 1 3 4]
(500000, 4)
[[0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]]


In [17]:
print(df_clean['property_type'].unique())
categorical_columns= ['property_type']
for col in categorical_columns:
    temp = encode_string(df_clean[col])
    Features_property_type = np.concatenate([temp], axis = 1)
enc_property_type = ['property_type_One_to_four_family','property_type_Manufactured_housing', 'property_type_Multifamily']
print(Features_property_type.shape)
print(Features_property_type[:5, :]) 

[1 2 3]
(500000, 3)
[[1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]]


In [18]:
print(df_clean['loan_purpose'].unique())
categorical_columns= ['loan_purpose']
for col in categorical_columns:
    temp = encode_string(df_clean[col])
    Features_loan_purpose = np.concatenate([temp], axis = 1)
enc_loan_purpose = ['loan_purpose_Home_purchase','loan_purpose_Home_improvement','loan_purpose_Refinancing']
print(Features_loan_purpose.shape)
print(Features_loan_purpose[:5, :]) 

[3 1 2]
(500000, 3)
[[0. 0. 1.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]]


In [19]:
print(df_clean['occupancy'].unique())
categorical_columns= ['occupancy']
for col in categorical_columns:
    temp = encode_string(df_clean[col])
    Features_occupancy = np.concatenate([temp], axis = 1)
enc_occupancy = ['occupancy_Owner_occupied','occupancy_Not_owner_occupied','occupancy_Not_applicable']
print(Features_occupancy.shape)
print(Features_occupancy[:5, :]) 

[1 2 3]
(500000, 3)
[[1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]]


In [20]:
print(df_clean['preapproval'].unique())
categorical_columns= ['preapproval']
for col in categorical_columns:
    temp = encode_string(df_clean[col])
    Features_preapproval = np.concatenate([temp], axis = 1)
enc_preapproval = ['preapproval_Preapproval_requested','preapproval_Preapproval_not_requested','preapproval_Not_applicable']
print(Features_preapproval.shape)
print(Features_preapproval[:5, :]) 

[3 2 1]
(500000, 3)
[[0. 0. 1.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 1. 0.]]


In [21]:
print(df_clean['applicant_ethnicity'].unique())
categorical_columns= ['applicant_ethnicity']
for col in categorical_columns:
    temp = encode_string(df_clean[col])
    Features_applicant_ethnicity = np.concatenate([temp], axis = 1)
enc_applicant_ethnicity = ['applicant_ethnicity_Hispanic_Latino',
           'applicant_ethnicity_Not_Hispanic_Latino','applicant_ethnicity_Information_not_provided',
           'applicant_ethnicity_Not_applicable']
print(Features_applicant_ethnicity.shape)
print(Features_applicant_ethnicity[:5, :]) 

[2 3 1 4]
(500000, 4)
[[0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 1. 0.]]


In [22]:
print(df_clean['applicant_race'].unique())
categorical_columns= ['applicant_race']
for col in categorical_columns:
    temp = encode_string(df_clean[col])
    Features_applicant_race = np.concatenate([temp], axis = 1)
enc_applicant_race = ['applicant_race_American_Indian','applicant_race_Asian', 'applicant_race_African_American',
           'applicant_race_Native_Hawaiian','applicant_race_White','applicant_race_Information_not_provided',
           'applicant_race_Not_applicable']
print(Features_applicant_race.shape)
print(Features_applicant_race[:5, :]) 

[5 1 2 6 3 7 4]
(500000, 7)
[[0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0.]]


In [23]:
print(df_clean['applicant_sex'].unique())
categorical_columns= ['applicant_sex']
for col in categorical_columns:
    temp = encode_string(df_clean[col])
    Features_applicant_sex = np.concatenate([temp], axis = 1)
enc_applicant_sex = ['applicant_sex_Male','applicant_sex_Female', 'applicant_sex_Information_not_provided',
                 'applicant_sex_Not_applicable']
print(Features_applicant_sex.shape)
print(Features_applicant_sex[:5, :])

[1 2 3 4]
(500000, 4)
[[1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 1. 0.]]


In [24]:
df_test_dataset['loan_amount_log'] = np.log((1 + df_test_dataset['loan_amount']))
#df_test_dataset['loan_amount_log'] = np.round(df_test_dataset['loan_amount_log'])
df_test_dataset['applicant_income_log'] = np.log((1 + df_test_dataset['applicant_income']))
#df_test_dataset['applicant_income_log'] = np.round(df_test_dataset['applicant_income_log'])
df_test_dataset['msa_md_log'] = np.log((1 + df_test_dataset['msa_md']))
#df_test_dataset['msa_md_log'] = np.round(df_test_dataset['msa_md_log'])
df_test_dataset['ffiecmedian_family_income_log'] = np.log((1 + df_test_dataset['ffiecmedian_family_income']))
#df_test_dataset['ffiecmedian_family_income_log'] = np.round(df_test_dataset['ffiecmedian_family_income_log'])
df_test_dataset['tract_to_msa_md_income_pct_log'] = np.log((1 + df_test_dataset['tract_to_msa_md_income_pct']))
#df_test_dataset['tract_to_msa_md_income_pct_log'] = np.round(df_test_dataset['tract_to_msa_md_income_pct_log'])
df_test_dataset['number_of_owner-occupied_units_log'] = np.log((1 + df_test_dataset['number_of_owner-occupied_units']))
#df_test_dataset['number_of_owner-occupied_units_log'] = np.round(df_test_dataset['number_of_owner-occupied_units_log'])
df_test_dataset['number_of_1_to_4_family_units_log'] = np.log((1 + df_test_dataset['number_of_1_to_4_family_units']))
#df_test_dataset['number_of_1_to_4_family_units_log'] = np.round(df_test_dataset['number_of_1_to_4_family_units_log'])
df_test_dataset['lender_log'] = np.log((1 + df_test_dataset['lender']))
#df_test_dataset['lender_log'] = np.round(df_test_dataset['lender_log'])
df_test_dataset['msa_md_log'] = np.log((1 + df_test_dataset['msa_md']))
#df_test_dataset['msa_md_log'] = np.round(df_test_dataset['msa_md_log'])
df_test_dataset['state_code_log'] = np.log((1 + df_test_dataset['state_code']))
#df_test_dataset['state_code_log'] = np.round(df_test_dataset['state_code_log'])
df_test_dataset['county_code_log'] = np.log((1 + df_test_dataset['county_code']))
#df_test_dataset['county_code_log'] = np.round(df_test_dataset['county_code_log'])
df_test_dataset['minority_population_pct_log'] = np.log((1 + df_test_dataset['minority_population_pct']))
#df_test_dataset['minority_population_pct_log'] = np.round(df_test_dataset['minority_population_pct_log'])
df_test_dataset['population_log'] = np.log((1 + df_test_dataset['population']))
#df_test_dataset['population_log'] = np.round(df_test_dataset['population_log'])

In [25]:
# To see and get number columns
category_num_cols_logs =(df_clean.dtypes == float) | (df_clean.dtypes==np.int64)
raw_category_num_cols_logs = [c for c in category_num_cols_logs.index if category_num_cols_logs[c]]
raw_category_num_cols_logs

['row_id',
 'loan_type',
 'property_type',
 'loan_purpose',
 'occupancy',
 'loan_amount',
 'preapproval',
 'msa_md',
 'state_code',
 'county_code',
 'applicant_ethnicity',
 'applicant_race',
 'applicant_sex',
 'applicant_income',
 'population',
 'minority_population_pct',
 'ffiecmedian_family_income',
 'tract_to_msa_md_income_pct',
 'number_of_owner-occupied_units',
 'number_of_1_to_4_family_units',
 'lender',
 'loan_amount_log',
 'applicant_income_log',
 'msa_md_log',
 'ffiecmedian_family_income_log',
 'tract_to_msa_md_income_pct_log',
 'number_of_owner-occupied_units_log',
 'number_of_1_to_4_family_units_log',
 'lender_log',
 'state_code_log',
 'county_code_log',
 'minority_population_pct_log',
 'population_log']

In [26]:
cols = raw_category_num_cols_logs + enc_co_applicant + enc_loan_type + enc_property_type + enc_loan_purpose + enc_occupancy + enc_preapproval + enc_applicant_ethnicity + enc_applicant_race + enc_applicant_sex
#cols = raw_category_num_cols_logs
len(cols)
cols

['row_id',
 'loan_type',
 'property_type',
 'loan_purpose',
 'occupancy',
 'loan_amount',
 'preapproval',
 'msa_md',
 'state_code',
 'county_code',
 'applicant_ethnicity',
 'applicant_race',
 'applicant_sex',
 'applicant_income',
 'population',
 'minority_population_pct',
 'ffiecmedian_family_income',
 'tract_to_msa_md_income_pct',
 'number_of_owner-occupied_units',
 'number_of_1_to_4_family_units',
 'lender',
 'loan_amount_log',
 'applicant_income_log',
 'msa_md_log',
 'ffiecmedian_family_income_log',
 'tract_to_msa_md_income_pct_log',
 'number_of_owner-occupied_units_log',
 'number_of_1_to_4_family_units_log',
 'lender_log',
 'state_code_log',
 'county_code_log',
 'minority_population_pct_log',
 'population_log',
 'co_applicant_True',
 'co_applicant_False',
 'loan_type_conv',
 'loan_type_FHA',
 'loan_type_VA',
 'loan_type_FSA_RHS',
 'property_type_One_to_four_family',
 'property_type_Manufactured_housing',
 'property_type_Multifamily',
 'loan_purpose_Home_purchase',
 'loan_purpose_Ho

In [None]:
df_clean.head()

In [None]:
df_test_enc= np.concatenate([df_clean[raw_category_num_cols_logs],Features_co_applicant,Features_loan_type,Features_property_type,Features_loan_purpose,Features_occupancy,Features_preapproval,Features_applicant_ethnicity,Features_applicant_race,Features_applicant_sex],axis=1)
df_test_enc = pd.DataFrame(df_test_enc, columns=cols)

In [None]:
cols_of_interest = ['loan_amount_log',
 'applicant_income_log',
 'msa_md_log',
 'ffiecmedian_family_income_log',
 'tract_to_msa_md_income_pct_log',
 'number_of_owner-occupied_units_log',
 'number_of_1_to_4_family_units_log',
 'lender_log',
 'state_code_log',
 'county_code_log',
 'minority_population_pct_log',
 'population_log',
 'co_applicant_True',
 'co_applicant_False',
 'loan_type_conv',
 'loan_type_FHA',
 'loan_type_VA',
 'loan_type_FSA_RHS',
 'property_type_One_to_four_family',
 'property_type_Manufactured_housing',
 'property_type_Multifamily',
 'loan_purpose_Home_purchase',
 'loan_purpose_Home_improvement',
 'loan_purpose_Refinancing',
 'occupancy_Owner_occupied',
 'occupancy_Not_owner_occupied',
 'occupancy_Not_applicable',
 'preapproval_Preapproval_requested',
 'preapproval_Preapproval_not_requested',
 'preapproval_Not_applicable',
 'applicant_ethnicity_Hispanic_Latino',
 'applicant_ethnicity_Not_Hispanic_Latino',
 'applicant_ethnicity_Information_not_provided',
 'applicant_ethnicity_Not_applicable',
 'applicant_race_American_Indian',
 'applicant_race_Asian',
 'applicant_race_African_American',
 'applicant_race_Native_Hawaiian',
 'applicant_race_White',
 'applicant_race_Information_not_provided',
 'applicant_race_Not_applicable',
 'applicant_sex_Male',
 'applicant_sex_Female',
 'applicant_sex_Information_not_provided',
 'applicant_sex_Not_applicable']

In [None]:
#select the cols of interest and assign back to the df:
df_test_enc = df_test_enc[cols_of_interest]

In [None]:
# convert all DataFrame columns to the int64 dtype
#df_test_enc = df_test_enc.astype(int)
#df_test_enc.dtypes

In [None]:
df_test_enc.head()

In [None]:
df_test_enc.to_csv('Data/df_test_enc_2.csv')

In [None]:
df_test_enc.head()

In [None]:
df_test_enc.shape

### transforming numerical data

In [None]:
df_test_trf = pd.read_csv('Data/df_test_enc_2.csv')

In [None]:
data_norm = preprocessing.normalize(df_test_trf[categorical], axis = 1)

In [None]:
df_test_trf = np.concatenate([data_norm,Features],axis=1)

In [None]:
df_test_trf = pd.DataFrame(df_test_trf, columns=cols)
df_test_trf.to_csv('Data/df_test_trf.csv')

In [None]:
df_test_trf.shape

In [None]:
df_test_trf.head()

In [7]:

# import the packages
import numpy as np
import pandas as pd
import category_encoders as ce

# make some data
df = pd.DataFrame({
 'color':["a", "b", "a", "c"],
 'left':["a", "b", "a", "c"],
 'outcome':[1, 2, 3, 2]})

# split into X and y
X = df.drop('outcome', axis = 1)
y = df.drop(['color','left'], axis = 1)

# instantiate an encoder - here we use Binary()
ce_binary = ce.BinaryEncoder(cols = ['left','color'])

# fit and transform and presto, you've got encoded data
ce_binary.fit_transform(X, y)

Unnamed: 0,left_0,left_1,color_0,color_1
0,0,0,0,0
1,0,1,0,1
2,0,0,0,0
3,1,0,1,0


In [4]:
from sklearn.preprocessing import RobustScaler,Normalizer, MinMaxScaler,FunctionTransformer, PolynomialFeatures, Imputer
df_test_values_trf = pd.read_csv('Data/df_test_enc_2.csv')
# df_test_values_trf = preprocessing.normalize(df_test_values_trf, axis =0)
# #df_trf = df_trf.astype(int)
# #df_trf = df_trf.round()
# #df_enc.dtypes
# df_test_values_trf = pd.DataFrame(df_test_values_trf,columns = df_trf.columns)


# #df_test_values_trf = clean_dataset(df_test_values_trf)
col_names = df_test_values_trf.columns
#features = df_test_values_trf[col_names]
imp = Imputer().fit(df_test_values_trf)
features = imp.transform(df_test_values_trf)

scaler = RobustScaler().fit(features)
features = scaler.transform(features)
df_test_values_trf[col_names] = features
test_values = df_test_values_trf.drop(['Unnamed: 0'],axis=1)
test_values.head()
test_values=np.array(test_values)

In [5]:
test_values

array([[-3.25838160e-01, -9.89562556e+02,  2.12586543e-01, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 4.21129035e-01,  4.41919275e-01, -6.58977049e-02, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 4.86964637e-01,  5.45843993e-01, -4.15407094e+03, ...,
         1.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       ...,
       [-4.68009197e-01, -7.27547330e-01, -2.03095859e-01, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 1.71702373e-01,  6.24795920e-01, -3.09134342e-01, ...,
         0.00000000e+00,  1.00000000e+00,  0.00000000e+00],
       [-1.94837221e-01,  1.45904352e-01, -9.33529738e-01, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00]])

In [4]:
testing = pd.read_csv('Data/test_values.csv', index_col = "row_id")
testing.head()

Unnamed: 0_level_0,loan_type,property_type,loan_purpose,occupancy,loan_amount,preapproval,msa_md,state_code,county_code,applicant_ethnicity,...,applicant_sex,applicant_income,population,minority_population_pct,ffiecmedian_family_income,tract_to_msa_md_income_pct,number_of_owner-occupied_units,number_of_1_to_4_family_units,lender,co_applicant
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,2,1,3,1,115.0,3,101,16,276,2,...,1,,6329.0,59.536,69889.0,85.78,1874.0,2410.0,3791,True
1,1,1,1,1,252.0,2,87,20,68,2,...,1,107.0,2473.0,8.05,65313.0,100.0,947.0,1214.0,2839,True
2,1,1,1,1,270.0,1,-1,-1,-1,2,...,2,119.0,,,,,,,4701,False
3,2,1,1,1,179.0,2,376,20,11,2,...,2,44.0,4795.0,29.676,57766.0,100.0,1426.0,1765.0,2153,True
4,2,1,1,1,36.0,2,254,48,156,3,...,3,32.0,5246.0,5.11,63332.0,100.0,1452.0,2092.0,5710,False
