# Feature Extraction
The goal of this notebook is to cleanup, preprocess and extract features into a separate csv

In [12]:
import numpy as np
import pandas as pd

from IPython.display import display

import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import MinMaxScaler, LabelBinarizer
from sklearn.preprocessing import Imputer

# Pretty display for notebooks
%matplotlib inline

In [3]:
app_train = pd.read_csv('input/application_train.csv.zip')
app_test = pd.read_csv('input/application_test.csv.zip')

In [4]:
app_train = app_train.drop(columns=['TARGET'])
print(app_train.shape)
print(app_test.shape)

(307511, 121)
(48744, 121)


Concatanating dataframe to do preprocessing for train and test simultaneously.

In [5]:
app = pd.concat([app_train, app_test])
display(app.head(2))

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,351000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,1129500.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
app.shape

(356255, 121)

## Categorical Features

In [7]:
app = pd.get_dummies(app)

In [8]:
display(app.head(2))

Unnamed: 0,SK_ID_CURR,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,...,HOUSETYPE_MODE_terraced house,WALLSMATERIAL_MODE_Block,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes
0,100002,0,202500.0,406597.5,24700.5,351000.0,0.018801,-9461,-637,-3648.0,...,0,0,0,0,0,0,1,0,1,0
1,100003,0,270000.0,1293502.5,35698.5,1129500.0,0.003541,-16765,-1188,-1186.0,...,0,1,0,0,0,0,0,0,1,0


In [9]:
print(app.dtypes.value_counts())
app.shape

uint8      140
float64     65
int64       40
dtype: int64


(356255, 245)

## Continuous Features Normalization

In [13]:
numerical = app.select_dtypes('float64').columns.tolist()
imputer = Imputer(strategy = 'median')

app[numerical] = imputer.fit_transform(app[numerical])

In [14]:
scaler = MinMaxScaler()
app[numerical] = scaler.fit_transform(app[numerical])
display(app.head(2))

Unnamed: 0,SK_ID_CURR,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,...,HOUSETYPE_MODE_terraced house,WALLSMATERIAL_MODE_Block,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes
0,100002,0,0.001512,0.090287,0.090032,0.077441,0.256702,-9461,-637,0.85214,...,0,0,0,0,0,0,1,0,1,0
1,100003,0,0.002089,0.311736,0.132924,0.271605,0.045506,-16765,-1188,0.951929,...,0,1,0,0,0,0,0,0,1,0
