# Feature Engineering

The goal of this notebook is see how to clean the data and to do the feature engineering

In [1]:
import pandas as pd
import numpy as np

In [2]:
test_df = pd.read_csv('../data/raw/application_test.csv')
train_df = pd.read_csv('../data/raw/application_train.csv')

In [3]:
print(train_df.shape, test_df.shape)

(307511, 122) (48744, 121)


## Delete missing values 

In [4]:
# checking missing data in train_df

number = train_df.isnull().sum().sort_values(ascending = False)
percent = (train_df.isnull().sum() / train_df.isnull().count() * 100).sort_values(ascending = False)

missing_train_df = pd.concat([number , percent] , axis = 1 , keys = ['Total' , 'Percent'])
print(missing_train_df.shape)
print(missing_train_df.head(40))

(122, 2)
                           Total    Percent
COMMONAREA_MEDI           214865  69.872297
COMMONAREA_AVG            214865  69.872297
COMMONAREA_MODE           214865  69.872297
NONLIVINGAPARTMENTS_MODE  213514  69.432963
NONLIVINGAPARTMENTS_MEDI  213514  69.432963
NONLIVINGAPARTMENTS_AVG   213514  69.432963
FONDKAPREMONT_MODE        210295  68.386172
LIVINGAPARTMENTS_MEDI     210199  68.354953
LIVINGAPARTMENTS_MODE     210199  68.354953
LIVINGAPARTMENTS_AVG      210199  68.354953
FLOORSMIN_MEDI            208642  67.848630
FLOORSMIN_MODE            208642  67.848630
FLOORSMIN_AVG             208642  67.848630
YEARS_BUILD_MEDI          204488  66.497784
YEARS_BUILD_AVG           204488  66.497784
YEARS_BUILD_MODE          204488  66.497784
OWN_CAR_AGE               202929  65.990810
LANDAREA_MODE             182590  59.376738
LANDAREA_AVG              182590  59.376738
LANDAREA_MEDI             182590  59.376738
BASEMENTAREA_MEDI         179943  58.515956
BASEMENTAREA_AVG       

In [5]:
# checking missing data in test_df
number = test_df.isnull().sum().sort_values(ascending = False)
percent = (test_df.isnull().sum() / test_df.isnull().count() * 100).sort_values(ascending = False)

missing_test_df = pd.concat([number , percent] , axis = 1 , keys = ['Total' , 'Percent'])
print(missing_test_df.shape)
print(missing_test_df.head(40))

(121, 2)
                          Total    Percent
COMMONAREA_MEDI           33495  68.716150
COMMONAREA_AVG            33495  68.716150
COMMONAREA_MODE           33495  68.716150
NONLIVINGAPARTMENTS_MODE  33347  68.412523
NONLIVINGAPARTMENTS_MEDI  33347  68.412523
NONLIVINGAPARTMENTS_AVG   33347  68.412523
FONDKAPREMONT_MODE        32797  67.284179
LIVINGAPARTMENTS_AVG      32780  67.249302
LIVINGAPARTMENTS_MEDI     32780  67.249302
LIVINGAPARTMENTS_MODE     32780  67.249302
FLOORSMIN_MEDI            32466  66.605121
FLOORSMIN_MODE            32466  66.605121
FLOORSMIN_AVG             32466  66.605121
OWN_CAR_AGE               32312  66.289184
YEARS_BUILD_MEDI          31818  65.275726
YEARS_BUILD_MODE          31818  65.275726
YEARS_BUILD_AVG           31818  65.275726
LANDAREA_AVG              28254  57.964057
LANDAREA_MODE             28254  57.964057
LANDAREA_MEDI             28254  57.964057
BASEMENTAREA_AVG          27641  56.706466
BASEMENTAREA_MODE         27641  56.706466
BA

In [6]:
train_df['TARGET'].unique

<bound method Series.unique of 0         1
1         0
2         0
3         0
4         0
         ..
307506    0
307507    0
307508    0
307509    1
307510    0
Name: TARGET, Length: 307511, dtype: int64>

In [7]:
#we drop columns with more than 60% missing values
def dropna(df):
    mv=df.isna().sum()/df.shape[0]
    val=mv[mv>0.60]
    l=[i for i in val.index]
    dat=df.drop(l,axis=1)
    return dat

In [8]:
train_df=dropna(train_df)

In [9]:
test_df=dropna(test_df)

In [10]:
print(train_df.shape, test_df.shape)

(307511, 105) (48744, 104)


In [21]:
for col in test_df.select_dtypes(include=[object]).columns:
    train_df[col] = train_df[col].fillna(train_df[col].mode(dropna=True)[0])
    test_df[col] = test_df[col].fillna(test_df[col].mode(dropna=True)[0])

In [12]:
for col in test_df.select_dtypes(include=[int,float]).columns:
    train_df[col] = train_df[col].fillna(train_df[col].median())
    test_df[col] = test_df[col].fillna(test_df[col].median())

In [18]:
train_df.isnull().sum().sum()

0

In [19]:
test_df.isnull().sum().sum()

0

In [13]:
print(train_df.shape, test_df.shape)

(307511, 105) (48744, 104)


In [14]:
# Use dummies
train_df = pd.get_dummies(train_df)
test_df = pd.get_dummies(test_df)
target = train_df['TARGET']

train_df, test_df = train_df.align(test_df, join = 'inner', axis = 1)
train_df['TARGET'] = target

In [15]:
print(train_df.shape, test_df.shape)

(307511, 223) (48744, 222)


In [16]:
train_df.to_csv(r'../data/processed/train.csv')

In [17]:
test_df.to_csv(r'../data/processed/test.csv')