# Importing required libraries

In [2]:
# Load Keras libraries used in this example

import keras
from keras import metrics
from keras import regularizers
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Activation
from keras.layers import Conv2D, MaxPooling2D
from keras.optimizers import Adam, RMSprop
from keras.callbacks import TensorBoard, EarlyStopping, ModelCheckpoint
from keras.utils import plot_model
from keras.models import load_model
from keras.layers.normalization import BatchNormalization

Using TensorFlow backend.


In [3]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import random as rn
from keras import backend as K
from sklearn.model_selection import train_test_split

import os
import boto3
import sys

if sys.version_info[0] < 3: 
    from StringIO import StringIO # Python 2.x
else:
    from io import StringIO # Python 3.x
    

import psutil
process = psutil.Process(os.getpid())

In [5]:
from sklearn.feature_selection import RFECV, RFE
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Data is fetched from the url and saved in the hard disk.

--- will be done in production environment ---

# Deciding Chunk Size on the basis of available RAM

--- will be done in production environment --- <br>
Assuming the chunk size to be 1000.

# Splitting the data randomly into train and test for feature selection

The following would be used in production script for all bash commands. <br>
os.system('---command---')

In [6]:
# Getting the total rows in the df

In [7]:
%%bash
wc -l < data/large_house_data.csv

23217


In [8]:
# Getting the percentage of test data
1000/23217

0.04307188697936857

In [9]:
# Getting the test_sample on the basis of pct_to_split

In [10]:
%%bash
cat data/large_house_data.csv | awk 'BEGIN {srand()} !/^$/ { if (rand() <= 0.043 || FNR==1) print $0}' > data/large_house_data_test.csv

In [11]:
# Getting the remaining train data except the test_sample

In [12]:
%%bash
awk 'NR==FNR {exclude[$0];next} !($0 in exclude) || FNR==1' data/large_house_data_test.csv data/large_house_data.csv > data/large_house_data_train.csv

# Pre-processing the test data and again saving it to the hard disk

Assuming the testing data is already pre-processed.

# RFE - Automated Feature Seclection

In [17]:
def rfe_feat_selection(train_x,train_y,label_col):
    
    testing_data = pd.read_csv('data/large_house_data_test.csv')
    testing_data_X = testing_data.loc[:,testing_data.columns != label_col]
    testing_data_y = testing_data[label_col]
    
    step = int(np.ceil(train_x.shape[1] / 100))
    print('Step Value:',step)

    estimator = RandomForestRegressor(warm_start=True, random_state=42,n_jobs=-1)

    no_cols = train_x.shape[1]
    
    result_dict = {}
    
    for pct in range(10,100,10):
        
        n_features = int((pct * no_cols) / 100)
        print('Fitting with {} features'.format(n_features))
        selector = RFE(estimator,step=step,n_features_to_select=n_features,verbose=0)
        selector = selector.fit(train_x, train_y)
        predicted_Y = selector.predict(testing_data_X)
        
        error = mean_squared_error(testing_data_y,predicted_Y)
        
        selected_cols = []
        for val,col in zip(selector.support_,train_x.columns):
            if(val == True):
                selected_cols.append(col)
        
        result_dict[pct] = error,selected_cols
            
    import operator
    sorted_result_dict = sorted(result_dict.items(), key=operator.itemgetter(1))
    
    selected_cols = sorted_result_dict[0][1][1]
    print('Best results with {} features for current chunk'.format(int((sorted_result_dict[0][0] * no_cols) / 100)))

    return selected_cols

# Loding training data Chunk Wise and getting best features

In [18]:
list_of_features = []
label_col = 'SalePrice'

for num,chunk in enumerate(pd.read_csv('data/large_house_data_train.csv',iterator=True,chunksize=1000)):
    print('Current Chunk Number: ',num+1)
    
    X = chunk.loc[:,chunk.columns != label_col]
    y = chunk[label_col]
    
    chunk_feature_list = rfe_feat_selection(X,y,label_col='SalePrice')
    
    list_of_features.append(chunk_feature_list)
    

Current Chunk Number:  1
Step Value: 12
Fitting with 114 features
Fitting with 229 features
Fitting with 344 features
Fitting with 459 features
Fitting with 574 features
Fitting with 688 features
Fitting with 803 features
Fitting with 918 features
Fitting with 1033 features
Best results with 229 features for current chunk
Current Chunk Number:  2
Step Value: 12
Fitting with 114 features
Fitting with 229 features
Fitting with 344 features
Fitting with 459 features
Fitting with 574 features
Fitting with 688 features
Fitting with 803 features
Fitting with 918 features
Fitting with 1033 features
Best results with 574 features for current chunk
Current Chunk Number:  3
Step Value: 12
Fitting with 114 features
Fitting with 229 features
Fitting with 344 features
Fitting with 459 features
Fitting with 574 features
Fitting with 688 features
Fitting with 803 features
Fitting with 918 features
Fitting with 1033 features
Best results with 114 features for current chunk
Current Chunk Number:  4
Ste

In [21]:
len(list_of_features)

12