In [14]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

# Read the training data
import os
train_file_path = os.getcwd() + '/house-prices-advanced-regression-techniques/train.csv'
test_file_path = os.getcwd() + '/house-prices-advanced-regression-techniques/test.csv'
train_data = pd.read_csv(train_file_path)
test_data = pd.read_csv(test_file_path)

In [15]:
# Print shape of the data
print(train_data.shape)
print(test_data.shape)

(1460, 81)
(1459, 80)


Traing data has 81 columns which is one more than the column number of test data. Extra column is target variable, 'SalePrice'

Make some data preprocessings.

Check for missing values

In [42]:
train_data_features_with_null = []
for feature in train_data:
    if train_data[feature].isna().sum() > 0:
        train_data_features_with_null.append(feature)

test_data_features_with_null = []
for feature in test_data:
    if test_data[feature].isna().sum() > 0:
        test_data_features_with_null.append(feature)

print('# of features with null values in training data :', len(train_data_features_with_null))
print('Features with null values in training data :', *train_data_features_with_null,sep='\n')
print('\n\n')
print('# of features with null values in test data :', len(test_data_features_with_null))
print('Features with null values in test data :', *test_data_features_with_null,sep='\n')

# of features with null values in training data: 19
Features with null values in training data:
LotFrontage
Alley
MasVnrType
MasVnrArea
BsmtQual
BsmtCond
BsmtExposure
BsmtFinType1
BsmtFinType2
Electrical
FireplaceQu
GarageType
GarageYrBlt
GarageFinish
GarageQual
GarageCond
PoolQC
Fence
MiscFeature



# of features with null values in test data: 33
Features with null values in test data:
MSZoning
LotFrontage
Alley
Utilities
Exterior1st
Exterior2nd
MasVnrType
MasVnrArea
BsmtQual
BsmtCond
BsmtExposure
BsmtFinType1
BsmtFinSF1
BsmtFinType2
BsmtFinSF2
BsmtUnfSF
TotalBsmtSF
BsmtFullBath
BsmtHalfBath
KitchenQual
Functional
FireplaceQu
GarageType
GarageYrBlt
GarageFinish
GarageCars
GarageArea
GarageQual
GarageCond
PoolQC
Fence
MiscFeature
SaleType


Check for the features which have null values in both training and test data.

In [43]:
common_features_with_null = []
for feature in train_data_features_with_null:
    if feature in test_data_features_with_null:
        common_features_with_null.append(feature)

print('# of common features containing null values :', len(common_features_with_null))

print('Common features containing null values :', *common_features_with_null, sep = '\n')

# of common features containing null values : 18
Common features containing null values :
LotFrontage
Alley
MasVnrType
MasVnrArea
BsmtQual
BsmtCond
BsmtExposure
BsmtFinType1
BsmtFinType2
FireplaceQu
GarageType
GarageYrBlt
GarageFinish
GarageQual
GarageCond
PoolQC
Fence
MiscFeature


Calculate percentage of missing values in training data.

In [138]:
nan_percent_train = {col : train_data[col].isnull().mean() for col in train_data.columns}
nan_percent_train = dict(sorted(nan_percent.items(), key = lambda x: x[1], reverse = True))
# Remove features with 0 percent nan values from this list.
new_nan_percent_train = {}
for (key, value) in nan_percent_train.items():
    if value > 0:
        new_nan_percent_train[key] = value
nan_percent_train = new_nan_percent_train

# Print these features
for key in nan_percent_train.keys():
    print(key,' \t\t: ',nan_percent_train[key])

PoolQC  		:  0.9952054794520548
MiscFeature  		:  0.963013698630137
Alley  		:  0.9376712328767123
Fence  		:  0.8075342465753425
FireplaceQu  		:  0.4726027397260274
LotFrontage  		:  0.1773972602739726
GarageType  		:  0.05547945205479452
GarageYrBlt  		:  0.05547945205479452
GarageFinish  		:  0.05547945205479452
GarageQual  		:  0.05547945205479452
GarageCond  		:  0.05547945205479452
BsmtExposure  		:  0.026027397260273973
BsmtFinType2  		:  0.026027397260273973
BsmtQual  		:  0.025342465753424658
BsmtCond  		:  0.025342465753424658
BsmtFinType1  		:  0.025342465753424658
MasVnrType  		:  0.005479452054794521
MasVnrArea  		:  0.005479452054794521
Electrical  		:  0.0006849315068493151


Calculate percentage of missing values in training data.

In [139]:
nan_percent_test = {col : train_data[col].isnull().mean() for col in train_data.columns}
nan_percent_test = dict(sorted(nan_percent_test.items(), key = lambda x: x[1], reverse = True))
# Remove features with 0 percent nan values from this list.
new_nan_percent_test = {}
for (key, value) in nan_percent_test.items():
    if value > 0:
        new_nan_percent_test[key] = value
nan_percent_test = new_nan_percent_test

# Print these features
for key in nan_percent_test.keys():
    print(key,' \t\t: ',nan_percent_test[key])

PoolQC  		:  0.9952054794520548
MiscFeature  		:  0.963013698630137
Alley  		:  0.9376712328767123
Fence  		:  0.8075342465753425
FireplaceQu  		:  0.4726027397260274
LotFrontage  		:  0.1773972602739726
GarageType  		:  0.05547945205479452
GarageYrBlt  		:  0.05547945205479452
GarageFinish  		:  0.05547945205479452
GarageQual  		:  0.05547945205479452
GarageCond  		:  0.05547945205479452
BsmtExposure  		:  0.026027397260273973
BsmtFinType2  		:  0.026027397260273973
BsmtQual  		:  0.025342465753424658
BsmtCond  		:  0.025342465753424658
BsmtFinType1  		:  0.025342465753424658
MasVnrType  		:  0.005479452054794521
MasVnrArea  		:  0.005479452054794521
Electrical  		:  0.0006849315068493151


Drop the features with percentage of nan values greater than 50.

In [140]:
high_nan_percent = {}
for (key, value) in nan_percent_train.items():
    if value > 0.5 or nan_percent_test[key] > 0.5:
        high_nan_percent[key] = value

In [141]:
reduced_train_data = train_data.drop(list(high_nan_percent), axis = 'columns')
reduced_test_data = test_data.drop(list(high_nan_percent), axis = 'columns')
#reduced_test_data = test_data.copy()
#for (key, value) in nan_percent:
#    if value > 0.5:
#        reduced_train_data.drop

In [142]:
print(reduced_train_data.shape)
print(reduced_test_data.shape)

(1460, 77)
(1459, 76)
