In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import mlflow
import mlflow.sklearn

# Read the training data
import os
train_file_path = os.getcwd() + '/house-prices-advanced-regression-techniques/train.csv'
test_file_path = os.getcwd() + '/house-prices-advanced-regression-techniques/test.csv'
train_data = pd.read_csv(train_file_path)
test_data = pd.read_csv(test_file_path)


In [2]:
mlflow.end_run()

In [3]:
mlflow.start_run()

The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh()

All git commands will error until this is rectified.

$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - error|e|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet



<ActiveRun: >

In [4]:
# Print shape of the data
print(train_data.shape)
print(test_data.shape)

(1460, 81)
(1459, 80)


Traing data has 81 columns which is one more than the column number of test data. Extra column is target variable, 'SalePrice'

In [5]:
y = train_data.SalePrice

Make data preprocessings.

Check for missing values

In [6]:
train_data_features_with_null = []
for feature in train_data:
    if train_data[feature].isna().sum() > 0:
        train_data_features_with_null.append(feature)

test_data_features_with_null = []
for feature in test_data:
    if test_data[feature].isna().sum() > 0:
        test_data_features_with_null.append(feature)

print('# of features with null values in training data :', len(train_data_features_with_null))
print('Features with null values in training data :', *train_data_features_with_null,sep='\n')
print('\n\n')
print('# of features with null values in test data :', len(test_data_features_with_null))
print('Features with null values in test data :', *test_data_features_with_null,sep='\n')

# of features with null values in training data : 19
Features with null values in training data :
LotFrontage
Alley
MasVnrType
MasVnrArea
BsmtQual
BsmtCond
BsmtExposure
BsmtFinType1
BsmtFinType2
Electrical
FireplaceQu
GarageType
GarageYrBlt
GarageFinish
GarageQual
GarageCond
PoolQC
Fence
MiscFeature



# of features with null values in test data : 33
Features with null values in test data :
MSZoning
LotFrontage
Alley
Utilities
Exterior1st
Exterior2nd
MasVnrType
MasVnrArea
BsmtQual
BsmtCond
BsmtExposure
BsmtFinType1
BsmtFinSF1
BsmtFinType2
BsmtFinSF2
BsmtUnfSF
TotalBsmtSF
BsmtFullBath
BsmtHalfBath
KitchenQual
Functional
FireplaceQu
GarageType
GarageYrBlt
GarageFinish
GarageCars
GarageArea
GarageQual
GarageCond
PoolQC
Fence
MiscFeature
SaleType


Check for the features which have null values in both training and test data.

In [7]:
common_features_with_null = []
for feature in train_data_features_with_null:
    if feature in test_data_features_with_null:
        common_features_with_null.append(feature)

print('# of common features containing null values :', len(common_features_with_null))

print('Common features containing null values :', *common_features_with_null, sep = '\n')

# of common features containing null values : 18
Common features containing null values :
LotFrontage
Alley
MasVnrType
MasVnrArea
BsmtQual
BsmtCond
BsmtExposure
BsmtFinType1
BsmtFinType2
FireplaceQu
GarageType
GarageYrBlt
GarageFinish
GarageQual
GarageCond
PoolQC
Fence
MiscFeature


Calculate percentages of missing values in training data.

In [8]:
nan_percent_train = {col : train_data[col].isnull().mean() for col in train_data.columns}
nan_percent_train = dict(sorted(nan_percent_train.items(), key = lambda x: x[1], reverse = True))
# Remove features with 0 percent nan values from this list.
new_nan_percent_train = {}
for (key, value) in nan_percent_train.items():
    if value > 0:
        new_nan_percent_train[key] = value
nan_percent_train = new_nan_percent_train

# Print these features
for key in nan_percent_train.keys():
    print(key,' \t\t: ',nan_percent_train[key])

PoolQC  		:  0.9952054794520548
MiscFeature  		:  0.963013698630137
Alley  		:  0.9376712328767123
Fence  		:  0.8075342465753425
FireplaceQu  		:  0.4726027397260274
LotFrontage  		:  0.1773972602739726
GarageType  		:  0.05547945205479452
GarageYrBlt  		:  0.05547945205479452
GarageFinish  		:  0.05547945205479452
GarageQual  		:  0.05547945205479452
GarageCond  		:  0.05547945205479452
BsmtExposure  		:  0.026027397260273973
BsmtFinType2  		:  0.026027397260273973
BsmtQual  		:  0.025342465753424658
BsmtCond  		:  0.025342465753424658
BsmtFinType1  		:  0.025342465753424658
MasVnrType  		:  0.005479452054794521
MasVnrArea  		:  0.005479452054794521
Electrical  		:  0.0006849315068493151


Calculate percentage of missing values in test data.

In [9]:
nan_percent_test = {col : test_data[col].isnull().mean() for col in test_data.columns}
nan_percent_test = dict(sorted(nan_percent_test.items(), key = lambda x: x[1], reverse = True))
# Remove features with 0 percent nan values from this list.
new_nan_percent_test = {}
for (key, value) in nan_percent_test.items():
    if value > 0:
        new_nan_percent_test[key] = value
nan_percent_test = new_nan_percent_test

# Print these features
for key in nan_percent_test.keys():
    print(key,' \t\t: ',nan_percent_test[key])

PoolQC  		:  0.997943797121316
MiscFeature  		:  0.9650445510623715
Alley  		:  0.9266620973269363
Fence  		:  0.8012337217272104
FireplaceQu  		:  0.5003427004797807
LotFrontage  		:  0.15558601782042494
GarageYrBlt  		:  0.053461274845784786
GarageFinish  		:  0.053461274845784786
GarageQual  		:  0.053461274845784786
GarageCond  		:  0.053461274845784786
GarageType  		:  0.0520904729266621
BsmtCond  		:  0.030843043180260453
BsmtQual  		:  0.03015764222069911
BsmtExposure  		:  0.03015764222069911
BsmtFinType1  		:  0.02878684030157642
BsmtFinType2  		:  0.02878684030157642
MasVnrType  		:  0.010966415352981495
MasVnrArea  		:  0.01028101439342015
MSZoning  		:  0.0027416038382453737
Utilities  		:  0.0013708019191226869
BsmtFullBath  		:  0.0013708019191226869
BsmtHalfBath  		:  0.0013708019191226869
Functional  		:  0.0013708019191226869
Exterior1st  		:  0.0006854009595613434
Exterior2nd  		:  0.0006854009595613434
BsmtFinSF1  		:  0.0006854009595613434
BsmtFinSF2  		:  0.0006854

Drop the features with percentage of nan values greater than 50.

In [10]:
high_nan_percent = {}
for (key, value) in nan_percent_train.items():
    if value > 0.5:
        high_nan_percent[key] = value
        
for (key, value) in nan_percent_test.items():
    if value > 0.5:
        high_nan_percent[key] = value

# Remove the keys from full lists
for key in high_nan_percent.keys():
    nan_percent_train.pop(key)
    nan_percent_test.pop(key)

In [11]:
mlflow.log_param("Parameters with high percentage of NaN values", high_nan_percent.keys())
mlflow.log_param("Percentages of NaN values", high_nan_percent.values())

dict_values([0.997943797121316, 0.9650445510623715, 0.9266620973269363, 0.8012337217272104, 0.5003427004797807])

Drop the features with high percentage of NaN values

In [12]:
reduced_train_data = train_data.drop(list(high_nan_percent), axis = 'columns')
reduced_test_data = test_data.drop(list(high_nan_percent), axis = 'columns')

Impute training data and test data. Numeric features will be imputed by "mean" while categorical features will be imputed by "mode" value.

In [13]:
print(reduced_train_data.shape)
print(y.shape)

(1460, 76)
(1460,)


In [14]:
from sklearn.impute import SimpleImputer
my_numeric_imputer = SimpleImputer()
my_categorical_imputer = SimpleImputer(strategy = 'most_frequent')

# Before imputation, select training and validation sets
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(reduced_train_data, y, train_size=0.8,
                                                      test_size=0.2, random_state=0)
X_test = reduced_test_data.copy()

# Seperate numerical and categorical features

num_X_train = X_train.select_dtypes(exclude = 'object')
num_X_valid = X_valid.select_dtypes(exclude = 'object')
num_X_test = reduced_test_data.select_dtypes(exclude = 'object')

cat_X_train = X_train.select_dtypes(include = 'object')
cat_X_valid = X_valid.select_dtypes(include = 'object')
cat_X_test = reduced_test_data.select_dtypes(include = 'object')

# Impute missing values in numerical data with mean values
imputed_num_X_train = pd.DataFrame(my_numeric_imputer.fit_transform(num_X_train))
imputed_num_X_valid = pd.DataFrame(my_numeric_imputer.transform(num_X_valid))
imputed_num_X_test = pd.DataFrame(my_numeric_imputer.transform(num_X_test))

imputed_cat_X_train = pd.DataFrame(my_categorical_imputer.fit_transform(cat_X_train))
imputed_cat_X_valid = pd.DataFrame(my_categorical_imputer.transform(cat_X_valid))
imputed_cat_X_test = pd.DataFrame(my_categorical_imputer.transform(cat_X_test))

Check if there is any missing value

print("# of missing values in imputed_num_X_train\t:", imputed_num_X_train.isnull().sum().sum() )
print("# of missing values in imputed_num_X_valid\t:", imputed_num_X_valid.isnull().sum().sum() )
print("# of missing values in imputed_num_X_test\t:", imputed_num_X_test.isnull().sum().sum() )
print("# of missing values in imputed_cat_X_train\t:", imputed_cat_X_train.isnull().sum().sum() )
print("# of missing values in imputed_cat_X_valid\t:", imputed_cat_X_valid.isnull().sum().sum() )
print("# of missing values in imputed_cat_X_test\t:", imputed_cat_X_test.isnull().sum().sum() )

print(imputed_num_X_train.shape)
print(imputed_cat_X_train.shape)
print(imputed_num_X_test.shape)
print(imputed_cat_X_test.shape)
print(imputed_num_X_valid.shape)
print(imputed_cat_X_valid.shape)

Now, concatenate numerical and categorical features

imputed_X_train = pd.concat([imputed_num_X_train, imputed_cat_X_train], axis = 1)
imputed_X_test = pd.concat([imputed_num_X_test, imputed_cat_X_test], axis = 1)
imputed_X_valid = pd.concat([imputed_num_X_valid, imputed_cat_X_valid], axis = 1)

In [15]:
imputed_X_train = X_train.copy()
imputed_X_test = X_test.copy()
imputed_X_valid = X_valid.copy()

for label,_ in imputed_X_train.items():
    if imputed_X_train[label].dtype == 'object':
        imputed_X_train[label].fillna(imputed_X_train[label].mode()[0], inplace = True)
        imputed_X_test[label].fillna(imputed_X_train[label].mode()[0], inplace = True)
    else:
        imputed_X_train[label].fillna(imputed_X_train[label].mean(), inplace = True)
        imputed_X_test[label].fillna(imputed_X_train[label].mean(), inplace = True)

KeyError: 'SalePrice'

Apply ordinal encoder.

In [None]:
X_train.isnull().sum()

In [None]:
from sklearn.preprocessing import OrdinalEncoder
# Get list of categorical variables
s = (X_train.dtypes == 'object')
object_cols = list(s[s].index)

# Make copy to avoid changing original data 
label_X_train = X_train.copy()
label_X_valid = X_valid.copy()
label_X_test = X_test.copy()

# Apply ordinal encoder to each column with categorical data
ordinal_encoder = OrdinalEncoder(handle_unknown = 'use_encoded_value', unknown_value = -1)
label_X_train[object_cols] = ordinal_encoder.fit_transform(X_train[object_cols])
label_X_valid[object_cols] = ordinal_encoder.transform(X_valid[object_cols])
label_X_test[object_cols] = ordinal_encoder.transform(X_test[object_cols])

Now look at correlation matrix

In [None]:
corr = label_X_train.corr()
# Filter the features whose correlation with "SalePrice" is higher than 0.5
highest_corr_features = corr.index[abs(corr['SalePrice']) > 0.5]
fig = plt.figure(figsize = (10,10))
g = sns.heatmap(label_X_train[highest_corr_features].corr(), annot = True)

In [None]:
mlflow.log_figure(fig1, "Abs_correlation_matrix.jpg")

Sort the correlation matrix to see the features that are most related to 'SalePrice'

In [None]:
highest_corr = label_X_train[highest_corr_features].corr()
sorted_corr = highest_corr.abs().sort_values('SalePrice', ascending = False)
fig = plt.figure(figsize = (10,10))
g = sns.heatmap(sorted_corr, annot = True)

In [None]:
features_sorted = ['OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea',
                 'TotalBsmtSF', 'ExterQual', '1stFlrSF', 'BsmtQual', 'KitchenQual',
                 'FullBath', 'TotRmsAbvGrd', 'YearBuilt', 'YearRemodAdd']

If there are columns that are highly correlated to each other, drop one of them.

In [None]:
clean_features_sorted = features_sorted.copy()
for col in features_sorted:
    for index in features_sorted:
        if col != index and highest_corr[col][index] > 0.8:
            if col in clean_features_sorted:
                clean_features_sorted.remove(col)

In [None]:
print(clean_features_sorted)

In [None]:
reduced_label_X_train = label_X_train[clean_features_sorted];
reduced_label_X_test = label_X_test[clean_features_sorted];
reduced_label_X_valid = label_X_test[clean_features_sorted];


Apply ML Model

In [None]:
from sklearn.ensemble import RandomForestRegressor

RF = RandomForestRegressor(bootstrap = True, random_state = 12345)

RF.fit(label_X_train, y_train)