In [212]:
import pandas as pd
import numpy as np
from scipy.stats import skew
import pickle
from pickle import dump
from pickle import load
from scipy import stats
from scipy.stats import norm
import seaborn as sns
import matplotlib.pyplot as plt
import category_encoders as ce
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel
import xgboost
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.impute import SimpleImputer
#from mlmachine.features.preprocessing import GroupbyImputer
from sktutor.preprocessing import GroupByImputer
from sklearn.metrics import mean_squared_error
from bayes_opt import BayesianOptimization

# to visualise al the columns in the dataframe
pd.pandas.set_option("display.max_columns", None)
pd.pandas.set_option("display.max_rows", None)

dataset_train=pd.read_csv("train.csv")

##preprocessing done based on insights from EDA analysis

In [None]:
def find_missing_datatypes(dataset):
    features_nan=[features for features in dataset.columns if dataset[features].isnull().sum()>0]
    print(' Total Features with missing values:', len(features_nan))
    print(features_nan)

    ## Get categorical features (dtypes == "object")
    catfeatures=[feature for feature in dataset.columns if dataset[feature].dtype =='object']
    print(' Total Categorical Features:', len(catfeatures))
    print(catfeatures)
    
    ## Get categorical features (dtypes == "object") with missing values
    catfeatures_nan=[feature for feature in dataset.columns if dataset[feature].isnull().sum()>0 and dataset[feature].dtypes=='object']
    print(' Total Categorical Features with missing values:', len(catfeatures_nan))
    print(catfeatures_nan)


    ## Get numerical features (dtypes != "object")
    numfeatures=[feature for feature in dataset.columns if dataset[feature].dtype !='object']
    print(' Total Numerical Features:', len(numfeatures))
    print(numfeatures)

    ## Get numerical features (dtypes != "object") with missing values
    numfeatures_nan=[feature for feature in dataset.columns if dataset[feature].isnull().sum()>0    and dataset[feature].dtypes!='object']
    print(' Total Numerical Features with missing values:', len(numfeatures_nan))
    print(numfeatures_nan)

    return features_nan, catfeatures, catfeatures_nan, numfeatures, numfeatures_nan




## Date Time Variables, convert to age of house feature
def features_age(dataset):
    dataset['GarageYrBlt'].fillna('0', inplace=True)   
    return(dataset)

In [None]:

def impute_object(dataset):
    
    dataset['MSSubClass']=dataset['MSSubClass'].astype(object)

    #catfeatures=[feature for feature in dataset.columns if dataset[feature].dtype =='object' ]
    year_features = ['YearBuilt','YearRemodAdd','GarageYrBlt']
    cat_nan_nofeature =['Alley', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 
    'BsmtFinType2','FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 
    'PoolQC', 'Fence', 'MasVnrType', 'MiscFeature']

    for feature in cat_nan_nofeature:
        dataset[feature].fillna('No Feature', inplace=True)

    dataset['Electrical'].fillna("SBrkr", inplace=True)
    
    features=[feature for feature in dataset.columns if feature not in 'SalePrice']


    catgroupbyImpute = GroupByImputer(group=['Neighborhood', 'MSSubClass'], impute_type="most_frequent")
    catgroupbyImpute.fit(dataset[features])
    dump(catgroupbyImpute, open('CatGroupbyImputer.pkl', 'wb'))
    data = pd.concat([dataset[['SalePrice']].reset_index(drop=True),
                    pd.DataFrame(catgroupbyImpute.transform(dataset[features]), columns=features)],
                    axis=1)
             
    features_nan=[features for features in dataset.columns if dataset[features].isnull().sum()>0]

    print(' Total Features with missing values after imputing object types:', len(features_nan))
    print(features_nan)
    
    return(data)


In [None]:

def impute_numeric(dataset):

    year_features = ['YearBuilt','YearRemodAdd','GarageYrBlt']
    cat_nan_nofeature =['Alley', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 
    'BsmtFinType2','FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 
    'PoolQC', 'Fence', 'MasVnrType', 'MiscFeature']

    numfeatures =[feature for feature in dataset.columns if  dataset[feature].dtypes!='object' and feature not in year_features]

    for feature in numfeatures:
        dataset[feature].fillna(dataset[feature].median(), inplace=True)
    
    features_nan=[features for features in dataset.columns if dataset[features].isnull().sum()>0]

    print(' Total Features with missing values after imputing numeric types:', len(features_nan))
    print(features_nan)
    return(dataset)

In [None]:

def catfeatures_ordinal(dataset):
    cat_ord_no_nan = ['ExterQual', 'ExterCond', 'HeatingQC','KitchenQual', 'CentralAir', 'Functional' 'Utilities', 'LandSlope']
    cat_ord_nan = ['FireplaceQu', 'GarageFinish', 'GarageQual', 'GarageCond',  'PoolQC',  'Fence','BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 
    'BsmtFinType2']
    catfeatures_ord_map = [{ 
        "col": "ExterQual",
        "mapping" : {
            'Ex' : 5,
            'Gd' : 4, 
            'TA' : 3, 
            'Fa' : 2, 
            'Po' : 1,
            'No Feature':0,
        }}, {     
        "col": "ExterCond",
        "mapping" : {'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1, 'No Feature':0}}, {
        "col": "HeatingQC",
        "mapping" : {"Ex":5,"Gd":4,"TA":3,"Fa":2,"Po":1, "No Feature":0}}, {
        "col": "KitchenQual",
        "mapping" :  {"Ex":5,"Gd":4,"TA":3,"Fa":2,"Po":1, "No Feature":0}}, {
        "col": "CentralAir",
        "mapping" :  {"Y":2, "N":1, "No Feature":0}}, {
        "col": "Functional",
        "mapping" : {"Typ":8,"Min1":7,"Min2":6,"Mod":5,"Maj1":4,"Maj2":3,"Sev":2,"Sal":1,"No Feature":0}}, {
        "col": "Utilities",
        "mapping" :  {"AllPub":4,"NoSewr":3,"NoSeWa":2,"LO":1,"No Feature":0}}, {
        "col": "LandSlope",
        "mapping" :  {"Sev": 3, "Mod": 2, "Gtl": 1,"No Feature":0}}, {
        "col": "FireplaceQu",
        "mapping" :  {"Ex":5,"Gd":4,"TA":3,"Fa":2,"Po":1,"No Feature":0}}, {
        "col": "GarageQual",
        "mapping" :  {"Ex":5,"Gd":4,"TA":3,"Fa":2,"Po":1,"No Feature":0}},{
        "col": "GarageCond",
        "mapping" :  {"Ex":5,"Gd":4,"TA":3,"Fa":2,"Po":1,"No Feature":0}}, {
        "col": "BsmtQual",
        "mapping" :  {"Ex":5,"Gd":4,"TA":3,"Fa":2,"Po":1,"No Feature":0}}, {
        "col": "BsmtCond",
        "mapping" :  {"Ex":5,"Gd":4,"TA":3,"Fa":2,"Po":1,"No Feature":0}}, {
        "col": "BsmtFinType1",
        "mapping" :  {"GLQ":6,"ALQ":5,"BLQ":4,"Rec":3,"LwQ":2,"Unf":1,"No Feature":0}}, {
        "col": "BsmtFinType2",
        "mapping" :  {"GLQ":6,"ALQ":5,"BLQ":4,"Rec":3,"LwQ":2,"Unf":1,"No Feature":0}}, {
        "col": "BsmtExposure",
        "mapping" :  {"Gd":4,"Av":3,"Mn":2,"No":1,"No Feature":0}}, {
        "col": "GarageFinish",
        "mapping" :  {"Fin":3,"RFn":2,"Unf":1,"No Feature":0}}, {
        "col": "PoolQC",
        "mapping" :  {"Ex":4,"Gd":3,"TA":2,"Fa":1,"No Feature":0}}, {
        "col": "Fence",
        "mapping" :  {"GdPrv":4,"MnPrv":3,"GdWo":2,"MnWw":1,"No Feature":0
        }}
    ]
    target = ['SalePrice']
    features = [feature for feature in dataset.columns if feature not in target]

    encoder1 = ce.OrdinalEncoder(mapping = catfeatures_ord_map, cols= cat_ord_no_nan+cat_ord_nan,  return_df = True, handle_unknown='-1') 
    encoder1.fit(dataset[features])
    dump(encoder1, open('catfeatures_ordinalmap_encoder.pkl', 'wb'))
   
    dataset = pd.concat([dataset[['SalePrice']].reset_index(drop=True),
                    pd.DataFrame(encoder1.transform(dataset[features]), columns=features)],
                    axis=1)
    return(dataset)

   
    
def catfeatures_oe(dataset):

    cat_features =['MSZoning', 'Alley', 'Street', 'LotShape', 'LandContour', 'LotConfig',  'Neighborhood', 
    'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 
    'Foundation', 'Heating',  'Electrical',  'GarageType', 'PavedDrive', 'MiscFeature', 'SaleType', 'SaleCondition']

    target = ['SalePrice']
    features = [feature for feature in dataset.columns if feature not in target]

    oe = ce.OrdinalEncoder(cols= cat_features,  return_df = True, handle_unknown='-1')    
    oe.fit(dataset[features])
    dump(oe, open('catfeatures_ordinal_encoder.pkl', 'wb'))
  
    dataset = pd.concat([dataset[['SalePrice']].reset_index(drop=True),
                    pd.DataFrame(oe.transform(dataset[features]), columns=features)],
                    axis=1)
    return dataset



##not used
##check and fix skewness in final numeric variables
def skewness_numfeatures(dataset):
    ##based on correlation of dependent variables to avoid multicollinearity
    drop_features=['TotRmsAbvGrd','GarageArea' ,'1stFlrSF']
    year_features = ['YearBuilt','YearRemodAdd','GarageYrBlt']
    numfeatures=[feature for feature in dataset_train.columns if dataset_train[feature].dtype !='object']
    num_features = [ feature for feature in numfeatures if feature not in year_features + drop_features + ['SalePrice']]

    skewness = dataset[num_features].skew().sort_values(ascending=False)
    skewed_features = list(skewness[abs(skewness) > 0.5].index)
    print('skewed features: {}'.format(len(skewed_features)))
    print(skewed_features)
    # Log-transform skewed features (+1 to avoid log0)
    dataset[skewed_features] = dataset[skewed_features].astype(float)
    dataset[skewed_features] = np.log(1+dataset[skewed_features])
    return dataset

##not used
##check and fix skewness in final numeric variables
def reskewness_numfeatures(dataset):
    drop_features=['TotRmsAbvGrd','GarageArea' ,'1stFlrSF']
    year_features = ['YearBuilt','YearRemodAdd','GarageYrBlt']
    numfeatures=[feature for feature in dataset_train.columns if dataset_train[feature].dtype !='object']
    num_features = [ feature for feature in numfeatures if feature not in year_features + drop_features + ['SalePrice']]
    skewness = dataset[num_features].skew().sort_values(ascending=False)
    skewed_features = list(skewness[abs(skewness) > 0.5].index)
    print('skewed features: {}'.format(len(skewed_features)))
    print(skewed_features)
    # Log-transform skewed features (+1 to avoid log0)
    dataset = dataset.drop(skewed_features,axis=1)
    return dataset

##check and fix skewness in final numeric variables
def skewness_median(dataset):
    drop_features=['TotRmsAbvGrd','GarageArea' ,'1stFlrSF']
    year_features = ['YearBuilt','YearRemodAdd','GarageYrBlt']
    numfeatures=[feature for feature in dataset_train.columns if dataset_train[feature].dtype !='object']
    num_features = [ feature for feature in numfeatures if feature not in year_features + ['SalePrice']]
    skewness = dataset[num_features].skew().sort_values(ascending=False)
    skewed_features = list(skewness[abs(skewness) > 0.5].index)
    for feature in skewed_features:
        dataset.sort_values(by=feature, ascending=True, na_position='last')
        q1, q3 = np.nanpercentile(dataset[feature], [25,75])
        iqr = q3-q1
        lower_bound = q1-(1.5*iqr)
        upper_bound = q3+(1.5*iqr)
        median = dataset[feature].median()
        if feature != 'SalePrice':
            dataset.loc[dataset[feature] < lower_bound, [feature]] = median
            dataset.loc[dataset[feature] > upper_bound, [feature]] = median
        return dataset


def feature_scaling(dataset):
    features_scale=[feature for feature in dataset.columns if feature not in ['Id','SalePrice']  ]
    scaler=MinMaxScaler()
    scaler.fit(dataset[features_scale]) 
    dump(scaler, open('MinMaxScaler.pkl', 'wb'))
    dataset = pd.concat([dataset[['Id', 'SalePrice']].reset_index(drop=True),
                    pd.DataFrame(scaler.transform(dataset[features_scale]), columns=features_scale)],
                    axis=1)
    return dataset

##not used
def drop_correlated(dataset):
    drop_features=['TotRmsAbvGrd','GarageArea' ,'1stFlrSF']
    #'GarageCond', 'PoolArea', 'Fireplaces']
    dataset = dataset.drop(drop_features, axis=1)
    print('selected features: {}'.format(len(dataset.columns)))
    return dataset


def feature_selection(dataset):
    y_train=dataset[['SalePrice']]
    X_train=dataset.drop(['Id','SalePrice'],axis=1)
    feature_sel_model = SelectFromModel(Lasso(alpha=0.005, random_state=0))
    feature_sel_model.fit(X_train, y_train)
    selected_feat = X_train.columns[(feature_sel_model.get_support())]
    print('total features: {}'.format((X_train.shape[1])))
    print('selected features: {}'.format(len(selected_feat)))
    print(selected_feat)
    X_train=dataset[selected_feat]
    return X_train, y_train

In [None]:
year_features = ['YearBuilt','YearRemodAdd','GarageYrBlt']
d1 = find_missing_datatypes(dataset_train)
d2 = features_age(dataset_train)
d3 = impute_object(d2)
d4 = impute_numeric(d3)
d5 = catfeatures_ordinal(d4)
d6 = catfeatures_oe(d5)
d7 = skewness_median(d6)
d8 = feature_scaling(d7)
X, Y = feature_selection(d8)

In [None]:
dp =pd.concat([Y,X],axis=1)
dp.to_csv('train_processed.csv', index=False)