In [1]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
train = pd.read_csv("D:\\JUPYTER\\Data Cleaning\\csv_file\\train.csv")
test = pd.read_csv("D:\\JUPYTER\\Data Cleaning\\csv_file\\test.csv")

In [3]:
print(f"Shape of the trian df: {train.shape}")
print(f"Shape of the trian df: {test.shape}")

Shape of the trian df: (1460, 81)
Shape of the trian df: (1459, 80)


In [4]:
x_train = train.drop(columns="SalePrice", axis=1)
y_train = train["SalePrice"]
x_test = test.copy()
print("Shape of X_train = ", x_train.shape)
print("Shape of y_train = ", y_train.shape)
print("Shape of X_test =", x_test.shape)

Shape of X_train =  (1460, 80)
Shape of y_train =  (1460,)
Shape of X_test = (1459, 80)


In [5]:
isnull_sum = x_train.isnull().sum()

In [6]:
numerical_col = x_train.select_dtypes(include=["int64","float64"]).columns
numerical_col

Index(['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd',
       'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF',
       'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea',
       'MiscVal', 'MoSold', 'YrSold'],
      dtype='object')

In [7]:
miss_value_col = [var for var in numerical_col if isnull_sum[var]>0]
miss_value_col

['LotFrontage', 'MasVnrArea', 'GarageYrBlt']

In [8]:
# finding the categorical variable which have mising value
categorical_col = x_train.select_dtypes(include=["object"]).columns
categorical_col

Index(['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
       'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature',
       'SaleType', 'SaleCondition'],
      dtype='object')

In [9]:
miss_value_cat_col = [var for var in categorical_col if isnull_sum[var]>0]
miss_value_cat_col

['Alley',
 'MasVnrType',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Electrical',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PoolQC',
 'Fence',
 'MiscFeature']

In [10]:
numerical_apply_col_mean = ['LotFrontage']
numerical_apply_col_median = ['MasVnrArea', 'GarageYrBlt']
categorical_apply_col_mode = ['Alley',
                             'MasVnrType',
                             'BsmtQual',
                             'BsmtCond',
                             'BsmtExposure',
                             'BsmtFinType1',
                             'BsmtFinType2',
                             'Electrical',
                             'FireplaceQu']
categorical_apply_col_missing = ['GarageType',
                                 'GarageFinish',
                                 'GarageQual',
                                 'GarageCond',
                                 'PoolQC',
                                 'Fence',
                                 'MiscFeature']

In [11]:
numerical_col_mean_imputer = Pipeline(steps=[("imputer",SimpleImputer(strategy="mean"))])
numerical_col_median_imputer = Pipeline(steps=[("imputer",SimpleImputer(strategy="median"))])
categorical_col_mode_imputer = Pipeline(steps=[("imputer",SimpleImputer(strategy="most_frequent"))])
categorical_col_missing_imputer = Pipeline(steps=[("imputer",SimpleImputer(strategy="constant",fill_value="missing"))])

In [15]:
preprocessor = ColumnTransformer(transformers=[("mean_imputer",numerical_col_mean_imputer,numerical_apply_col_mean),
                                              ("median_imputer",numerical_col_median_imputer,numerical_apply_col_median),
                                              ("mode_imputer",categorical_col_mode_imputer,categorical_apply_col_mode),
                                              ("missing_imputer",categorical_col_missing_imputer,categorical_apply_col_missing)],remainder="passthrough")

In [16]:
preprocessor.fit(x_train)

ColumnTransformer(remainder='passthrough',
                  transformers=[('mean_imputer',
                                 Pipeline(steps=[('imputer', SimpleImputer())]),
                                 ['LotFrontage']),
                                ('median_imputer',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='median'))]),
                                 ['MasVnrArea', 'GarageYrBlt']),
                                ('mode_imputer',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='most_frequent'))]),
                                 ['Alley', 'MasVnrType', 'BsmtQual', 'BsmtCond',
                                  'BsmtExposure', 'BsmtFinType1',
                                  'BsmtFinType2', 'Electrical',
                                  'FireplaceQu']),
                                ('missing_impu

In [17]:
preprocessor.transform

<bound method ColumnTransformer.transform of ColumnTransformer(remainder='passthrough',
                  transformers=[('mean_imputer',
                                 Pipeline(steps=[('imputer', SimpleImputer())]),
                                 ['LotFrontage']),
                                ('median_imputer',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='median'))]),
                                 ['MasVnrArea', 'GarageYrBlt']),
                                ('mode_imputer',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='most_frequent'))]),
                                 ['Alley', 'MasVnrType', 'BsmtQual', 'BsmtCond',
                                  'BsmtExposure', 'BsmtFinType1',
                                  'BsmtFinType2', 'Electrical',
                                  'FireplaceQu']),
 

In [18]:
preprocessor.named_transformers_["mean_imputer"].named_steps["imputer"].statistics_

array([70.04995837])

In [19]:
x_train_clean = preprocessor.transform(x_train)
x_test_clean = preprocessor.transform(x_test)

In [20]:
x_train_clean

array([[65.0, 196.0, 2003.0, ..., 2008, 'WD', 'Normal'],
       [80.0, 0.0, 1976.0, ..., 2007, 'WD', 'Normal'],
       [68.0, 162.0, 2001.0, ..., 2008, 'WD', 'Normal'],
       ...,
       [66.0, 0.0, 1941.0, ..., 2010, 'WD', 'Normal'],
       [68.0, 0.0, 1950.0, ..., 2010, 'WD', 'Normal'],
       [75.0, 0.0, 1965.0, ..., 2008, 'WD', 'Normal']], dtype=object)

In [21]:
preprocessor.transformers_

[('mean_imputer',
  Pipeline(steps=[('imputer', SimpleImputer())]),
  ['LotFrontage']),
 ('median_imputer',
  Pipeline(steps=[('imputer', SimpleImputer(strategy='median'))]),
  ['MasVnrArea', 'GarageYrBlt']),
 ('mode_imputer',
  Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent'))]),
  ['Alley',
   'MasVnrType',
   'BsmtQual',
   'BsmtCond',
   'BsmtExposure',
   'BsmtFinType1',
   'BsmtFinType2',
   'Electrical',
   'FireplaceQu']),
 ('missing_imputer',
  Pipeline(steps=[('imputer',
                   SimpleImputer(fill_value='missing', strategy='constant'))]),
  ['GarageType',
   'GarageFinish',
   'GarageQual',
   'GarageCond',
   'PoolQC',
   'Fence',
   'MiscFeature']),
 ('remainder',
  'passthrough',
  [0,
   1,
   2,
   4,
   5,
   7,
   8,
   9,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   17,
   18,
   19,
   20,
   21,
   22,
   23,
   24,
   27,
   28,
   29,
   34,
   36,
   37,
   38,
   39,
   40,
   41,
   43,
   44,
   45,
   46,
   47,
   48,
   

In [23]:
x_train_clean_miss_var = pd.DataFrame(x_train_clean)

In [24]:
x_train_clean_miss_var

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,70,71,72,73,74,75,76,77,78,79
0,65,196,2003,Grvl,BrkFace,Gd,TA,No,GLQ,Unf,...,61,0,0,0,0,0,2,2008,WD,Normal
1,80,0,1976,Grvl,,Gd,TA,Gd,ALQ,Unf,...,0,0,0,0,0,0,5,2007,WD,Normal
2,68,162,2001,Grvl,BrkFace,Gd,TA,Mn,GLQ,Unf,...,42,0,0,0,0,0,9,2008,WD,Normal
3,60,0,1998,Grvl,,TA,Gd,No,ALQ,Unf,...,35,272,0,0,0,0,2,2006,WD,Abnorml
4,84,350,2000,Grvl,BrkFace,Gd,TA,Av,GLQ,Unf,...,84,0,0,0,0,0,12,2008,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,62,0,1999,Grvl,,Gd,TA,No,Unf,Unf,...,40,0,0,0,0,0,8,2007,WD,Normal
1456,85,119,1978,Grvl,Stone,Gd,TA,No,ALQ,Rec,...,0,0,0,0,0,0,2,2010,WD,Normal
1457,66,0,1941,Grvl,,TA,Gd,No,GLQ,Unf,...,60,0,0,0,0,2500,5,2010,WD,Normal
1458,68,0,1950,Grvl,,TA,TA,Mn,GLQ,Rec,...,0,112,0,0,0,0,4,2010,WD,Normal


In [28]:
x_train_clean_miss_var = pd.DataFrame(x_train_clean,columns= x_train.columns)

In [30]:
x_train_clean_miss_var

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,65,196,2003,Grvl,BrkFace,Gd,TA,No,GLQ,Unf,...,61,0,0,0,0,0,2,2008,WD,Normal
1,80,0,1976,Grvl,,Gd,TA,Gd,ALQ,Unf,...,0,0,0,0,0,0,5,2007,WD,Normal
2,68,162,2001,Grvl,BrkFace,Gd,TA,Mn,GLQ,Unf,...,42,0,0,0,0,0,9,2008,WD,Normal
3,60,0,1998,Grvl,,TA,Gd,No,ALQ,Unf,...,35,272,0,0,0,0,2,2006,WD,Abnorml
4,84,350,2000,Grvl,BrkFace,Gd,TA,Av,GLQ,Unf,...,84,0,0,0,0,0,12,2008,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,62,0,1999,Grvl,,Gd,TA,No,Unf,Unf,...,40,0,0,0,0,0,8,2007,WD,Normal
1456,85,119,1978,Grvl,Stone,Gd,TA,No,ALQ,Rec,...,0,0,0,0,0,0,2,2010,WD,Normal
1457,66,0,1941,Grvl,,TA,Gd,No,GLQ,Unf,...,60,0,0,0,0,2500,5,2010,WD,Normal
1458,68,0,1950,Grvl,,TA,TA,Mn,GLQ,Rec,...,0,112,0,0,0,0,4,2010,WD,Normal


In [31]:
x_train_clean_miss_var.shape

(1460, 80)