In [60]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [61]:
train=pd.read_csv('/content/house_prince_train_data.csv')
test=pd.read_csv('/content/housse_prince_test_datta.csv')

In [62]:
test.shape

(1459, 80)

In [63]:
train.shape

(1460, 81)

In [64]:
x_train=train.drop('SalePrice',axis=1)
y_train=train['SalePrice']

In [65]:
x_train.shape

(1460, 80)

# missing value imputation

In [66]:
pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)



In [67]:
per_var=x_train.isnull().sum()/x_train.shape[0]*100
per_var

Unnamed: 0,0
Id,0.0
MSSubClass,0.0
MSZoning,0.0
LotFrontage,17.739726
LotArea,0.0
Street,0.0
Alley,93.767123
LotShape,0.0
LandContour,0.0
Utilities,0.0


In [68]:
greater_then_20=per_var[per_var>20].keys()

In [69]:
greater_then_20.shape

(6,)

In [70]:
drop_col=x_train.drop(columns=greater_then_20)


In [71]:
drop_col.isnull().sum()

Unnamed: 0,0
Id,0
MSSubClass,0
MSZoning,0
LotFrontage,259
LotArea,0
Street,0
LotShape,0
LandContour,0
Utilities,0
LotConfig,0


In [72]:
num_var=x_train.select_dtypes(include=['int64','float64']).columns


In [73]:
num_var

Index(['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd',
       'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF',
       'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea',
       'MiscVal', 'MoSold', 'YrSold'],
      dtype='object')

In [74]:
greater_then_0=[var for var in num_var if x_train[var].isnull().sum()>0] #grater then 0 value

In [75]:

greater_then_0

['LotFrontage', 'MasVnrArea', 'GarageYrBlt']

In [76]:
cat_var=x_train.select_dtypes(include=['object']).columns # find  the cat data
cat_vr_greater_then_0=[var for var in cat_var if x_train[var].isnull().sum()>0]
cat_vr_greater_then_0

['Alley',
 'MasVnrType',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Electrical',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PoolQC',
 'Fence',
 'MiscFeature']

In [77]:
num_var_mean=['LotFrontage']
num_var_median=['MasVnrArea','GarageYrBlt']
cat_var_mode=['Alley','MasVnrType','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2','Electrical','FireplaceQu']
cat_var_missing=['GarageType','GarageFinish','GarageQual','GarageCond','PoolQC','Fence','MiscFeature']

In [87]:
# pipeliine imputer and strategies
num_var_mean_imputer=Pipeline(steps=[('imputer',SimpleImputer(strategy='mean'))])
num_var_mefisn_imputer=Pipeline(steps=[('imputer',SimpleImputer(strategy='median'))])
cat_var_mode_imputer=Pipeline(steps=[('imputer',SimpleImputer(strategy='most_frequent'))])
cat_var_missing_imputer=Pipeline(steps=[('imputer',SimpleImputer(strategy='constant',fill_value='missing'))])

In [88]:

# columns transformer
preprocessor=ColumnTransformer(transformers=[('mean_imputer',num_var_mean_imputer,num_var_mean),
                  ('median_imputer',num_var_mefisn_imputer,num_var_median),
                  ('mode_imputer',cat_var_mode_imputer,cat_var_mode),
                  ('missing_imputer',cat_var_missing_imputer,cat_var_missing)])

In [89]:
preprocessor

In [90]:
preprocessor.fit(x_train)


In [93]:
preprocessor.named_transformers_['mean_imputer'].named_steps['imputer'].statistics_ # check strategies of mean imputer

array([70.04995837])

In [94]:
# transform of x_train data set and preaduce to clean dataset
clean_data_set=preprocessor.transform(x_train)

In [95]:
clean_data_set

array([[65.0, 196.0, 2003.0, ..., 'missing', 'missing', 'missing'],
       [80.0, 0.0, 1976.0, ..., 'missing', 'missing', 'missing'],
       [68.0, 162.0, 2001.0, ..., 'missing', 'missing', 'missing'],
       ...,
       [66.0, 0.0, 1941.0, ..., 'missing', 'GdPrv', 'Shed'],
       [68.0, 0.0, 1950.0, ..., 'missing', 'missing', 'missing'],
       [75.0, 0.0, 1965.0, ..., 'missing', 'missing', 'missing']],
      dtype=object)

In [96]:
# check all fill value and all activity
preprocessor.transformers_

[('mean_imputer',
  Pipeline(steps=[('imputer', SimpleImputer())]),
  ['LotFrontage']),
 ('median_imputer',
  Pipeline(steps=[('imputer', SimpleImputer(strategy='median'))]),
  ['MasVnrArea', 'GarageYrBlt']),
 ('mode_imputer',
  Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent'))]),
  ['Alley',
   'MasVnrType',
   'BsmtQual',
   'BsmtCond',
   'BsmtExposure',
   'BsmtFinType1',
   'BsmtFinType2',
   'Electrical',
   'FireplaceQu']),
 ('missing_imputer',
  Pipeline(steps=[('imputer',
                   SimpleImputer(fill_value='missing', strategy='constant'))]),
  ['GarageType',
   'GarageFinish',
   'GarageQual',
   'GarageCond',
   'PoolQC',
   'Fence',
   'MiscFeature']),
 ('remainder',
  'drop',
  [0,
   1,
   2,
   4,
   5,
   7,
   8,
   9,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   17,
   18,
   19,
   20,
   21,
   22,
   23,
   24,
   27,
   28,
   29,
   34,
   36,
   37,
   38,
   39,
   40,
   41,
   43,
   44,
   45,
   46,
   47,
   48,
   49,
   

In [97]:
x_train_clean=pd.DataFrame(clean_data_set,columns=num_var_mean+num_var_median+cat_var_mode+cat_var_missing)

In [98]:
x_train_clean.head()

Unnamed: 0,LotFrontage,MasVnrArea,GarageYrBlt,Alley,MasVnrType,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,Electrical,FireplaceQu,GarageType,GarageFinish,GarageQual,GarageCond,PoolQC,Fence,MiscFeature
0,65.0,196.0,2003.0,Grvl,BrkFace,Gd,TA,No,GLQ,Unf,SBrkr,Gd,Attchd,RFn,TA,TA,missing,missing,missing
1,80.0,0.0,1976.0,Grvl,BrkFace,Gd,TA,Gd,ALQ,Unf,SBrkr,TA,Attchd,RFn,TA,TA,missing,missing,missing
2,68.0,162.0,2001.0,Grvl,BrkFace,Gd,TA,Mn,GLQ,Unf,SBrkr,TA,Attchd,RFn,TA,TA,missing,missing,missing
3,60.0,0.0,1998.0,Grvl,BrkFace,TA,Gd,No,ALQ,Unf,SBrkr,Gd,Detchd,Unf,TA,TA,missing,missing,missing
4,84.0,350.0,2000.0,Grvl,BrkFace,Gd,TA,Av,GLQ,Unf,SBrkr,TA,Attchd,RFn,TA,TA,missing,missing,missing


In [99]:
x_train_clean.isnull().sum().sum()

0

In [100]:
x_train_clean.shape

(1460, 19)

In [101]:
print("thanku...")

thanku...
