In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
import os 

import missingno as msno

import warnings
warnings.filterwarnings("ignore")

%matplotlib inline
from collections import Counter

In [2]:
import random

seed = 42
random.seed(seed)
np.random.seed(seed)
os.environ["PYTHONHASHSEED"] = str(seed)

In [3]:
test_df = pd.read_csv('/kaggle/input/2024-smarcle-ks-2house-price/test.csv')
train_df = pd.read_csv('/kaggle/input/2024-smarcle-ks-2house-price/train.csv')
submit = pd.read_csv('/kaggle/input/2024-smarcle-ks-2house-price/sample_submission.csv')

In [4]:
train_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [6]:
print(train_df.shape)
print(test_df.shape)

(1460, 81)
(1459, 80)


In [7]:
numerical = train_df.select_dtypes(exclude=['object']).columns
categorical = train_df.select_dtypes(include=['object']).columns

print("Num of numerical feats: ", len(numerical))
print("Num of categorical feats: ", len(categorical))

Num of numerical feats:  38
Num of categorical feats:  43


In [8]:
print(train_df[numerical].columns)
print("="*90)
print(train_df[categorical].columns)

Index(['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd',
       'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF',
       'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea',
       'MiscVal', 'MoSold', 'YrSold', 'SalePrice'],
      dtype='object')
Index(['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', '

In [9]:
def detect_outliers(df, n, features):
    outlier_indices = []
    for col in features:
        Q1 = np.percentile(df[col], 25)
        Q3 = np.percentile(df[col], 75)
        IQR = Q3 - Q1
        
        outlier_step = 1.5 * IQR
        
        outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step)].index
        outlier_indices.extend(outlier_list_col)
    outlier_indices = Counter(outlier_indices)
    multiple_outliers = list(k for k, v in outlier_indices.items() if v > n)
        
    return multiple_outliers
        
Outliers_to_drop = detect_outliers(train_df, 2, ['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd',
       'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF',
       'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea',
       'MiscVal', 'MoSold', 'YrSold'])

In [10]:
train_df = train_df.drop(Outliers_to_drop, axis = 0).reset_index(drop=True)

In [11]:
for col in test_df.columns:
    msperc = 'column: {:>10}\t Null: {:.2f}%'.format(col, 100 * (test_df[col].isnull().sum() / test_df[col].shape[0]))
    print(msperc)

column:         Id	 Null: 0.00%
column: MSSubClass	 Null: 0.00%
column:   MSZoning	 Null: 0.27%
column: LotFrontage	 Null: 15.56%
column:    LotArea	 Null: 0.00%
column:     Street	 Null: 0.00%
column:      Alley	 Null: 92.67%
column:   LotShape	 Null: 0.00%
column: LandContour	 Null: 0.00%
column:  Utilities	 Null: 0.14%
column:  LotConfig	 Null: 0.00%
column:  LandSlope	 Null: 0.00%
column: Neighborhood	 Null: 0.00%
column: Condition1	 Null: 0.00%
column: Condition2	 Null: 0.00%
column:   BldgType	 Null: 0.00%
column: HouseStyle	 Null: 0.00%
column: OverallQual	 Null: 0.00%
column: OverallCond	 Null: 0.00%
column:  YearBuilt	 Null: 0.00%
column: YearRemodAdd	 Null: 0.00%
column:  RoofStyle	 Null: 0.00%
column:   RoofMatl	 Null: 0.00%
column: Exterior1st	 Null: 0.07%
column: Exterior2nd	 Null: 0.07%
column: MasVnrType	 Null: 61.27%
column: MasVnrArea	 Null: 1.03%
column:  ExterQual	 Null: 0.00%
column:  ExterCond	 Null: 0.00%
column: Foundation	 Null: 0.00%
column:   BsmtQual	 Null: 3

**Credit**:

https://www.kaggle.com/code/kj0409/house-prices-kj