In [44]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv
/kaggle/input/house-prices-advanced-regression-techniques/data_description.txt
/kaggle/input/house-prices-advanced-regression-techniques/train.csv
/kaggle/input/house-prices-advanced-regression-techniques/test.csv


In [45]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

# Load the House Prices dataset 
# Replace with your actual dataset loading mechanism
house_prices_data = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')


In [46]:
# Define features and target
X = house_prices_data.drop('SalePrice', axis=1)  # Features
y = house_prices_data['SalePrice']  # Target


In [47]:
#Descriptive Statistics
house_prices_data.info()
house_prices_data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [48]:
# Numerical preprocessing steps (impute missing values and scale)
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values with mean
    ('scaler', StandardScaler())                 # Scale numerical features
])

# Categorical preprocessing steps (impute missing values and one-hot encode)
categorical_features = X.select_dtypes(include=['object']).columns
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values with most frequent value
    ('onehot', OneHotEncoder(handle_unknown='ignore'))     # One-hot encode categorical features
])


In [49]:
# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])


In [50]:
#Outlier Detection

from scipy import stats

def detect_outliers_zscore(X, threshold=3):
    z_scores = stats.zscore(X)
    abs_z_scores = np.abs(z_scores)
    outliers = (abs_z_scores > threshold).any(axis=1)
    return outliers

class OutlierRemover(BaseEstimator, TransformerMixin):
    def __init__(self, method='zscore', threshold=3):
        self.method = method
        self.threshold = threshold
    
    def fit(self,X, y=None):
        if self.method == 'zscore':
            self.outliers_ = detect_outliers_zscore(X, threshold=self.threshold)
        return self
    
    def transform(self,X):
        if self.method == 'zscore':
            return X[~self.outliers_]
        return X

In [51]:
# Outlier detection and removal step
outlier_detector = OutlierRemover(method='zscore', threshold=3)


In [56]:
#Multicollinarity detection

class MulticollinearityRemover(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.9):
        self.threshold = threshold
    
    def fit(self, X, y=None):
        # Calculate correlation matrix for numeric features
        self.correlation_matrix_ = np.corrcoef(X, rowvar=False)
        return self
    
    def transform(self, X):
        # Identify and remove highly correlated features based on threshold
        correlated_pairs = find_correlated_features(self.correlation_matrix_, threshold=self.threshold)
        selected_features = list(range(X.shape[1]))
        
        for i, j, _ in correlated_pairs:
            # Remove feature j (keeping feature i) from the selected features
            selected_features.remove(j)
        
        return X[:, selected_features]

In [57]:
# Full pipeline including outlier detection, multi-collinearity removal, and cleaning data

pipeline = Pipeline(steps=[
    ('outlier_detector', OutlierRemover(method='zscore', threshold=3)),
    ('multicollinearity_remover', MulticollinearityRemover(threshold=0.9)),
    ('preprocessor', preprocessor)
])


In [54]:
import joblib

# Save preprocessing pipeline to a file
joblib.dump(preprocessor, 'preprocessor_pipeline.pkl')


['preprocessor_pipeline.pkl']

In [55]:
# Later, you can load the pipeline from the file
loaded_pipeline = joblib.load('preprocessor_pipeline.pkl')

# Use the loaded pipeline to preprocess new datasets
# new_data = pd.read_csv('new_data.csv')  # Load your new dataset
# new_data_preprocessed = loaded_pipeline.transform(new_data)