## Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from src.utils.outliers import OutlierCapper

## Configurations

In [2]:
# path for the training and testing datasets
train_data_path = 'C:/Users/ansar/Desktop/Workspace/Personal/MLOPs/House Price Prediction/artifacts/raw/train.csv'
test_data_path = 'C:/Users/ansar/Desktop/Workspace/Personal/MLOPs/House Price Prediction/artifacts/raw/test.csv'

In [3]:
# loading training and testing datasets
df_train = pd.read_csv(train_data_path)
df_test = pd.read_csv(test_data_path)

In [4]:
# drop the id column from training and testing dataset
df_train.drop('id', axis=1, inplace=True)
# dropping the id column from the testing dataset
df_test.drop('id', axis=1, inplace=True)

In [5]:
# extracting the column names
features = df_train.columns.to_list()
features.remove('MedHouseVal') 

not_normal_feats = ~df_train.skew().between(-0.5, 0.5)
iqr_feats = not_normal_feats[not_normal_feats].index.tolist()

In [6]:
# removing the MedHouseVal feature from iqr_feats
iqr_feats.remove('MedHouseVal')

In [7]:
pipeline = Pipeline([
    ('capper', OutlierCapper(iqr_feats)),
    ('scaler', StandardScaler())
])

In [8]:
# separating independent and dependent features from training dataset
X_train = df_train.drop('MedHouseVal', axis=1)
y_train = df_train['MedHouseVal']

X_test = df_test.drop('MedHouseVal', axis=1)
y_test = df_test['MedHouseVal']

In [9]:
# transforming the datasets
X_train_processed = pipeline.fit_transform(X_train)
X_test_processed = pipeline.transform(X_test)

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- MedHouseVal


In [10]:
# log transform MedHouseVal feature
y_train = np.log1p(y_train)
y_test = np.log1p(y_test)

In [11]:
# converting the processed datasets into dataframes
df_train_processed = pd.concat([
    pd.DataFrame(X_train_processed, columns=X_train.columns),
    pd.DataFrame(y_train, columns=['MedHouseVal'])
], axis=1)

df_test_processed = pd.concat([
    pd.DataFrame(X_test_processed, columns=X_test.columns),
    pd.DataFrame(y_test, columns=['MedHouseVal'])
], axis=1)


In [12]:
# saving the processed datasets
os.makedirs('data/processed', exist_ok=True)

df_train_processed.to_csv('data/processed/train_processed.csv', index=False, header=True)
df_test_processed.to_csv('data/processed/test_processed.csv', index=False, header=True)