In [1]:
# 1. Import Libraries and Load Data
# ------------------------------------------
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [2]:
# Load both datasets
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
# Save the target separately
y_train = train['SalePrice']
train.drop(['SalePrice'], axis=1, inplace=True)

In [4]:
# Combine train and test for preprocessing
combined = pd.concat([train, test], axis=0)
print("Combined shape:", combined.shape)

Combined shape: (2919, 80)


In [5]:
# 2. Handle Missing Values
# ------------------------------------------

In [6]:
#Fill CategoricalValues..
categorical_cols = combined.select_dtypes(include='object').columns
combined[categorical_cols] = combined[categorical_cols].fillna("None")

In [7]:
#Fill numerical with median
numerical_cols = combined.select_dtypes(exclude='object').columns
combined[numerical_cols] = combined[numerical_cols].fillna(combined[numerical_cols].median())

In [9]:
# 3. Label Encoding for Ordinal Features
# ------------------------------------------
ordinal_cols = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC',
                'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC']
for col in ordinal_cols:
    le = LabelEncoder()
    combined[col] = le.fit_transform(combined[col])

In [10]:
# 4. One-Hot Encoding for Remaining Categorical Columns
# ------------------------------------------
combined = pd.get_dummies(combined)

In [12]:
# 5. Feature Engineering
# ------------------------------------------
combined['TotalBathrooms'] = (combined['FullBath'] + combined['HalfBath'] * 0.5 +
                              combined['BsmtFullBath'] + combined['BsmtHalfBath'] * 0.5)
combined['TotalSF'] = (combined['TotalBsmtSF'] + combined['1stFlrSF'] + combined['2ndFlrSF'])
combined['HouseAge'] = combined['YrSold'] - combined['YearBuilt']

In [13]:
# 6. Outlier Removal
# ------------------------------------------
# Example: Remove extreme TotalSF outliers from original train
train_rows = train.shape[0]
X_train_temp = combined[:train_rows].copy()
y_train_temp = y_train.copy()

outliers = X_train_temp[X_train_temp['TotalSF'] > 6000].index
X_train_temp.drop(outliers, inplace=True)
y_train_temp.drop(outliers, inplace=True)

In [14]:
# 7. Scaling Features (Optional)
# ------------------------------------------
scaler = StandardScaler()
X_scaled = scaler.fit_transform(combined)
combined_scaled = pd.DataFrame(X_scaled, columns=combined.columns)

In [15]:
 #8. Final Dataset Split and Save
# ------------------------------------------
X_train = combined_scaled.iloc[:len(y_train_temp), :]
X_test = combined_scaled.iloc[len(y_train_temp):, :]
y_train = y_train_temp

In [16]:
# Save to CSV files
X_train.to_csv('X_train.csv', index=False)
X_test.to_csv('X_test.csv', index=False)
y_train.to_csv('y_train.csv', index=False)

print("Preprocessing complete. Files saved.")

Preprocessing complete. Files saved.
