In [32]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from scipy.stats import skew

In [33]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train = train[train['GrLivArea'] < 4000].reset_index(drop=True)

In [34]:
y_train = train['SalePrice']
train_features = train.drop('SalePrice', axis=1)
all_data = pd.concat([train_features, test], axis=0, ignore_index=True)

In [35]:
for col in all_data.columns:
    if all_data[col].dtype == "object":
        all_data[col] = all_data[col].fillna("None")
    else:
        all_data[col] = all_data[col].fillna(all_data[col].median())

In [36]:
numeric_feats = all_data.select_dtypes(include=[np.number]).columns
skewed_feats = all_data[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
skewness = skewed_feats[abs(skewed_feats) > 0.75]
for feat in skewness.index:
    all_data[feat] = np.log1p(all_data[feat])

In [37]:
for col in all_data.select_dtypes(include='object').columns:
    if all_data[col].nunique() <= 10:
        le = LabelEncoder()
        all_data[col] = le.fit_transform(all_data[col])
    else:
        dummies = pd.get_dummies(all_data[col], prefix=col)
        all_data = pd.concat([all_data.drop(col, axis=1), dummies], axis=1)

In [38]:
all_data['TotalSF'] = (
    all_data.get('TotalBsmtSF', 0) +
    all_data.get('1stFlrSF', 0) +
    all_data.get('2ndFlrSF', 0)
)

In [39]:
scaler = StandardScaler()
all_data_scaled = pd.DataFrame(scaler.fit_transform(all_data), columns=all_data.columns)

In [40]:
X_train = all_data_scaled.iloc[:train.shape[0], :]
X_test = all_data_scaled.iloc[train.shape[0]:, :]

In [30]:
X_train.to_csv('X_train_preprocessed.csv', index=False)
X_test.to_csv('X_test_preprocessed.csv', index=False)
y_train.to_csv('y_train.csv', index=False)

In [31]:
print("Preprocessing complete. Shapes:", X_train.shape, X_test.shape)

Preprocessing complete. Shapes: (1456, 136) (1459, 136)
