In [None]:
import pandas as pd
import numpy as np

# ======================
# 1) Load the dataset
# ======================
df = pd.read_csv("../data/train.csv")

print("Shape before cleaning:", df.shape)

# ======================
# 2) Handle missing values
# ======================
# Count missing values
missing = df.isnull().sum()
missing = missing[missing > 0].sort_values(ascending=False)
print("\nMissing values per column (top 20):\n", missing.head(20))

# Drop columns with more than 40% missing values
threshold = 0.4 * len(df)
df = df.dropna(thresh=threshold, axis=1)

# For numeric columns → fill missing values with the median
num_cols = df.select_dtypes(include=[np.number]).columns
for col in num_cols:
    df[col] = df[col].fillna(df[col].median())

# For categorical columns → fill missing values with the string "Missing"
cat_cols = df.select_dtypes(include=['object']).columns
for col in cat_cols:
    df[col] = df[col].fillna("Missing")

print("Shape after cleaning:", df.shape)

# ======================
# 3) Encode categorical variables
# ======================
# One-Hot Encoding for categorical columns
df_encoded = pd.get_dummies(df, drop_first=True)

print("Shape after encoding:", df_encoded.shape)

# ======================
# 4) Feature Engineering
# ======================
# Example: create a new feature for total square footage
df_encoded["TotalSF"] = df_encoded["GrLivArea"] + df_encoded.get("TotalBsmtSF", 0)

print("\nNew feature added: TotalSF")

# ======================
# 5) Save the cleaned dataset
# ======================
df_encoded.to_csv("../data/train_cleaned.csv", index=False)
print("\nSaved cleaned dataset to ../data/train_cleaned.csv")
