In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
import category_encoders as ce
 
df = pd.read_csv('housePrice.csv')
for col in df.columns:
    try:
        df[col] = pd.to_numeric(df[col])
    except (ValueError, TypeError):
        pass

df['Area'] = pd.to_numeric(df['Area'], errors='coerce')
df['Area'] = df['Area'].fillna(df['Area'].median())

bin_cols = ['Room', 'Parking', 'Warehouse', 'Elevator'] 
df[bin_cols] = df[bin_cols].fillna(df[bin_cols].mode().iloc[0])

df = df.drop(columns=['Price'])
df['Price(USD)'] = pd.to_numeric(df['Price(USD)'], errors='coerce')
df['Price(USD)'] = df['Price(USD)'].fillna(df['Price(USD)'].median())
df['Price(USD)'] = np.log1p(df['Price(USD)'])

df['Address'] = df['Address'].fillna('Unknown').astype(str)

X = df[['Area', 'Room', 'Parking', 'Warehouse', 'Elevator', 'Address']]
y = df['Price(USD)']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train['Area'] = np.log1p(X_train['Area'])
X_test['Area'] = np.log1p(X_test['Area'])

numeric_features = ['Area']
categorical_features = ['Address']
binary_features = ['Room', 'Parking', 'Warehouse', 'Elevator']

# Create the preprocessor
# 'passthrough' means the binary columns will be left untouched
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('binary', 'passthrough', binary_features)
    ],
    remainder='drop' # Drops any columns not specified
)

# Apply the transformations
# We FIT and TRANSFORM the training data
X_train_processed = preprocessor.fit_transform(X_train)

# We ONLY TRANSFORM the test data (using the settings from X_train)
X_test_processed = preprocessor.transform(X_test)

print(f"Original shape: {X_train.shape}")
print(f"Processed shape: {X_train_processed.shape}")    


reg = LinearRegression()

reg.fit(X_train_processed, y_train)
y_pred = reg.predict(X_test_processed)
reg.score(X_test_processed, y_test)

y_test_exp = np.expm1(y_test)
y_pred_exp = np.expm1(y_pred)

# Compute metrics
mae = mean_absolute_error(y_test_exp, y_pred_exp)
rmse = np.sqrt(mean_squared_error(y_test_exp, y_pred_exp))
r2 = reg.score(X_test_processed, y_test)

print(f"\n✅ R² Score:  {r2:.4f}")
print(f"✅ MAE:       {mae:,.2f} USD")
print(f"✅ RMSE:      {rmse:,.2f} USD")

Original shape: (2783, 6)
Processed shape: (2783, 186)

✅ R² Score:  0.8989
✅ MAE:       43,814.66 USD
✅ RMSE:      167,346.28 USD
