In [43]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.decomposition import PCA
import tensorflow as tf
from tensorflow import keras
from keras import models, layers, optimizers, losses, metrics, callbacks

In [2]:
train_data = pd.read_csv("./datasets/train.csv")
test_data = pd.read_csv("./datasets/test.csv")

In [3]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [4]:
train_data.head(3)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500


In [5]:
na_values = train_data.isnull().sum().sort_values(ascending = False)
na_values = na_values[na_values > 0]
na_values

PoolQC          1453
MiscFeature     1406
Alley           1369
Fence           1179
MasVnrType       872
FireplaceQu      690
LotFrontage      259
GarageQual        81
GarageFinish      81
GarageType        81
GarageYrBlt       81
GarageCond        81
BsmtFinType2      38
BsmtExposure      38
BsmtCond          37
BsmtQual          37
BsmtFinType1      37
MasVnrArea         8
Electrical         1
dtype: int64

In [6]:
test_data.isnull().sum().sort_values(ascending = False).head(20)

PoolQC          1456
MiscFeature     1408
Alley           1352
Fence           1169
MasVnrType       894
FireplaceQu      730
LotFrontage      227
GarageYrBlt       78
GarageCond        78
GarageFinish      78
GarageQual        78
GarageType        76
BsmtCond          45
BsmtQual          44
BsmtExposure      44
BsmtFinType1      42
BsmtFinType2      42
MasVnrArea        15
MSZoning           4
BsmtHalfBath       2
dtype: int64

In [None]:
def process_na_values(data: pd.DataFrame) -> pd.DataFrame:
    """
    Handle missing values in the given DataFrame according to specific rules.

    Steps performed:
    - Drops columns with 500 or more missing values.
    - Fills missing values in selected numerical columns with 0.
    - Fills missing values in selected categorical columns with "Unknown".
    - Fills missing values in selected categorical columns with their mode.
    - Fills missing values in "LotFrontage" with the median value per "Neighborhood".

    Args:
        data (pd.DataFrame): Input DataFrame to process.

    Returns:
        pd.DataFrame: A new DataFrame with missing values processed.
    """

    df = data.copy()

    na_values = df.isnull().sum().sort_values(ascending = False)

    df = df.drop(columns = na_values[na_values >= 500].index)

    for col in ("MasVnrArea", "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF", "TotalBsmtSF", "BsmtFullBath", "BsmtHalfBath", "GarageYrBlt", "GarageArea", "GarageCars"):
        df[col] = df[col].fillna(0)
    
    for col in ("GarageFinish", "GarageCond", "GarageQual", "GarageType", "BsmtFinType2", "BsmtExposure", "BsmtQual", "BsmtCond", "BsmtFinType1"):
        df[col] = df[col].fillna("Unknown")

    for col in ("Electrical", "KitchenQual", "MSZoning", "Utilities", "Functional", "Exterior2nd", "Exterior1st", "SaleType"):
        df[col] = df[col].fillna(df[col].mode()[0])

    df["LotFrontage"] = df.groupby(by = "Neighborhood")["LotFrontage"].transform(lambda x: x.fillna(x.median()))

    return df

In [30]:
def feature_extraction(data: pd.DataFrame) -> pd.DataFrame:
    df = data.copy()

    df["TotalSF"] = df["TotalBsmtSF"] + df["1stFlrSF"] + df["2ndFlrSF"]

    return df

In [78]:
def feature_selection(data: pd.DataFrame) -> pd.DataFrame:
    df = data.copy()

    df = df.drop(columns = ["GarageCars", "1stFlrSF", "GarageYrBlt", "TotRmsAbvGrd", "Id"])
    
    for col in ("MSSubClass", "OverallQual", "OverallCond", "YrSold", "MoSold"):
        df[col] = df[col].astype(str)

    return df

In [79]:
train_df = process_na_values(train_data)
test_df = process_na_values(test_data)

In [80]:
train_df = feature_extraction(train_df)
test_df = feature_extraction(test_df)

In [81]:
train_df = feature_selection(train_df)
test_df = feature_selection(test_df)

In [82]:
train_df = pd.get_dummies(train_df, drop_first = True).drop(columns = "SalePrice")
test_df = pd.get_dummies(test_df, drop_first = True)

In [83]:
scaler = RobustScaler()
train_df_scaled = scaler.fit_transform(train_df)
test_df_scaled = scaler.fit_transform(test_df)

In [97]:
X_train, y_train = train_df_scaled, train_data["SalePrice"]
X_test = test_df_scaled

In [98]:
model = models.Sequential([
    layers.Input(shape = (train_df.shape[1],)),
    layers.Dense(units = 128, activation = "relu", name = "Dense1"),
    layers.Dropout(rate = 0.25, name = "Dropout1"),
    layers.Dense(units = 32, activation = "relu", name = "Dense2"),
    layers.Dropout(rate = 0.25, name = "Dropout2"),
    layers.Dense(units = 1, activation = "linear", name = "Output")
])

model.compile(
    optimizer = optimizers.Adam(learning_rate = 0.003),
    loss = losses.MeanAbsoluteError,
    metrics = [
        
    ]
)

history = model.fit(
    x = X_train,
    y = y_train,
    validation_split = 0.2,
    epochs = 100,
    batch_size = 8,
    callbacks = [
        callbacks.EarlyStopping(monitor = "val_loss", patience = 5, restore_best_weights = True)
    ]
)

Epoch 1/100
[1m146/146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 176700.6719 - val_loss: 163567.5000
Epoch 2/100
[1m146/146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 155222.3906 - val_loss: 134894.6875
Epoch 3/100
[1m146/146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 128561.7891 - val_loss: 111467.7500
Epoch 4/100
[1m146/146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 101989.9219 - val_loss: 69036.0156
Epoch 5/100
[1m146/146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 60968.6289 - val_loss: 36078.6172
Epoch 6/100
[1m146/146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 40578.5195 - val_loss: 34409.6602
Epoch 7/100
[1m146/146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 40003.3516 - val_loss: 33167.9492
Epoch 8/100
[1m146/146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - l

In [99]:
model.summary()

In [103]:
# Align test_df columns to match train_df columns before scaling
test_df_aligned = test_df.reindex(columns=train_df.columns, fill_value=0)
test_df_scaled_aligned = scaler.fit_transform(test_df_aligned)
y_pred = model.predict(test_df_scaled_aligned, batch_size=8)
y_pred

[1m183/183[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step


array([[118021.36 ],
       [314911.72 ],
       [172679.73 ],
       ...,
       [168041.55 ],
       [ 68127.766],
       [170366.27 ]], dtype=float32)

In [104]:
y_pred.shape

(1459, 1)

In [105]:
test_data["Id"]

0       1461
1       1462
2       1463
3       1464
4       1465
        ... 
1454    2915
1455    2916
1456    2917
1457    2918
1458    2919
Name: Id, Length: 1459, dtype: int64

In [106]:
result = pd.DataFrame(test_data["Id"])
result["SalePrice"] = y_pred
result

Unnamed: 0,Id,SalePrice
0,1461,118021.359375
1,1462,314911.718750
2,1463,172679.734375
3,1464,194981.984375
4,1465,160788.187500
...,...,...
1454,2915,80114.414062
1455,2916,63401.371094
1456,2917,168041.546875
1457,2918,68127.765625


In [63]:
result.to_csv("./results/neural_network_prediction.csv", index = False)