In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

# Load your dataset
data = pd.read_csv('house_price_regression_dataset.csv')
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Square_Footage        1000 non-null   int64  
 1   Num_Bedrooms          1000 non-null   int64  
 2   Num_Bathrooms         1000 non-null   int64  
 3   Year_Built            1000 non-null   int64  
 4   Lot_Size              1000 non-null   float64
 5   Garage_Size           1000 non-null   int64  
 6   Neighborhood_Quality  1000 non-null   int64  
 7   House_Price           1000 non-null   float64
dtypes: float64(2), int64(6)
memory usage: 62.6 KB
None


In [11]:
data

Unnamed: 0,Square_Footage,Num_Bedrooms,Num_Bathrooms,Year_Built,Lot_Size,Garage_Size,Neighborhood_Quality,House_Price
0,1360,2,1,1981,0.599637,0,5,2.623829e+05
1,4272,3,3,2016,4.753014,1,6,9.852609e+05
2,3592,1,2,2016,3.634823,0,9,7.779774e+05
3,966,1,2,1977,2.730667,1,8,2.296989e+05
4,4926,2,1,1993,4.699073,0,8,1.041741e+06
...,...,...,...,...,...,...,...,...
995,3261,4,1,1978,2.165110,2,10,7.014940e+05
996,3179,1,2,1999,2.977123,1,10,6.837232e+05
997,2606,4,2,1962,4.055067,0,2,5.720240e+05
998,4723,5,2,1950,1.930921,0,7,9.648653e+05


In [None]:
# Separate features and target variable
X = data.drop('House_Price', axis=1)
y = data['House_Price']

# Scaling the features for better performance of the neural network
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [12]:
X_scaled

array([[-1.15980323, -0.69383623, -1.18669921, ..., -1.67927849,
        -1.25465753, -0.21312613],
       [ 1.16072443,  0.00700845,  1.25255918, ...,  1.52238989,
        -0.02700828,  0.13342042],
       [ 0.61884297, -1.39468091,  0.03292999, ...,  0.66042215,
        -1.25465753,  1.17306009],
       ...,
       [-0.16688515,  0.70785312,  0.03292999, ...,  0.98437109,
        -1.25465753, -1.25276579],
       [ 1.52011933,  1.4086978 ,  0.03292999, ..., -0.65304553,
        -1.25465753,  0.47996698],
       [ 0.36065239,  0.70785312,  0.03292999, ...,  0.25492526,
         1.20064096, -1.25276579]])

In [None]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.25, random_state=42
)

In [None]:
# Import necessary libraries for building the neural network
from tensorflow.keras.models import Sequential # type: ignore
from tensorflow.keras.layers import Dense, Dropout # type: ignore
from tensorflow.keras.optimizers import Adam # pyright: ignore[reportMissingImports]
from tensorflow.keras.callbacks import EarlyStopping # type: ignore

def create_model(features):
    model = Sequential([
        Dense(64, activation='relu', input_dim=features), # Input layer
        #64 neurons in the first layer, relu for my activation function, features is the number of input features
        # (input x  number of neurons) + bias
        # (10 x 64) + 64 = 704 parameters
        Dropout(0.3),
        # Input into the second layer is number of output from the first layer
        Dense(32, activation='relu'), #64 inputs, 32 neurons which is 32 output.. # First Hidden layer
        # (64 x 32) + 32 = 2080 parameters
        Dropout(0.2),
        Dense(16, activation='relu'), # Second Hidden layer
        # (32 x 16) + 16 = 528 parameters
        Dense(1, activation='linear')
        # output layer = 1
        # input layer = 16
    ])
    return model

model = create_model(X_train.shape[1])


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [None]:
# Compile the model including the optimiser and loss function
# Adam optimizer is a popular choice for regression tasks
model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='mean_squared_error',
    metrics=['mae']
)

model.summary()

In [None]:
# Early stopping to prevent redundant training
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=20,
    restore_best_weights=True
)

# Train the model
# Using a batch size of 32 or less, depending on the size of the training set
history = model.fit(
    X_train, y_train,
    epochs=1000, 
    batch_size=min(32, len(X_train)//4),
    validation_split=0.2,
    callbacks=[early_stopping],
    verbose=1
)

Epoch 1/1000
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 10228658176.0000 - mae: 79211.1562 - val_loss: 1501828352.0000 - val_mae: 30678.1191
Epoch 2/1000
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 9468955648.0000 - mae: 77379.3672 - val_loss: 1506825728.0000 - val_mae: 30669.9609
Epoch 3/1000
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 9260590080.0000 - mae: 74598.7500 - val_loss: 1508766336.0000 - val_mae: 30700.6172
Epoch 4/1000
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 9521357824.0000 - mae: 77460.2969 - val_loss: 1497905280.0000 - val_mae: 30604.2461
Epoch 5/1000
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 10221686784.0000 - mae: 79242.9297 - val_loss: 1564284672.0000 - val_mae: 31287.3027
Epoch 6/1000
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 9277577216.000

In [19]:
#y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

#train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))

#print(f"Training RMSE: {train_rmse:.4f}")
print(f"Testing RMSE: {test_rmse:.4f}")

[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
Testing RMSE: 30173.7528
