In [73]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

The Boston Housing dataset contains several columns that describe various characteristics of the housing data. Here's what each column name represents:

1. **`crim`**: Crime rate per capita (crime rate for the town or city).
2. **`zn`**: Proportion of residential land zoned for large lots (e.g., for houses).
3. **`indus`**: Proportion of non-retail business acres per town.
4. **`chas`**: Charles River dummy variable (1 if the house is near the Charles River, 0 if not).
5. **`nox`**: Nitrogen oxides concentration (measured in parts per 10 million). This is a measure of air pollution.
6. **`rm`**: Average number of rooms per dwelling (house).
7. **`age`**: Proportion of owner-occupied units built before 1940.
8. **`dis`**: Weighted distance to employment centers in Boston.
9. **`rad`**: Index of accessibility to radial highways (the lower the number, the more accessible).
10. **`tax`**: Property tax rate per $10,000 of property value.
11. **`ptratio`**: Pupil-teacher ratio by town.
12. **`black`**: Proportion of African American residents in the town (measured by the variable `100 * (Bk - 0.63)^2`, where Bk is the proportion of Black residents).
13. **`lstat`**: Percentage of the population considered "lower status" (e.g., income level).
14. **`medv`**: Median value of owner-occupied homes (in thousands of dollars). This is the target variable (house price).

These columns are used to predict the price of houses in the Boston area based on the various factors (e.g., crime rates, proximity to highways, air quality, etc.).


In [75]:
# Step 1: Load the dataset
data = pd.read_csv('Boston Dataset.csv')

# Drop the first column 'Unnamed: 0' as it is just an index
data = data.drop(columns=['Unnamed: 0'])
data.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [77]:
# Step 2: Prepare the features (X) and labels (y)
X = data.drop('medv', axis=1).values  # Features
y = data['medv'].values  # Target (Price)

In [79]:
# Step 3: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


### `train_test_split(X, y)`:
This function splits the data into two sets: one for training the model (`X_train`, `y_train`) and one for testing the model (`X_test`, `y_test`).

### `test_size=0.2`:
This argument specifies that 20% of the data should be used for testing, and the remaining 80% will be used for training the model.  
In this case, 80% of the data will be used to train the model, and 20% will be reserved for evaluating the model's performance.

### `random_state=42`:
This is a seed for the random number generator used to shuffle the data before splitting. Setting a specific value (in this case, `42`) ensures that the split will be the same every time you run the code, making your results reproducible.  
If you don't set `random_state`, the data will be split randomly each time, leading to different results on each run.


In [81]:
# Step 4: Normalize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


### `scaler = StandardScaler()`:
This line creates an instance of the `StandardScaler` class, which is used to standardize the features (i.e., scale them to have a mean of 0 and a standard deviation of 1).

### `X_train = scaler.fit_transform(X_train)`:
This line applies the scaling to the training data (`X_train`).

- **`fit_transform()`**: The `fit()` method computes the mean and standard deviation for each feature in the training data, and the `transform()` method uses these values to scale the data.  
  The result is that the features in `X_train` are normalized, i.e., they now have a mean of 0 and a standard deviation of 1.

### `X_test = scaler.transform(X_test)`:
This line applies the same scaling to the test data (`X_test`).

- **`transform()`**: Here, the `transform()` method is used to scale the test data based on the mean and standard deviation calculated from the training data.  
  This ensures that the test data is scaled in the same way as the training data, maintaining consistency.

### Purpose of Normalization:
Normalizing the features ensures that the model treats all features equally, regardless of their original scale. It is particularly important for models that rely on distance metrics (e.g., linear regression, k-nearest neighbors) and for improving the performance of gradient-based optimization methods.


In [83]:
# Step 5: Build the neural network model
model = Sequential([
    Dense(64, input_dim=X_train.shape[1], activation='relu'),  # Input layer and first hidden layer
    Dense(32, activation='relu'),  # Second hidden layer
    Dense(1)  # Output layer (single value for price prediction)
])


In [85]:
# Compile the model
# The 'optimizer' parameter specifies the optimization algorithm to minimize the loss function.
# 'Adam' is a popular optimization algorithm that adapts learning rates during training for faster convergence.
model.compile(optimizer=Adam(), loss='mean_squared_error')
# The 'loss' parameter specifies the loss function to be used during training.
# 'mean_squared_error' is commonly used for regression tasks to measure the difference between predicted and actual values.


In [51]:
# Step 7: Train the model
model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.2, verbose=1)
# Train the model
# 'X_train' and 'y_train' are the training data and labels.
# 'epochs' specifies the number of times the model will see the entire training dataset during training.
# Here, the model will train for 100 epochs.
# 'batch_size' specifies the number of samples per gradient update. Here, it is set to 32.
# 'validation_split' is the fraction of the training data to be used as validation data. Here, 20% of the data will be used for validation.
# 'verbose' controls the verbosity of the output. 1 means progress will be displayed during training.

Epoch 1/100
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step - loss: 615.6313 - val_loss: 525.1214
Epoch 2/100
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 541.0792 - val_loss: 494.1234
Epoch 3/100
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 515.1743 - val_loss: 461.1566
Epoch 4/100
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 497.9662 - val_loss: 422.4959
Epoch 5/100
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 440.2523 - val_loss: 376.8575
Epoch 6/100
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 406.5294 - val_loss: 323.7342
Epoch 7/100
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 332.8683 - val_loss: 265.2293
Epoch 8/100
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 287.1360 - val_loss: 203.7267
Epoch 9/100
[1

<keras.src.callbacks.history.History at 0x23429069910>

In [87]:
# Step 8: Evaluate the model
y_pred = model.predict(X_test)

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step


In [89]:
# Calculate Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error on test data: {mse}')

Mean Squared Error on test data: 532.3807939912008


In [68]:
# Get predictions from the model
predicted_prices = model.predict(X_test)

# Show the first 5 actual and predicted prices
print("Actual Prices: ", y_test[:5])
print("Predicted Prices: ", predicted_prices[:5])


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
Actual Prices:  [23.6 32.4 13.6 22.8 16.1]
Predicted Prices:  [[27.9978  ]
 [34.14383 ]
 [20.36575 ]
 [27.20421 ]
 [15.778932]]
