In [1]:
# Step 1: Import necessary libraries
import pandas as pd
from sklearn.datasets import fetch_california_housing

# Step 2: Load the dataset
housing = fetch_california_housing()

# Step 3: Create a pandas DataFrame (a table) for easier use
# housing.data contains the features, and housing.feature_names contains the column names
df = pd.DataFrame(housing.data, columns=housing.feature_names)

# Step 4: Add the target variable (the house prices we want to predict) to the DataFrame
df['MedHouseVal'] = housing.target

# Step 5: Display the first 5 rows of the table to see what the data looks like
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [2]:
# Step 6: Get a quick summary of the dataset
print("----------- Data Info -----------")
df.info()

print("\n\n----------- Data Description -----------")
# Step 7: Get descriptive statistics (like mean, std, etc.)
df.describe()

----------- Data Info -----------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   MedInc       20640 non-null  float64
 1   HouseAge     20640 non-null  float64
 2   AveRooms     20640 non-null  float64
 3   AveBedrms    20640 non-null  float64
 4   Population   20640 non-null  float64
 5   AveOccup     20640 non-null  float64
 6   Latitude     20640 non-null  float64
 7   Longitude    20640 non-null  float64
 8   MedHouseVal  20640 non-null  float64
dtypes: float64(9)
memory usage: 1.4 MB


----------- Data Description -----------


Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,3.870671,28.639486,5.429,1.096675,1425.476744,3.070655,35.631861,-119.569704,2.068558
std,1.899822,12.585558,2.474173,0.473911,1132.462122,10.38605,2.135952,2.003532,1.153956
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.54,-124.35,0.14999
25%,2.5634,18.0,4.440716,1.006079,787.0,2.429741,33.93,-121.8,1.196
50%,3.5348,29.0,5.229129,1.04878,1166.0,2.818116,34.26,-118.49,1.797
75%,4.74325,37.0,6.052381,1.099526,1725.0,3.282261,37.71,-118.01,2.64725
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333,41.95,-114.31,5.00001


In [3]:
# First, let's separate our features (X) from our target (y)
X = df.drop('MedHouseVal', axis=1) # X is all columns EXCEPT the house value
y = df['MedHouseVal']             # y is ONLY the house value column

# Now, let's scale the features
from sklearn.preprocessing import StandardScaler

# Create an instance of the scaler
scaler = StandardScaler()

# Fit the scaler to the data and transform it
X_scaled = scaler.fit_transform(X)

# To see the result, let's print the first 5 rows of the scaled data
print(X_scaled[:5])

[[ 2.34476576  0.98214266  0.62855945 -0.15375759 -0.9744286  -0.04959654
   1.05254828 -1.32783522]
 [ 2.33223796 -0.60701891  0.32704136 -0.26333577  0.86143887 -0.09251223
   1.04318455 -1.32284391]
 [ 1.7826994   1.85618152  1.15562047 -0.04901636 -0.82077735 -0.02584253
   1.03850269 -1.33282653]
 [ 0.93296751  1.85618152  0.15696608 -0.04983292 -0.76602806 -0.0503293
   1.03850269 -1.33781784]
 [-0.012881    1.85618152  0.3447108  -0.03290586 -0.75984669 -0.08561576
   1.03850269 -1.33781784]]


In [4]:
from sklearn.model_selection import train_test_split

# Split the data into training (80%) and testing (20%) sets
# random_state=42 ensures that we get the same split every time we run the code
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Let's check the shape of the training and testing sets to confirm
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (16512, 8)
Shape of X_test: (4128, 8)
Shape of y_train: (16512,)
Shape of y_test: (4128,)


In [5]:
import tensorflow as tf
from tensorflow import keras

# Step 1: Define the model architecture
model = keras.Sequential([
    # Input layer and first hidden layer with 64 neurons
    keras.layers.Dense(64, activation='relu', input_shape=[X_train.shape[1]]),
    # Second hidden layer with 64 neurons
    keras.layers.Dense(64, activation='relu'),
    # Output layer with 1 neuron (since we are predicting a single value - the price)
    keras.layers.Dense(1)
])

# Step 2: Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Step 3: Display a summary of the model
model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [6]:
# Train the model
# An epoch is one full pass through the entire training dataset.
# We will train for 100 epochs.
history = model.fit(
    X_train, y_train,
    epochs=100,
    validation_split=0.2  # We'll use 20% of the training data for validation
)

Epoch 1/100
[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 1.7536 - val_loss: 0.4547
Epoch 2/100
[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - loss: 0.4122 - val_loss: 0.4103
Epoch 3/100
[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - loss: 0.3627 - val_loss: 0.3914
Epoch 4/100
[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - loss: 0.3538 - val_loss: 0.3750
Epoch 5/100
[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.3428 - val_loss: 0.3851
Epoch 6/100
[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.3793 - val_loss: 0.3414
Epoch 7/100
[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.3130 - val_loss: 0.3609
Epoch 8/100
[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.2994 - val_loss: 0.3378
Epoch 9/100
[1m413/413[0m [32

In [7]:
# Step 1: Evaluate the model on the unseen test data to get the final loss score
print("----------- Model Evaluation -----------")
test_loss = model.evaluate(X_test, y_test)
print(f"Mean Squared Error on Test Data: {test_loss:.4f}") # .4f to format the number nicely

print("\n----------- Predictions vs Actual Values -----------")

# Step 2: Use the model to predict the prices of the first 5 houses in the test set
predictions = model.predict(X_test[:5])

# Step 3: Let's compare the model's predictions with the actual prices
print("Predicted Prices:\t", predictions.flatten().round(2))
print("Actual Prices:\t\t", y_test[:5].values.round(2))

----------- Model Evaluation -----------
[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.2688
Mean Squared Error on Test Data: 0.2808

----------- Predictions vs Actual Values -----------
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 86ms/step
Predicted Prices:	 [0.51 0.91 4.84 2.68 2.62]
Actual Prices:		 [0.48 0.46 5.   2.19 2.78]


In [8]:
!pip install gradio



In [9]:
import gradio as gr
import numpy as np

# This is the function that will power our interface.
# It takes 8 numbers as input and returns the predicted price.
def predict_price(medinc, house_age, ave_rooms, ave_bedrms, population, ave_occup, latitude, longitude):

    # 1. Collect all inputs into a single NumPy array
    input_data = np.array([[
        medinc, house_age, ave_rooms, ave_bedrms,
        population, ave_occup, latitude, longitude
    ]])

    # 2. IMPORTANT: Scale the input data using the SAME scaler we used for training
    input_data_scaled = scaler.transform(input_data)

    # 3. Use our trained model to make a prediction
    prediction = model.predict(input_data_scaled)

    # 4. Format the output to be a readable price
    # The original data's price is in units of $100,000, so we multiply by 100000
    predicted_price = prediction[0][0] * 100000

    return f"${predicted_price:,.2f}" # Format as currency, e.g., $250,123.45


# Define the input components for the interface
inputs = [
    gr.Number(label="Median Income (in tens of thousands, e.g., 8.3)"),
    gr.Number(label="House Age (e.g., 41)"),
    gr.Number(label="Average Rooms (e.g., 6.9)"),
    gr.Number(label="Average Bedrooms (e.g., 1.0)"),
    gr.Number(label="Population (e.g., 322)"),
    gr.Number(label="Average Occupancy (e.g., 2.5)"),
    gr.Number(label="Latitude (e.g., 37.88)"),
    gr.Number(label="Longitude (e.g., -122.23)")
]

# Define the output component
outputs = gr.Textbox(label="Predicted House Price")

# Create the Gradio interface
interface = gr.Interface(
    fn=predict_price,
    inputs=inputs,
    outputs=outputs,
    title="🏡 California House Price Predictor",
    description="An AI model to predict the median value of houses in California. Fill in the details below and click Submit."
)

# Launch the interface!
interface.launch()

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://2ae004ccf548551673.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


