
# California Housing Prices Prediction



# Required libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import gradio as gr
import joblib
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

# Read Data

In [2]:
housing = pd.read_csv('housing.csv')

# Exploring Data 

In [3]:
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [4]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [5]:
housing.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,206855.816909
std,2.003532,2.135952,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,115395.615874
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,296.0,787.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [6]:
housing.corr()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
longitude,1.0,-0.924664,-0.108197,0.044568,0.069608,0.099773,0.05531,-0.015176,-0.045967
latitude,-0.924664,1.0,0.011173,-0.0361,-0.066983,-0.108785,-0.071035,-0.079809,-0.14416
housing_median_age,-0.108197,0.011173,1.0,-0.361262,-0.320451,-0.296244,-0.302916,-0.119034,0.105623
total_rooms,0.044568,-0.0361,-0.361262,1.0,0.93038,0.857126,0.918484,0.19805,0.134153
total_bedrooms,0.069608,-0.066983,-0.320451,0.93038,1.0,0.877747,0.979728,-0.007723,0.049686
population,0.099773,-0.108785,-0.296244,0.857126,0.877747,1.0,0.907222,0.004834,-0.02465
households,0.05531,-0.071035,-0.302916,0.918484,0.979728,0.907222,1.0,0.013033,0.065843
median_income,-0.015176,-0.079809,-0.119034,0.19805,-0.007723,0.004834,0.013033,1.0,0.688075
median_house_value,-0.045967,-0.14416,0.105623,0.134153,0.049686,-0.02465,0.065843,0.688075,1.0


# Splitting Data Into Train And Test

As we can see from the above table the most effective predictor of **median house value** is **median income**
So I will split the data with respect to this predictor to make sure the train and test sets fully cover the whole dataset.

In [7]:
housing['income_cat'] = pd.cut(housing['median_income'],
                              bins=[0.0, 1.5, 3.0, 4.5, 6.0, np.inf],
                              labels=[1, 2, 3, 4, 5])

spliter = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in spliter.split(housing, housing['income_cat']):
    train_stra = housing.loc[train_index]
    test_stra = housing.loc[test_index]

In [8]:
train_stra.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16512 entries, 12655 to 19773
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype   
---  ------              --------------  -----   
 0   longitude           16512 non-null  float64 
 1   latitude            16512 non-null  float64 
 2   housing_median_age  16512 non-null  float64 
 3   total_rooms         16512 non-null  float64 
 4   total_bedrooms      16354 non-null  float64 
 5   population          16512 non-null  float64 
 6   households          16512 non-null  float64 
 7   median_income       16512 non-null  float64 
 8   median_house_value  16512 non-null  float64 
 9   ocean_proximity     16512 non-null  object  
 10  income_cat          16512 non-null  category
dtypes: category(1), float64(9), object(1)
memory usage: 1.4+ MB


In [9]:
test_stra.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4128 entries, 5241 to 3965
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype   
---  ------              --------------  -----   
 0   longitude           4128 non-null   float64 
 1   latitude            4128 non-null   float64 
 2   housing_median_age  4128 non-null   float64 
 3   total_rooms         4128 non-null   float64 
 4   total_bedrooms      4079 non-null   float64 
 5   population          4128 non-null   float64 
 6   households          4128 non-null   float64 
 7   median_income       4128 non-null   float64 
 8   median_house_value  4128 non-null   float64 
 9   ocean_proximity     4128 non-null   object  
 10  income_cat          4128 non-null   category
dtypes: category(1), float64(9), object(1)
memory usage: 359.0+ KB


## Removing Unneeded column

In [10]:
train_stra.drop('income_cat', axis=1, inplace=True)
test_stra.drop('income_cat', axis=1, inplace=True)

# Removing Target column

In [11]:
train = train_stra.drop('median_house_value', axis=1)
train_label = train_stra['median_house_value']
test = test_stra.drop('median_house_value', axis=1)
test_label = test_stra['median_house_value']

# Data Preprocessing

## Numerical  and  Categorical Attributes

In [12]:
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),# Handling missing values in 'total bedrooms' column
    ("std_scaler", StandardScaler()) #feature scaling
])

In [13]:
num_attr = ['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income']
cat_attr = ['ocean_proximity']

data_transformer = ColumnTransformer([
    ('num', num_pipeline, num_attr),
    ('cat', OneHotEncoder(), cat_attr) # Encoding categorical data
])

## Transform Data

In [14]:
train_prepared = data_transformer.fit_transform(train)

Now the data is ready to feed into the model

# Select and Train a Model

## Experiment 1 (Linear Regression)

In [15]:
lin_reg = LinearRegression()
lin_reg.fit(train_prepared, train_label)

LinearRegression()

In [16]:
some_data = train.iloc[:5]
some_labels = train_label.iloc[:5]
some_data_prepared = data_transformer.transform(some_data)
lin_pred = lin_reg.predict(some_data_prepared)
lin_reg_mse = mean_squared_error(some_labels,lin_pred )
lin_reg_rmse = np.sqrt(lin_reg_mse)
print(f"Linear Regression Training RMSE is: {lin_reg_rmse}")

Linear Regression Training RMSE is: 47256.69622864763


### Evaluating The Model

In [19]:
scores_lin = cross_val_score(lin_reg, train_prepared, train_label,
                        cv=10, scoring='neg_mean_squared_error')
lin_reg_rmse_score = np.sqrt(-scores_lin)
print(f"Linear Regression Validation RMSE is: {lin_reg_rmse_score.mean()}")

Linear Regression Validation RMSE is: 69204.32275494766


As we can see the validation **RMSE** is greater than the training one.
This model is **underfitting** the data.

### Saving The Model

In [25]:
joblib.dump(lin_reg, "linear.pkl")

['linear.pkl']

## Experiment 2 (Decision Tree)

In [20]:
tree_reg = DecisionTreeRegressor(max_depth=6)
tree_reg.fit(train_prepared, train_label)
tree_pred = tree_reg.predict(some_data_prepared)
tree_reg_mse = mean_squared_error(some_labels, tree_pred)
tree_reg_rmse = np.sqrt(tree_reg_mse)
print(f"Decision Tree Training RMSE is: {tree_reg_rmse}")

Decision Tree Training RMSE is: 19255.729538428423


### Evaluating The Model

In [21]:
scores_tree = cross_val_score(tree_reg, train_prepared, train_label,
                        cv=10, scoring='neg_mean_squared_error')
tree_rmse_score = np.sqrt(-scores_tree)
print(f"Decision Tree Validation RMSE is: {tree_rmse_score.mean()}")

Decision Tree Validation RMSE is: 67967.72782592665


As we can see the training **RMSE** is good but the validation is very bad.
This model is **overfitting** the data.

### Saving The Model

In [26]:
joblib.dump(tree_reg, "tree.pkl")

['tree.pkl']

## Experiment 3 (Random Forest)

In [23]:
forest_reg = RandomForestRegressor(n_estimators=40, max_features=9)
forest_reg.fit(train_prepared, train_label)
forest_pred = forest_reg.predict(some_data_prepared)
forest_reg_mse = mean_squared_error(some_labels, forest_pred)
forest_reg_rmse = np.sqrt(forest_reg_mse)
print(f"Random Forest Training RMSE is: {forest_reg_rmse}")

Random Forest Training RMSE is: 9937.379684806252


### Evaluating The Model

In [24]:
scores_forest = cross_val_score(forest_reg, train_prepared, train_label,
                        cv=10, scoring='neg_mean_squared_error')
forest_rmse_score = np.sqrt(-scores_forest)
print(f"Random Forest Validation RMSE is: {forest_rmse_score.mean()}")

Random Forest Validation RMSE is: 49186.59346557298


That's it!

# Deploying The Model

In [29]:
def prediction(properties):
    df = pd.DataFrame(properties, columns=train.columns)
    preparations = data_transformer.transform(df)
    return forest_reg.predict(preparations)

In [50]:
demo = gr.Interface(
    prediction,
    [
        gr.Dataframe(
            headers=["longitude", "latitude", "housing_median_age", "total_rooms", "total_bedrooms",
                     "population", "households", "median_income", "ocean_proximity"],
            datatype=["number", "number", "number", "number", "number", "number", "number", "number", "str"],
            row_count=1,
            col_count=(9, "fixed"),
        )
        
    ],
    "number",
    description="Enter The Properties Of The Home",
    title="California Housing Prices Prediction",
   # examples=[-121.46, 38.52, 29.0, 3873.0, 797.0, 2237.0, 706.0, 2.1736, 'INLAND'],
    

)

demo.launch()


Running on local URL:  http://127.0.0.1:7868

To create a public link, set `share=True` in `launch()`.




Traceback (most recent call last):
  File "/home/abdelrahman/anaconda3/lib/python3.9/site-packages/gradio/routes.py", line 292, in run_predict
    output = await app.blocks.process_api(
  File "/home/abdelrahman/anaconda3/lib/python3.9/site-packages/gradio/blocks.py", line 1007, in process_api
    result = await self.call_function(fn_index, inputs, iterator, request)
  File "/home/abdelrahman/anaconda3/lib/python3.9/site-packages/gradio/blocks.py", line 848, in call_function
    prediction = await anyio.to_thread.run_sync(
  File "/home/abdelrahman/anaconda3/lib/python3.9/site-packages/anyio/to_thread.py", line 28, in run_sync
    return await get_asynclib().run_sync_in_worker_thread(func, *args, cancellable=cancellable,
  File "/home/abdelrahman/anaconda3/lib/python3.9/site-packages/anyio/_backends/_asyncio.py", line 818, in run_sync_in_worker_thread
    return await future
  File "/home/abdelrahman/anaconda3/lib/python3.9/site-packages/anyio/_backends/_asyncio.py", line 754, in run
 