# Housing Price Prediction

## Import Libraries

In [16]:
import pandas as pd


## Load the data

In [6]:
housing_df = pd.read_csv("housing (1).csv")
housing_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [7]:
housing_df.shape

(20640, 10)

## EDA

In [9]:
housing_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [12]:
housing_df.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity'],
      dtype='object')

In [14]:
housing_df['ocean_proximity'].value_counts()

<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: ocean_proximity, dtype: int64

In [19]:
housing_df.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,206855.816909
std,2.003532,2.135952,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,115395.615874
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,296.0,787.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [20]:
housing_df['median_house_value'].describe()

count     20640.000000
mean     206855.816909
std      115395.615874
min       14999.000000
25%      119600.000000
50%      179700.000000
75%      264725.000000
max      500001.000000
Name: median_house_value, dtype: float64

In [24]:
num_feat = housing_df.select_dtypes(include =['int' ,"float64"]).columns
cat_feat = housing_df.select_dtypes(include=['object']).columns

In [25]:
num_feat, cat_feat

(Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
        'total_bedrooms', 'population', 'households', 'median_income',
        'median_house_value'],
       dtype='object'),
 Index(['ocean_proximity'], dtype='object'))

## Split X and y and train test 

In [59]:
X = housing_df.drop('median_house_value', axis=1)
y = housing_df['median_house_value']

In [60]:
# Train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

In [61]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(16512, 9)
(16512,)
(4128, 9)
(4128,)


## Preprocessing

### Numeric Features

In [85]:
num_attr = X_train.select_dtypes(include = ['int64', 'float64']).columns
num_attr

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income'],
      dtype='object')

In [86]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [87]:
imputer_num = SimpleImputer(strategy='median')
scaler = StandardScaler()

### Categorical Features 

In [88]:
cat_attr = X_train.select_dtypes(include=['object']).columns
cat_attr

Index(['ocean_proximity'], dtype='object')

In [89]:
from sklearn.preprocessing import OneHotEncoder

imputer_cat = SimpleImputer(strategy='most_frequent')
encoder = OneHotEncoder(handle_unknown='ignore')

## Pipeline and Column Transform

In [90]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [91]:
# Numrical Pipeline
num_pipeline = Pipeline(steps=[
    ('imputer', imputer_num) ,
    ('scaler', scaler)
])

In [92]:
# Categorical Pipeline
cat_pipeline = Pipeline(steps=[
    ('imputer', imputer_cat),
    ('onehot', encoder )
])

In [93]:
# Column Transformer

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_pipeline, num_attr),
        ('cat', cat_pipeline, cat_attr)
    ])

## Full Pipeline (Preprocessing + Model)

In [94]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()

In [95]:
full_pipline = Pipeline(steps = [
    ('preprocessing', preprocessor),
    ('model', model)
])

full_pipline.fit(X_train, y_train)

## Cross Validation

In [97]:
from sklearn.model_selection import cross_val_score
import numpy as np

In [99]:
scores = cross_val_score(
    full_pipline,
    X_train, y_train,
    cv= 5,
    scoring = 'neg_root_mean_squared_error'
)

In [100]:
rmse_scores = -scores
print("Mean RMSE : ", rmse_scores.mean())

Mean RMSE :  68622.53528344534


#### repeat 5 times (cv=5):
    1. split X_train into (train_fold, val_fold)
    2. fit full_pipeline on train_fold
    3. evaluate on val_fold using scoring
store all 5 scores


#### Why `scoring="neg_root_mean_squared_error`
    Here’s the key rule in sklearn:
      "Scikit-learn assumes: “higher score = better model” 
    
    This works fine for:
    Accuracy
    R²
    
    But RMSE is different:
    "Lower RMSE = better model"

    So what sklearn does

    Instead of changing the rule, sklearn does this:
    It negates loss metrics
    So they can still be “maximized”

    Internally:
        true_rmse = 25000
        returned_score = -25000
        
        That breaks sklearn’s assumption.
#### Why? `rmse_scores = -scores`

    Because:
    Humans think in positive RMSE
    Interviewers expect positive RMSE
    Business interpretation needs positive error
