In [1]:
import pandas as pd


In [2]:
df = pd.read_csv('housing.csv')

In [3]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0


In [4]:
df.drop(columns= ['longitude', 'latitude'], inplace= True)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   housing_median_age  20637 non-null  float64
 1   total_rooms         20635 non-null  float64
 2   total_bedrooms      20429 non-null  float64
 3   population          20628 non-null  float64
 4   households          20630 non-null  float64
 5   median_income       20631 non-null  float64
 6   median_house_value  20632 non-null  float64
dtypes: float64(7)
memory usage: 1.1 MB


In [6]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
housing_median_age,20637.0,28.639046,12.58564,1.0,18.0,29.0,37.0,52.0
total_rooms,20635.0,2635.499443,2181.039708,2.0,1447.5,2127.0,3147.5,39320.0
total_bedrooms,20429.0,537.869352,421.38425,1.0,296.0,435.0,647.0,6445.0
population,20628.0,1425.44076,1132.601922,3.0,787.0,1166.0,1725.0,35682.0
households,20630.0,499.607465,382.356098,1.0,280.0,409.0,605.0,6082.0
median_income,20631.0,3.870831,1.900014,0.4999,2.5631,3.5349,4.7434,15.0001
median_house_value,20632.0,206870.509936,115402.351214,14999.0,119600.0,179700.0,264825.0,500001.0


In [7]:
df.isnull().sum()

housing_median_age      3
total_rooms             5
total_bedrooms        211
population             12
households             10
median_income           9
median_house_value      8
dtype: int64

In [8]:
df.dropna(inplace= True)

In [9]:
df.isnull().sum()

housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
dtype: int64

In [10]:
df.head()

Unnamed: 0,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0
1,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0
2,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0
3,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0
4,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0


In [11]:
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error


In [12]:
x = df.drop('median_house_value',axis=1)
y = df['median_house_value']

In [13]:
x_train, x_test, y_train, y_test, = train_test_split(x, y, test_size=0.2, random_state=42)

In [14]:
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [15]:
models ={
    'LinearRegression' : LinearRegression(),
    'ridge regression' : Ridge(),
    'Lasso regression' : Lasso(),
    'ElasticNet' : ElasticNet(),
    'RandomForest' : RandomForestRegressor(),
    'GradientBoosting' : GradientBoostingRegressor()
}

In [16]:
model_results= {}
for model_name, model in models.items():
    model.fit(x_train_scaled, y_train)
    y_pred = model.predict(x_test_scaled)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    model_results[model_name] = rmse
    print(f'{model_name}: RMSE = {rmse:.2f}')


LinearRegression: RMSE = 77240.59
ridge regression: RMSE = 77241.45
Lasso regression: RMSE = 77240.93
ElasticNet: RMSE = 85982.29
RandomForest: RMSE = 67850.93
GradientBoosting: RMSE = 70068.28


In [17]:
best_model_name = min(model_results, key=model_results.get)
best_model_rmse = model_results[best_model_name]
print(f'\nBest model: {best_model_name} with RMSE = {best_model_rmse:.2f}')


Best model: RandomForest with RMSE = 67850.93


In [19]:

best_model = models[best_model_name]
cv_scores = cross_val_score(best_model, x_train_scaled, y_train, cv=5, scoring='neg_mean_squared_error')
cv_rmse = np.sqrt(-cv_scores)
print(f'\nCross-Validation RMSE for {best_model_name}: {cv_rmse.mean():.2f} ± {cv_rmse.std():.2f}')



Cross-Validation RMSE for RandomForest: 67177.15 ± 1637.35


In [22]:
df.columns

Index(['housing_median_age', 'total_rooms', 'total_bedrooms', 'population',
       'households', 'median_income', 'median_house_value'],
      dtype='object')

In [28]:
def pred(housing_median_age, total_rooms, total_bedrooms, population, households, median_income):
    # Input features
    input_data = np.array([[housing_median_age, total_rooms, total_bedrooms, population, households, median_income]])
    
    # Scale the input features using the same scaler used during training
    input_data_scaled = scaler.transform(input_data)
    
    # Use the best model for prediction
    best_model = models[best_model_name]
    prediction = best_model.predict(input_data_scaled)
    
    # Return the predicted median house value
    return prediction[0]

In [29]:
predicted_value = pred(52.0, 1627.0, 280.0, 565.0, 259.0, 3.8462)
print(f"Predicted Median House Value: ${predicted_value:.2f}")

Predicted Median House Value: $333920.04


