In [3]:
import pandas as pd
from sklearn.datasets import fetch_california_housing

In [5]:
data = fetch_california_housing(as_frame=True)
df = data
df

{'data':        MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
 0      8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
 1      8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
 2      7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
 3      5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
 4      3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   
 ...       ...       ...       ...        ...         ...       ...       ...   
 20635  1.5603      25.0  5.045455   1.133333       845.0  2.560606     39.48   
 20636  2.5568      18.0  6.114035   1.315789       356.0  3.122807     39.49   
 20637  1.7000      17.0  5.205543   1.120092      1007.0  2.325635     39.43   
 20638  1.8672      18.0  5.329513   1.171920       741.0  2.123209     39.43   
 20639  2.3886      16.0  5.254717   1.162264      1387.0  2.616981     39.37   
 
        Longitude 

In [6]:
df = pd.concat([data.data, data.target.rename('MedianHouseValue')], axis=1)
# The line of code is used to create a single pandas DataFrame that combines the features (independent variables) and target (dependent variable) from the California Housing dataset for easier analysis and processing.
df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedianHouseValue
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847


In [9]:
df.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedianHouseValue
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,3.870671,28.639486,5.429,1.096675,1425.476744,3.070655,35.631861,-119.569704,2.068558
std,1.899822,12.585558,2.474173,0.473911,1132.462122,10.38605,2.135952,2.003532,1.153956
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.54,-124.35,0.14999
25%,2.5634,18.0,4.440716,1.006079,787.0,2.429741,33.93,-121.8,1.196
50%,3.5348,29.0,5.229129,1.04878,1166.0,2.818116,34.26,-118.49,1.797
75%,4.74325,37.0,6.052381,1.099526,1725.0,3.282261,37.71,-118.01,2.64725
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333,41.95,-114.31,5.00001


In [11]:
# Check for missing values
print("Missing Values:\n", df.isnull().sum())

Missing Values:
 MedInc              0
HouseAge            0
AveRooms            0
AveBedrms           0
Population          0
AveOccup            0
Latitude            0
Longitude           0
MedianHouseValue    0
dtype: int64


In [13]:
# check for duplicates
df[df.duplicated()].sum()

MedInc              0.0
HouseAge            0.0
AveRooms            0.0
AveBedrms           0.0
Population          0.0
AveOccup            0.0
Latitude            0.0
Longitude           0.0
MedianHouseValue    0.0
dtype: float64

In [15]:
from sklearn.preprocessing import StandardScaler
# Feature Scaling
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df[data.feature_names])
df_scaled = pd.DataFrame(scaled_features, columns=data.feature_names)
df_scaled['MedianHouseValue'] = df['MedianHouseValue']

In [17]:
df_scaled 

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedianHouseValue
0,2.344766,0.982143,0.628559,-0.153758,-0.974429,-0.049597,1.052548,-1.327835,4.526
1,2.332238,-0.607019,0.327041,-0.263336,0.861439,-0.092512,1.043185,-1.322844,3.585
2,1.782699,1.856182,1.155620,-0.049016,-0.820777,-0.025843,1.038503,-1.332827,3.521
3,0.932968,1.856182,0.156966,-0.049833,-0.766028,-0.050329,1.038503,-1.337818,3.413
4,-0.012881,1.856182,0.344711,-0.032906,-0.759847,-0.085616,1.038503,-1.337818,3.422
...,...,...,...,...,...,...,...,...,...
20635,-1.216128,-0.289187,-0.155023,0.077354,-0.512592,-0.049110,1.801647,-0.758826,0.781
20636,-0.691593,-0.845393,0.276881,0.462365,-0.944405,0.005021,1.806329,-0.818722,0.771
20637,-1.142593,-0.924851,-0.090318,0.049414,-0.369537,-0.071735,1.778237,-0.823713,0.923
20638,-1.054583,-0.845393,-0.040211,0.158778,-0.604429,-0.091225,1.778237,-0.873626,0.847


In [19]:
df['MedianHouseValue']

0        4.526
1        3.585
2        3.521
3        3.413
4        3.422
         ...  
20635    0.781
20636    0.771
20637    0.923
20638    0.847
20639    0.894
Name: MedianHouseValue, Length: 20640, dtype: float64

In [21]:
from sklearn.model_selection import train_test_split

In [23]:
# Split the data
X = df_scaled[data.feature_names]
y = df_scaled['MedianHouseValue']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [25]:
print("Train set size:", X_train.shape, y_train.shape)
print("Test set size:", X_test.shape, y_test.shape)

Train set size: (16512, 8) (16512,)
Test set size: (4128, 8) (4128,)


In [27]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [29]:
# Initialize and train model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

In [31]:
# Predict on test set
lr_predictions = lr_model.predict(X_test)

In [33]:
# Evaluate performance
lr_mse = mean_squared_error(y_test, lr_predictions)
lr_mae = mean_absolute_error(y_test, lr_predictions)
lr_r2 = r2_score(y_test, lr_predictions)

print("Linear Regression - MSE:", lr_mse, "MAE:", lr_mae, "R²:", lr_r2)

Linear Regression - MSE: 0.5558915986952442 MAE: 0.5332001304956565 R²: 0.575787706032451


In [35]:
from sklearn.tree import DecisionTreeRegressor

# Initialize and train model
dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X_train, y_train)

# Predict on test set
dt_predictions = dt_model.predict(X_test)

# Evaluate performance
dt_mse = mean_squared_error(y_test, dt_predictions)
dt_mae = mean_absolute_error(y_test, dt_predictions)
dt_r2 = r2_score(y_test, dt_predictions)

print("Decision Tree Regressor - MSE:", dt_mse, "MAE:", dt_mae, "R²:", dt_r2)

Decision Tree Regressor - MSE: 0.4942716777366763 MAE: 0.4537843265503876 R²: 0.6228111330554302


In [37]:
from sklearn.ensemble import RandomForestRegressor

# Initialize and train model
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)

# Predict on test set
rf_predictions = rf_model.predict(X_test)

# Evaluate performance
rf_mse = mean_squared_error(y_test, rf_predictions)
rf_mae = mean_absolute_error(y_test, rf_predictions)
rf_r2 = r2_score(y_test, rf_predictions)

print("Random Forest Regressor - MSE:", rf_mse, "MAE:", rf_mae, "R²:", rf_r2)

Random Forest Regressor - MSE: 0.25549776668540763 MAE: 0.32761306601259704 R²: 0.805024407701793


In [38]:
from sklearn.ensemble import GradientBoostingRegressor

# Initialize and train model
gb_model = GradientBoostingRegressor(random_state=42)
gb_model.fit(X_train, y_train)

# Predict on test set
gb_predictions = gb_model.predict(X_test)

# Evaluate performance
gb_mse = mean_squared_error(y_test, gb_predictions)
gb_mae = mean_absolute_error(y_test, gb_predictions)
gb_r2 = r2_score(y_test, gb_predictions)

print("Gradient Boosting Regressor - MSE:", gb_mse, "MAE:", gb_mae, "R²:", gb_r2)

Gradient Boosting Regressor - MSE: 0.29399901242474274 MAE: 0.37165044848436773 R²: 0.7756433164710084


In [41]:
from sklearn.svm import SVR

# Initialize and train model
svr_model = SVR(kernel='rbf')
svr_model.fit(X_train, y_train)

# Predict on test set
svr_predictions = svr_model.predict(X_test)

# Evaluate performance
svr_mse = mean_squared_error(y_test, svr_predictions)
svr_mae = mean_absolute_error(y_test, svr_predictions)
svr_r2 = r2_score(y_test, svr_predictions)

print("Support Vector Regressor - MSE:", svr_mse, "MAE:", svr_mae, "R²:", svr_r2)

Support Vector Regressor - MSE: 0.3551984619989419 MAE: 0.3977630963437859 R²: 0.7289407597956462


In [42]:
results = pd.DataFrame({
    "Model": ["Linear Regression", "Decision Tree", "Random Forest", "Gradient Boosting", "SVR"],
    "MSE": [lr_mse, dt_mse, rf_mse, gb_mse, svr_mse],
    "MAE": [lr_mae, dt_mae, rf_mae, gb_mae, svr_mae],
    "R²": [lr_r2, dt_r2, rf_r2, gb_r2, svr_r2]
})

print(results)

               Model       MSE       MAE        R²
0  Linear Regression  0.555892  0.533200  0.575788
1      Decision Tree  0.494272  0.453784  0.622811
2      Random Forest  0.255498  0.327613  0.805024
3  Gradient Boosting  0.293999  0.371650  0.775643
4                SVR  0.355198  0.397763  0.728941


In [45]:
# Identify best and worst models
best_model = results.loc[results["R²"].idxmax()]
worst_model = results.loc[results["R²"].idxmin()]

print("Best Model:\n", best_model)
print("Worst Model:\n", worst_model)

Best Model:
 Model    Random Forest
MSE           0.255498
MAE           0.327613
R²            0.805024
Name: 2, dtype: object
Worst Model:
 Model    Linear Regression
MSE               0.555892
MAE                 0.5332
R²                0.575788
Name: 0, dtype: object
