# Model Explainability

## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

In [2]:
data_housing = pd.read_csv('housing (1).csv')
data_housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


## Split the data_housing

In [3]:
X = data_housing.drop('median_house_value', axis = 1)
y = data_housing['median_house_value']

X.shape, y.shape

((20640, 9), (20640,))

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.2, random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((4128, 9), (16512, 9), (4128,), (16512,))

## Preprocessing pipeline

In [5]:
data_housing_num = X_train.select_dtypes([np.number]).columns
data_housing_cat = X_train.select_dtypes(['object']).columns


data_housing_cat, data_housing_num

(Index(['ocean_proximity'], dtype='object'),
 Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
        'total_bedrooms', 'population', 'households', 'median_income'],
       dtype='object'))

In [6]:
num_imputer = SimpleImputer(strategy='median')
num_scaler = StandardScaler()

cat_imputer = SimpleImputer(strategy="most_frequent")
cat_encoder = OneHotEncoder(handle_unknown='ignore')

In [7]:
num_pipline = Pipeline(steps = [
    ('num_imputer' , num_imputer),
    ('num_scaler', num_scaler)
])

cat_pipeline = Pipeline(steps = [
    ('cat_imputer', cat_imputer),
    ('cat_encoder', cat_encoder)
])

In [8]:
preprocessor = ColumnTransformer(transformers=[
    ('num_pipline', num_pipline, data_housing_num),
    ('cat_pipeline', cat_pipeline, data_housing_cat)
])

## Model 

In [9]:
ridge_model = Ridge()

## Full pipeline

In [10]:
full_pipeline = Pipeline(steps = [
    ('preprocessor' , preprocessor),
    ('ridge_model' , ridge_model)
])

## Train

In [11]:
full_pipeline.fit(X_train, y_train)

## Hyperparameter Tuning

In [12]:
param_grid = [{'ridge_model__alpha': [0.01, 0.1, 1, 10, 100, 1000] }]

In [13]:
grid_search  = GridSearchCV( full_pipeline,
                           param_grid,
                           cv = 5,
                           scoring = 'neg_root_mean_squared_error',
                           return_train_score= True)

In [14]:
grid_search.fit(X_train, y_train)

In [15]:
grid_search.best_estimator_

In [16]:
grid_search.best_score_

-72479.10445007944

In [17]:
grid_search.cv_results_

{'mean_fit_time': array([0.01331024, 0.01084056, 0.01536598, 0.01296611, 0.01123457,
        0.01411085]),
 'std_fit_time': array([0.00359092, 0.00193714, 0.00531923, 0.00376286, 0.00259808,
        0.00396943]),
 'mean_score_time': array([0.00520763, 0.00397964, 0.00533581, 0.00495796, 0.00394535,
        0.00463729]),
 'std_score_time': array([0.00135073, 0.00136368, 0.00097155, 0.00194169, 0.00064867,
        0.00092359]),
 'param_ridge_model__alpha': masked_array(data=[0.01, 0.1, 1, 10, 100, 1000],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'ridge_model__alpha': 0.01},
  {'ridge_model__alpha': 0.1},
  {'ridge_model__alpha': 1},
  {'ridge_model__alpha': 10},
  {'ridge_model__alpha': 100},
  {'ridge_model__alpha': 1000}],
 'split0_test_score': array([-74413.51360086, -74412.73989543, -74405.31655953, -74353.85260935,
        -74457.8763454 , -78342.89491894]),
 'split1_test_score': array([-70794.7481

## Fit on best estimator

In [22]:
ridge_best_model = grid_search.best_estimator_

In [23]:
ridge_best_model.fit(X_train, y_train)

## Error analysis

In [24]:
y_train_pred = ridge_best_model.predict(X_train)

In [25]:
y_train_pred

array([ 90166.35911145, 170986.92047192, 191566.47275218, ...,
       196527.45485572, 281991.82848605, 259524.40930316])

In [26]:
residuals = y_train - y_train_pred
abs_residuals = residuals.abs()

In [27]:
top_errors = abs_residuals.sort_values(ascending=False).head(10)
top_errors

18501    535039.285911
1914     435924.681054
15360    435137.669978
4492     364385.537022
89       351746.872558
20322    337366.899760
9168     321895.355114
4644     319438.282203
10616    316065.686284
1633     315542.658518
Name: median_house_value, dtype: float64

In [28]:
X_train.loc[top_errors.index]

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
18501,-121.59,37.19,52.0,220.0,32.0,55.0,26.0,15.0001,<1H OCEAN
1914,-120.1,38.91,33.0,1561.0,282.0,30.0,11.0,1.875,INLAND
15360,-117.42,33.35,14.0,25135.0,4819.0,35682.0,4769.0,2.5729,<1H OCEAN
4492,-118.22,34.06,52.0,48.0,6.0,41.0,10.0,10.2264,<1H OCEAN
89,-122.27,37.8,52.0,249.0,78.0,396.0,85.0,1.2434,NEAR BAY
20322,-119.14,34.23,8.0,243.0,75.0,102.0,80.0,2.5714,NEAR OCEAN
9168,-118.56,34.42,2.0,966.0,270.0,233.0,169.0,1.9667,<1H OCEAN
4644,-118.31,34.06,24.0,1336.0,453.0,1268.0,426.0,2.8202,<1H OCEAN
10616,-117.81,33.69,5.0,1256.0,256.0,880.0,288.0,2.4233,<1H OCEAN
1633,-122.22,37.88,20.0,95.0,13.0,31.0,15.0,2.4444,NEAR BAY


In [29]:
y_train_pred = pd.Series(
    y_train_pred,
    index=y_train.index
)

In [30]:
top_errors_df= pd.DataFrame({
    'y_train': y_train.loc[top_errors.index],
    'y_train_pred' : y_train_pred[top_errors.index],
    'top_errors': top_errors
})

## Get feature names

In [33]:
feature_names = full_pipeline.named_steps['preprocessor'].get_feature_names_out()
feature_names

array(['num_pipline__longitude', 'num_pipline__latitude',
       'num_pipline__housing_median_age', 'num_pipline__total_rooms',
       'num_pipline__total_bedrooms', 'num_pipline__population',
       'num_pipline__households', 'num_pipline__median_income',
       'cat_pipeline__ocean_proximity_<1H OCEAN',
       'cat_pipeline__ocean_proximity_INLAND',
       'cat_pipeline__ocean_proximity_ISLAND',
       'cat_pipeline__ocean_proximity_NEAR BAY',
       'cat_pipeline__ocean_proximity_NEAR OCEAN'], dtype=object)

In [38]:
coefficients = full_pipeline.named_steps['ridge_model'].coef_
coefficients

array([-48056.17203209, -47476.20432411,  13163.45627611, -16331.52097649,
        50301.44555436, -32170.1300646 ,   3779.05276323,  75214.9209086 ,
       -10507.1536733 , -54417.88875153,  81368.69404342,  -9767.0597014 ,
        -6676.59191736])

In [46]:
coef_df = pd.DataFrame({
    'feature' : feature_names,
    'coefficient' : coefficients
}).sort_values(by ='coefficient', ascending=False)

In [48]:
coef_df

Unnamed: 0,feature,coefficient
10,cat_pipeline__ocean_proximity_ISLAND,81368.694043
7,num_pipline__median_income,75214.920909
4,num_pipline__total_bedrooms,50301.445554
2,num_pipline__housing_median_age,13163.456276
6,num_pipline__households,3779.052763
12,cat_pipeline__ocean_proximity_NEAR OCEAN,-6676.591917
11,cat_pipeline__ocean_proximity_NEAR BAY,-9767.059701
8,cat_pipeline__ocean_proximity_<1H OCEAN,-10507.153673
3,num_pipline__total_rooms,-16331.520976
5,num_pipline__population,-32170.130065


## Identify POSITIVE & NEGATIVE drivers Top 3

In [43]:
coef_df.head(3)

Unnamed: 0,feature,coefficient
10,cat_pipeline__ocean_proximity_ISLAND,81368.694043
7,num_pipline__median_income,75214.920909
4,num_pipline__total_bedrooms,50301.445554


In [49]:
coef_df.tail(3)

Unnamed: 0,feature,coefficient
1,num_pipline__latitude,-47476.204324
0,num_pipline__longitude,-48056.172032
9,cat_pipeline__ocean_proximity_INLAND,-54417.888752


## Interpretation

### What does this feature represent in the real world?
-> ```Eg : median_income . It strong positive coeficient. This means that, holding other features constant, areas with higher median income are associated with higher house prices. This likely captures neighborhood quality and purchasing power rahter than income directly causing price changes.```

-> ```ocean_proximity_ISLAND has the highest positive coefficient, meaning that—holding all other features constant—houses located on islands are predicted to be significantly more expensive than houses in other locations. This likely reflects the fact that island properties in the dataset are rare and typically high-value, such as luxury or exclusive homes. The model learns this association from the data, not because being on an island directly causes higher prices, but because island houses consistently appear with higher target values in the training set. Because island houses are rare, this coefficient may be influenced by a small number of high-priced examples, which could make predictions less reliable for similar future cases.``` 



-> ``` total_bedrooms has a strong positive coefficient, meaning that—holding other features constant—houses with more bedrooms are predicted to be more expensive. This feature likely captures both house size and usability: more bedrooms generally indicate larger homes or homes that can accommodate more occupants, which are valued higher in the market. Since the model also includes related features like total rooms and households, this coefficient reflects the additional value of bedroom count beyond overall size, rather than area alone.Because bedrooms are highly correlated with other size-related features, this coefficient should be interpreted cautiously, as its magnitude may be affected by multicollinearity.```

-> ```latitude has a strong negative coefficient, meaning that—holding other features constant—houses located further north are predicted to be cheaper. This reflects geographic price patterns in the dataset, where southern regions of California tend to have higher housing prices compared to northern inland areas. The model captures this as a spatial correlation rather than a causal effect of latitude itself.```

-> ```longitude also has a negative coefficient, indicating that as houses are located further east, the predicted price decreases. This likely captures proximity to the coast, where coastal and western regions in the dataset are associated with higher property values. Longitude acts as a proxy for location-based desirability rather than directly influencing price.```

->```Latitude and longitude together allow the model to learn spatial price gradients, rather than relying on a single location variable.```

-> ```The negative coefficient for ocean_proximity_INLAND means that, holding all other features constant, houses located inland are predicted to be cheaper than houses in coastal or island locations. This reflects the premium associated with proximity to water in the dataset, where coastal access is correlated with higher housing prices.Because ocean proximity is encoded categorically, this coefficient should be interpreted relative to the reference categories rather than as an absolute effect.``` 

## EXPLAINING ONE PREDICTION

In [56]:
test_sample = X_test.iloc[[80]]
test_sample

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
19890,-119.15,36.29,18.0,1435.0,,657.0,254.0,2.4281,INLAND


In [57]:
full_pipeline.predict(test_sample)

array([99421.40709])

### Factors pushing the price UP

1.`Moderate number of rooms (1435)`:
Indicates a reasonably sized house, which contributes positively to the price.

2.`Relatively low housing age (18 years)`:
Newer houses tend to be valued higher than very old properties.

### Factors pulling the price DOWN (STRONGER EFFECT)

1. `Low median income (2.43)`:
This is one of the strongest predictors in the model. Lower-income areas are associated with lower housing prices.

2. `INLAND location`:
Being inland carries a strong negative effect compared to coastal or island properties.

3. `Geographic location (latitude & longitude)`:
This location corresponds to inland, non-coastal regions that historically have lower prices in the dataset.

4. `Missing bedrooms value`:
The model relies on imputation, which usually pulls the prediction toward an average rather than a high value.


```The model predicted a price of around 99,000 mainly because the house is located inland in a lower-income area, which strongly reduces the predicted value. While the house has a reasonable number of rooms and is not very old, these positive factors are outweighed by the low median income of the area and the inland geographic location. As a result, the model predicts a relatively low house price compared to coastal or higher-income regions. ``` 

## WHY MODELS FAIL

In [69]:
y_pred = full_pipeline.predict(X_test)

In [70]:
errors = y_test - y_pred
errors

20046    -12572.752770
3024     -71005.132819
15663    262229.096208
20484    -50502.065669
9814      20012.057498
             ...      
20011    -23336.239994
12225     24630.772502
1051      -3339.578342
14519    -59738.297719
11410    -73188.150294
Name: median_house_value, Length: 16512, dtype: float64

In [71]:
abs_errors = errors.abs()
abs_errors

20046     12572.752770
3024      71005.132819
15663    262229.096208
20484     50502.065669
9814      20012.057498
             ...      
20011     23336.239994
12225     24630.772502
1051       3339.578342
14519     59738.297719
11410     73188.150294
Name: median_house_value, Length: 16512, dtype: float64

In [73]:
error_table = pd.DataFrame({
    'y_test' : y_test,
    'y_pred' : y_pred,
    'abs_errors' : abs_errors
}).sort_values(by='abs_errors', ascending=False)

## Top 10 worst predictions

In [75]:
error_table.head(10)

Unnamed: 0,y_test,y_pred,abs_errors
6688,500001.0,31234.265894,468766.734106
4861,500001.0,36595.817934,463405.182066
18346,375000.0,-55986.823281,430986.823281
12138,500001.0,85757.335798,414243.664202
10574,500001.0,110105.835501,389895.164499
13766,500001.0,114379.932739,385621.067261
11912,112500.0,493204.7733,380704.7733
19542,450000.0,81872.317647,368127.682353
6639,500001.0,132875.960164,367125.039836
459,500001.0,133076.724003,366924.275997


 ### What stands out immediately ?
``` Value 500001 appears repeatedly 375000, 450000, 112500 also appear, but the worst errors are dominated by 500001 This is NOT random. This is a dataset artifact.```

``` In the California housing dataset: `500001` is a price cap, not the true value`.
So many of these houses are:
Very expensive
But all clipped to the same maximum target. ```

### What type of houses does the model struggle with?
The model struggles most with very high-priced houses, especially those at or near the upper price cap of the dataset.

These houses are:
1. Rare
2. High-end
3. Often coastal or premium locations
4. Underrepresented in training data

### Why are the errors so large?
```Target limitation (PRIMARY reason) Prices above 500,000 are clippedThe model never sees true variation beyond this cap. So it cannot learn how expensive luxury houses really are. This creates systematic underprediction```

``` The largest errors occur for very expensive houses, many of which are capped at the maximum target value in the dataset. This cap prevents the model from learning meaningful differences among high-end properties, leading to severe underprediction. Additionally, these houses are relatively rare, so the model prioritizes minimizing error on more common mid-range homes. As a result, the errors are driven more by data and target limitations than by poor model fitting.```