In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, PowerTransformer
from sklearn.pipeline import Pipeline
from scipy.stats import iqr
from sklearn.feature_selection import SelectKBest, f_regression, SequentialFeatureSelector

In [2]:
df = pd.read_csv("../Dataset/housing.csv")
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [27]:
df.isna().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [31]:
value_counts = df['ocean_proximity'].value_counts()
value_counts

ocean_proximity
<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: count, dtype: int64

In [32]:
_1h_ocean_data = df[df["ocean_proximity"] == "<1H OCEAN"]
inland_data = df[df["ocean_proximity"] == "INLAND"]

In [33]:
inland_data = inland_data.drop(columns=["ocean_proximity"])

In [34]:
inland_data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
954,-121.92,37.64,46.0,1280.0,209.0,512.0,208.0,5.1406,315600.0
957,-121.9,37.66,18.0,7397.0,1137.0,3126.0,1115.0,6.4994,323000.0
965,-121.88,37.68,23.0,2234.0,270.0,854.0,286.0,7.333,337200.0
967,-121.88,37.67,16.0,4070.0,624.0,1543.0,577.0,6.5214,311500.0
968,-121.88,37.67,25.0,2244.0,301.0,937.0,324.0,6.4524,296900.0


**Data Spliting**

In [35]:
inland_data_x = inland_data.drop(columns=["median_house_value"])
inland_data_y = inland_data["median_house_value"]

**Simple Imputing**

In [36]:
simple_imputer = SimpleImputer()
inland_data_x = simple_imputer.set_output(transform="pandas").fit_transform(inland_data_x)

**Applying KNNRegression model on simple data without any feature eng.**

In [39]:
model_knn= KNeighborsRegressor()
model_knn.fit(inland_data_x, inland_data_y)

In [42]:
cross_val_score(model_knn, inland_data_x, inland_data_y, cv=5, verbose=3, scoring="r2")

[CV] END ................................ score: (test=0.249) total time=   0.0s
[CV] END ................................ score: (test=0.109) total time=   0.0s
[CV] END ................................ score: (test=0.245) total time=   0.0s
[CV] END ................................ score: (test=0.193) total time=   0.0s
[CV] END ............................... score: (test=-0.003) total time=   0.0s


array([ 0.24911387,  0.10864558,  0.24497243,  0.19261525, -0.00273955])

**Scalling the data**

In [43]:
scalers = {
    "StandardScaler": StandardScaler,
    "MinMaxScaler": MinMaxScaler,
    "MaxAbsScaler": MaxAbsScaler,
    "PowerTransformer": PowerTransformer
}

In [49]:
results = []

for name, scaler in scalers.items():
    scaled_data = scaler().fit_transform(inland_data_x)
    knn = KNeighborsRegressor()
    score = cross_val_score(knn, scaled_data, inland_data_y, cv=5, verbose=3, scoring="r2")
    score_mean = np.mean(score)
    score_std = np.std(score)
    print(f"Scaler: {name} \t score_mean: {score_mean} \t score_std: {score_std}", end="\n\n")
    results.append({"Scaler": name, "score_mean": score_mean, "score_std": score_std})

[CV] END ................................ score: (test=0.616) total time=   0.0s
[CV] END ................................ score: (test=0.363) total time=   0.0s
[CV] END ................................ score: (test=0.343) total time=   0.0s
[CV] END ................................ score: (test=0.267) total time=   0.0s
[CV] END ................................ score: (test=0.360) total time=   0.0s
Scaler: StandardScaler 	 score_mean: 0.38988762692459533 	 score_std: 0.11833907462623414

[CV] END ................................ score: (test=0.594) total time=   0.0s
[CV] END ................................ score: (test=0.327) total time=   0.0s
[CV] END ................................ score: (test=0.336) total time=   0.0s
[CV] END ................................ score: (test=0.331) total time=   0.0s
[CV] END ................................ score: (test=0.367) total time=   0.0s
Scaler: MinMaxScaler 	 score_mean: 0.39080930966610633 	 score_std: 0.10242083511767384

[CV] END .

In [50]:
pd.DataFrame(results)

Unnamed: 0,Scaler,score_mean,score_std
0,StandardScaler,0.389888,0.118339
1,MinMaxScaler,0.390809,0.102421
2,MaxAbsScaler,0.334128,0.100363
3,PowerTransformer,0.370757,0.113431


**Hyper Tuning the parameter**

In [51]:
pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("model", KNeighborsRegressor())
])

grid = RandomizedSearchCV(pipe, {
    "model__n_neighbors": range(5,30,2),
    "model__weights": ["uniform","distance"],
    "model__leaf_size": range(5,50,5),
    "model__p": [1,2,3,4,5]
}, scoring="r2", cv=5, n_jobs=6, verbose=3)

In [52]:
grid.fit(inland_data_x, inland_data_y)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [53]:
grid.best_score_

0.43653790849081486

In [54]:
grid.best_params_

{'model__weights': 'distance',
 'model__p': 3,
 'model__n_neighbors': 21,
 'model__leaf_size': 15}

**Feature Selection**

In [79]:
sfs = SequentialFeatureSelector(knn, n_features_to_select=6, scoring="r2", n_jobs=5, cv=3).set_output(transform="pandas")
sfs

In [80]:
stnd = StandardScaler()
stnd.set_output(transform="pandas")
sfs_x = stnd.fit_transform(inland_data_x)
sfs_data = sfs.fit_transform(sfs_x, inland_data_y)

In [83]:
pipe = Pipeline([
    ("model", KNeighborsRegressor(leaf_size=15, n_neighbors=21, p=1, weights='uniform'))
])

score = cross_val_score(pipe, sfs_data, inland_data_x, cv=5, scoring='r2', n_jobs=5)
print("score_mean: ", score.mean())
print("score_std: ", score.std())

score_mean:  0.8449268666431378
score_std:  0.011080944932019797
