In [14]:
import pandas as pd
import matplotlib.pyplot as plt 
import numpy as np


from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.metrics import mean_squared_error, r2_score

In [15]:
data_path = "../../data"

In [16]:
df = pd.read_csv(f"{data_path}/ai_silicon_valley/housing_train_data.csv")
df

Unnamed: 0.1,Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,2072,-119.84,36.77,6.0,1853.0,473.0,1397.0,417.0,1.4817,72000.0,INLAND
1,10600,-117.80,33.68,8.0,2032.0,349.0,862.0,340.0,6.9133,274100.0,<1H OCEAN
2,2494,-120.19,36.60,25.0,875.0,214.0,931.0,214.0,1.5536,58300.0,INLAND
3,4284,-118.32,34.10,31.0,622.0,229.0,597.0,227.0,1.5284,200000.0,<1H OCEAN
4,16541,-121.23,37.79,21.0,1922.0,373.0,1130.0,372.0,4.0815,117900.0,INLAND
...,...,...,...,...,...,...,...,...,...,...,...
16507,1099,-121.90,39.59,20.0,1465.0,278.0,745.0,250.0,3.0625,93800.0,INLAND
16508,18898,-122.25,38.11,49.0,2365.0,504.0,1131.0,458.0,2.6133,103100.0,NEAR BAY
16509,11798,-121.22,38.92,19.0,2531.0,461.0,1206.0,429.0,4.4958,192600.0,INLAND
16510,6637,-118.14,34.16,39.0,2776.0,840.0,2546.0,773.0,2.5750,153500.0,<1H OCEAN


In [17]:
ocean_proximity_map = {
    'INLAND': 0,
    '<1H OCEAN': 1,
    'NEAR OCEAN': 2,
    'NEAR BAY': 3,
    'ISLAND': 4
}

def encode(df, col, col_map):
    df.loc[:,col] = df[col].map(col_map)
    return df

df = encode(df, 'ocean_proximity', ocean_proximity_map)

In [18]:
df_num = [
    'longitude',
    'latitude',
    'housing_median_age',
    'total_rooms',
    'total_bedrooms',
    'population',
    'households',
    'median_income'
]

df_cat = ['ocean_proximity']

In [19]:

num_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='mean')),  
    ('scale', MinMaxScaler())                    
])

cat_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')), 
    ('one-hot', OneHotEncoder(handle_unknown='ignore', sparse=False)) 
])

col_trans = ColumnTransformer(transformers=[
    ('num_pipeline', num_pipeline, df_num),
    ('cat_pipeline', cat_pipeline, df_cat)
    ],
    remainder='drop',
    n_jobs=-1)

knn = KNeighborsRegressor(n_neighbors=5) 

knn_pipeline = Pipeline(steps=[
    ('col_trans', col_trans),
    ('model', knn)
])


X = df[df_cat + df_num] 
y = df['median_house_value'] 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

knn_pipeline.fit(X_train, y_train)

score = knn_pipeline.score(X_test, y_test)
print(f"Model score: {score:.2f}") 



Model score: 0.71


## Create the KNN Model and Set Up GridSearchCV

In [34]:




num_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='mean')),  
    ('scale', MinMaxScaler())                    
])

cat_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')), 
    ('one-hot', OneHotEncoder(handle_unknown='ignore', sparse=False)) 
])

col_trans = ColumnTransformer(transformers=[
    ('num_pipeline', num_pipeline, df_num),
    ('cat_pipeline', cat_pipeline, df_cat)
    ],
    remainder='drop',
    n_jobs=-1)

knn_pipeline = Pipeline([
    ('col_trans', col_trans),
    ('knn', KNeighborsRegressor())
])

param_grid = {
    'knn__n_neighbors': np.arange(1, 25) 
}

grid_search = GridSearchCV(knn_pipeline, param_grid, cv=5, scoring='r2') 


X = df[df_cat + df_num] 
y = df['median_house_value'] 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

grid_search.fit(X_train, y_train)

best_k = grid_search.best_params_['knn__n_neighbors']
print(f"Best number of neighbors: {best_k}")

best_score = grid_search.best_score_
print(f"Best cross-validation R^2 score: {best_score:.2f}")

test_score = grid_search.score(X_test, y_test)
print(f"Test set R^2 score: {test_score:.2f}")



Best number of neighbors: 10
Best cross-validation R^2 score: 0.70
Test set R^2 score: 0.71


## improve?

In [21]:



num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  
    ('scaler', StandardScaler())                 
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')), 
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse=False)) 
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_pipeline, df_num),
        ('cat', cat_pipeline, df_cat)
    ],
    remainder='drop' 
)
knn_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('knn', KNeighborsRegressor())
])

param_grid = {
    'knn__n_neighbors': np.arange(1, 31), 
    'knn__weights': ['uniform', 'distance'], 
    'knn__metric': ['euclidean', 'manhattan'] 
}

grid_search = GridSearchCV(knn_pipeline, param_grid, cv=5, verbose=1, scoring='r2')
X = df[df_cat + df_num]
y = df['median_house_value']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best cross-validated R^2 score:", grid_search.best_score_)
y_pred = grid_search.predict(X_test)
test_r2 = r2_score(y_test, y_pred)
test_mse = mean_squared_error(y_test, y_pred)
print(f"Test R^2 score: {test_r2:.2f}")
print(f"Test MSE: {test_mse:.2f}")


Fitting 5 folds for each of 120 candidates, totalling 600 fits




Best parameters: {'knn__metric': 'manhattan', 'knn__n_neighbors': 9, 'knn__weights': 'distance'}
Best cross-validated R^2 score: 0.7355659572413168
Test R^2 score: 0.75
Test MSE: 3291881548.44




## saving model

In [22]:
model_path = "../../model"

In [33]:
import joblib

knn_pipeline.fit(df[df_cat + df_num], df['median_house_value'])
joblib.dump(knn_pipeline, f"{model_path}/knn_pipe.joblib")




['../../model/knn_pipe.joblib']

## Loading model and test

In [28]:
same_pipe = joblib.load(f"{model_path}/knn_pipe.joblib")

In [29]:
same_df = pd.read_csv(f"{data_path}/ai_silicon_valley/housing_train_data.csv")
same_df

Unnamed: 0.1,Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,2072,-119.84,36.77,6.0,1853.0,473.0,1397.0,417.0,1.4817,72000.0,INLAND
1,10600,-117.80,33.68,8.0,2032.0,349.0,862.0,340.0,6.9133,274100.0,<1H OCEAN
2,2494,-120.19,36.60,25.0,875.0,214.0,931.0,214.0,1.5536,58300.0,INLAND
3,4284,-118.32,34.10,31.0,622.0,229.0,597.0,227.0,1.5284,200000.0,<1H OCEAN
4,16541,-121.23,37.79,21.0,1922.0,373.0,1130.0,372.0,4.0815,117900.0,INLAND
...,...,...,...,...,...,...,...,...,...,...,...
16507,1099,-121.90,39.59,20.0,1465.0,278.0,745.0,250.0,3.0625,93800.0,INLAND
16508,18898,-122.25,38.11,49.0,2365.0,504.0,1131.0,458.0,2.6133,103100.0,NEAR BAY
16509,11798,-121.22,38.92,19.0,2531.0,461.0,1206.0,429.0,4.4958,192600.0,INLAND
16510,6637,-118.14,34.16,39.0,2776.0,840.0,2546.0,773.0,2.5750,153500.0,<1H OCEAN


In [32]:
predictions = same_pipe.predict(df)
predictions.shape

(16512,)