## KNeighbors Regression

This notebook is for the KNeighbors Regression on the data!

In [None]:
# %load_ext autoreload # this ensures modules are reloaded automatically
# %autoreload 2 

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

from IPython.display import display
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

# Cleaning methods
import cleaning
import constants
import imputation
import auxiliary

# Scikit learn
from sklearn.neighbors import KNeighborsRegressor
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RepeatedKFold
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import KFold
from sklearn.ensemble import AdaBoostRegressor
from sklearn.impute import KNNImputer


from scipy.spatial import Voronoi, voronoi_plot_2d
from matplotlib.patches import Rectangle, Circle
from matplotlib.colors import ListedColormap

from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer

# For adaboost
from sklearn.ensemble import AdaBoostClassifier


import os
for dirname, _, filenames in os.walk('data'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# Main dataset
df_prices_train = pd.read_csv("data/train.csv")
df_prices_train_demo = pd.read_csv("data/train.csv")
df_prices_test = pd.read_csv("data/test.csv")

# Have a peek at the data
print("Property Prices train dataset", df_prices_train.shape)
display(df_prices_train.head())

## Data Cleaning

In [None]:
# Take 4+1 as 5
df_prices_train['bedrooms'] = df_prices_train['bedrooms'].apply(cleaning.process_bedroom_sum)
df_prices_test['bedrooms'] = df_prices_test['bedrooms'].apply(cleaning.process_bedroom_sum) 

# Take 4+1 as 4.5
# df_prices_train['bedrooms'] = df_prices_train['bedrooms'].apply(cleaning.process_bedroom_half)
# df_prices_test['bedrooms'] = df_prices_test['bedrooms'].apply(cleaning.process_bedroom_half) 
df_prices_train[df_prices_train['bedrooms'].isna()]

## Add Auxiliary Data (optional)

In [None]:
# Import auxiliary data
df_commercial_centres = pd.read_csv("data/auxiliary-data/auxiliary-data/sg-commerical-centres.csv")
df_train_stations = pd.read_csv("data/auxiliary-data/auxiliary-data/sg-train-stations.csv")
df_shopping_malls = pd.read_csv("data/auxiliary-data/auxiliary-data/sg-shopping-malls.csv")
df_primary_schools = pd.read_csv("data/auxiliary-data/auxiliary-data/sg-primary-schools.csv")
df_secondary_schools = pd.read_csv("data/auxiliary-data/auxiliary-data/sg-secondary-schools.csv")

# Have a peek at the data
print("Aux: Commercial Centres", df_commercial_centres.shape)
display(df_commercial_centres.head())
print("Aux: Train Stations", df_train_stations.shape)
display(df_train_stations.head())
print("Aux: Shopping Malls", df_shopping_malls.shape)
display(df_shopping_malls.head())
print("Aux: Primary Schools", df_primary_schools.shape)
display(df_primary_schools.head())
print("Aux: Secondary Schools", df_secondary_schools.shape)
display(df_secondary_schools.head())

We add some helper functions for computing the distances to these important locations and add the distances to our dataframes:

In [None]:
from geopy import distance
from tqdm import tqdm
tqdm.pandas()

# Helper Functions 
def distance_ll(lat1: float, lng1: float, lat2: float, lng2: float) -> float:
    return distance.distance((lat1, lng1), (lat2, lng2))

def dist_to_nearest_item(expr: pd.DataFrame, aux_df: pd.DataFrame) -> str:
    return round(min(aux_df.apply(lambda x: distance_ll(expr['lat'], expr['lng'], x['lat'], x['lng']), axis=1)).km, 1)

## KNeighbors Regression

In [None]:
# Get data
X_train = df_prices_train.loc[:, df_prices_train.columns != 'price']
y_train = df_prices_train[['price']].values.ravel()
print(X_train.shape)
model = KNeighborsRegressor()
# Set up pipeline with imputer for proper grid search
# Reference: https://scikit-learn.org/stable/tutorial/statistical_inference/putting_together.html
pipeline = Pipeline(steps=[('imputer', KNNImputer()), ('model', model)])

# define grid search for hyperparameters
grid = {
    'imputer__n_neighbors': [3, 5, 7, 11, 15],
    'model__n_neighbors': [i for i in range(1, 32, 2)]
}


cv = RepeatedKFold(n_splits=10, n_repeats=1, random_state=1)
# define grid search for hyperparameters
num_features = X_train.shape[1]

mse = make_scorer(mean_squared_error, greater_is_better=False)
grid_search = HalvingGridSearchCV(estimator=pipeline, param_grid=grid, n_jobs=-1, cv=cv, scoring=mse, verbose=1)
# Execute the grid search
grid_results = grid_search.fit(X_train, y_train)

In [None]:
# Get RMSE of grid results best score
best_rmse = (-grid_results.best_score_) ** 0.5
print("Best: %f using %s" % (best_rmse, grid_results.best_params_))
means = grid_results.cv_results_['mean_test_score']
stds = grid_results.cv_results_['std_test_score']
params = grid_results.cv_results_['params']
# for mean, stdev, param in zip(means, stds, params):
#     print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
# Predict
clf = grid_results
X_test = df_prices_test.loc[:, df_prices_test.columns != 'price']
y_pred = clf.predict(X_test)

In [None]:
result = pd.DataFrame({"Predicted": y_pred})
result.index.name = "Id"
result.to_csv("submission_kneighbors.csv")