# k-NN Algorithm - Correlation Statistics for target variable hvac_S

In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error, r2_score

### Validation split of data by keeping aside 10 percent of data and working on 90 percent of data for model building

In [13]:
# Load the CSV file
#df = pd.read_csv("feature_select_df_hvacS_1.csv")

# Perform 90:10 train-test split
#train_90, test_10 = train_test_split(df, test_size=0.1, random_state=42)

# Save the 90% split files
#train_90.to_csv("train_90.csv", index=False)
#test_10.to_csv("test_10.csv", index=False)

In [14]:
#target = all_df[['hvac_S', 'hvac_N']]

#target.to_csv("target.csv", index=False)

# Perform 90:10 train-test split for the target variable
#target_train_90, target_test_10 = train_test_split(target, test_size=0.1, random_state=42)

# Save the 90% split target variable
#target_train_90.to_csv("target_train_90.csv", index=False)
#target_test_10.to_csv("target_test_10.csv", index=False)

target_90 = pd.read_csv("target_train_90.csv")

# Align indices of features with the 90% split target variable
#df_target_90 = df.loc[target_90.index]


In [15]:
df_features_90 = pd.read_csv("train_90_S.csv")

In [16]:

X1_train, X1_test, y1_train, y1_test = train_test_split(df_features_90, target_90['hvac_S'], random_state=42, test_size=0.3)

In [17]:

scaler = StandardScaler()
X1_train_scaled = scaler.fit_transform(X1_train)
X1_test_scaled = scaler.transform(X1_test)

knn = KNeighborsRegressor(n_neighbors=5)

knn.fit(X1_train_scaled, y1_train)

y1_pred = knn.predict(X1_test_scaled)

mse_hvacS = mean_squared_error(y1_test, y1_pred)
print("Mean Squared Error:", mse_hvacS)


Mean Squared Error: 28.82916824489818


In [23]:
print('k-NN R2 score for hvacS: {}'.format(r2_score(y1_test,y1_pred)))
print('k-NN Mean Absolute Error (MAE) for hvacS: {}'.format(mean_absolute_error(y1_test, y1_pred)))
print('k-NN Mean Squared Error (MSE) for hvacS: {}'.format(mean_squared_error(y1_test, y1_pred)))
rmse_hvacS = np.sqrt(mean_squared_error(y1_test, y1_pred))
print('k-NN Root Mean Squared Error (RMSE) for hvacS: {}'.format(rmse_hvacS))

k-NN R2 score for hvacS: 0.8282825654851133
k-NN Mean Absolute Error (MAE) for hvacS: 3.3433649369199396
k-NN Mean Squared Error (MSE) for hvacS: 28.82916824489818
k-NN Root Mean Squared Error (RMSE) for hvacS: 5.369280049028751


# Hyperparameter tuning in k-NN through cross-validation and gridsearch

In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

knn_cv = KNeighborsRegressor(n_neighbors=3)

cv_scores = cross_val_score(knn_cv, X1_train_scaled, y1_train, cv=5, scoring='neg_mean_squared_error')
cv_scores_r2 = cross_val_score(knn_cv, X1_train_scaled, y1_train, cv=5, scoring='r2')

print("CV Scores (RMSE):", np.sqrt(-cv_scores))
print("Mean CV Score (RMSE):", np.mean(np.sqrt(-cv_scores)))

print("CV Scores (R squared):", cv_scores_r2)
print("Mean CV Score (R squared):", np.mean(cv_scores_r2))

param_grid = {'n_neighbors': np.arange(1, 25, 2)}

# Using GridSearch to test all values for n_neighbors
knn_gscv = GridSearchCV(KNeighborsRegressor(), param_grid, cv=5, scoring='neg_mean_squared_error')

knn_gscv.fit(X1_train_scaled, y1_train)

print("Best n_neighbors value:", knn_gscv.best_params_)

print("Best CV Score (RMSE):", np.sqrt(-knn_gscv.best_score_))

y_pred_tuned = knn_gscv.predict(X1_test_scaled)
rmse = np.sqrt(mean_squared_error(y1_test, y_pred_tuned))
r_squared = r2_score(y1_test, y_pred_tuned)
print("Test Set RMSE:", rmse)
print("Test Set R squared:", r_squared)


CV Scores (RMSE): [5.52224117 5.65746091 5.45893267 5.50326052 5.50503789]
Mean CV Score (RMSE): 5.529386630786256
CV Scores (R squared): [0.81794709 0.81011869 0.82995449 0.81953875 0.82222567]
Mean CV Score (R squared): 0.8199569380909955
Best n_neighbors value: {'n_neighbors': 5}
Best CV Score (RMSE): 5.524417821602885
Test Set RMSE: 5.369280049028751
Test Set R squared: 0.8282825654851133


#### From Hyperparameter tuning results, we can see that k-NN model performs best at k=5


#### Implementing k-NN with other distance metrics such as manhattan and minkowski

In [24]:
knn_manhattan = KNeighborsRegressor(n_neighbors=5, metric='manhattan')

knn_manhattan.fit(X1_train_scaled, y1_train)

y1_pred_manhattan = knn_manhattan.predict(X1_test_scaled)

# Evaluate the model with Manhattan distance
mse_hvacS_manhattan = mean_squared_error(y1_test, y1_pred_manhattan)
print("\nManhattan Distance:")
print("Mean Squared Error:", mse_hvacS_manhattan)
print('k-NN R2 score for hvacS: {}'.format(r2_score(y1_test, y1_pred_manhattan)))
print('k-NN Mean Absolute Error (MAE) for hvacS: {}'.format(mean_absolute_error(y1_test, y1_pred_manhattan)))
print('k-NN Mean Squared Error (MSE) for hvacS: {}'.format(mse_hvacS_manhattan))
rmse_hvacS_manhattan = np.sqrt(mse_hvacS_manhattan)
print('k-NN Root Mean Squared Error (RMSE) for hvacS: {}'.format(rmse_hvacS_manhattan))



Manhattan Distance:
Mean Squared Error: 27.630878360373906
k-NN R2 score for hvacS: 0.8354200334490759
k-NN Mean Absolute Error (MAE) for hvacS: 3.2773541570332063
k-NN Mean Squared Error (MSE) for hvacS: 27.630878360373906
k-NN Root Mean Squared Error (RMSE) for hvacS: 5.25650819084056


In [25]:
knn_minkowski = KNeighborsRegressor(n_neighbors=5, metric='minkowski')

knn_minkowski.fit(X1_train_scaled, y1_train)

y1_pred_minkowski = knn_minkowski.predict(X1_test_scaled)

# Evaluate the model with Minkowski distance
mse_hvacS_minkowski = mean_squared_error(y1_test, y1_pred_minkowski)
print("\nMinkowski Distance (p=2, equivalent to Euclidean):")
print("Mean Squared Error:", mse_hvacS_minkowski)
print('k-NN R2 score for hvacS: {}'.format(r2_score(y1_test, y1_pred_minkowski)))
print('k-NN Mean Absolute Error (MAE) for hvacS: {}'.format(mean_absolute_error(y1_test, y1_pred_minkowski)))
print('k-NN Mean Squared Error (MSE) for hvacS: {}'.format(mse_hvacS_minkowski))
rmse_hvacS_minkowski = np.sqrt(mse_hvacS_minkowski)
print('k-NN Root Mean Squared Error (RMSE) for hvacS: {}'.format(rmse_hvacS_minkowski))



Minkowski Distance (p=2, equivalent to Euclidean):
Mean Squared Error: 28.82916824489818
k-NN R2 score for hvacS: 0.8282825654851133
k-NN Mean Absolute Error (MAE) for hvacS: 3.3433649369199396
k-NN Mean Squared Error (MSE) for hvacS: 28.82916824489818
k-NN Root Mean Squared Error (RMSE) for hvacS: 5.369280049028751


In [27]:
import numpy as np

# Scale the features
scaler = StandardScaler()
X1_train_scaled = scaler.fit_transform(X1_train)
X1_test_scaled = scaler.transform(X1_test)

# Calculate the inverse covariance matrix
VI = np.linalg.inv(np.cov(X1_train_scaled, rowvar=False))

# Chebyshev distance
knn_chebyshev = KNeighborsRegressor(n_neighbors=5, metric='chebyshev')

# Mahalanobis distance
knn_mahalanobis = KNeighborsRegressor(n_neighbors=5, metric='mahalanobis', metric_params={'VI': VI})

# Train and evaluate models with different distance metrics

models = {
    "Chebyshev": knn_chebyshev,
    "Mahalanobis": knn_mahalanobis
}

for name, model in models.items():
    print(name)
    model.fit(X1_train_scaled, y1_train)
    y1_pred_dis = model.predict(X1_test_scaled)
    
    mse_hvacS = mean_squared_error(y1_test, y1_pred_dis)
    print("Mean Squared Error:", mse_hvacS)
    print('k-NN R2 score for hvacS:', r2_score(y1_test, y1_pred_dis))
    print('k-NN Mean Absolute Error (MAE) for hvacS:', mean_absolute_error(y1_test, y1_pred_dis))
    print('k-NN Root Mean Squared Error (RMSE) for hvacS:', np.sqrt(mse_hvacS))
    print()


Chebyshev
Mean Squared Error: 32.80170631391751
k-NN R2 score for hvacS: 0.8046206256077656
k-NN Mean Absolute Error (MAE) for hvacS: 3.633392168247137
k-NN Root Mean Squared Error (RMSE) for hvacS: 5.727277391039961

Mahalanobis
Mean Squared Error: 27.062579548351504
k-NN R2 score for hvacS: 0.8388050362077182
k-NN Mean Absolute Error (MAE) for hvacS: 3.230357714860493
k-NN Root Mean Squared Error (RMSE) for hvacS: 5.202170657365203



In [28]:
# Saving the k-NN model through pickle file

import pickle

Pkl_kNN_hvac_S = "Pickle_kNN_hvac_S_Model.pkl"  

with open(Pkl_kNN_hvac_S, 'wb') as file:  
    pickle.dump(knn_mahalanobis, file)

with open(Pkl_kNN_hvac_S, 'rb') as file:  
    Pickled_kNN_hvac_S_Model = pickle.load(file)

Pickled_kNN_hvac_S_Model

Ypredict_hvac_S = Pickled_kNN_hvac_S_Model.predict(X1_test_scaled)  
 

## k-NN Algorithm - Correlation Statistics for target variable hvac_N

In [29]:
# Load the CSV file
#df_N = pd.read_csv("feature_select_df_hvacN_1.csv")

# Perform 90:10 train-test split
#train_90_N, test_10_N = train_test_split(df_N, test_size=0.1, random_state=42)

# Save the 90% split files
#train_90_N.to_csv("train_90_N.csv", index=False)
#test_10_N.to_csv("test_10_N.csv", index=False)

In [30]:
df_features_90_N = pd.read_csv("train_90_N.csv")

In [31]:
#Splitting the data
X2_train, X2_test, y2_train, y2_test = train_test_split(df_features_90_N, target_90['hvac_N'], random_state=42, test_size=0.3)

In [32]:
# Scaling the features
scaler = StandardScaler()
X2_train_scaled = scaler.fit_transform(X2_train)
X2_test_scaled = scaler.transform(X2_test)

knn = KNeighborsRegressor(n_neighbors=5)

knn.fit(X2_train_scaled, y2_train)

# Predicting on the testing set
y2_pred = knn.predict(X2_test_scaled)

# Evaluating model performance the model
mse_hvacN = mean_squared_error(y2_test, y2_pred)
print("Mean Squared Error:", mse_hvacN)


Mean Squared Error: 23.43564496598298


In [33]:
print('k-NN R2 score for hvacN: {}'.format(r2_score(y2_test,y2_pred)))
print('k-NN Mean Absolute Error (MAE) for hvacN: {}'.format(mean_absolute_error(y2_test, y2_pred)))
print('k-NN Mean Squared Error (MSE) for hvacN: {}'.format(mean_squared_error(y2_test, y2_pred)))
rmse_hvacN = np.sqrt(mean_squared_error(y2_test, y2_pred))
print('k-NN Root Mean Squared Error (RMSE) for hvacN: {}'.format(rmse_hvacN))


k-NN R2 score for hvacN: 0.8749333032065396
k-NN Mean Absolute Error (MAE) for hvacN: 2.6985699008857518
k-NN Mean Squared Error (MSE) for hvacN: 23.43564496598298
k-NN Root Mean Squared Error (RMSE) for hvacN: 4.8410375918787265


## Hyperparameter tuning in k-NN through cross-validation and gridsearch

In [34]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

knn_cv_n = KNeighborsRegressor(n_neighbors=3)

# Training model with cross-validation of 5 folds
cv_scores_n = cross_val_score(knn_cv_n, X2_train_scaled, y2_train, cv=5, scoring='neg_mean_squared_error')
cv_scores_r2_n = cross_val_score(knn_cv_n, X2_train_scaled, y2_train, cv=5, scoring='r2')

print("CV Scores (RMSE):", np.sqrt(-cv_scores_n))
print("Mean CV Score (RMSE):", np.mean(np.sqrt(-cv_scores_n)))

print("CV Scores (R squared):", cv_scores_r2_n)
print("Mean CV Score (R squared):", np.mean(cv_scores_r2_n))

# Creating a dictionary of values we want to test for n_neighbors
param_grid_n = {'n_neighbors': np.arange(1, 25, 2)}

# Using GridSearch
knn_gscv_n = GridSearchCV(KNeighborsRegressor(), param_grid_n, cv=5, scoring='neg_mean_squared_error')

knn_gscv_n.fit(X2_train_scaled, y2_train)

print("Best n_neighbors value:", knn_gscv_n.best_params_)

print("Best CV Score (RMSE):", np.sqrt(-knn_gscv_n.best_score_))

# Evaluating model performance on test set
y_pred_tuned = knn_gscv_n.predict(X2_test_scaled)
rmse = np.sqrt(mean_squared_error(y2_test, y_pred_tuned))
r_squared = r2_score(y2_test, y_pred_tuned)
print("Test Set RMSE:", rmse)
print("Test Set R squared:", r_squared)

CV Scores (RMSE): [5.0082472  4.90665337 5.0367324  5.17095444 5.16853483]
Mean CV Score (RMSE): 5.058224446998724
CV Scores (R squared): [0.86929166 0.86991346 0.87166254 0.85869447 0.86132217]
Mean CV Score (R squared): 0.8661768595282566
Best n_neighbors value: {'n_neighbors': 5}
Best CV Score (RMSE): 5.041717714193332
Test Set RMSE: 4.8410375918787265
Test Set R squared: 0.8749333032065396


#### From Hyperparameter tuning results, we can see that k-NN model performs best at k=5

#### Implementing k-NN with other distance metrics such as manhattan and minkowski

In [35]:
knn_manhattan_n = KNeighborsRegressor(n_neighbors=5, metric='manhattan')

knn_manhattan_n.fit(X2_train_scaled, y2_train)

# Predict on the testing set with Manhattan distance
y2_pred_manhattan = knn_manhattan_n.predict(X2_test_scaled)

mse_hvacN_manhattan = mean_squared_error(y2_test, y2_pred_manhattan)
print("\nManhattan Distance:")
print("Mean Squared Error:", mse_hvacN_manhattan)
print('k-NN R2 score for hvacN: {}'.format(r2_score(y2_test, y2_pred_manhattan)))
print('k-NN Mean Absolute Error (MAE) for hvacN: {}'.format(mean_absolute_error(y2_test, y2_pred_manhattan)))
print('k-NN Mean Squared Error (MSE) for hvacN: {}'.format(mse_hvacN_manhattan))
rmse_hvacN_manhattan = np.sqrt(mse_hvacN_manhattan)
print('k-NN Root Mean Squared Error (RMSE) for hvacN: {}'.format(rmse_hvacN_manhattan))


Manhattan Distance:
Mean Squared Error: 21.195831600423308
k-NN R2 score for hvacN: 0.8868862944500491
k-NN Mean Absolute Error (MAE) for hvacN: 2.5691088623998137
k-NN Mean Squared Error (MSE) for hvacN: 21.195831600423308
k-NN Root Mean Squared Error (RMSE) for hvacN: 4.603893091767369


In [36]:
knn_minkowski_n = KNeighborsRegressor(n_neighbors=5, metric='minkowski')

knn_minkowski_n.fit(X2_train_scaled, y2_train)

# Predict on the testing set with Minkowski distance
y2_pred_minkowski = knn_minkowski_n.predict(X2_test_scaled)

mse_hvacN_minkowski = mean_squared_error(y2_test, y2_pred_minkowski)
print("\nMinkowski Distance (p=2, equivalent to Euclidean):")
print("Mean Squared Error:", mse_hvacN_minkowski)
print('k-NN R2 score for hvacN: {}'.format(r2_score(y2_test, y2_pred_minkowski)))
print('k-NN Mean Absolute Error (MAE) for hvacN: {}'.format(mean_absolute_error(y2_test, y2_pred_minkowski)))
print('k-NN Mean Squared Error (MSE) for hvacN: {}'.format(mse_hvacN_minkowski))
rmse_hvacN_minkowski = np.sqrt(mse_hvacN_minkowski)
print('k-NN Root Mean Squared Error (RMSE) for hvacN: {}'.format(rmse_hvacN_minkowski))


Minkowski Distance (p=2, equivalent to Euclidean):
Mean Squared Error: 23.43564496598298
k-NN R2 score for hvacN: 0.8749333032065396
k-NN Mean Absolute Error (MAE) for hvacN: 2.6985699008857518
k-NN Mean Squared Error (MSE) for hvacN: 23.43564496598298
k-NN Root Mean Squared Error (RMSE) for hvacN: 4.8410375918787265


In [37]:
import numpy as np

# Scale the features
scaler = StandardScaler()
X2_train_scaled = scaler.fit_transform(X2_train)
X2_test_scaled = scaler.transform(X2_test)

# Calculate the inverse covariance matrix
VI_n = np.linalg.inv(np.cov(X2_train_scaled, rowvar=False))

# Chebyshev distance
knn_chebyshev_n = KNeighborsRegressor(n_neighbors=5, metric='chebyshev')

# Mahalanobis distance
knn_mahalanobis_n = KNeighborsRegressor(n_neighbors=5, metric='mahalanobis', metric_params={'VI': VI_n})

models_n = {
    "Chebyshev": knn_chebyshev_n,
    "Mahalanobis": knn_mahalanobis_n
}

for name, model in models_n.items():
    print(name)
    model.fit(X2_train_scaled, y2_train)
    y2_pred_dis = model.predict(X2_test_scaled)

    # Evaluate the model
    mse_hvacN = mean_squared_error(y2_test, y2_pred_dis)
    print("Mean Squared Error:", mse_hvacN)
    print('k-NN R2 score for hvacN:', r2_score(y2_test, y2_pred_dis))
    print('k-NN Mean Absolute Error (MAE) for hvacN:', mean_absolute_error(y2_test, y2_pred_dis))
    print('k-NN Root Mean Squared Error (RMSE) for hvacN:', np.sqrt(mse_hvacN))
    print()


Chebyshev
Mean Squared Error: 28.001720527197847
k-NN R2 score for hvacN: 0.8505659777679013
k-NN Mean Absolute Error (MAE) for hvacN: 2.988847265791097
k-NN Root Mean Squared Error (RMSE) for hvacN: 5.2916651941707205

Mahalanobis
Mean Squared Error: 20.094745657481138
k-NN R2 score for hvacN: 0.8927623512843873
k-NN Mean Absolute Error (MAE) for hvacN: 2.4743898992965874
k-NN Root Mean Squared Error (RMSE) for hvacN: 4.482716325787428



In [38]:
# Saving the model through pickle file

import pickle

Pkl_kNN_hvac_N = "Pickle_kNN_hvac_N_Model.pkl"  

with open(Pkl_kNN_hvac_N, 'wb') as file:  
    pickle.dump(knn_mahalanobis_n, file)
    
with open(Pkl_kNN_hvac_N, 'rb') as file:  
    Pickled_kNN_hvac_N_Model = pickle.load(file)

Pickled_kNN_hvac_N_Model

# Predict the Labels using the reloaded Model
Ypredict_hvac_N = Pickled_kNN_hvac_N_Model.predict(X2_test_scaled)  