In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.decomposition import PCA


In [3]:
# 1. Load data and define X / y
df = pd.read_csv("FINAL_DATA.csv")

target_col = "Altitude"
X = df.drop(columns=[target_col])
y = df[target_col]

numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X.select_dtypes(exclude=[np.number]).columns.tolist()

numeric_transformer = Pipeline(steps=[
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [4]:
# 2. Baseline KNN (no PCA)
base_pipe = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("knn", KNeighborsRegressor())
])

param_grid_knn = {
    "knn__n_neighbors": [3, 5, 7, 9, 11],
    "knn__weights": ["uniform", "distance"]
}

grid_base = GridSearchCV(
    base_pipe,
    param_grid_knn,
    cv=5,
    scoring="neg_mean_squared_error",
    n_jobs=-1
)
grid_base.fit(X_train, y_train)

best_base = grid_base.best_estimator_
y_pred_base = best_base.predict(X_test)


In [5]:
# 3. KNN with PCA (post-preprocessing)
## Helper to turn sparse matrix (from one-hot) into dense
to_dense = FunctionTransformer(lambda x: x.toarray(), accept_sparse=True)

pca_pipe = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("to_dense", to_dense),
    ("pca", PCA(n_components=20, random_state=42)),  # e.g. 20 PCs
    ("knn", KNeighborsRegressor())
])

param_grid_pca = {
    "knn__n_neighbors": [3, 5, 7, 9, 11],
    "knn__weights": ["uniform", "distance"]
}

grid_pca = GridSearchCV(
    pca_pipe,
    param_grid_pca,
    cv=5,
    scoring="neg_mean_squared_error",
    n_jobs=-1
)
grid_pca.fit(X_train, y_train)

best_pca = grid_pca.best_estimator_
y_pred_pca = best_pca.predict(X_test)

In [6]:
# 4. Compare test performance (pre vs post PCA)
def eval_model(name, y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    print(f"{name}:")
    print(f"  RMSE = {rmse:.3f}")
    print(f"  R^2  = {r2:.3f}\n")

print("Best params (no PCA):", grid_base.best_params_)
print("Best params (with PCA):", grid_pca.best_params_)
print()

eval_model("Baseline KNN (no PCA)", y_test, y_pred_base)
eval_model("KNN with PCA (20 PCs)", y_test, y_pred_pca)


Best params (no PCA): {'knn__n_neighbors': 9, 'knn__weights': 'distance'}
Best params (with PCA): {'knn__n_neighbors': 11, 'knn__weights': 'distance'}

Baseline KNN (no PCA):
  RMSE = 410.158
  R^2  = 0.304

KNN with PCA (20 PCs):
  RMSE = 436.448
  R^2  = 0.212



## Interpretation

This KNN model evaluates how well it can predict coffee farm altitude by comparing a baseline KNN model to a version incorporating PCA. After preprocessing numerical and categorical features with scaling and one-hot encoding, a grid search identified the best-performing hyperparameters for each model. The baseline KNN (k = 9, distance weighting) achieved an RMSE of approximately 410 and an R² of 0.304 which indicates modest predictive power. Introducing PCA (20 components) did not improve performance as RMSE worsened to 436 and R² dropped to 0.212. This suggests that dimensionality reduction removed important structure in the feature space that KNN relies on. Overall, the model without PCA provided the strongest results and demonstrates that preserving the full set of engineered features leads to better altitude predictions for this dataset.