In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold


In [13]:


train_df = pd.read_csv("train_set.csv")
test_df = pd.read_csv("test_set.csv")

train_df = train_df[train_df["Suitability Score"] != 0]
test_df = test_df[test_df["Suitability Score"] != 0]

train_df = train_df.dropna()
test_df = test_df.dropna()

y_train = train_df["Suitability Score"]
y_test = test_df["Suitability Score"]


X_train = train_df.drop(columns=["AHP1", "AHP2", "Latitude", "Longitude", "Suitability Score", "Year Commissioned"])
X_test = test_df.drop(columns=["AHP1", "AHP2", "Latitude", "Longitude", "Suitability Score", "Year Commissioned"])





Random Forest Regressor

In [14]:

kf = KFold(n_splits=5, shuffle=True, random_state=42)

mse_scores = []
r2_scores = []

print("cross val:\n")
for fold, (train_index, val_index) in enumerate(kf.split(X_train)):
    print(f"Fold {fold + 1}")
    
    X_fold_train, X_fold_val = X_train.iloc[train_index], X_train.iloc[val_index]
    y_fold_train, y_fold_val = y_train.iloc[train_index], y_train.iloc[val_index]
    
    model = RandomForestRegressor(n_estimators=200, min_samples_split=5, max_depth=None, random_state=42)
    model.fit(X_fold_train, y_fold_train)
    
    y_pred = model.predict(X_fold_val)
    
    mse = mean_squared_error(y_fold_val, y_pred)
    r2 = r2_score(y_fold_val, y_pred)
    
    mse_scores.append(mse)
    r2_scores.append(r2)
    
    print(f"  MSE: {mse:.4f}")
    print(f"  R²:  {r2:.4f}")

print("\nAverage CV Results:")
print(f"Average MSE: {np.mean(mse_scores):.4f}")
print(f"Average R²:  {np.mean(r2_scores):.4f}")

final_model = RandomForestRegressor(n_estimators=200, min_samples_split=5, max_depth=None, random_state=42)
final_model.fit(X_train, y_train)

y_test_pred = final_model.predict(X_test)

test_mse = mean_squared_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)

print("\nFinal Evaluation on Test Set:")
print(f"Test MSE: {test_mse:.4f}")
print(f"Test R²:  {test_r2:.4f}")


cross val:

Fold 1
  MSE: 0.0078
  R²:  0.7914
Fold 2
  MSE: 0.0052
  R²:  0.8447
Fold 3
  MSE: 0.0052
  R²:  0.8486
Fold 4
  MSE: 0.0046
  R²:  0.8603
Fold 5
  MSE: 0.0061
  R²:  0.8082

Average CV Results:
Average MSE: 0.0058
Average R²:  0.8306

Final Evaluation on Test Set:
Test MSE: 0.0147
Test R²:  0.7729


svm

In [15]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

mse_scores = []
r2_scores = []

print("cross val:\n")
for fold, (train_index, val_index) in enumerate(kf.split(X_train)):
    print(f"Fold {fold + 1}")
    
    X_fold_train, X_fold_val = X_train.iloc[train_index], X_train.iloc[val_index]
    y_fold_train, y_fold_val = y_train.iloc[train_index], y_train.iloc[val_index]
    
    model = SVR(kernel='rbf', gamma='scale', epsilon=0.01, C=100)
    model.fit(X_fold_train, y_fold_train)
    
    y_pred = model.predict(X_fold_val)
    
    mse = mean_squared_error(y_fold_val, y_pred)
    r2 = r2_score(y_fold_val, y_pred)
    
    mse_scores.append(mse)
    r2_scores.append(r2)
    
    print(f"  MSE: {mse:.4f}")
    print(f"  R²:  {r2:.4f}")

print("\nAverage CV Results:")
print(f"Average MSE: {np.mean(mse_scores):.4f}")
print(f"Average R²:  {np.mean(r2_scores):.4f}")

final_model = SVR(kernel='rbf', gamma='scale', epsilon=0.01, C=100)
final_model.fit(X_train, y_train)

y_test_pred = final_model.predict(X_test)

test_mse = mean_squared_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)

print("\nFinal Evaluation on Test Set:")
print(f"Test MSE: {test_mse:.4f}")
print(f"Test R²:  {test_r2:.4f}")


cross val:

Fold 1
  MSE: 0.0115
  R²:  0.6926
Fold 2
  MSE: 0.0057
  R²:  0.8298
Fold 3
  MSE: 0.0070
  R²:  0.7972
Fold 4
  MSE: 0.0064
  R²:  0.8064
Fold 5
  MSE: 0.0062
  R²:  0.8080

Average CV Results:
Average MSE: 0.0073
Average R²:  0.7868

Final Evaluation on Test Set:
Test MSE: 0.0231
Test R²:  0.6421


knn

In [16]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

mse_scores = []
r2_scores = []

print("cross val:\n")
for fold, (train_index, val_index) in enumerate(kf.split(X_train)):
    print(f"Fold {fold + 1}")
    
    X_fold_train, X_fold_val = X_train.iloc[train_index], X_train.iloc[val_index]
    y_fold_train, y_fold_val = y_train.iloc[train_index], y_train.iloc[val_index]
    
    model = KNeighborsRegressor(n_neighbors=5)
    model.fit(X_fold_train, y_fold_train)
    
    
    y_pred = model.predict(X_fold_val)
    
    mse = mean_squared_error(y_fold_val, y_pred)
    r2 = r2_score(y_fold_val, y_pred)
    
    mse_scores.append(mse)
    r2_scores.append(r2)
    
    print(f"  MSE: {mse:.4f}")
    print(f"  R²:  {r2:.4f}")

print("\nAverage CV Results:")
print(f"Average MSE: {np.mean(mse_scores):.4f}")
print(f"Average R²:  {np.mean(r2_scores):.4f}")

final_model = KNeighborsRegressor(n_neighbors=5, p = 1, weights='distance')  
final_model.fit(X_train, y_train)

y_test_pred = final_model.predict(X_test)

test_mse = mean_squared_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)

print("\nFinal Evaluation on Test Set:")
print(f"Test MSE: {test_mse:.4f}")
print(f"Test R²:  {test_r2:.4f}")


cross val:

Fold 1
  MSE: 0.0146
  R²:  0.6092
Fold 2
  MSE: 0.0104
  R²:  0.6930
Fold 3
  MSE: 0.0108
  R²:  0.6846
Fold 4
  MSE: 0.0097
  R²:  0.7059
Fold 5
  MSE: 0.0093
  R²:  0.7094

Average CV Results:
Average MSE: 0.0110
Average R²:  0.6804

Final Evaluation on Test Set:
Test MSE: 0.0201
Test R²:  0.6890


xgboost

In [18]:
from xgboost import XGBRegressor

kf = KFold(n_splits=5, shuffle=True, random_state=42)

mse_scores = []
r2_scores = []

print("cross val:\n")
for fold, (train_index, val_index) in enumerate(kf.split(X_train)):
    print(f"Fold {fold + 1}")
    
    X_fold_train, X_fold_val = X_train.iloc[train_index], X_train.iloc[val_index]
    y_fold_train, y_fold_val = y_train.iloc[train_index], y_train.iloc[val_index]
    
    model = XGBRegressor(
        n_estimators=300,
        max_depth=6,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        gamma=0,
        reg_alpha=0.1,
        reg_lambda=2,
        random_state=42,
        n_jobs=-1
    )
    
    model.fit(X_fold_train, y_fold_train)
    
    y_pred = model.predict(X_fold_val)
    
    mse = mean_squared_error(y_fold_val, y_pred)
    r2 = r2_score(y_fold_val, y_pred)
    
    mse_scores.append(mse)
    r2_scores.append(r2)
    
    print(f"  MSE: {mse:.4f}")
    print(f"  R²:  {r2:.4f}")

print("\nAverage CV Results:")
print(f"Average MSE: {np.mean(mse_scores):.4f}")
print(f"Average R²:  {np.mean(r2_scores):.4f}")


final_model = XGBRegressor(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=0,
    reg_alpha=0.1,
    reg_lambda=2,
    random_state=42,
    n_jobs=-1
)
final_model.fit(X_train, y_train)


y_test_pred = final_model.predict(X_test)

test_mse = mean_squared_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)

print("\nFinal Evaluation on Test Set:")
print(f"Test MSE: {test_mse:.4f}")
print(f"Test R²:  {test_r2:.4f}")


cross val:

Fold 1
  MSE: 0.0067
  R²:  0.8225
Fold 2
  MSE: 0.0043
  R²:  0.8732
Fold 3
  MSE: 0.0043
  R²:  0.8756
Fold 4
  MSE: 0.0037
  R²:  0.8868
Fold 5
  MSE: 0.0048
  R²:  0.8488

Average CV Results:
Average MSE: 0.0048
Average R²:  0.8614

Final Evaluation on Test Set:
Test MSE: 0.0135
Test R²:  0.7915


mlp

In [19]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

mse_scores = []
r2_scores = []

print("cross val:\n")
for fold, (train_index, val_index) in enumerate(kf.split(X_train)):
    print(f"Fold {fold + 1}")
    
    X_fold_train, X_fold_val = X_train.iloc[train_index], X_train.iloc[val_index]
    y_fold_train, y_fold_val = y_train.iloc[train_index], y_train.iloc[val_index]
    
    
    model = MLPRegressor(hidden_layer_sizes=(100, 50),
                         activation='relu',
                         solver='adam',
                         learning_rate_init=0.001,
                         max_iter=500,
                         random_state=42)
    
    model.fit(X_fold_train, y_fold_train)
    
    y_pred = model.predict(X_fold_val)
    
    mse = mean_squared_error(y_fold_val, y_pred)
    r2 = r2_score(y_fold_val, y_pred)
    
    mse_scores.append(mse)
    r2_scores.append(r2)
    
    print(f"  MSE: {mse:.4f}")
    print(f"  R²:  {r2:.4f}")

print("\nAverage CV Results:")
print(f"Average MSE: {np.mean(mse_scores):.4f}")
print(f"Average R²:  {np.mean(r2_scores):.4f}")


final_model = MLPRegressor(hidden_layer_sizes=(100, 50),
                           activation='relu',
                           solver='adam',
                           learning_rate_init=0.001,
                           max_iter=500,
                           random_state=42)
final_model.fit(X_train, y_train)

y_test_pred = final_model.predict(X_test)

test_mse = mean_squared_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)

print("\nFinal Evaluation on Test Set:")
print(f"Test MSE: {test_mse:.4f}")
print(f"Test R²:  {test_r2:.4f}")


cross val:

Fold 1
  MSE: 0.0137
  R²:  0.6350
Fold 2
  MSE: 0.0083
  R²:  0.7534
Fold 3
  MSE: 0.0094
  R²:  0.7269
Fold 4
  MSE: 0.0075
  R²:  0.7711
Fold 5
  MSE: 0.0085
  R²:  0.7356

Average CV Results:
Average MSE: 0.0095
Average R²:  0.7244

Final Evaluation on Test Set:
Test MSE: 0.0232
Test R²:  0.6409


In [20]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

mse_scores = []
r2_scores = []

print("cross val:\n")
for fold, (train_index, val_index) in enumerate(kf.split(X_train)):
    print(f"Fold {fold + 1}")
    
    X_fold_train, X_fold_val = X_train.iloc[train_index], X_train.iloc[val_index]
    y_fold_train, y_fold_val = y_train.iloc[train_index], y_train.iloc[val_index]
    
    model = MLPRegressor(
        hidden_layer_sizes=(8,16),
        activation='tanh',
        solver='lbfgs',
        alpha=0.01,
        learning_rate_init=0.05,
        max_iter=5000,
        random_state=42
    )
    
    model.fit(X_fold_train, y_fold_train)
    
    y_pred = model.predict(X_fold_val)
    
    mse = mean_squared_error(y_fold_val, y_pred)
    r2 = r2_score(y_fold_val, y_pred)
    
    mse_scores.append(mse)
    r2_scores.append(r2)
    
    print(f"  MSE: {mse:.4f}")
    print(f"  R²:  {r2:.4f}")

print("\nAverage CV Results:")
print(f"Average MSE: {np.mean(mse_scores):.4f}")
print(f"Average R²:  {np.mean(r2_scores):.4f}")

final_model = MLPRegressor(
    hidden_layer_sizes=(8,16),
    activation='tanh',
    solver='lbfgs',
    alpha=0.01,
    learning_rate_init=0.05,
    max_iter=5000,
    random_state=42
)
final_model.fit(X_train, y_train)

y_test_pred = final_model.predict(X_test)

test_mse = mean_squared_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)

print("\nFinal Evaluation on Test Set:")
print(f"Test MSE: {test_mse:.4f}")
print(f"Test R²:  {test_r2:.4f}")


cross val:

Fold 1
  MSE: 0.0113
  R²:  0.6980
Fold 2
  MSE: 0.0056
  R²:  0.8352
Fold 3
  MSE: 0.0067
  R²:  0.8058
Fold 4
  MSE: 0.0062
  R²:  0.8124
Fold 5
  MSE: 0.0062
  R²:  0.8049

Average CV Results:
Average MSE: 0.0072
Average R²:  0.7913

Final Evaluation on Test Set:
Test MSE: 0.0187
Test R²:  0.7095
