## Data preprocessing:

In [1]:
import pandas as pd
import numpy as np


dane = pd.read_excel("loty_clean.xlsx")

from sklearn.model_selection import train_test_split
dane = dane.rename(columns={"#Layovers": "Num_Layovers", "Price [PLN]": "Price"})
dane["Flight_date"] = pd.to_datetime(dane["Flight_date"])
dane["Extraction_Time"] = pd.to_datetime(dane["Extraction_Time"].str.split(" ").apply(lambda x: x[0]), dayfirst=True)
dane.drop(columns = ["Extraction_Time", "Flight_date", "arr_city", "dep_city",
                     "Departure_airport_name", "Destination_airport_name",
                     "layover_airport", "ujemne", "low_cost1", "low_cost2"], inplace=True)
dane.drop(columns=[
    "Ticket_class", "Departure_airport_code", "Destination_airport_code",
    "Flight_weekday", "Extraction_Weekday", "Airline1", "Airline2", "Is_-2"
], inplace=True, errors='ignore')

# przekształcamy na numeryczne (na wszelki wypadek)
dane = dane.apply(pd.to_numeric)

# target variable - cena lotu (zł)
X = dane.drop(columns="Price")
y = dane["Price"]
import random

random.seed(123)  # ziarenko dla powtarzalności wyników

# podział na zbiór treningowy, walidacyjny i testowy - proporcje 80% : 20% : 20%
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)

# konwersja do macierzy numpy
X_train_np = X_train.to_numpy().astype(np.float64)
y_train_np = y_train.to_numpy().astype(np.float64)
X_test_np = X_test.to_numpy().astype(np.float64)
y_test_np = y_test.to_numpy().astype(np.float64)

In [14]:
import pandas as pd
import numpy as np
import time
from glob import glob
from knn_regression import KNNRegressor  # 👈 your custom class

# === Output filename ===
files = glob("feature_test_knn_*.xlsx")
filename = f"feature_test_knn_{len(files) + 1}.xlsx"

# List to store results
feature_removal_results = []

# Get feature names
feature_names = list(X_train.columns)

print("Starting custom KNN feature importance test by removing each feature...")

# Standardization (KNN is sensitive to scale)
def standardize(train, test):
    mean = train.mean(axis=0)
    std = train.std(axis=0) + 1e-8  # Avoid division by zero
    return (train - mean) / std, (test - mean) / std

for feature_to_remove in feature_names:
    print(f"\n🚫 Removing feature: {feature_to_remove}")

    # Drop one feature
    X_train_small = X_train.sample(1500, random_state=42)
    y_train_small = y_train.loc[X_train_small.index]
    X_train_sub = X_train_small.drop(columns=[feature_to_remove])
    X_test_sub = X_test.drop(columns=[feature_to_remove])  # Use full test set

    # Convert to numpy
    X_train_np = X_train_sub.to_numpy()
    X_test_np = X_test_sub.to_numpy()
    y_train_np = y_train_small.to_numpy()
    y_test_np = y_test.to_numpy()


    # Standardize features
    X_train_std, X_test_std = standardize(X_train_np, X_test_np)

    # Initialize custom KNN Regressor
    knn = KNNRegressor(n_neighbors=5, weights='distance')

    # Train model
    start_time = time.time()
    knn.fit(X_train_std, y_train_np)
    train_time = time.time() - start_time

    # Predict
    pred_train = knn.predict(X_train_std)
    pred_test = knn.predict(X_test_std)

    # Handle zero division in MAPE
    mask = y_test_np != 0
    mape = np.mean(np.abs((y_test_np[mask] - pred_test[mask]) / y_test_np[mask])) * 100 if np.sum(mask) > 0 else np.nan

    # Store results
    result = {
        "feature_removed": feature_to_remove,
        "train_MSE": np.mean((y_train_np - pred_train) ** 2),
        "test_MSE": np.mean((y_test_np - pred_test) ** 2),
        "test_R2": 1 - np.sum((y_test_np - pred_test) ** 2) / np.sum((y_test_np - np.mean(y_test_np)) ** 2),
        "test_MAE": np.mean(np.abs(y_test_np - pred_test)),
        "test_MAPE": mape
    }

    feature_removal_results.append(result)

# Save results
result_df = pd.DataFrame(feature_removal_results)
result_df.to_excel(filename, index=False)
print(f"\n✅ KNN feature removal test completed. Results saved to: {filename}")


Starting custom KNN feature importance test by removing each feature...

🚫 Removing feature: Departure_time

🚫 Removing feature: Arrival_time

🚫 Removing feature: Flight_time

🚫 Removing feature: Num_Layovers

🚫 Removing feature: Cabin_bag

🚫 Removing feature: Checked_bag

🚫 Removing feature: Days_to_departure

🚫 Removing feature: layover_duration

✅ KNN feature removal test completed. Results saved to: feature_test_knn_3.xlsx


### mała próbka test:

In [10]:
import pandas as pd
import numpy as np
import time
from glob import glob
from knn_regression import KNNRegressor  # 👈 your custom class

# === Output filename ===
files = glob("feature_test_knn_*.xlsx")
filename = f"feature_test_knn_{len(files) + 1}.xlsx"

# List to store results
feature_removal_results = []

# Get feature names
feature_names = list(X_train.columns)

print("Starting custom KNN feature importance test by removing each feature...")

# Standardization (KNN is sensitive to scale)
def standardize(X_train, X_test):
    mean = np.mean(X_train, axis=0)
    std = np.std(X_train, axis=0) + 1e-8  # Avoid division by zero
    X_train_std = (X_train - mean) / std
    X_test_std = (X_test - mean) / std
    return X_train_std, X_test_std


for feature_to_remove in feature_names:
    print(f"\n🚫 Removing feature: {feature_to_remove}")

    X_train_small = X_train.sample(1000, random_state=42)
    y_train_small = y_train.loc[X_train_small.index]

    X_train_sub = X_train_small.drop(columns=[feature_to_remove])
    X_test_sub = X_test.drop(columns=[feature_to_remove])  # Use full test set

    # Convert to numpy
    X_train_np = X_train_sub.to_numpy()
    X_test_np = X_test_sub.to_numpy()
    y_train_np = y_train_small.to_numpy()
    y_test_np = y_test.to_numpy()


    # Standardize
    try:
        X_train_std, X_test_std = standardize(X_train_np, X_test_np)
    except Exception as e:
        print(f"⚠️ Error standardizing after removing feature '{feature_to_remove}':", e)
        continue  # 🔄 Make sure this line is inside the try block, not outside

    # Train KNN
    knn = KNNRegressor(n_neighbors=5, weights='distance')
    start_time = time.time()
    knn.fit(X_train_std, y_train_np)
    train_time = time.time() - start_time

    # Predict
    pred_train = knn.predict(X_train_std)
    pred_test = knn.predict(X_test_std)

    # Evaluation
    mask = y_test_np != 0
    mape = np.mean(np.abs((y_test_np[mask] - pred_test[mask]) / y_test_np[mask])) * 100 if np.sum(mask) > 0 else np.nan

    result = {
        "feature_removed": feature_to_remove,
        "train_MSE": np.mean((y_train_np - pred_train) ** 2),
        "test_MSE": np.mean((y_test_np - pred_test) ** 2),
        "test_R2": 1 - np.sum((y_test_np - pred_test) ** 2) / np.sum((y_test_np - np.mean(y_test_np)) ** 2),
        "test_MAE": np.mean(np.abs(y_test_np - pred_test)),
        "test_MAPE": mape,
        "train_time_sec": train_time
    }

    feature_removal_results.append(result)

# Save results
result_df = pd.DataFrame(feature_removal_results)
result_df.to_excel(filename, index=False)
print(f"\n✅ KNN feature removal test completed. Results saved to: {filename}")


Starting custom KNN feature importance test by removing each feature...

🚫 Removing feature: Departure_time

🚫 Removing feature: Arrival_time

🚫 Removing feature: Flight_time

🚫 Removing feature: Num_Layovers

🚫 Removing feature: Cabin_bag

🚫 Removing feature: Checked_bag

🚫 Removing feature: Days_to_departure

🚫 Removing feature: layover_duration

✅ KNN feature removal test completed. Results saved to: feature_test_knn_2.xlsx
