# Projekt 1.

## Przygotowanie danych.

Domyślnie wczytuję dane ze zmiennymi które arbitralnie sobie wybrałem - zmienne kategoryczne przekształcone do macierzy zer i jedynek (one-hot encoding). Czyli nie ma zmiennej "linia lotnicza", ale jest zmienna "Is_Ryanair", "Is_LOT" - itp., zmienne binarne. Oczywiście zwykłe numeryczne też są. 

In [1]:
import pandas as pd
import numpy as np

# ostateczne dane do przewidywania ceny
dane = pd.read_pickle("dane_onehot.pkl")
#dane = pd.read_pickle("Indian_Dataset.pkl")

dane.head()

Unnamed: 0,Departure_time,Arrival_time,Flight_time,Price,Num_Layovers,Cabin_bag,Checked_bag,Days_to_departure,layover_duration,Is_Ekonomiczna,...,Is_Wizz Air1,Is_easyJet1,Is_Air France2,Is_British Airways2,Is_KLM2,Is_LOT2,Is_Lufthansa2,Is_Ryanair2,Is_Wizz Air2,Is_easyJet2
0,16.58,17.92,1.33,1643,0,1,0,2,0.0,True,...,False,False,False,False,False,False,False,False,False,False
1,18.33,22.67,4.33,1087,1,0,0,2,1.583333,True,...,True,False,False,False,False,False,False,False,False,True
2,19.67,21.0,1.33,1749,0,1,0,2,0.0,True,...,False,False,False,False,False,False,False,False,False,False
3,6.08,11.92,5.83,1776,1,0,0,2,1.083333,True,...,True,False,False,False,False,False,False,False,False,True
4,17.0,22.67,5.67,1878,1,1,0,2,2.833333,True,...,False,False,False,False,False,False,True,False,False,False


In [2]:
from sklearn.model_selection import train_test_split

# przekształcamy na numeryczne (na wszelki wypadek)
dane = dane.apply(pd.to_numeric)

# target variable - cena lotu (zł)
X = dane.drop(columns = "Price")
y = dane["Price"]

In [3]:
import random 
random.seed(123) # ziarenko dla powtarzalności wyników

# podział na zbiór treningowy, walidacyjny i testowy - proporcje 80% : 20% : 20% 
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)

# konwersja do macierzy numpy
X_train_np = X_train.to_numpy().astype(np.float64)
y_train_np = y_train.to_numpy().astype(np.float64)
X_test_np = X_test.to_numpy().astype(np.float64)
y_test_np = y_test.to_numpy().astype(np.float64)


## Testowanie parametrów

Grid Search - przeszukiwanie kombinacji parametrów. 

In [4]:
import itertools
import pandas as pd
import time
import numpy as np
from glob import glob
from rf_regressor import RandomForestRegressor, mean_absolute_error, mean_squared_error, \
r2_score, mean_absolute_percentage_error

# Configuration
filename = None  # jeśli nazwa pliku nie ma być defaultowa to tutaj trzeba ją podać
num_repetitions = 1

# Random Forest parameters to test
baseline_params = {
    "n_estimators": [50], # liczba drzew w lesie 
    "max_depth": [5], # max głębokość pojedynczego drzew
    "min_samples_split": [2, 5, 10], # minimalna liczba obserwacji w węźle potrzebnych do tego żeby podzeilić obserwacje Z TEGO WĘZŁA
    "min_samples_leaf": [1, 2, 4], # minimalna liczba obserwacji, które muszą być w każdym z węzłów PO PODZIALE, żeby mogło dojść do podziału
    "max_features": [int(X_train.shape[1]/3)], # liczba zmiennych wylosowanych spośród wszystkich zmiennych w zb. treningowym
    # max features dla regresji to zazwyczaj 0.33n gdzie n to liczba zmiennych 
    "bootstrap": [True]
}

if not filename:
    files = glob("rf_*.xlsx")
    filename = f"rf_{len(files)+1}.xlsx"

if ".xlsx" not in filename: 
    filename += ".xlsx"

# Kombinacje parametrów
keys = list(baseline_params.keys())
combinations = list(itertools.product(*(baseline_params[key] for key in keys)))

# Ramka danych
params_df = pd.DataFrame(data=combinations, columns=keys)

print(f"Testing {len(params_df)} parameter combinations with {num_repetitions} repetitions each")
print(f"Total experiments: {len(params_df) * num_repetitions}")

# Lista na wyniki
results = []

print(f"Data shapes:")
print(f"Train: {X_train_np.shape}, {y_train_np.shape}")
print(f"Test: {X_test_np.shape}, {y_test_np.shape}")

# Main testing loop
for i, row in params_df.iterrows():
    try:
        print(f"\nTesting combination {i+1}/{len(params_df)}")
        
        # Extract parameters
        n_estimators = row["n_estimators"]
        max_depth = row["max_depth"]
        min_samples_split = row["min_samples_split"]
        min_samples_leaf = row["min_samples_leaf"]
        max_features = row["max_features"]
        bootstrap = row["bootstrap"]
        
        print(f"Parameters: n_estimators={n_estimators}, max_depth={max_depth}, "
              f"min_samples_split={min_samples_split}, min_samples_leaf={min_samples_leaf}, "
              f"max_features={max_features}, bootstrap={bootstrap}")
        
        # Lists to store results from repetitions
        train_mse = []
        training_times = []
        test_mse = []
        test_mae = []
        test_mape = []
        test_r2 = []
        oob_r2 = []
        oob_mse = []
        
        # Multiple repetitions for statistical significance
        for k in range(num_repetitions):
            print(f"  Repetition {k+1}/{num_repetitions}")
            
            # Create Random Forest model
            rf = RandomForestRegressor(
                n_estimators=n_estimators,
                max_depth=max_depth,
                min_samples_split=min_samples_split,
                min_samples_leaf=min_samples_leaf,
                max_features=max_features,
                bootstrap=bootstrap
            )

            start_time = time.time()
            # Train the model
            rf.fit(X_train_np, y_train_np)

            # Measure also the train time
            training_time = time.time() - start_time
            training_times.append(training_time)
            
            
            # Make predictions
            pred_train = rf.predict(X_train_np)
            pred_test = rf.predict(X_test_np)
            
            # Calculate metrics
            train_mse.append(mean_squared_error(y_train_np, pred_train))
            test_mse.append(mean_squared_error(y_test_np, pred_test))
            test_mae.append(mean_absolute_error(y_test_np, pred_test))
            test_mape.append(mean_absolute_percentage_error(y_test_np, pred_test))
            test_r2.append(r2_score(y_test_np, pred_test))

            if bootstrap:
                oob_r2.append(rf.oob_score_)
                oob_mse.append(rf.oob_mse_)
        
        # Calculate average metrics across repetitions
        avg_results = {
            "n_estimators": n_estimators,
            "max_depth": max_depth,
            "min_samples_split": min_samples_split,
            "min_samples_leaf": min_samples_leaf,
            "max_features": max_features,
            "bootstrap": bootstrap,
            "train_MSE": np.mean(train_mse),
            "test_MSE": np.mean(test_mse),
            "test_MAE": np.mean(test_mae),
            "test_MAPE": np.mean(test_mape),
            "test_R2": np.mean(test_r2),
            "train_time": np.mean(training_times)
        }

        if bootstrap:
            avg_results[ "OOB_MSE"] = np.mean(oob_mse)
            avg_results["OOB_R2"] = np.mean(oob_r2)
        
        results.append(avg_results)
        
        # Print current results
        print(f"  Results: Test R² = {avg_results['test_R2']:.4f}")
        print(f"  Test MSE = {avg_results['test_MSE']:.4f}")
            
    except KeyboardInterrupt:
        print("\nInterrupted by user. Saving current results...")
        result_df = pd.DataFrame(results)
        result_df.to_excel(filename, index=False)
        print(f"Results saved to {filename}")
        break

# Save final results
try:
    result_df = pd.DataFrame(results)
    result_df.to_excel(filename, index=False)
    print(f"\nFinal results saved to {filename}")
    
    # Display summary of best results
    if not result_df.empty:
        print("\n" + "="*60)
        print("SUMMARY OF RESULTS")
        print("="*60)
        
        # Best R² score
        best_r2_idx = result_df['test_R2'].idxmax()
        best_r2_row = result_df.iloc[best_r2_idx]
        print(f"Best R² Score: {best_r2_row['test_R2']:.4f} ± {best_r2_row['test_R2_std']:.4f}")
        print(f"Parameters: n_estimators={best_r2_row['n_estimators']}, max_depth={best_r2_row['max_depth']}")
        print(f"           min_samples_split={best_r2_row['min_samples_split']}, min_samples_leaf={best_r2_row['min_samples_leaf']}")
        print(f"           max_features={best_r2_row['max_features']}, bootstrap={best_r2_row['bootstrap']}")
        
        # Lowest MSE
        best_mse_idx = result_df['test_MSE'].idxmin()
        best_mse_row = result_df.iloc[best_mse_idx]
        print(f"\nLowest Test MSE: {best_mse_row['test_MSE']:.4f} ± {best_mse_row['test_MSE_std']:.4f}")
        print(f"Parameters: n_estimators={best_mse_row['n_estimators']}, max_depth={best_mse_row['max_depth']}")
        print(f"           min_samples_split={best_mse_row['min_samples_split']}, min_samples_leaf={best_mse_row['min_samples_leaf']}")
        print(f"           max_features={best_mse_row['max_features']}, bootstrap={best_mse_row['bootstrap']}")
        
        
except Exception as e:
    print(f"Error saving results: {str(e)}")

print(f"\nTesting completed. Results saved to {filename}")


Testing 9 parameter combinations with 1 repetitions each
Total experiments: 9
Data shapes:
Train: (115450, 37), (115450,)
Test: (28863, 37), (28863,)

Testing combination 1/9
Parameters: n_estimators=50, max_depth=5, min_samples_split=2, min_samples_leaf=1, max_features=12, bootstrap=True
  Repetition 1/1

Interrupted by user. Saving current results...
Results saved to rf_1.xlsx

Final results saved to rf_1.xlsx

Testing completed. Results saved to rf_1.xlsx
