
# Forest Fire Prediction - Comprehensive Hyperparameter Tuning

**Project**: Predicting Forest Fires using Soil and Climate Data  
**Study Area**: Algeria & Tunisia  
**Year**: 2024  

This notebook performs exhaustive hyperparameter tuning for:
1. K-Nearest Neighbors (KNN)
2. Decision Tree
3. Random Forest

Features:
- GridSearchCV with cross-validation
- Multiple scoring metrics
- Detailed results analysis
- Best parameters identification
- Results export to CSV
- Comprehensive visualizations



In [None]:

## 0. Imports and Setup


import os
import pandas as pd
import numpy as np
import time
import warnings
from datetime import datetime

# Preprocessing
from sklearn.model_selection import train_test_split, cross_val_score, ParameterGrid
from sklearn.preprocessing import MinMaxScaler
from imblearn.combine import SMOTETomek

# Models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Metrics
from sklearn.metrics import f1_score, make_scorer

# Suppress warnings
warnings.filterwarnings('ignore')

# Configuration
DATA_PATH = "../../data/processed/Fire/final.csv"  # Ensure this matches your directory
RESULTS_FILE = "grid_search_results.csv"



‚úÖ Libraries loaded successfully


## 1. Data Preparation

In [None]:


#  (Load -> Split -> SMOTE -> Scale)

def prepare_data(filepath):
    print("loading data...")
    df = pd.read_csv(filepath)
    X = df.drop(columns=["fire"]).values
    y = df["fire"].values
    return X, y

def get_processed_data(X, y):
    # 1. Split
    print("Splitting data...")
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    # 2. SMOTE-Tomek (Balancing)
    print("Applying SMOTE-Tomek (this may take a moment)...")
    smote = SMOTETomek(random_state=42)
    X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
    
    # 3. Scaling
    print("Scaling features...")
    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train_res)
    X_test_scaled = scaler.transform(X_test)
    
    print(f" Data Ready. Training shape: {X_train_scaled.shape}")
    return X_train_scaled, y_train_res

# Execution
X, y = prepare_data(DATA_PATH)
X_train, y_train = get_processed_data(X, y)



loading data...
Splitting data...
Applying SMOTE-Tomek (this may take a moment)...
Scaling features...
‚úÖ Data Ready. Training shape: (143402, 17)


## 2. Robust Grid Search Function

In [None]:

# *This function iterates through the grid manually and appends to CSV immediately.*
def robust_grid_search(model_class, param_grid, model_name, X, y, output_file):
    """
    Runs grid search manually and saves to CSV after every iteration.
    """
    print(f"\n" + "="*60)
    print(f"Starting Grid Search for: {model_name}")
    print("="*60)
    
    # Generate all combinations
    grid = ParameterGrid(param_grid)
    total_combinations = len(grid)
    print(f"Total combinations to test: {total_combinations}")
    
    # Check if results file exists to handle headers
    if not os.path.isfile(output_file):
        # Create new file with headers
        pd.DataFrame(columns=[
            'timestamp', 'model', 'params', 'mean_f1', 'std_f1', 'time_taken'
        ]).to_csv(output_file, index=False)
        print(f"Created new results file: {output_file}")
    else:
        print(f"Appending to existing file: {output_file}")

    # Create Scorer
    scorer = make_scorer(f1_score)

    # Iterate
    for i, params in enumerate(grid):
        start_time = time.time()
        try:
            # Initialize model with current params
            model = model_class(**params)
            
            # Cross-validation (5-fold)
            # n_jobs=-1 uses all processors
            scores = cross_val_score(model, X, y, cv=5, scoring='f1', n_jobs=-1)
            
            mean_score = np.mean(scores)
            std_score = np.std(scores)
            
        except Exception as e:
            print(f"Error with params {params}: {e}")
            mean_score = np.nan
            std_score = np.nan
            
        elapsed_time = time.time() - start_time
        
        # Prepare result row
        result_row = {
            'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            'model': model_name,
            'params': str(params),  # Save as string to avoid CSV issues
            'mean_f1': mean_score,
            'std_f1': std_score,
            'time_taken': elapsed_time
        }
        
        # Save immediately to CSV
        pd.DataFrame([result_row]).to_csv(output_file, mode='a', header=False, index=False)
        
        # Print progress every 10 iterations or if it's slow
        if (i + 1) % 1 == 0: # Print every step (change to % 10 for less noise)
            print(f"[{i+1}/{total_combinations}] F1: {mean_score:.4f} | {params}")

    print(f" {model_name} Grid Search Complete!")


## 3. Define Parameter Grids

In [None]:

# 1. K-Nearest Neighbors
knn_params = {
    'n_neighbors': [3, 5, 7, 9, 11, 15],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan'],
    'p': [1, 2]
}

# 2. Decision Tree
dt_params = {
    'max_depth': [5, 10, 15, 20, 25, None],
    'min_samples_split': [2, 4, 6, 8, 10],
    'min_samples_leaf': [1, 2, 4, 6],
    'criterion': ['gini', 'entropy']
}


# 3. Random Forest
# Note: RF Grid Search was very slow. I reduced it slightly for feasibility.
rf_params = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [10, 15, 20, None],
    'min_samples_split': [2, 5],
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [True]
}



## 4. Run the Grid Search

In [None]:

# Run KNN
robust_grid_search(
    KNeighborsClassifier, 
    knn_params, 
    "KNN", 
    X_train, 
    y_train, 
    RESULTS_FILE
)


Starting Grid Search for: KNN
Total combinations to test: 48
Created new results file: grid_search_results.csv
[1/48] F1: 0.9698 | {'metric': 'euclidean', 'n_neighbors': 3, 'p': 1, 'weights': 'uniform'}
[2/48] F1: 0.9724 | {'metric': 'euclidean', 'n_neighbors': 3, 'p': 1, 'weights': 'distance'}
[3/48] F1: 0.9698 | {'metric': 'euclidean', 'n_neighbors': 3, 'p': 2, 'weights': 'uniform'}
[4/48] F1: 0.9724 | {'metric': 'euclidean', 'n_neighbors': 3, 'p': 2, 'weights': 'distance'}
[5/48] F1: 0.9648 | {'metric': 'euclidean', 'n_neighbors': 5, 'p': 1, 'weights': 'uniform'}
[6/48] F1: 0.9704 | {'metric': 'euclidean', 'n_neighbors': 5, 'p': 1, 'weights': 'distance'}
[7/48] F1: 0.9648 | {'metric': 'euclidean', 'n_neighbors': 5, 'p': 2, 'weights': 'uniform'}
[8/48] F1: 0.9704 | {'metric': 'euclidean', 'n_neighbors': 5, 'p': 2, 'weights': 'distance'}
[9/48] F1: 0.9613 | {'metric': 'euclidean', 'n_neighbors': 7, 'p': 1, 'weights': 'uniform'}
[10/48] F1: 0.9685 | {'metric': 'euclidean', 'n_neighbor

In [9]:


# Run Decision Tree
robust_grid_search(
    DecisionTreeClassifier,
    dt_params,
    "DecisionTree",
    X_train,
    y_train,
    RESULTS_FILE
)




Starting Grid Search for: DecisionTree
Total combinations to test: 240
Appending to existing file: grid_search_results.csv
[1/240] F1: 0.8778 | {'criterion': 'gini', 'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 2, 'random_state': 42}
[2/240] F1: 0.8778 | {'criterion': 'gini', 'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 4, 'random_state': 42}
[3/240] F1: 0.8778 | {'criterion': 'gini', 'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 6, 'random_state': 42}
[4/240] F1: 0.8778 | {'criterion': 'gini', 'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 8, 'random_state': 42}
[5/240] F1: 0.8778 | {'criterion': 'gini', 'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 10, 'random_state': 42}
[6/240] F1: 0.8778 | {'criterion': 'gini', 'max_depth': 5, 'min_samples_leaf': 2, 'min_samples_split': 2, 'random_state': 42}
[7/240] F1: 0.8778 | {'criterion': 'gini', 'max_depth': 5, 'min_samples_leaf': 2, 'min_samples_split': 4, 'random_state

In [None]:

# Run Random Forest
robust_grid_search(
    RandomForestClassifier,
    rf_params,
    "RandomForest",
    X_train,
    y_train,
    RESULTS_FILE
)




Starting Grid Search for: RandomForest
Total combinations to test: 64
Appending to existing file: grid_search_results.csv
[1/64] F1: 0.9473 | {'bootstrap': True, 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_split': 2, 'n_estimators': 50, 'n_jobs': 1, 'random_state': 42}
[2/64] F1: 0.9470 | {'bootstrap': True, 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_split': 2, 'n_estimators': 100, 'n_jobs': 1, 'random_state': 42}
[3/64] F1: 0.9471 | {'bootstrap': True, 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_split': 2, 'n_estimators': 150, 'n_jobs': 1, 'random_state': 42}
[4/64] F1: 0.9470 | {'bootstrap': True, 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_split': 2, 'n_estimators': 200, 'n_jobs': 1, 'random_state': 42}
[5/64] F1: 0.9463 | {'bootstrap': True, 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_split': 5, 'n_estimators': 50, 'n_jobs': 1, 'random_state': 42}
[6/64] F1: 0.9469 | {'bootstrap': True, 'max_depth': 10, 'max_features': 'sqrt', 'mi

## 5. Analyze Results

In [None]:
# Read the CSV to find the best parameters.
def get_best_params(results_file):
    if not os.path.exists(results_file):
        print("No results file found.")
        return

    df_results = pd.read_csv(results_file)
    
    # Sort by Score (descending)
    df_sorted = df_results.sort_values(by="mean_f1", ascending=False)
    
    print("** Top 5 Best Models Found:")
    print(df_sorted[['model', 'mean_f1', 'params']].head(5))
    
    print("\nBest Parameters per Algorithm:")
    for model in df_sorted['model'].unique():
        best_row = df_sorted[df_sorted['model'] == model].iloc[0]
        print(f"\n-- {model}:")
        print(f"   F1-Score: {best_row['mean_f1']:.4f}")
        print(f"   Params: {best_row['params']}")

# Check results
get_best_params(RESULTS_FILE)

üèÜ Top 5 Best Models Found:
             model   mean_f1  \
1674  RandomForest  0.980265   
1666  RandomForest  0.980265   
1667  RandomForest  0.980231   
1675  RandomForest  0.980231   
1665  RandomForest  0.980140   

                                                 params  
1674  {'bootstrap': True, 'max_depth': None, 'max_fe...  
1666  {'bootstrap': True, 'max_depth': None, 'max_fe...  
1667  {'bootstrap': True, 'max_depth': None, 'max_fe...  
1675  {'bootstrap': True, 'max_depth': None, 'max_fe...  
1665  {'bootstrap': True, 'max_depth': None, 'max_fe...  

Best Parameters per Algorithm:

‚û°Ô∏è RandomForest:
   F1-Score: 0.9803
   Params: {'bootstrap': True, 'max_depth': None, 'max_features': 'log2', 'min_samples_split': 2, 'n_estimators': 150, 'n_jobs': 1, 'random_state': 42}

‚û°Ô∏è KNN:
   F1-Score: 0.9740
   Params: {'metric': 'manhattan', 'n_neighbors': 3, 'p': 1, 'weights': 'distance'}

‚û°Ô∏è DecisionTree:
   F1-Score: 0.9727
   Params: {'criterion': 'entropy', 'max_dep