In [None]:

import numpy as np
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [None]:

df = pd.read_csv("../data/processed/Fire/fnf_cleaned.csv")

X = df.drop(columns=["fire"]).values
y = df["fire"].values


In [None]:

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [None]:

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test  = scaler.transform(X_test)


In [None]:

class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, *, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value
        
    def is_leaf_node(self):
        return self.value is not None

In [None]:

def gini(y):
    classes = np.unique(y)
    impurity = 1.0
    for c in classes:
        p = np.sum(y == c) / len(y)
        impurity -= p**2
    return impurity


def split_dataset(X, y, feature, threshold):
    left_idx = np.where(X[:, feature] <= threshold)[0]
    right_idx = np.where(X[:, feature] > threshold)[0]
    return X[left_idx], y[left_idx], X[right_idx], y[right_idx]


def best_split(X, y, feature_indices):
    best_feature, best_threshold = None, None
    best_gain = -1
    parent_impurity = gini(y)
    n_samples = len(y)

    for feature in feature_indices:
        thresholds = np.unique(X[:, feature])
        
        for threshold in thresholds:
            X_left, y_left, X_right, y_right = split_dataset(X, y, feature, threshold)

            if len(y_left) == 0 or len(y_right) == 0:
                continue

            left_gini = gini(y_left)
            right_gini = gini(y_right)
            child_impurity = (len(y_left)/n_samples)*left_gini + (len(y_right)/n_samples)*right_gini
            gain = parent_impurity - child_impurity

            if gain > best_gain:
                best_gain = gain
                best_feature = feature
                best_threshold = threshold
    
    return best_feature, best_threshold, best_gain



In [None]:

class DecisionTree:
    def __init__(self, max_depth=10, min_samples_split=2, max_features=None):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.max_features = max_features  # number of random features sampled at each split
        self.root = None

    def fit(self, X, y):
        self.n_features = X.shape[1]
        self.root = self._grow_tree(X, y)
    
    def _grow_tree(self, X, y, depth=0):

        num_samples = len(y)
        num_labels = len(np.unique(y))

        # stopping conditions
        if (depth >= self.max_depth 
            or num_labels == 1
            or num_samples < self.min_samples_split):
            leaf_value = Counter(y).most_common(1)[0][0]
            return Node(value=leaf_value)

        # choose random subset of features for this split
        max_feats = self.max_features or int(np.sqrt(self.n_features))
        feature_indices = np.random.choice(self.n_features, max_feats, replace=False)

        feature, threshold, gain = best_split(X, y, feature_indices)

        # no good split found
        if gain < 1e-7:
            leaf_value = Counter(y).most_common(1)[0][0]
            return Node(value=leaf_value)

        X_left, y_left, X_right, y_right = split_dataset(X, y, feature, threshold)

        left_child = self._grow_tree(X_left, y_left, depth + 1)
        right_child = self._grow_tree(X_right, y_right, depth + 1)

        return Node(feature, threshold, left_child, right_child)

    def predict(self, X):
        return np.array([self._predict_single(x, self.root) for x in X])
    
    def _predict_single(self, x, node):
        if node.is_leaf_node():
            return node.value
        if x[node.feature] <= node.threshold:
            return self._predict_single(x, node.left)
        return self._predict_single(x, node.right)


In [8]:
# RF

In [None]:
# 
def bootstrap_sample(X, y):
    n_samples = X.shape[0]
    idxs = np.random.choice(n_samples, size=n_samples, replace=True)
    return X[idxs], y[idxs]


In [10]:
# Random Forest Class
# %%
class RandomForest:
    def __init__(self, n_trees=10, max_depth=10, min_samples_split=2, max_features=None):
        self.n_trees = n_trees
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.max_features = max_features
        self.trees = []
    
    def fit(self, X, y):
        self.trees = []
        for _ in range(self.n_trees):
            X_sample, y_sample = bootstrap_sample(X, y)

            tree = DecisionTree(
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split,
                max_features=self.max_features
            )
            tree.fit(X_sample, y_sample)
            self.trees.append(tree)
    
    def predict(self, X):
        # predictions from all trees
        tree_preds = np.array([tree.predict(X) for tree in self.trees])
        # majority vote across trees
        final_preds = []
        for i in range(X.shape[0]):
            votes = tree_preds[:, i]
            final_preds.append(Counter(votes).most_common(1)[0][0])
        return np.array(final_preds)


In [11]:
# train
rf = RandomForest(
    n_trees=10,
    max_depth=10,
    min_samples_split=4,
    max_features=int(np.sqrt(X_train.shape[1]))  
)

rf.fit(X_train, y_train)
preds = rf.predict(X_test)


In [12]:

print("Accuracy:", accuracy_score(y_test, preds))
print("\nClassification Report:")
print(classification_report(y_test, preds))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, preds))


Accuracy: 0.9599430018999366

Classification Report:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98     18032
           1       0.76      0.25      0.38       916

    accuracy                           0.96     18948
   macro avg       0.86      0.62      0.68     18948
weighted avg       0.95      0.96      0.95     18948


Confusion Matrix:
[[17960    72]
 [  687   229]]


In [None]:
# Different Numbers of Trees
for n in [5, 10, 20, 30]:
    rf = RandomForest(n_trees=n, max_depth=10)
    rf.fit(X_train, y_train)
    preds = rf.predict(X_test)
    print(f"Trees={n} â†’ Accuracy={accuracy_score(y_test, preds):.4f}")
