# Random Forest Classifier on Iris Dataset

## Import

In [42]:
import os

os.chdir('/home/arog/Documents/GitHub/si/')

import numpy as np
from si.io.csv_file import read_csv
from si.model_selection.split import train_test_split
from si.models.random_forest_classifier import RandomForestClassifier
from si.metrics.accuracy import accuracy

## Load Iris CSV

In [43]:
iris_file = os.path.join('datasets/iris/iris.csv')
iris_dataset = read_csv(filename=iris_file,features=True, label=True)

## Display basic dataset info

In [44]:
print(f"Dataset shape: {iris_dataset.X.shape}")
print(f"Features: {iris_dataset.features}")
print(f"Classes: {np.unique(iris_dataset.y)}")

Dataset shape: (150, 4)
Features: Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width'], dtype='object')
Classes: ['Iris-setosa' 'Iris-versicolor' 'Iris-virginica']


## Split the dataset into training and testing sets

In [45]:
train_dataset, test_dataset = train_test_split(iris_dataset, test_size=0.3, random_state=42)

print(f"Training samples: {train_dataset.X.shape[0]}")
print(f"Testing samples: {test_dataset.X.shape[0]}")

Training samples: 105
Testing samples: 45


## Initialize and train the Random Forest classifier

In [46]:
rf = RandomForestClassifier(
    n_estimators=50,      # Number of trees
    max_features=2,       # Number of features to consider at each split
    min_sample_split=2,   # Minimum samples required to split a node
    max_depth=5,          # Maximum depth of each tree
    mode='gini',          # Split criterion
    seed=42               # For reproducibility
)

rf.fit(train_dataset)

<si.models.random_forest_classifier.RandomForestClassifier at 0x70c323ee2fd0>

## Make predictions on the test set and calculate and display accuracy

In [None]:
predictions = rf.predict(test_dataset)

acc = accuracy(test_dataset.y, predictions)
print(f"Random Forest Accuracy: {acc:.4f}")

Random Forest Accuracy: 1.0000


## Compare with a single decision tree

In [49]:
from si.models.decision_tree_classifier import DecisionTreeClassifier

dt = DecisionTreeClassifier(
    min_sample_split=2,
    max_depth=5,
    mode='gini'
)
dt.fit(train_dataset)
dt_predictions = dt.predict(test_dataset)
dt_acc = accuracy(test_dataset.y, dt_predictions)

print(f"Decision Tree Accuracy: {dt_acc:.4f}")
print(f"Random Forest Improvement: {(acc - dt_acc):.4f}")

Decision Tree Accuracy: 0.9556
Random Forest Improvement: 0.0444


## Feature importance analysis

In [50]:
feature_importance = np.zeros(iris_dataset.X.shape[1])
for feature_idx, tree in rf.trees:
    # This is a simplified approach - in a real implementation you'd track
    # how often each feature is used for splits and their importance
    for idx in feature_idx:
        feature_importance[idx] += 1

feature_importance /= len(rf.trees)  # Normalize

print("\nFeature importance:")
for name, importance in zip(iris_dataset.features, feature_importance):
    print(f"{name}: {importance:.3f}")


Feature importance:
sepal_length: 0.520
sepal_width: 0.380
petal_length: 0.620
petal_width: 0.480


## Confusion matrix

In [51]:
from collections import defaultdict

def confusion_matrix(true_labels, pred_labels):
    classes = np.unique(np.concatenate([true_labels, pred_labels]))
    matrix = defaultdict(lambda: defaultdict(int))

    for true, pred in zip(true_labels, pred_labels):
        matrix[true][pred] += 1

    # Print the confusion matrix
    print("\nConfusion Matrix:")
    print("          " + " ".join(f"{cls:>10}" for cls in classes))
    for true in classes:
        row = [matrix[true][pred] for pred in classes]
        print(f"{true:>10} " + " ".join(f"{count:>10}" for count in row))

confusion_matrix(test_dataset.y, predictions)


Confusion Matrix:
          Iris-setosa Iris-versicolor Iris-virginica
Iris-setosa         19          0          0
Iris-versicolor          0         13          0
Iris-virginica          0          0         13
