# Setup

Imports

In [31]:
# data manipulation libs
import pandas as pd
import numpy as np
# data visualization lib
import matplotlib.pyplot as plt
# data preprocessing libs
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
# model implementation libs
from sklearn.ensemble import RandomForestClassifier
import joblib
# model evaluation lib
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

Loading dataset

In [24]:
from sklearn.datasets import load_iris

# Load the Iris dataset
iris = load_iris()
print("Iris dataset loaded successfully.")

# Access the features and target variable
X = iris.data  # Features (sepal length, sepal width, petal length, petal width)
y = iris.target  # Target variable (species: 0 for setosa, 1 for versicolor, 2 for virginica)

Iris dataset loaded successfully.


# Data exploration

In [25]:
# Create a DataFrame for better visualization
iris_df = pd.DataFrame(data=X, columns=iris.feature_names)
iris_df['target'] = y
print(iris_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sepal length (cm)  150 non-null    float64
 1   sepal width (cm)   150 non-null    float64
 2   petal length (cm)  150 non-null    float64
 3   petal width (cm)   150 non-null    float64
 4   target             150 non-null    int64  
dtypes: float64(4), int64(1)
memory usage: 6.0 KB
None


In [26]:
# Print the feature names and target names
print("Feature names:", iris.feature_names)
print(X[:5])  # Display the first 5 rows of features
print("Target names:", iris.target_names)
print(y[:5])  # Display the first 5 target values

Feature names: ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]]
Target names: ['setosa' 'versicolor' 'virginica']
[0 0 0 0 0]


In [27]:
# label map
label_map = {0: 'setosa', 1: 'versicolor', 2: 'virginica'}

In [50]:
# data splitting
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=43)

# Modeling
As the dataset has 150 records, RandomForest classification will be a good fit for this use case.

In [60]:
# train function
def train_model(model, x_train, x_test, y_train, y_test):
    """
    Train the model and evaluate its performance.
    """
    # Fit the model to the training data.
    model.fit(x_train, y_train)

    # make predictions on the test data
    y_pred_train = model.predict(x_train)
    y_pred_test = model.predict(x_test)

    # Evaluate the model's performance
    train_accuracy = accuracy_score(y_train, y_pred_train)
    test_accuracy = accuracy_score(y_test, y_pred_test)

    print(f"Training Accuracy: {train_accuracy:.2f}")
    print(f"Testing Accuracy: {test_accuracy:.2f}")
    print("\n", "-" * 50)
    print("Classification Report:\n", classification_report(y_test, y_pred_test, target_names=iris.target_names))
    print("-" * 50, "\n")
    return model

In [40]:
# Initialize the RandomForestClassifier
rf_model = RandomForestClassifier(random_state=0)

In [61]:
# Train and Evaluate the model
trained_model = train_model(rf_model, x_train, x_test, y_train, y_test)
# Save the trained model using jonlib
print("Saving the trained model...")
joblib.dump(trained_model, 'trained_rf_model.pkl')

Training Accuracy: 1.00
Testing Accuracy: 0.98

 --------------------------------------------------
Classification Report:
               precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        15
  versicolor       1.00      0.94      0.97        17
   virginica       0.93      1.00      0.96        13

    accuracy                           0.98        45
   macro avg       0.98      0.98      0.98        45
weighted avg       0.98      0.98      0.98        45

-------------------------------------------------- 

Saving the trained model...


['trained_rf_model.pkl']

# Testing the model

In [65]:
# Make predictions on the test set
y_pred = trained_model.predict(x_test)
# Testing the model
print("Predictions on the test set")
for i in range(10):
    predicted_class = label_map[y_pred[i]]
    actual_class = label_map[y_test[i]]
    if predicted_class == actual_class:
        em = "✅"
    else:
        em = "❌"
    print(f"{em} Sample {i+1}: Predicted: {predicted_class}, Actual: {actual_class}")


Predictions on the test set
✅ Sample 1: Predicted: setosa, Actual: setosa
✅ Sample 2: Predicted: setosa, Actual: setosa
✅ Sample 3: Predicted: virginica, Actual: virginica
✅ Sample 4: Predicted: versicolor, Actual: versicolor
✅ Sample 5: Predicted: virginica, Actual: virginica
✅ Sample 6: Predicted: setosa, Actual: setosa
✅ Sample 7: Predicted: virginica, Actual: virginica
✅ Sample 8: Predicted: versicolor, Actual: versicolor
✅ Sample 9: Predicted: versicolor, Actual: versicolor
✅ Sample 10: Predicted: versicolor, Actual: versicolor
