In [1]:
import numpy as np
import pandas as pd

In [5]:
from sklearn.datasets import load_iris

# Load the Iris dataset
iris = load_iris()

# Extract features and labels
X = iris.data  # Features
y = iris.target  # Labels

# Access additional information if needed
feature_names = iris.feature_names
target_names = iris.target_names

# Display the dataset description
print(iris.DESCR)

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

:Number of Instances: 150 (50 in each of three classes)
:Number of Attributes: 4 numeric, predictive attributes and the class
:Attribute Information:
    - sepal length in cm
    - sepal width in cm
    - petal length in cm
    - petal width in cm
    - class:
            - Iris-Setosa
            - Iris-Versicolour
            - Iris-Virginica

:Summary Statistics:

                Min  Max   Mean    SD   Class Correlation
sepal length:   4.3  7.9   5.84   0.83    0.7826
sepal width:    2.0  4.4   3.05   0.43   -0.4194
petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

:Missing Attribute Values: None
:Class Distribution: 33.3% for each of 3 classes.
:Creator: R.A. Fisher
:Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
:Date: July, 1988

The famous Iris database, first used by Sir R.A. Fisher. The dataset is taken
from Fis

In [7]:
pd.DataFrame(X).head()

Unnamed: 0,0,1,2,3
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [8]:
pd.DataFrame(y).head()

Unnamed: 0,0
0,0
1,0
2,0
3,0
4,0


In [10]:
# Train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state= 42
)

print(X_train.shape, y_train.shape, X_test.shape, X_test.shape)

(120, 4) (120,) (30, 4) (30, 4)


In [11]:
# I am using Random Forest to solve the task
from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier()
classifier.fit(X_train, y_train)

predicted = classifier.predict(X_test)

from sklearn.metrics import accuracy_score

accuracy_score(predicted, y_test)

1.0

## Do the hyper parameter tuning

In [None]:
'''
class sklearn.ensemble.RandomForestClassifier(
    n_estimators=100, *,
    criterion='gini',
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    min_weight_fraction_leaf=0.0,
    max_features='sqrt',
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    bootstrap=True, oob_score=False,
    n_jobs=None, random_state=None,
    verbose=0, warm_start=False, class_weight=None,
    ccp_alpha=0.0, max_samples=None, monotonic_cst=None)

'''

## Install the Optuna library

In [12]:
!pip install optuna



In [14]:
import optuna

In [20]:
# Objective function for Optuna
def objective(trial):
    # Suggest values for hyperparameters
    n_estimators = trial.suggest_categorical("n_estimators", [100, 200])  # corrected to use categorical
    criterion = trial.suggest_categorical("criterion", ["gini", "entropy", "log_loss"])
    max_features = trial.suggest_categorical("max_features", ["sqrt", "log2", None])
    
    # Initialize the model with suggested hyperparameters
    classifier = RandomForestClassifier(
        n_estimators=n_estimators,
        criterion=criterion,
        max_features=max_features
    )
    
    # Fit the model on training data
    classifier.fit(X_train, y_train)
    
    # Predict on the test data
    predicted = classifier.predict(X_test)
    
    # Calculate and return the accuracy on the test set
    return accuracy_score(y_test, predicted)

In [34]:
# Create the study
study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler())
study.optimize(func=objective, n_trials=20)

[I 2024-11-06 09:59:11,472] A new study created in memory with name: no-name-061f724b-27e0-46f1-b718-770279c1ef85


[I 2024-11-06 09:59:11,789] Trial 0 finished with value: 1.0 and parameters: {'n_estimators': 200, 'criterion': 'entropy', 'max_features': None}. Best is trial 0 with value: 1.0.
[I 2024-11-06 09:59:11,924] Trial 1 finished with value: 1.0 and parameters: {'n_estimators': 100, 'criterion': 'gini', 'max_features': 'log2'}. Best is trial 0 with value: 1.0.
[I 2024-11-06 09:59:12,183] Trial 2 finished with value: 1.0 and parameters: {'n_estimators': 200, 'criterion': 'gini', 'max_features': 'log2'}. Best is trial 0 with value: 1.0.
[I 2024-11-06 09:59:12,439] Trial 3 finished with value: 1.0 and parameters: {'n_estimators': 200, 'criterion': 'log_loss', 'max_features': 'log2'}. Best is trial 0 with value: 1.0.
[I 2024-11-06 09:59:12,683] Trial 4 finished with value: 1.0 and parameters: {'n_estimators': 200, 'criterion': 'gini', 'max_features': 'log2'}. Best is trial 0 with value: 1.0.
[I 2024-11-06 09:59:12,807] Trial 5 finished with value: 1.0 and parameters: {'n_estimators': 100, 'crite

In [35]:
study.directions

[<StudyDirection.MAXIMIZE: 2>]

In [36]:
study.sampler

<optuna.samplers._tpe.sampler.TPESampler at 0x7f9c8d46d600>

In [37]:
study.metric_names

In [38]:
study.best_params

{'n_estimators': 200, 'criterion': 'entropy', 'max_features': None}

In [39]:
study.best_value

1.0

<bound method Study.get_trials of <optuna.study.study.Study object at 0x7f9c8c662ef0>>

In [44]:
study.best_trial.params

{'n_estimators': 200, 'criterion': 'entropy', 'max_features': None}

In [45]:
# Use this parameter
classifier = RandomForestClassifier(
    **study.best_params
)

classifier.fit(X_train, y_train)

predicted = classifier.predict(X_test)

accuracy_score(predicted, y_test)

1.0

### Using the range

In [46]:
def objective(trail):
    n_estimators = trail.suggest_int("n_estimators", 100, 500)
    criterion = trail.suggest_categorical("criterion", ["gini", "entropy", "log_loss"])
    max_features = trail.suggest_categorical("max_features", ["sqrt", "log2", None])
    
    classifier = RandomForestClassifier(
        n_estimators=n_estimators,
        criterion=criterion,
        max_features=max_features
    )
    
    classifier.fit(X_train, y_train)
    
    predicted = classifier.predict(X_test)
    
    return accuracy_score(y_test, predicted)

In [47]:
study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler())
study.optimize(func=objective, n_trials=20)

[I 2024-11-06 10:26:06,247] A new study created in memory with name: no-name-f8bcf22e-1984-47e6-81d5-a8ae5369af93
[I 2024-11-06 10:26:06,998] Trial 0 finished with value: 1.0 and parameters: {'n_estimators': 426, 'criterion': 'log_loss', 'max_features': None}. Best is trial 0 with value: 1.0.
[I 2024-11-06 10:26:07,165] Trial 1 finished with value: 1.0 and parameters: {'n_estimators': 127, 'criterion': 'gini', 'max_features': None}. Best is trial 0 with value: 1.0.
[I 2024-11-06 10:26:07,715] Trial 2 finished with value: 1.0 and parameters: {'n_estimators': 434, 'criterion': 'gini', 'max_features': None}. Best is trial 0 with value: 1.0.
[I 2024-11-06 10:26:08,258] Trial 3 finished with value: 1.0 and parameters: {'n_estimators': 435, 'criterion': 'entropy', 'max_features': 'log2'}. Best is trial 0 with value: 1.0.
[I 2024-11-06 10:26:08,577] Trial 4 finished with value: 1.0 and parameters: {'n_estimators': 259, 'criterion': 'entropy', 'max_features': None}. Best is trial 0 with value:

In [49]:
study.best_params

{'n_estimators': 426, 'criterion': 'log_loss', 'max_features': None}

In [51]:
study.best_value

1.0

In [52]:
classifier = RandomForestClassifier(
    **study.best_params
)

classifier.fit(X_train, y_train)

predicted = classifier.predict(X_test)

accuracy_score(predicted, y_test)

1.0

### Is it possible to use multiple Algorithms

In [54]:
from sklearn.tree import DecisionTreeClassifier


def objective(trails):
    classifiers = trails.suggest_categorical("classifier", ["RF", "DT"])
    n_estimators = trails.suggest_categorical("n_estimators", [100, 200, 300])
    criterion = trails.suggest_categorical("criterion", ["gini", "entropy", "log_loss"])
    splitter = trails.suggest_categorical("splitter", ["best", "random"])
    max_depth = trails.suggest_int("max_depth", 3, 10)
    
    if classifiers == "RF":
        model = RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            criterion = criterion
        )
        
    elif classifiers == "DT":
        model = DecisionTreeClassifier(
            max_depth=max_depth,
            splitter=splitter
        )
        
    model.fit(X_train, y_train)
    
    predicted = model.predict(X_test)
    
    return accuracy_score(y_test, predicted)

In [55]:
study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler())
study.optimize(func=objective, n_trials=20)

[I 2024-11-06 10:35:33,762] A new study created in memory with name: no-name-026718a8-36ec-4081-a383-8151ee5830b2
[I 2024-11-06 10:35:34,346] Trial 0 finished with value: 1.0 and parameters: {'classifier': 'RF', 'n_estimators': 300, 'criterion': 'entropy', 'splitter': 'best', 'max_depth': 10}. Best is trial 0 with value: 1.0.
[I 2024-11-06 10:35:34,476] Trial 1 finished with value: 1.0 and parameters: {'classifier': 'RF', 'n_estimators': 100, 'criterion': 'entropy', 'splitter': 'best', 'max_depth': 10}. Best is trial 0 with value: 1.0.
[I 2024-11-06 10:35:34,484] Trial 2 finished with value: 0.9666666666666667 and parameters: {'classifier': 'DT', 'n_estimators': 200, 'criterion': 'log_loss', 'splitter': 'random', 'max_depth': 7}. Best is trial 0 with value: 1.0.
[I 2024-11-06 10:35:34,487] Trial 3 finished with value: 1.0 and parameters: {'classifier': 'DT', 'n_estimators': 200, 'criterion': 'log_loss', 'splitter': 'best', 'max_depth': 9}. Best is trial 0 with value: 1.0.
[I 2024-11-06

In [56]:
study.best_params

{'classifier': 'RF',
 'n_estimators': 300,
 'criterion': 'entropy',
 'splitter': 'best',
 'max_depth': 10}