In [1]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

Iris Dataset

In [2]:
data = load_iris()
X = data.data
y = data.target

In [3]:
# Data Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [4]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=200),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC()
}

In [5]:
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    results[name] = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred, average='weighted'),
        "Recall": recall_score(y_test, y_pred, average='weighted'),
        "F1 Score": f1_score(y_test, y_pred, average='weighted')
    }

In [6]:
# Hyperparameter grids
param_grids = {
    "Logistic Regression": {'C': [0.1, 1, 10]},
    "Decision Tree": {'max_depth': [3, 5, 10]},
    "Random Forest": {'n_estimators': [10, 50, 100]},
    "SVM": {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
}

In [7]:
# GridSearchCV for hyperparameter tuning
best_models = {}
for name, model in models.items():
    grid_search = GridSearchCV(model, param_grids[name], cv=3)
    grid_search.fit(X_train, y_train)
    best_models[name] = grid_search.best_estimator_

In [8]:
# Evaluation
best_results = {}
for name, model in best_models.items():
    y_pred = model.predict(X_test)
    best_results[name] = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred, average='weighted'),
        "Recall": recall_score(y_test, y_pred, average='weighted'),
        "F1 Score": f1_score(y_test, y_pred, average='weighted')
    }

Results

In [9]:
print("Initial Model Performance:")
for name, metrics in results.items():
    print(f"{name}: {metrics}")

print("\nBest Model Performance after GridSearchCV:")
for name, metrics in best_results.items():
    print(f"{name}: {metrics}")

Initial Model Performance:
Logistic Regression: {'Accuracy': 1.0, 'Precision': 1.0, 'Recall': 1.0, 'F1 Score': 1.0}
Decision Tree: {'Accuracy': 1.0, 'Precision': 1.0, 'Recall': 1.0, 'F1 Score': 1.0}
Random Forest: {'Accuracy': 1.0, 'Precision': 1.0, 'Recall': 1.0, 'F1 Score': 1.0}
SVM: {'Accuracy': 1.0, 'Precision': 1.0, 'Recall': 1.0, 'F1 Score': 1.0}

Best Model Performance after GridSearchCV:
Logistic Regression: {'Accuracy': 1.0, 'Precision': 1.0, 'Recall': 1.0, 'F1 Score': 1.0}
Decision Tree: {'Accuracy': 1.0, 'Precision': 1.0, 'Recall': 1.0, 'F1 Score': 1.0}
Random Forest: {'Accuracy': 1.0, 'Precision': 1.0, 'Recall': 1.0, 'F1 Score': 1.0}
SVM: {'Accuracy': 1.0, 'Precision': 1.0, 'Recall': 1.0, 'F1 Score': 1.0}


Why it is so Accurate(1.0) - 

1. Simplicity of the Iris Dataset
- The Iris dataset is small (150 samples) and well-structured.
- It contains only 4 features and 3 classes that are linearly separable to a large extent.
- Many models can easily find decision boundaries that perfectly separate the classes.

2. Powerful Models
- Models like Random Forest and SVM are very powerful and can easily overfit or perfectly fit small datasets like Iris.
- Even Logistic Regression performs well due to the linear separability of the data.