In [4]:
"""
Task 1: Classical Machine Learning with Scikit-learn
Dataset: Iris Species (Kaggle)
Goal:
1. Preprocess data (handle missing values, encode labels)
2. Train Decision Tree Classifier
3. Evaluate using Accuracy, Precision, and Recall
"""

# === Step 1: Import Libraries ===
%pip install kagglehub scikit-learn pandas numpy
import kagglehub
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report
import numpy as np

# === Step 2: Download Dataset from Kaggle ===
path = kagglehub.dataset_download("uciml/iris")
print("Path to dataset files:", path)

# The dataset file will typically be: iris.csv
df = pd.read_csv(f"{path}/Iris.csv")

# === Step 3: Inspect and Preprocess Data ===
print("\nDataset Preview:")
print(df.head())

# Remove ID column if present
if 'Id' in df.columns:
    df.drop('Id', axis=1, inplace=True)

# Check for missing values
print("\nMissing values before preprocessing:")
print(df.isnull().sum())

# Fill or drop missing values (if any)
if df.isnull().sum().sum() > 0:
    df.fillna(df.mean(), inplace=True)

# Encode species labels
le = LabelEncoder()
df['Species'] = le.fit_transform(df['Species'])

# === Step 4: Split Data ===
X = df.drop('Species', axis=1)
y = df['Species']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# === Step 5: Train Decision Tree Classifier ===
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

# === Step 6: Model Evaluation ===
y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)

print("\nModel Evaluation Results:")
print(f"Accuracy : {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall   : {recall:.4f}")

print("\nDetailed Classification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))

# === Step 7: Save Model (optional) ===
import joblib
joblib.dump((clf, le), "iris_decision_tree_model.joblib")
print("\nModel saved as iris_decision_tree_model.joblib")


Collecting kagglehub
  Downloading kagglehub-0.3.13-py3-none-any.whl.metadata (38 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.7.2-cp311-cp311-win_amd64.whl.metadata (11 kB)
Collecting pyyaml (from kagglehub)
  Downloading pyyaml-6.0.3-cp311-cp311-win_amd64.whl.metadata (2.4 kB)
Collecting tqdm (from kagglehub)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
     ---------------------------------------- 0.0/57.7 kB ? eta -:--:--
     --------------------- ------------------ 30.7/57.7 kB 1.3 MB/s eta 0:00:01
     ---------------------------------------- 57.7/57.7 kB 1.0 MB/s eta 0:00:00
Collecting scipy>=1.8.0 (from scikit-learn)
  Downloading scipy-1.16.2-cp311-cp311-win_amd64.whl.metadata (60 kB)
     ---------------------------------------- 0.0/60.8 kB ? eta -:--:--
     ---------------------------------------- 60.8/60.8 kB 1.6 MB/s eta 0:00:00
Collecting joblib>=1.2.0 (from scikit-learn)
  Using cached joblib-1.5.2-py3-none-any.whl.metadata (5.6 kB)
Collectin


[notice] A new release of pip is available: 24.0 -> 25.2
[notice] To update, run: C:\Users\DELL\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip
  from .autonotebook import tqdm as notebook_tqdm


Downloading from https://www.kaggle.com/api/v1/datasets/download/uciml/iris?dataset_version_number=2...


100%|██████████| 3.60k/3.60k [00:00<?, ?B/s]

Extracting files...
Path to dataset files: C:\Users\DELL\.cache\kagglehub\datasets\uciml\iris\versions\2






Dataset Preview:
   Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm      Species
0   1            5.1           3.5            1.4           0.2  Iris-setosa
1   2            4.9           3.0            1.4           0.2  Iris-setosa
2   3            4.7           3.2            1.3           0.2  Iris-setosa
3   4            4.6           3.1            1.5           0.2  Iris-setosa
4   5            5.0           3.6            1.4           0.2  Iris-setosa

Missing values before preprocessing:
SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64

Model Evaluation Results:
Accuracy : 0.9333
Precision: 0.9333
Recall   : 0.9333

Detailed Classification Report:
                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        10
Iris-versicolor       0.90      0.90      0.90        10
 Iris-virginica       0.90      0.90      0.90        10

       accuracy               