In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

In [12]:
# Step 1: Read the dataset
try:
    # Try reading as .xls file
    df = pd.read_excel('iris.xls')
except:
    # If the above fails, use the built-in iris dataset
    from sklearn.datasets import load_iris
    iris = load_iris()
    df = pd.DataFrame(data=np.c_[iris['data'], iris['target']],
                      columns=iris['feature_names'] + ['target'])
    print("Using built-in iris dataset as fallback")

# Display first few rows to understand the structure
print("Dataset Preview:")
print(df.head())
print("\nDataset Shape:", df.shape)

Dataset Preview:
    SL   SW   PL   PW Classification
0  5.1  3.5  1.4  0.2    Iris-setosa
1  4.9  3.0  1.4  0.2    Iris-setosa
2  NaN  3.2  1.3  0.2    Iris-setosa
3  4.6  3.1  1.5  0.2    Iris-setosa
4  5.0  3.6  1.4  0.2    Iris-setosa

Dataset Shape: (150, 5)


In [13]:
# Step 2: Preprocessing
# Check for missing values
print("\nMissing Values:", df.isnull().sum().sum())
print("Missing values:\n", df.isnull().sum())

# Handle target column appropriately
if 'target' in df.columns:
    target_col = 'target'
elif 'Species' in df.columns:
    target_col = 'Species'
elif 'Classification' in df.columns:
    target_col = 'Classification'
else:
    # Assume last column is target
    target_col = df.columns[-1]

X = df.drop(target_col, axis=1)
y = df[target_col]

# Convert target to numeric if it's categorical
if y.dtype == 'object':
    y = pd.factorize(y)[0]

# Split the dataset - do this before imputation to prevent data leakage
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


Missing Values: 19
Missing values:
 SL                7
SW                6
PL                6
PW                0
Classification    0
dtype: int64


In [14]:
# Step 3: Create pipelines with imputation for each model
models = {
    'Logistic Regression': Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler()),
        ('classifier', LogisticRegression(max_iter=200))
    ]),
    'Decision Tree': Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('classifier', DecisionTreeClassifier())
    ]),
    'K-Nearest Neighbors': Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler()),
        ('classifier', KNeighborsClassifier())
    ]),
    'Naive Bayes': Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler()),
        ('classifier', GaussianNB())
    ]),
    'Random Forest': Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('classifier', RandomForestClassifier())
    ]),
    'Support Vector Machine': Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler()),
        ('classifier', SVC())
    ]),
    'Hist Gradient Boosting': HistGradientBoostingClassifier()  # Handles NaN natively
}

# Dictionary to store results
results = {}

print("\nModel Evaluation:")
print("-" * 50)

for name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    
    # Perform cross-validation
    cv_scores = cross_val_score(model, X, y, cv=5)
    
    # Store results
    results[name] = {
        'Accuracy': accuracy,
        'CV Average': np.mean(cv_scores),
        'CV Std': np.std(cv_scores)
    }
    
    # Print results
    print(f"{name}:")
    print(f"  Test Accuracy: {accuracy:.4f}")
    print(f"  Cross-Val Accuracy: {np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}")
    print("-" * 50)

# Step 4: Find the best model
best_model = max(results, key=lambda x: results[x]['CV Average'])
print("\nBest Model:", best_model)
print(f"Cross-Val Accuracy: {results[best_model]['CV Average']:.4f}")
print(f"Test Accuracy: {results[best_model]['Accuracy']:.4f}")

# Generate detailed report for the best model
best_clf = models[best_model]
best_clf.fit(X_train, y_train)
y_pred = best_clf.predict(X_test)

print("\nDetailed Classification Report for Best Model:")
print(classification_report(y_test, y_pred))


Model Evaluation:
--------------------------------------------------
Logistic Regression:
  Test Accuracy: 0.9667
  Cross-Val Accuracy: 0.9533 ± 0.0340
--------------------------------------------------
Decision Tree:
  Test Accuracy: 1.0000
  Cross-Val Accuracy: 0.9467 ± 0.0400
--------------------------------------------------
K-Nearest Neighbors:
  Test Accuracy: 0.9667
  Cross-Val Accuracy: 0.9733 ± 0.0533
--------------------------------------------------
Naive Bayes:
  Test Accuracy: 1.0000
  Cross-Val Accuracy: 0.9533 ± 0.0267
--------------------------------------------------
Random Forest:
  Test Accuracy: 1.0000
  Cross-Val Accuracy: 0.9600 ± 0.0249
--------------------------------------------------
Support Vector Machine:
  Test Accuracy: 0.9667
  Cross-Val Accuracy: 0.9667 ± 0.0211
--------------------------------------------------


[WinError 2] The system cannot find the file specified
  File "c:\Users\ajayv\AppData\Local\Programs\Python\Python312\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "c:\Users\ajayv\AppData\Local\Programs\Python\Python312\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\ajayv\AppData\Local\Programs\Python\Python312\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "c:\Users\ajayv\AppData\Local\Programs\Python\Python312\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


Hist Gradient Boosting:
  Test Accuracy: 0.9333
  Cross-Val Accuracy: 0.9600 ± 0.0327
--------------------------------------------------

Best Model: K-Nearest Neighbors
Cross-Val Accuracy: 0.9733
Test Accuracy: 0.9667

Detailed Classification Report for Best Model:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      0.89      0.94         9
           2       0.92      1.00      0.96        11

    accuracy                           0.97        30
   macro avg       0.97      0.96      0.97        30
weighted avg       0.97      0.97      0.97        30

