## Iris Data

In [3]:
import pandas as pd

In [4]:
file_path = 'iris.data'
df = pd.read_csv(file_path)

In [5]:
print("Dataset preview;")
print(df.head())

Dataset preview;
   5.1  3.5  1.4  0.2  Iris-setosa
0  4.9  3.0  1.4  0.2  Iris-setosa
1  4.7  3.2  1.3  0.2  Iris-setosa
2  4.6  3.1  1.5  0.2  Iris-setosa
3  5.0  3.6  1.4  0.2  Iris-setosa
4  5.4  3.9  1.7  0.4  Iris-setosa


In [6]:
df.columns = ['sepal_length' , 'sepal_width', 'petal_length', 'petal_width', 'species']

In [7]:
df['Label'] = df['species'].apply(lambda x: 1 if x == 'Iris-setosa' else 0)

df = df.drop(columns=['species'])

# Separate features
X = df.drop(columns=['Label']).values
y = df['Label'].values

# Display 
print("\nProcessed dataset preview:")
print(df.head())


Processed dataset preview:
   sepal_length  sepal_width  petal_length  petal_width  Label
0           4.9          3.0           1.4          0.2      1
1           4.7          3.2           1.3          0.2      1
2           4.6          3.1           1.5          0.2      1
3           5.0          3.6           1.4          0.2      1
4           5.4          3.9           1.7          0.4      1


In [8]:
from sklearn.model_selection import train_test_split

# Separate features and labels
X = df.drop(columns=['Label']).values
y = df['Label'].values

# Create 80/20 split
X_train_80, X_test_80, y_train_80, y_test_80 = train_test_split(X, y, test_size=0.2, random_state=42)
print("80/20 Split:")
print(f"Training samples: {len(X_train_80)}, Testing samples: {len(X_test_80)}")

# Create 50/50 split
X_train_50, X_test_50, y_train_50, y_test_50 = train_test_split(X, y, test_size=0.5, random_state=42)
print("\n50/50 Split:")
print(f"Training samples: {len(X_train_50)}, Testing samples: {len(X_test_50)}")

# Create 20/80 split
X_train_20, X_test_20, y_train_20, y_test_20 = train_test_split(X, y, test_size=0.8, random_state=42)
print("\n20/80 Split:")
print(f"Training samples: {len(X_train_20)}, Testing samples: {len(X_test_20)}")

# Organize splits into a dictionary 
splits = {
    "80/20": (X_train_80, X_test_80, y_train_80, y_test_80),
    "50/50": (X_train_50, X_test_50, y_train_50, y_test_50),
    "20/80": (X_train_20, X_test_20, y_train_20, y_test_20)
}

80/20 Split:
Training samples: 119, Testing samples: 30

50/50 Split:
Training samples: 74, Testing samples: 75

20/80 Split:
Training samples: 29, Testing samples: 120


In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# Initialize classifiers
classifiers = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'SVM': SVC(kernel='rbf', C=1, gamma='scale', random_state=42),
    'KNN': KNeighborsClassifier(n_neighbors=5)
}


In [10]:
from sklearn.metrics import accuracy_score
import numpy as np

# Number of trials
num_trials = 3

# Store results
results = {}

for partition_name, (X_train_orig, X_test_orig, y_train_orig, y_test_orig) in splits.items():
    results[partition_name] = {clf_name: [] for clf_name in classifiers.keys()}
    print(f"\nConducting trials for {partition_name} partition:")
    
    for trial in range(num_trials):
        print(f"  Trial {trial + 1}:")
        # Shuffle the dataset with a different random state
        X_train, X_test, y_train, y_test = train_test_split(
            np.vstack((X_train_orig, X_test_orig)),
            np.hstack((y_train_orig, y_test_orig)),
            test_size=len(X_test_orig) / (len(X_train_orig) + len(X_test_orig)),
            random_state=trial
        )
        
        for clf_name, clf in classifiers.items():
            # Train the classifier
            clf.fit(X_train, y_train)
            
            # Test the classifier
            y_pred = clf.predict(X_test)
            
            # Calculate accuracy
            accuracy = accuracy_score(y_test, y_pred)
            results[partition_name][clf_name].append(accuracy)
            print(f"    {clf_name} - Accuracy: {accuracy:.4f}")



Conducting trials for 80/20 partition:
  Trial 1:
    Random Forest - Accuracy: 1.0000
    SVM - Accuracy: 1.0000
    KNN - Accuracy: 1.0000
  Trial 2:
    Random Forest - Accuracy: 1.0000
    SVM - Accuracy: 1.0000
    KNN - Accuracy: 1.0000
  Trial 3:
    Random Forest - Accuracy: 1.0000
    SVM - Accuracy: 1.0000
    KNN - Accuracy: 1.0000

Conducting trials for 50/50 partition:
  Trial 1:
    Random Forest - Accuracy: 1.0000
    SVM - Accuracy: 1.0000
    KNN - Accuracy: 1.0000
  Trial 2:
    Random Forest - Accuracy: 1.0000
    SVM - Accuracy: 1.0000
    KNN - Accuracy: 1.0000
  Trial 3:
    Random Forest - Accuracy: 1.0000
    SVM - Accuracy: 1.0000
    KNN - Accuracy: 1.0000

Conducting trials for 20/80 partition:
  Trial 1:
    Random Forest - Accuracy: 1.0000
    SVM - Accuracy: 1.0000
    KNN - Accuracy: 1.0000
  Trial 2:
    Random Forest - Accuracy: 1.0000
    SVM - Accuracy: 1.0000
    KNN - Accuracy: 1.0000
  Trial 3:
    Random Forest - Accuracy: 1.0000
    SVM - Accura

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

# Define hyperparameter grids
param_grids = {
    'Random Forest': {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10]
    },
    'SVM': {
        'C': [0.1, 1, 10, 100],
        'gamma': ['scale', 0.1, 0.01, 0.001],
        'kernel': ['rbf']
    },
    'KNN': {
        'n_neighbors': range(3, 11),
        'weights': ['uniform', 'distance'],
        'p': [1, 2]  # Manhattan (1) and Euclidean (2)
    }
}

In [12]:
# Store results for hyperparameter tuning
best_params = {}
partition_results = {}

# Perform GridSearchCV for each partition and classifier
for partition_name, (X_train, X_test, y_train, y_test) in splits.items():
    print(f"\nHyperparameter tuning for {partition_name} partition:")
    partition_results[partition_name] = {}
    best_params[partition_name] = {}
    
    for clf_name, clf in {'Random Forest': RandomForestClassifier(random_state=42),
                          'SVM': SVC(random_state=42),
                          'KNN': KNeighborsClassifier()}.items():
        
        print(f"\nTuning {clf_name}...")
        
        # Initialize GridSearchCV
        grid_search = GridSearchCV(
            clf,
            param_grid=param_grids[clf_name],
            cv=5,  # 5-fold cross-validation
            scoring='accuracy',
            n_jobs=-1  # Use all processors
        )
        
        # Fit GridSearchCV
        grid_search.fit(X_train, y_train)
        
        # Store the best parameters and corresponding accuracy
        best_params[partition_name][clf_name] = grid_search.best_params_
        partition_results[partition_name][clf_name] = grid_search.best_score_
        
        print(f"Best parameters for {clf_name}: {grid_search.best_params_}")
        print(f"Best cross-validation accuracy for {clf_name}: {grid_search.best_score_:.4f}")


Hyperparameter tuning for 80/20 partition:

Tuning Random Forest...
Best parameters for Random Forest: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 50}
Best cross-validation accuracy for Random Forest: 1.0000

Tuning SVM...
Best parameters for SVM: {'C': 0.1, 'gamma': 'scale', 'kernel': 'rbf'}
Best cross-validation accuracy for SVM: 1.0000

Tuning KNN...
Best parameters for KNN: {'n_neighbors': 3, 'p': 1, 'weights': 'uniform'}
Best cross-validation accuracy for KNN: 1.0000

Hyperparameter tuning for 50/50 partition:

Tuning Random Forest...
Best parameters for Random Forest: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 50}
Best cross-validation accuracy for Random Forest: 1.0000

Tuning SVM...
Best parameters for SVM: {'C': 0.1, 'gamma': 'scale', 'kernel': 'rbf'}
Best cross-validation accuracy for SVM: 1.0000

Tuning KNN...
Best parameters for KNN: {'n_neighbors': 3, 'p': 1, 'weights': 'uniform'}
Best cross-validation accuracy for KNN: 1.0000

Hyperparame