In [1]:
# Import required libraries
import numpy as np  # For numerical operations
import matplotlib.pyplot as plt  # For plotting
import pandas as pd  # For data manipulation and analysis

In [2]:
# Load the Iris dataset from GitHub repository
# This dataset contains measurements of iris flowers including sepal length, sepal width, petal length, and petal width
# The target variable is the species of iris (setosa, versicolor, or virginica)
dataset = pd.read_csv('https://raw.githubusercontent.com/mk-gurucharan/Classification/master/IrisDataset.csv')

In [None]:
# Step 3: Data Exploration and Analysis
# This step involves examining the dataset's statistical properties and dimensions
# - describe(): Generates descriptive statistics (count, mean, std, min, 25%, 50%, 75%, max)
# - shape: Shows the number of rows and columns in the dataset
# - head(): Displays the first few rows to understand the data structure
dataset.describe()

In [None]:
# Display the first 5 rows of the dataset to understand its structure
# This shows the features (sepal length, sepal width, petal length, petal width) and target variable (species)
dataset.head()

In [None]:
# Get the dimensions of the dataset
# Returns a tuple containing (number of rows, number of columns)
# This helps understand the size and structure of our dataset
dataset.shape

In [None]:
# Extract features (X) from the dataset
# Using iloc to select all rows (:) and first 4 columns (:4)
# This selects the numerical features: sepal length, sepal width, petal length, and petal width
# .values converts the pandas DataFrame to a numpy array for machine learning
X = dataset.iloc[:,:4].values
X

In [None]:
# Extract target variable (y) from the dataset
# Using the 'species' column which contains the iris species labels
# .values converts the pandas Series to a numpy array for machine learning
y = dataset['species'].values
y


In [8]:
# Import train_test_split from scikit-learn's model_selection module
# This function splits arrays or matrices into random train and test subsets
# - Useful for evaluating model performance on unseen data
# - Returns X_train, X_test, y_train, y_test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [None]:
# Import StandardScaler from scikit-learn's preprocessing module
# StandardScaler standardizes features by removing the mean and scaling to unit variance
# - Centers the data by removing the mean of each feature
# - Scales the data to unit variance (standard deviation = 1)
# - This helps improve model performance by normalizing the feature scales
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
X_train

In [None]:
# Display the standardized test features (X_test)
# This shows the scaled test data after applying StandardScaler
# Each row represents a test sample with 4 standardized features
# Values are centered around 0 with unit variance
X_test

In [None]:
# 1--Simple Naïve Bayes Classification
# Naïve Bayes is a probabilistic classifier based on Bayes’ Theorem
# with a strong (naïve) assumption that features are independent 
# given the class.Import GaussianNB from scikit-learn's 
# naive_bayes module
# Bayes' Theorem describes the probability of an event based on prior 
# knowledge of conditions related to the event.
# GaussianNB implements the Gaussian Naive Bayes algorithm for classification
# - Assumes features follow a normal distribution
# - Simple and efficient for small to medium-sized datasets
# - Works well with continuous features
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

In [None]:
# Use the trained classifier to make predictions on the test data
# - classifier.predict() returns predicted class labels for X_test
# - Each prediction corresponds to a sample in X_test
# - Returns numpy array of predicted class labels
y_pred = classifier.predict(X_test) 
y_pred


In [None]:
# 2--Import confusion_matrix from scikit-learn's metrics module
# confusion_matrix computes the confusion matrix to evaluate classification accuracy
# - Shows true positives, false positives, true negatives, and false negatives
# - Helps visualize model performance and identify misclassifications
# - Returns a 2D array where rows represent actual classes and columns represent predicted classes
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

In [None]:
# Import accuracy_score from scikit-learn's metrics module
# accuracy_score calculates the accuracy of the classifier's predictions
# - Compares predicted labels (y_pred) with true labels (y_test)
# - Returns a float between 0 and 1 representing the proportion of correct predictions
# - 1.0 means perfect accuracy, 0.0 means all predictions were wrong
from sklearn.metrics import accuracy_score 
print ("Accuracy : ", accuracy_score(y_test, y_pred))
Accuracy :  1.0

In [None]:
# Create a DataFrame to compare actual vs predicted values
# - Uses pandas DataFrame to organize the results
# - 'Real Values' column contains the true labels from y_test
# - 'Predicted Values' column contains the model's predictions from y_pred
# - Makes it easy to visually inspect model performance
# - Helps identify where predictions match or differ from actual values
df = pd.DataFrame({'Real Values':y_test, 'Predicted Values':y_pred})
df

In [None]:
# 2---Import necessary libraries for classification metrics
# - classification_report provides detailed metrics for each class
# - numpy is used for array operations and unique value extraction
from sklearn.metrics import classification_report
import numpy as np

# Compute confusion matrix
# - Creates a 2D array showing true vs predicted class distributions
# - Helps visualize model's performance across different classes
cm = confusion_matrix(y_test, y_pred)

# Calculate accuracy and error rate
# - accuracy_score computes proportion of correct predictions
# - error_rate is complement of accuracy (1 - accuracy)
accuracy = accuracy_score(y_test, y_pred)
error_rate = 1 - accuracy
print(f"Accuracy: {accuracy:.4f}")
print(f"Error Rate: {error_rate:.4f}")

# Print classification report
# - Provides precision, recall, F1-score for each class
# - Includes macro and weighted averages
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))

# Compute and display confusion matrix breakdown
# - Extracts unique class labels from test and prediction data
# - For each class, calculates:
#   * True Positives (TP): Correctly predicted instances
#   * False Positives (FP): Incorrectly predicted as this class
#   * False Negatives (FN): This class incorrectly predicted as others
#   * True Negatives (TN): Correctly predicted as not this class
print("Confusion Matrix Breakdown:")
labels = np.unique(np.concatenate((y_test, y_pred)))
for i, label in enumerate(labels):
    TP = cm[i, i]
    FP = cm[:, i].sum() - TP
    FN = cm[i, :].sum() - TP
    TN = cm.sum() - (TP + FP + FN)
    
    print(f"\nClass: {label}")
    print(f"TP: {TP}, FP: {FP}, FN: {FN}, TN: {TN}")



Accuracy: 0.9333
Error Rate: 0.0667

Classification Report:

              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        11
  versicolor       0.80      1.00      0.89         8
   virginica       1.00      0.82      0.90        11

    accuracy                           0.93        30
   macro avg       0.93      0.94      0.93        30
weighted avg       0.95      0.93      0.93        30

Confusion Matrix Breakdown:

Class: setosa
TP: 11, FP: 0, FN: 0, TN: 19

Class: versicolor
TP: 8, FP: 2, FN: 0, TN: 20

Class: virginica
TP: 9, FP: 0, FN: 2, TN: 19
