In [4]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [8]:
data = pd.read_csv("data/Iris.csv")

print(data.info())

data = data.drop(["Id"], axis=1)

Q1 = data["SepalWidthCm"].quantile(0.25)
Q3 = data["SepalWidthCm"].quantile(0.75)

IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = data[(data["SepalWidthCm"] < lower_bound) | (data["SepalWidthCm"] > upper_bound)]
print(f"Outliers: {outliers}")

data = data[(data["SepalWidthCm"] >= lower_bound) & (data["SepalWidthCm"] <= upper_bound)].copy()
data.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             150 non-null    int64  
 1   SepalLengthCm  150 non-null    float64
 2   SepalWidthCm   150 non-null    float64
 3   PetalLengthCm  150 non-null    float64
 4   PetalWidthCm   150 non-null    float64
 5   Species        150 non-null    object 
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB
None
Outliers:     SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm          Species
15            5.7           4.4            1.5           0.4      Iris-setosa
32            5.2           4.1            1.5           0.1      Iris-setosa
33            5.5           4.2            1.4           0.2      Iris-setosa
60            5.0           2.0            3.5           1.0  Iris-versicolor


(146, 5)

In [25]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, roc_auc_score, f1_score, matthews_corrcoef, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LogisticRegression


In [11]:
X = data.copy()
X = X.drop(columns=["Species"])
y = data["Species"]

X_train , X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=9)

print("X train", X_train)

print("y train", y_train)

print("X test", X_test)

print("y test", y_test)

X train      SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm
131            7.9           3.8            6.4           2.0
135            7.7           3.0            6.1           2.3
101            5.8           2.7            5.1           1.9
88             5.6           3.0            4.1           1.3
6              4.6           3.4            1.4           0.3
..             ...           ...            ...           ...
129            7.2           3.0            5.8           1.6
59             5.2           2.7            3.9           1.4
149            5.9           3.0            5.1           1.8
96             5.7           2.9            4.2           1.3
130            7.4           2.8            6.1           1.9

[116 rows x 4 columns]
y train 131     Iris-virginica
135     Iris-virginica
101     Iris-virginica
88     Iris-versicolor
6          Iris-setosa
            ...       
129     Iris-virginica
59     Iris-versicolor
149     Iris-virginica
96     Ir

In [14]:
model = LogisticRegression(max_iter=1000, random_state=9)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

In [20]:
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy}")

Accuracy: 1.0


In [21]:
print(f"yPred: {y_pred}")

yPred: ['Iris-setosa' 'Iris-virginica' 'Iris-virginica' 'Iris-setosa'
 'Iris-virginica' 'Iris-virginica' 'Iris-setosa' 'Iris-virginica'
 'Iris-setosa' 'Iris-versicolor' 'Iris-setosa' 'Iris-versicolor'
 'Iris-setosa' 'Iris-virginica' 'Iris-virginica' 'Iris-virginica'
 'Iris-setosa' 'Iris-setosa' 'Iris-virginica' 'Iris-versicolor'
 'Iris-versicolor' 'Iris-setosa' 'Iris-versicolor' 'Iris-versicolor'
 'Iris-virginica' 'Iris-virginica' 'Iris-setosa' 'Iris-setosa'
 'Iris-setosa' 'Iris-setosa']


In [22]:
print(f"yTest: {y_test}")

yTest: 26         Iris-setosa
137     Iris-virginica
102     Iris-virginica
28         Iris-setosa
148     Iris-virginica
145     Iris-virginica
48         Iris-setosa
109     Iris-virginica
34         Iris-setosa
74     Iris-versicolor
27         Iris-setosa
98     Iris-versicolor
29         Iris-setosa
116     Iris-virginica
115     Iris-virginica
126     Iris-virginica
40         Iris-setosa
21         Iris-setosa
118     Iris-virginica
61     Iris-versicolor
89     Iris-versicolor
3          Iris-setosa
92     Iris-versicolor
71     Iris-versicolor
147     Iris-virginica
107     Iris-virginica
42         Iris-setosa
7          Iris-setosa
18         Iris-setosa
8          Iris-setosa
Name: Species, dtype: object


In [23]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, X, y, cv=5)  # 5-fold cross-validation
print(f"Cross-Validation Scores: {scores}")
print(f"Average Cross-Validation Score: {scores.mean()}")

Cross-Validation Scores: [0.96666667 1.         0.93103448 0.96551724 1.        ]
Average Cross-Validation Score: 0.9726436781609195


In [26]:
# Display classification report for precision, recall, and F1-score
print(classification_report(y_test, y_pred))

# Confusion Matrix to see the number of true positives, false positives, etc.
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)


                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        13
Iris-versicolor       1.00      1.00      1.00         6
 Iris-virginica       1.00      1.00      1.00        11

       accuracy                           1.00        30
      macro avg       1.00      1.00      1.00        30
   weighted avg       1.00      1.00      1.00        30

Confusion Matrix:
 [[13  0  0]
 [ 0  6  0]
 [ 0  0 11]]
