In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('Iris.csv',header=None)
df.head()

Unnamed: 0,0,1,2,3,4
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [3]:
df.columns = ['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm','Species']
df.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [4]:
df['Species'].unique()

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

## Data cleaning(Remove NA, ?, Negative values etc.)

In [5]:
df = df.replace('?', np.nan)  # Replace '?' values with NaN
df = df.dropna()  # Remove rows with missing values

In [6]:
df.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


## Data transformation

In [7]:
# Convert the target variable to numerical values
df['Species'] = df['Species'].map({'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2})

## Error correcting(Outlier detection and removal)

In [8]:
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
df = df[~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)]

## Build Data model using regression and Naïve Bayes methods and compare accuracy of Iris Species Prediction.

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

In [10]:
# Split the data into training and testing sets
X = df[['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']]
y = df['Species']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

In [11]:
# Build a logistic regression model
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

LogisticRegression()

In [12]:
# Make predictions on the test set using the logistic regression model
y_pred_logreg = logreg.predict(X_test)

In [13]:
# Calculate accuracy for logistic regression model
accuracy_logreg = accuracy_score(y_test, y_pred_logreg)

accuracy_logreg

0.9491525423728814

In [14]:
# Build a Naïve Bayes model

naive_bayes = GaussianNB()
naive_bayes.fit(X_train, y_train)

GaussianNB()

In [15]:
# Make predictions on the test set using the Naïve Bayes model
y_pred_nb = naive_bayes.predict(X_test)

In [16]:
# Calculate accuracy for Naïve Bayes model
accuracy_nb = accuracy_score(y_test, y_pred_nb)
accuracy_nb

0.9661016949152542

In [17]:
# Compare the accuracies of the two models
print("Accuracy - Logistic Regression:", accuracy_logreg)
print("Accuracy - Naïve Bayes:", accuracy_nb)

Accuracy - Logistic Regression: 0.9491525423728814
Accuracy - Naïve Bayes: 0.9661016949152542
