In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('Iris.csv',header=None)
df.head()

Unnamed: 0,0,1,2,3,4
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [3]:
df.columns = ['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm','Species']
df.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [4]:
df['Species'].unique()

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

## Data cleaning(Remove NA, ?, Negative values etc.)

In [5]:
df.shape

(150, 5)

In [6]:
df[df == '?'].sum()

SepalLengthCm    0.0
SepalWidthCm     0.0
PetalLengthCm    0.0
PetalWidthCm     0.0
Species            0
dtype: object

In [7]:
df.isnull().sum().sum()

0

In [8]:
df = df.replace('?', np.nan)  # Replace '?' values with NaN
df = df.dropna()  # Remove rows with missing values

In [9]:
df.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


## Data transformation

In [10]:
from sklearn.preprocessing import LabelEncoder,StandardScaler

In [11]:
le = LabelEncoder()
le.fit(df['Species'])
le.classes_

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [12]:
df['Species'] = le.transform(df['Species'])

In [13]:
df.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [14]:
# Manual Way
# Convert the target variable to numerical values
# df['Species'] = df['Species'].map({'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2})

## Error correcting(Outlier detection and removal)

In [15]:
df.shape

(150, 5)

In [16]:
# The Z-score, also known as the standard score, is a statistical measure that quantifies 
# how many standard deviations a data point is away from the mean of a distribution. 
# It is a way to standardize and compare values across different distributions.

def remove_outliers_zscore(data, threshold=3):
    z_scores = np.abs((data - data.mean()) / data.std())  # Calculate Z-scores
    
    out = z_scores > 3
    
    data = data[~out.any(axis=1)]
    
    return data
    
filtered_data = remove_outliers_zscore(df)

In [17]:
filtered_data.shape

(149, 5)

## Build Data model using regression and Naïve Bayes methods and compare accuracy of Iris Species Prediction.

In [18]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

In [19]:
# Split the data into training and testing sets
X = df[['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']]
y = df['Species']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [20]:
# Build a logistic regression model
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

LogisticRegression()

In [21]:
# Make predictions on the test set using the logistic regression model
y_pred_logreg = logreg.predict(X_test)

In [22]:
# Calculate accuracy for logistic regression model
accuracy_logreg = accuracy_score(y_test, y_pred_logreg)

accuracy_logreg

1.0

In [23]:
# Build a Naïve Bayes model
naive_bayes = GaussianNB()
naive_bayes.fit(X_train, y_train)

GaussianNB()

In [24]:
# Make predictions on the test set using the Naïve Bayes model
y_pred_nb = naive_bayes.predict(X_test)

In [25]:
# Calculate accuracy for Naïve Bayes model
accuracy_nb = accuracy_score(y_test, y_pred_nb)
accuracy_nb

0.9777777777777777

In [26]:
# Compare the accuracies of the two models
print("Accuracy - Logistic Regression:", accuracy_logreg)
print("Accuracy - Naïve Bayes:", accuracy_nb)

Accuracy - Logistic Regression: 1.0
Accuracy - Naïve Bayes: 0.9777777777777777
