In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

# Load the dataset
# columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']
df = pd.read_csv(r"C:\Users\waghb\OneDrive\Desktop\dsbdal\DSBDALExam DataSets\Iris\Iris.csv")
df.head(5)

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa


In [6]:


# e. Data cleaning: Remove NA, ?, and negative values
# Replace '?' with NaN
df.replace('?', np.nan, inplace=True)

# Drop rows with NaN values
df.dropna(inplace=True)

# Remove negative values (e.g., in numeric columns)
numeric_cols = ['sepal.length', 'sepal.width', 'petal.length', 'petal.width']
df = df[(df[numeric_cols] >= 0).all(axis=1)]

# f. Error correcting: Outlier detection and removal using IQR
for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]

# g. Data transformation: Normalize numerical features and encode target
scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

# Encode the species labels
le = LabelEncoder()
df['variety'] = le.fit_transform(df['variety'])

# Prepare features (X) and target (y)
X = df[numeric_cols]
y = df['variety']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# h. Build models and compare accuracy
# Logistic Regression
logreg = LogisticRegression(max_iter=1000, random_state=42)
logreg.fit(X_train, y_train)
y_pred_logreg = logreg.predict(X_test)
logreg_accuracy = accuracy_score(y_test, y_pred_logreg)

# Naive Bayes
nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred_nb = nb.predict(X_test)
nb_accuracy = accuracy_score(y_test, y_pred_nb)

# Print results
print("Logistic Regression Accuracy:", logreg_accuracy)
print("Naive Bayes Accuracy:", nb_accuracy)
print("Difference (Logistic Regression - Naive Bayes):", logreg_accuracy - nb_accuracy)

Logistic Regression Accuracy: 1.0
Naive Bayes Accuracy: 1.0
Difference (Logistic Regression - Naive Bayes): 0.0
