# Iris end-to-end project : Classification

## Environment setup

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

## Exploratory Data Analysis (EDA)

In [2]:
# Set random state for repeatability
rs = 147

cols = ["sepal_length", "sepal_width", "petal_length", "petal_width", "iris_type"]
iris = load_iris(as_frame=True)
df = pd.concat([iris.data, iris.target], axis=1)
df.columns = cols
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,iris_type
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


*TODO: Check feature histograms and correlations (Seaborn pair plot)*

## Data preparation

In [3]:
X = df.iloc[:, :-1]
y = df[df.columns[-1]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=rs)

print(f"Train set shape - X : {X_train.shape}, y : {y_train.shape}")
print(f"Test  set shape - X : {X_test.shape}, y : {y_test.shape}")

Train set shape - X : (120, 4), y : (120,)
Test  set shape - X : (30, 4), y : (30,)


In [4]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_train_scaled.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
13,-1.868877,-0.150756,-1.504066,-1.442948
3,-1.508322,0.075378,-1.27718,-1.310668
26,-1.027582,0.753778,-1.220459,-1.046109
8,-1.748692,-0.376889,-1.333902,-1.310668
102,1.496303,-0.150756,1.218568,1.20264


In [5]:
X_test_scaled = scaler.transform(X_test)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
df_test = pd.concat([X_test_scaled, y_test], axis=1).round(6)
df_test.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,iris_type
83,0.174268,-0.829156,0.764795,0.541243,1
101,-0.066102,-0.829156,0.764795,0.938081,2
55,-0.186287,-0.603023,0.424466,0.144405,1
71,0.294453,-0.603023,0.140859,0.144405,1
105,2.097228,-0.150756,1.615619,1.20264,2


In [6]:
df_train = pd.concat([X_train_scaled, y_train], axis=1).round(6)
df_train.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,iris_type
13,-1.868877,-0.150756,-1.504066,-1.442948,0
3,-1.508322,0.075378,-1.27718,-1.310668,0
26,-1.027582,0.753778,-1.220459,-1.046109,0
8,-1.748692,-0.376889,-1.333902,-1.310668,0
102,1.496303,-0.150756,1.218568,1.20264,2


In [7]:
df_test = pd.concat([X_test, y_test], axis=1)
df_test.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,iris_type
83,6.0,2.7,5.1,1.6,1
101,5.8,2.7,5.1,1.9,2
55,5.7,2.8,4.5,1.3,1
71,6.1,2.8,4.0,1.3,1
105,7.6,3.0,6.6,2.1,2


## Model training

In [8]:
df_train = pd.read_csv("../data/prepared/train.csv")
df_train.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,iris_type
0,-1.868877,-0.150756,-1.504066,-1.442948,0
1,-1.508322,0.075378,-1.27718,-1.310668,0
2,-1.027582,0.753778,-1.220459,-1.046109,0
3,-1.748692,-0.376889,-1.333902,-1.310668,0
4,1.496303,-0.150756,1.218568,1.20264,2


In [9]:
df_test = pd.read_csv("../data/prepared/test.csv")
df_test.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,iris_type
0,0.174268,-0.829156,0.764795,0.541243,1
1,-0.066102,-0.829156,0.764795,0.938081,2
2,-0.186287,-0.603023,0.424466,0.144405,1
3,0.294453,-0.603023,0.140859,0.144405,1
4,2.097228,-0.150756,1.615619,1.20264,2


In [10]:
X_train = df_train.iloc[:, :-1]
y_train = df_train[df_train.columns[-1]]

X_test = df_test.iloc[:, :-1]
y_test = df_test[df_test.columns[-1]]

print(f"Train set shapes - X : {X_train.shape} , y : {y_train.shape}")
print(f"Test  set shapes - X : {X_test.shape}  , y : {y_test.shape}")

Train set shapes - X : (120, 4) , y : (120,)
Test  set shapes - X : (30, 4)  , y : (30,)


In [11]:
# Random state
rs = 147

# Start with a baseline model
lr = LogisticRegression(solver="saga", random_state=rs)
trained = lr.fit(X_train, y_train)

# TODO: Add tuning logic to get a better model

# TODO: Integrate with MLFlow and log params, model and metrics

## Model evaluation

In [12]:
y_pred = trained.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="macro")

print(f"Baseline accuracy (default LR) : {accuracy:.4f}")
print(f"Baseline F1 score (default LR) : {f1:.4f}")

Baseline accuracy (default LR) : 0.9333
Baseline F1 score (default LR) : 0.9333
