In [57]:
import pandas as pd
from sklearn import preprocessing

In [58]:
DATASET_URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
FEATURES = [
    "sepal_length",
    "sepal_width",
    "petal_length",
    "petal_width"
]
LABEL = "species"
CLASSES = ["Iris-setosa", "Iris-versicolor", "Iris-virginica"]
COLUMNS = FEATURES + [LABEL]
COLUMNS

['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']

In [59]:
data = pd.read_csv(DATASET_URL, names=COLUMNS)
data.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [60]:
# Data Types Before
data.dtypes

sepal_length    float64
sepal_width     float64
petal_length    float64
petal_width     float64
species          object
dtype: object

In [61]:
# Float64 -> Float32
for feature in FEATURES:
    data[feature] = data[feature].astype("float32")
    
data[LABEL] = data[LABEL].astype("category")

In [62]:
# Data Types After
data.dtypes

sepal_length     float32
sepal_width      float32
petal_length     float32
petal_width      float32
species         category
dtype: object

In [63]:
# Pre-preprocessing Stats
data.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [64]:
# Missing values
data.isna().sum()

sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
dtype: int64

In [65]:
# Class Balance
data[LABEL].value_counts()

Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
Name: species, dtype: int64

In [66]:
# Normalise Features
for feature in FEATURES:
    min_max_scaler = preprocessing.MinMaxScaler()
    data[[feature]] = min_max_scaler.fit_transform(data[[feature]])
#     standard_scaler = preprocessing.StandardScaler()
# data[[feature]] = standard_scaler.fit_transform(data[[feature]])

    
data.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,0.222222,0.625,0.067797,0.041667,Iris-setosa
1,0.166667,0.416667,0.067797,0.041667,Iris-setosa
2,0.111111,0.5,0.050847,0.041667,Iris-setosa
3,0.083333,0.458333,0.084746,0.041667,Iris-setosa
4,0.194444,0.666667,0.067797,0.041667,Iris-setosa


In [67]:
# Label Encode
labelencoder = preprocessing.LabelEncoder()
data[LABEL] = labelencoder.fit_transform(data[LABEL])
data.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,0.222222,0.625,0.067797,0.041667,0
1,0.166667,0.416667,0.067797,0.041667,0
2,0.111111,0.5,0.050847,0.041667,0
3,0.083333,0.458333,0.084746,0.041667,0
4,0.194444,0.666667,0.067797,0.041667,0


In [68]:
data[LABEL].nunique()

3

In [69]:
# Pre-preprocessing Stats
data.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
count,150.0,150.0,150.0,150.0,150.0
mean,0.428704,0.439167,0.467571,0.457778,1.0
std,0.230018,0.180664,0.299054,0.317984,0.819232
min,0.0,0.0,0.0,0.0,0.0
25%,0.222222,0.333333,0.101695,0.083333,0.0
50%,0.416667,0.416667,0.567797,0.5,1.0
75%,0.583333,0.541667,0.694915,0.708333,2.0
max,1.0,1.0,1.0,1.0,2.0


In [70]:
# Save preprocessed csv
data.to_csv("iris.csv", index=False)