In [1]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
import pandas as pd
import numpy as np
import os
import joblib

In [2]:
# Load the dataset
df = pd.read_csv('../data/raw/iris.csv')
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [3]:
# Check for missing values
if df.isnull().sum().any():
    # Handle missing values (Here, we'll just fill them with the median value)
    df.fillna(df.median(), inplace=True)

In [4]:
# Feature scaling
scaler = StandardScaler()
features = ['sepal length (cm)', 'sepal width (cm)', 
            'petal length (cm)', 'petal width (cm)']
df[features] = scaler.fit_transform(df[features])

In [5]:
# Preprocess the 'species' column using LabelEncoder
encoder = LabelEncoder()
df['species'] = encoder.fit_transform(df['species'])

# Save the LabelEncoder object to a file
joblib.dump(encoder, '../data/processed/label_encoder.joblib')

['../data/processed/label_encoder.joblib']

In [7]:
# Print the LabelEncoder object and its attributes
print(encoder)
print('Classes:', encoder.classes_)
print('Encoded labels:', encoder.transform(encoder.classes_))

LabelEncoder()
Classes: ['setosa' 'versicolor' 'virginica']
Encoded labels: [0 1 2]


In [6]:
# Save the preprocessed data
os.makedirs('../data/processed', exist_ok=True)
df.to_csv('../data/processed/iris_preprocessed.csv', index=False)