# Iris Dataset - Exploratory Data Analysis

This notebook performs exploratory data analysis on the Iris dataset for our MLOps pipeline.

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

## 1. Load and Explore the Dataset

In [None]:
# Load the Iris dataset
iris = load_iris()

# Create DataFrame
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['target'] = iris.target
df['species'] = df['target'].map({0: 'setosa', 1: 'versicolor', 2: 'virginica'})

print("Dataset shape:", df.shape)
print("\nFeature names:", iris.feature_names)
print("\nTarget names:", iris.target_names)

df.head()

In [None]:
# Basic information about the dataset
print("Dataset Info:")
print(df.info())
print("\nDataset Description:")
print(df.describe())

In [None]:
# Check for missing values
print("Missing values:")
print(df.isnull().sum())

# Check class distribution
print("\nClass distribution:")
print(df['species'].value_counts())

## 2. Data Visualization

In [None]:
# Create figure with subplots
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('Iris Dataset - Feature Distributions by Species', fontsize=16, fontweight='bold')

# Plot distributions for each feature
features = iris.feature_names
for i, feature in enumerate(features):
    row = i // 2
    col = i % 2
    
    for species in df['species'].unique():
        subset = df[df['species'] == species]
        axes[row, col].hist(subset[feature], alpha=0.7, label=species, bins=15)
    
    axes[row, col].set_title(feature.replace(' (cm)', '').title(), fontweight='bold')
    axes[row, col].set_xlabel('Value (cm)')
    axes[row, col].set_ylabel('Frequency')
    axes[row, col].legend()
    axes[row, col].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Box plots for each feature
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('Iris Dataset - Feature Box Plots by Species', fontsize=16, fontweight='bold')

for i, feature in enumerate(features):
    row = i // 2
    col = i % 2
    
    sns.boxplot(data=df, x='species', y=feature, ax=axes[row, col])
    axes[row, col].set_title(feature.replace(' (cm)', '').title(), fontweight='bold')
    axes[row, col].set_xlabel('Species')
    axes[row, col].set_ylabel('Value (cm)')
    axes[row, col].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Correlation matrix
plt.figure(figsize=(10, 8))
correlation_matrix = df[features].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=0.5, fmt='.2f')
plt.title('Feature Correlation Matrix', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

print("Correlation Matrix:")
print(correlation_matrix)

In [None]:
# Pair plot
plt.figure(figsize=(12, 10))
sns.pairplot(df, hue='species', height=2.5)
plt.suptitle('Iris Dataset - Pairwise Feature Relationships', y=1.02, fontsize=16, fontweight='bold')
plt.show()

## 3. Feature Analysis

In [None]:
# Statistical summary by species
print("Statistical Summary by Species:")
print("="*50)

for species in df['species'].unique():
    print(f"\n{species.upper()}:")
    subset = df[df['species'] == species][features]
    print(subset.describe().round(2))

In [None]:
# Feature importance analysis using simple variance
feature_variance = df[features].var().sort_values(ascending=False)

plt.figure(figsize=(10, 6))
bars = plt.bar(range(len(feature_variance)), feature_variance.values)
plt.xlabel('Features')
plt.ylabel('Variance')
plt.title('Feature Variance Analysis', fontweight='bold')
plt.xticks(range(len(feature_variance)), 
           [name.replace(' (cm)', '') for name in feature_variance.index], 
           rotation=45)

# Add value labels on bars
for i, bar in enumerate(bars):
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height + 0.01,
             f'{height:.2f}', ha='center', va='bottom')

plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print("Feature Variance Ranking:")
for i, (feature, variance) in enumerate(feature_variance.items(), 1):
    print(f"{i}. {feature}: {variance:.4f}")

## 4. Data Preprocessing Preview

In [None]:
# Split the data
X = df[features]
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")
print(f"Features: {X_train.shape[1]}")

# Check class distribution in splits
print("\nClass distribution in training set:")
train_dist = pd.Series(y_train).value_counts().sort_index()
for i, count in enumerate(train_dist):
    print(f"Class {i} ({iris.target_names[i]}): {count}")

print("\nClass distribution in test set:")
test_dist = pd.Series(y_test).value_counts().sort_index()
for i, count in enumerate(test_dist):
    print(f"Class {i} ({iris.target_names[i]}): {count}")

In [None]:
# Feature scaling demonstration
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Compare before and after scaling
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Before scaling
axes[0].boxplot([X_train[col] for col in X_train.columns], 
                labels=[col.replace(' (cm)', '') for col in X_train.columns])
axes[0].set_title('Before Scaling', fontweight='bold')
axes[0].set_ylabel('Value')
axes[0].grid(True, alpha=0.3)

# After scaling
axes[1].boxplot([X_train_scaled[:, i] for i in range(X_train_scaled.shape[1])], 
                labels=[col.replace(' (cm)', '') for col in X_train.columns])
axes[1].set_title('After Scaling', fontweight='bold')
axes[1].set_ylabel('Scaled Value')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("Scaling Statistics:")
print(f"Original data - Mean: {X_train.mean().values}")
print(f"Original data - Std: {X_train.std().values}")
print(f"Scaled data - Mean: {X_train_scaled.mean(axis=0)}")
print(f"Scaled data - Std: {X_train_scaled.std(axis=0)}")

## 5. Key Insights

### Dataset Characteristics:
- **Size**: 150 samples, 4 features
- **Classes**: 3 balanced classes (50 samples each)
- **Missing Values**: None
- **Feature Types**: All numerical (continuous)

### Feature Analysis:
- **Petal length** shows the highest variance and best separability
- **Petal width** also shows good class separation
- **Sepal features** have more overlap between classes
- Features are positively correlated, especially petal measurements

### Class Separability:
- **Setosa** is clearly separable from other classes
- **Versicolor** and **Virginica** have some overlap
- Petal measurements are most discriminative

### Preprocessing Needs:
- Features have different scales - standardization recommended
- No missing values to handle
- Balanced classes - no resampling needed

In [None]:
# Save the processed data for use in the pipeline
import os

# Create data directory if it doesn't exist
os.makedirs('../data', exist_ok=True)

# Save raw data
df.to_csv('../data/iris_raw.csv', index=False)
print("Raw data saved to '../data/iris_raw.csv'")

# Save processed data
np.save('../data/X_train.npy', X_train_scaled)
np.save('../data/X_test.npy', X_test_scaled)
np.save('../data/y_train.npy', y_train.values)
np.save('../data/y_test.npy', y_test.values)
print("Processed data saved to '../data/' directory")

print("\nEDA completed successfully!")