# Iris Dataset Exploratory Data Analysis (EDA)

### Author: Christopher Braun

Purpose: Exploratory Data Analysis of the Iris dataset using pandas and visualization tools

Date: September 2025

### **1. Imports**

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Set style for better visualizations
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)

### **2. Load Data**

In [None]:
# Load the Iris dataset into pandas DataFrame
iris_df: pd.DataFrame = sns.load_dataset('iris')

# List column names
print("Column names:")
print(iris_df.columns.tolist())
print("\n")

# Inspect first few rows of the DataFrame
print("First 5 rows:")
iris_df.head()

### **3. Initial Data Inspection**

In [None]:
# Specify the number of rows to display
print("First 10 rows:")
iris_df.head(10)

# Inspect the shape of the DataFrame with shape attribute
print(f"DataFrame shape: {iris_df.shape}")
print(f"Number of rows: {iris_df.shape[0]}")
print(f"Number of columns: {iris_df.shape[1]}")

# Inspect the data types of the columns with dtypes attribute
print("Data types:")
print(iris_df.dtypes)

# Inspect the data types of the columns with info() method
iris_df.info()

### **4. Initial Descriptive Statistics**

In [None]:
# Inspect summary statistics for numerical columns
iris_df.describe()

### **5. Initial Data Distribution for Numerical Columns**

In [None]:
# Inspect histogram by one numerical column
iris_df['sepal_length'].hist()
plt.title('Distribution of Sepal Length')
plt.xlabel('Sepal Length (cm)')
plt.ylabel('Frequency')
plt.show()

# Inspect histograms for ALL numerical columns
iris_df.hist()
plt.suptitle('Distribution of All Numerical Features', y=1.02)
plt.tight_layout()
plt.show()

### **6. Initial Data Distribution for Catagorical Columns**

In [None]:
# Inspect value counts by categorical column
print("Species value counts:")
print(iris_df['species'].value_counts())
print("\n")

# Visualize distribution of species
sns.countplot(x='species', data=iris_df)
plt.title('Distribution of Iris Species')
plt.xlabel('Species')
plt.ylabel('Count')
plt.show()

### **7. Initial Data Transformation and Feature Engineering**

In [None]:
# Create a copy to preserve original data
iris_analysis_df = iris_df.copy()

# Renaming columns for consistency and clarity
iris_analysis_df.rename(columns={
    'sepal_length': 'sepal_length_cm',
    'sepal_width': 'sepal_width_cm',
    'petal_length': 'petal_length_cm',
    'petal_width': 'petal_width_cm'
}, inplace=True)

# Adding new engineered features
iris_analysis_df['sepal_area_cm2'] = iris_analysis_df['sepal_length_cm'] * iris_analysis_df['sepal_width_cm']
iris_analysis_df['petal_area_cm2'] = iris_analysis_df['petal_length_cm'] * iris_analysis_df['petal_width_cm']
iris_analysis_df['sepal_to_petal_ratio'] = iris_analysis_df['sepal_length_cm'] / iris_analysis_df['petal_length_cm']

# Display the enhanced dataset
print("Enhanced dataset with new features:")
iris_analysis_df.head()

### **8. Initial Visualizations**

In [None]:
# Create a pairplot of the Iris dataset
sns.pairplot(iris_analysis_df, hue='species', diag_kind='hist')
plt.suptitle('Pairplot of Iris Features by Species', y=1.02)
plt.show()

# Create a scatter plot comparing sepal length and area
scatter_plt = sns.scatterplot(
    data=iris_analysis_df, 
    x="sepal_length_cm", 
    y="sepal_area_cm2", 
    hue="species",
    s=100
)

# Customize the plot
scatter_plt.set_xlabel("Sepal Length (cm)")
scatter_plt.set_ylabel("Sepal Area (cm²)")   
scatter_plt.set_title("Chart 1. Iris Sepal Length vs. Sepal Area (by Species)")
plt.legend(title='Species')
plt.show()

# Create boxplots to compare distributions across species
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

features = ['sepal_length_cm', 'sepal_width_cm', 'petal_length_cm', 'petal_width_cm']
titles = ['Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width']

for i, (ax, feature, title) in enumerate(zip(axes.flat, features, titles)):
    sns.boxplot(x='species', y=feature, data=iris_analysis_df, ax=ax)
    ax.set_title(f'{title} by Species')
    ax.set_xlabel('Species')
    ax.set_ylabel(f'{title} (cm)')

plt.tight_layout()
plt.suptitle('Comparison of Iris Measurements by Species', y=1.02, fontsize=16)
plt.show()

# Correlation heatmap
plt.figure(figsize=(10, 8))
numeric_df = iris_analysis_df.select_dtypes(include=[np.number])
correlation_matrix = numeric_df.corr()

sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix of Iris Features')
plt.tight_layout()
plt.show()
