# **1. Perkenalan Dataset**

Dataset yang digunakan: **Iris Dataset**
- Sumber: UCI Machine Learning Repository
- Jumlah sampel: 150
- Jumlah fitur: 4 (sepal_length, sepal_width, petal_length, petal_width)
- Target: 3 kelas (setosa, versicolor, virginica)
- Tipe data: Numerik untuk fitur, kategorikal untuk target

# **2. Import Library**

In [None]:
# Data manipulation and analysis
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Utilities
import warnings
warnings.filterwarnings('ignore')

# Set style for plots
plt.style.use('default')
sns.set_palette("husl")

print("Libraries imported successfully!")

# **3. Memuat Dataset**

In [None]:
# Load Iris dataset
from sklearn.datasets import load_iris

# Load data
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['species'] = iris.target

# Map target numbers to species names
species_mapping = {0: 'setosa', 1: 'versicolor', 2: 'virginica'}
df['species_name'] = df['species'].map(species_mapping)

# Save raw data
df.to_csv('iris_raw.csv', index=False)

print(f"Dataset shape: {df.shape}")
print(f"\nDataset info:")
print(df.info())
print(f"\nFirst 5 rows:")
df.head()

# **4. Exploratory Data Analysis (EDA)**

In [None]:
# Basic statistics
print("=== BASIC STATISTICS ===")
print(df.describe())

print("\n=== MISSING VALUES ===")
print(df.isnull().sum())

print("\n=== TARGET DISTRIBUTION ===")
print(df['species_name'].value_counts())

In [None]:
# Visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# 1. Distribution of features
df[iris.feature_names].hist(bins=20, ax=axes[0,0], alpha=0.7)
axes[0,0].set_title('Feature Distributions')

# 2. Species distribution
df['species_name'].value_counts().plot(kind='bar', ax=axes[0,1], color='skyblue')
axes[0,1].set_title('Species Distribution')
axes[0,1].set_xlabel('Species')
axes[0,1].set_ylabel('Count')

# 3. Correlation heatmap
correlation_matrix = df[iris.feature_names].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', ax=axes[1,0])
axes[1,0].set_title('Feature Correlation Matrix')

# 4. Pairplot sample (sepal features)
scatter = axes[1,1].scatter(df['sepal length (cm)'], df['sepal width (cm)'], 
                           c=df['species'], cmap='viridis', alpha=0.7)
axes[1,1].set_xlabel('Sepal Length (cm)')
axes[1,1].set_ylabel('Sepal Width (cm)')
axes[1,1].set_title('Sepal Length vs Width by Species')

plt.tight_layout()
plt.show()

In [None]:
# More detailed EDA
print("=== FEATURE STATISTICS BY SPECIES ===")
for species in df['species_name'].unique():
    print(f"\n{species.upper()}:")
    species_data = df[df['species_name'] == species][iris.feature_names]
    print(species_data.describe())

In [None]:
# Box plots for each feature by species
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
axes = axes.flatten()

for i, feature in enumerate(iris.feature_names):
    sns.boxplot(data=df, x='species_name', y=feature, ax=axes[i])
    axes[i].set_title(f'{feature} by Species')
    axes[i].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

# **5. Data Preprocessing**

In [None]:
print("=== DATA PREPROCESSING STEPS ===")

# 1. Check for missing values
print("\n1. Checking for missing values:")
missing_values = df.isnull().sum()
print(missing_values)
print(f"Total missing values: {missing_values.sum()}")

# 2. Check for duplicates
print("\n2. Checking for duplicates:")
duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")

if duplicates > 0:
    print("Removing duplicates...")
    df_clean = df.drop_duplicates()
    print(f"Shape after removing duplicates: {df_clean.shape}")
else:
    df_clean = df.copy()
    print("No duplicates found.")

In [None]:
# 3. Outlier detection using IQR method
print("\n3. Outlier Detection using IQR method:")

def detect_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = data[(data[column] < lower_bound) | (data[column] > upper_bound)]
    return outliers.index

outlier_indices = set()
for feature in iris.feature_names:
    feature_outliers = detect_outliers_iqr(df_clean, feature)
    print(f"{feature}: {len(feature_outliers)} outliers")
    outlier_indices.update(feature_outliers)

print(f"\nTotal unique outlier indices: {len(outlier_indices)}")
print(f"Outlier percentage: {len(outlier_indices)/len(df_clean)*100:.2f}%")

# For this dataset, we'll keep outliers as they might be important for species classification
print("\nKeeping outliers as they may be important for species classification.")

In [None]:
# 4. Feature scaling
print("\n4. Feature Scaling:")

# Separate features and target
X = df_clean[iris.feature_names].copy()
y = df_clean['species'].copy()

print(f"Original feature ranges:")
for feature in iris.feature_names:
    print(f"{feature}: {X[feature].min():.2f} - {X[feature].max():.2f}")

# Apply StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled_df = pd.DataFrame(X_scaled, columns=iris.feature_names)

print(f"\nAfter scaling - feature ranges:")
for feature in iris.feature_names:
    print(f"{feature}: {X_scaled_df[feature].min():.2f} - {X_scaled_df[feature].max():.2f}")

In [None]:
# 5. Target encoding (optional, for consistency)
print("\n5. Target Encoding:")

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

print(f"Original target classes: {np.unique(y)}")
print(f"Encoded target classes: {np.unique(y_encoded)}")
print(f"Label mapping: {dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))}")

In [None]:
# 6. Create final preprocessed dataset
print("\n6. Creating Final Preprocessed Dataset:")

# Combine scaled features with encoded target
processed_df = X_scaled_df.copy()
processed_df['target'] = y_encoded

print(f"Final preprocessed dataset shape: {processed_df.shape}")
print(f"\nPreprocessed dataset info:")
print(processed_df.info())

print(f"\nFirst 5 rows of preprocessed data:")
print(processed_df.head())

print(f"\nTarget distribution in preprocessed data:")
print(processed_df['target'].value_counts().sort_index())

In [None]:
# 7. Save preprocessed data
print("\n7. Saving Preprocessed Data:")

# Save to CSV
processed_df.to_csv('iris_preprocessing.csv', index=False)
print("Preprocessed data saved as 'iris_preprocessing.csv'")

# Verification
verification_df = pd.read_csv('iris_preprocessing.csv')
print(f"\nVerification - loaded file shape: {verification_df.shape}")
print("Preprocessing completed successfully!")

In [None]:
# 8. Quick model validation to ensure preprocessing quality
print("\n8. Quick Model Validation:")

# Split data for quick validation
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled_df, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

# Train a simple model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Quick validation accuracy: {accuracy:.4f}")
print(f"\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

print("\n=== PREPROCESSING SUMMARY ===")
print(f"✓ Original dataset shape: {df.shape}")
print(f"✓ Final preprocessed shape: {processed_df.shape}")
print(f"✓ Missing values handled: {df.isnull().sum().sum()} → 0")
print(f"✓ Duplicates removed: {duplicates}")
print(f"✓ Features scaled: ✓")
print(f"✓ Target encoded: ✓")
print(f"✓ Quick validation accuracy: {accuracy:.4f}")
print(f"✓ Data saved successfully: ✓")