In [None]:
# Complete One-Hot Encoding Example
# This shows how to convert categories to AI-friendly format

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

print("🎯 One-Hot Encoding: Converting Categories to Numbers for AI")
print("="*60)

# ============================================================================
# STEP 1: Create sample data with categories (like our slide example)
# ============================================================================

# Raw data as it might come from a database or CSV file
raw_customer_data = {
    'name': ['John Smith', 'Maria Garcia', 'Ahmed Al-Rahman', 'Sarah Johnson'],
    'age': [25, 35, 42, 28],
    'city': ['New York', 'Los Angeles', 'Chicago', 'New York'],
    'education': ['Bachelor', 'High School', 'PhD', 'Master'],
    'loan_purpose': ['Home', 'Car', 'Business', 'Home'],
    'risk_level': ['Low', 'Medium', 'High', 'Low']
}

# Create DataFrame
df = pd.DataFrame(raw_customer_data)
print("📊 Original Data (Human-Readable):")
print(df)
print(f"\nData types:\n{df.dtypes}")

# ============================================================================
# STEP 2: Manual One-Hot Encoding (to understand the concept)
# ============================================================================

print("\n" + "="*60)
print("🔧 MANUAL One-Hot Encoding Process")
print("="*60)

# Let's manually create one-hot encoding for 'city' to understand the process
print("\n🏙️  Converting 'city' column manually:")
print("Original cities:", df['city'].unique())

# Manual approach - create a column for each unique city
df_manual = df.copy()

# Get unique cities
unique_cities = df['city'].unique()
print(f"Unique cities found: {unique_cities}")

# Create a binary column for each city
for city in unique_cities:
    column_name = f"city_{city.replace(' ', '_')}"  # Replace spaces with underscores
    df_manual[column_name] = (df['city'] == city).astype(int)
    print(f"Created column '{column_name}': {df_manual[column_name].tolist()}")

print("\n📋 Result of manual one-hot encoding for cities:")
city_columns = [col for col in df_manual.columns if col.startswith('city_')]
print(df_manual[['name', 'city'] + city_columns])

# ============================================================================
# STEP 3: Using pandas get_dummies() - Easy Way
# ============================================================================

print("\n" + "="*60)
print("🚀 PANDAS get_dummies() - The Easy Way")
print("="*60)

# Start fresh with original data
df_pandas = df.copy()

# Apply one-hot encoding to categorical columns
categorical_columns = ['city', 'education', 'loan_purpose', 'risk_level']

print("🎯 Applying one-hot encoding to:", categorical_columns)

# Use pandas get_dummies for automatic one-hot encoding
df_encoded = pd.get_dummies(df_pandas, columns=categorical_columns, prefix=categorical_columns)

print("\n📊 After One-Hot Encoding:")
print(df_encoded)

print(f"\n📈 Shape change: {df.shape} → {df_encoded.shape}")
print(f"Columns increased from {df.shape[1]} to {df_encoded.shape[1]}")

# ============================================================================
# STEP 4: Using sklearn OneHotEncoder - Professional Way
# ============================================================================

print("\n" + "="*60)
print("🏭 SKLEARN OneHotEncoder - Professional Method")
print("="*60)

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Prepare data
df_sklearn = df.copy()

# Define which columns to encode
categorical_features = ['city', 'education', 'loan_purpose', 'risk_level']
numerical_features = ['age']

# Create the encoder
encoder = OneHotEncoder(drop='first', sparse_output=False)  # drop='first' avoids multicollinearity

# Fit and transform the categorical columns
encoded_features = encoder.fit_transform(df_sklearn[categorical_features])

# Get feature names for the encoded columns
feature_names = encoder.get_feature_names_out(categorical_features)

print("🏷️  Generated feature names:")
for i, name in enumerate(feature_names):
    print(f"  {i+1:2d}. {name}")

# Create DataFrame with encoded features
df_sklearn_encoded = pd.DataFrame(encoded_features, columns=feature_names)

# Add back the numerical features and name
df_sklearn_final = pd.concat([
    df_sklearn[['name', 'age']],
    df_sklearn_encoded
], axis=1)

print("\n📊 sklearn OneHotEncoder Result:")
print(df_sklearn_final)

# ============================================================================
# STEP 5: Comparison and Explanation
# ============================================================================

print("\n" + "="*60)
print("🔍 DETAILED EXPLANATION: Why One-Hot Encoding?")
print("="*60)

print("\n❌ What happens WITHOUT one-hot encoding:")
print("If we just convert categories to numbers: New York=1, Los Angeles=2, Chicago=3")
print("AI might think: Chicago (3) > Los Angeles (2) > New York (1)")
print("This creates a false ordering where none should exist!")

print("\n✅ What happens WITH one-hot encoding:")
print("Each category gets its own column with 1 (present) or 0 (absent)")
print("AI understands these are separate, equal categories")

# Show the encoding for one example
print("\n🏷️  Example breakdown for 'Maria Garcia' from Los Angeles:")
maria_row = df_encoded[df_encoded['name'] == 'Maria Garcia']
print("Original: city = 'Los Angeles'")
print("Encoded as:")
city_cols = [col for col in df_encoded.columns if col.startswith('city_')]
for col in city_cols:
    value = maria_row[col].iloc[0]
    print(f"  {col}: {value}")

# ============================================================================
# STEP 6: Real-world considerations
# ============================================================================

print("\n" + "="*60)
print("⚠️  REAL-WORLD CONSIDERATIONS")
print("="*60)

print("\n🎯 When to use one-hot encoding:")
print("✅ Categorical data with no natural order (cities, colors, products)")
print("✅ Small number of unique categories (< 10-20)")
print("✅ When you want to preserve all categories")

print("\n🚫 When NOT to use one-hot encoding:")
print("❌ High-cardinality categories (thousands of unique values)")
print("❌ Ordinal data with natural order (small < medium < large)")
print("❌ When you have limited memory/computational resources")

print("\n🔧 Alternative encoding methods:")
print("• Label Encoding: For ordinal data (education levels)")
print("• Target Encoding: For high-cardinality categories")
print("• Binary Encoding: For medium-cardinality categories")
print("• Frequency Encoding: Based on category frequency")

# ============================================================================
# STEP 7: Performance comparison
# ============================================================================

print("\n" + "="*60)
print("📊 BEFORE vs AFTER Comparison")
print("="*60)

print("\n🔢 Original Data Types:")
print(df.dtypes)

print("\n🔢 After One-Hot Encoding:")
print(df_encoded.dtypes)

print(f"\n📈 Memory usage:")
original_memory = df.memory_usage(deep=True).sum()
encoded_memory = df_encoded.memory_usage(deep=True).sum()
print(f"Original: {original_memory:,} bytes")
print(f"Encoded:  {encoded_memory:,} bytes")
print(f"Increase: {((encoded_memory/original_memory - 1) * 100):.1f}%")

print("\n✅ Benefits achieved:")
print("• All data is now numeric (AI-friendly)")
print("• No false ordering between categories")
print("• Each category is treated independently")
print("• Ready for machine learning algorithms")

print("\n🎯 Key Takeaway:")
print("One-hot encoding transforms categorical text into binary columns,")
print("making data AI-ready without creating false relationships!")