In [5]:
#!/usr/bin/env python3
"""
Data Collection for AI: Practical Examples
==========================================

This script demonstrates various data collection strategies and techniques
for AI projects, designed for students without strong mathematical backgrounds.

Topics covered:
1. Understanding different data types and their characteristics
2. Working with existing datasets vs. creating your own
3. Automated vs. manual data collection methods
4. Data quality assessment and validation
5. Real-world examples in Healthcare and Finance domains

Author: AI Education Assistant
Date: 2024
"""

# Install required packages (Google Colab compatible)
import subprocess
import sys

def install_packages():
    """Install required packages with error handling"""
    packages = ['requests', 'beautifulsoup4', 'pandas', 'numpy', 'matplotlib', 'seaborn']

    for package in packages:
        try:
            subprocess.check_call([sys.executable, "-m", "pip", "install", package, "-q"])
        except subprocess.CalledProcessError:
            print(f"Warning: Could not install {package}")

    print("✅ Package installation complete!")

# Run installation
install_packages()

# Import all necessary libraries with fallbacks
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
from bs4 import BeautifulSoup
import json
import time
from datetime import datetime, timedelta
import warnings
import random
import string

warnings.filterwarnings('ignore')

# Set up plotting style with fallback
try:
    plt.style.use('seaborn-v0_8')
except:
    try:
        plt.style.use('seaborn')
    except:
        plt.style.use('default')

try:
    sns.set_palette("husl")
except:
    pass

def generate_fake_name():
    """Generate fake names without faker library"""
    first_names = ['John', 'Jane', 'Michael', 'Sarah', 'David', 'Emily', 'Chris', 'Lisa', 'Robert', 'Maria']
    last_names = ['Smith', 'Johnson', 'Williams', 'Brown', 'Jones', 'Garcia', 'Miller', 'Davis', 'Rodriguez', 'Martinez']
    return f"{random.choice(first_names)} {random.choice(last_names)}"

def generate_fake_date(start_date='-5y', end_date='today'):
    """Generate fake dates without faker library"""
    if start_date == '-5y':
        start = datetime.now() - timedelta(days=5*365)
    elif start_date == '-10y':
        start = datetime.now() - timedelta(days=10*365)
    elif start_date == '-1y':
        start = datetime.now() - timedelta(days=365)
    else:
        start = datetime.strptime(start_date, '%Y-%m-%d')

    if end_date == 'today':
        end = datetime.now()
    elif end_date == '-1y':
        end = datetime.now() - timedelta(days=365)
    else:
        end = datetime.strptime(end_date, '%Y-%m-%d')

    time_between = end - start
    days_between = time_between.days
    random_days = random.randrange(days_between)
    return start + timedelta(days=random_days)

def main():
    """Main function that runs all data collection examples"""

    print("🎯 DATA COLLECTION FOR AI: PRACTICAL EXAMPLES")
    print("=" * 60)
    print("Welcome to the comprehensive data collection tutorial!")
    print("This script will guide you through different data collection strategies.\n")

    # Run all sections
    section_1_data_types()
    section_2_existing_datasets()
    section_3_automated_collection()
    section_4_manual_collection()
    section_5_data_quality()
    section_6_real_world_examples()

    print("\n🎉 TUTORIAL COMPLETE!")
    print("You've learned the fundamentals of data collection for AI!")

def section_1_data_types():
    """Section 1: Understanding Data Types with Examples"""

    print("\n" + "="*60)
    print("📊 SECTION 1: UNDERSTANDING DATA TYPES")
    print("="*60)

    # Set random seed for reproducible results
    np.random.seed(42)
    random.seed(42)

    # 1.1 NUMERICAL DATA EXAMPLES
    print("\n🔢 1.1 NUMERICAL DATA EXAMPLES")
    print("-" * 40)

    # Healthcare numerical data
    print("🏥 Healthcare Numerical Data:")
    healthcare_numerical = {
        'patient_id': range(1, 11),  # Smaller sample for display
        'age': np.random.normal(45, 15, 10).astype(int),
        'systolic_bp': np.random.normal(120, 20, 10).astype(int),
        'diastolic_bp': np.random.normal(80, 10, 10).astype(int),
        'heart_rate': np.random.normal(72, 12, 10).astype(int),
        'cholesterol': np.random.normal(200, 40, 10).astype(int),
        'bmi': np.random.normal(25, 5, 10).round(1)
    }

    healthcare_df = pd.DataFrame(healthcare_numerical)
    print(healthcare_df)
    print(f"\nAverage age: {healthcare_df['age'].mean():.1f} years")
    print(f"Average BMI: {healthcare_df['bmi'].mean():.1f}")

    # Finance numerical data
    print("\n💰 Finance Numerical Data:")
    finance_numerical = {
        'customer_id': range(1, 11),
        'annual_income': np.random.lognormal(10.5, 0.5, 10).astype(int),
        'credit_score': np.random.normal(700, 100, 10).astype(int),
        'monthly_spending': np.random.normal(2500, 800, 10).astype(int),
        'savings_balance': np.random.exponential(5000, 10).astype(int),
        'loan_amount': np.random.normal(15000, 8000, 10).astype(int)
    }

    finance_df = pd.DataFrame(finance_numerical)
    print(finance_df)
    print(f"\nAverage credit score: {finance_df['credit_score'].mean():.0f}")
    print(f"Average monthly spending: ${finance_df['monthly_spending'].mean():.2f}")

    # 1.2 CATEGORICAL DATA EXAMPLES
    print("\n🏷️ 1.2 CATEGORICAL DATA EXAMPLES")
    print("-" * 40)

    # Healthcare categorical data
    print("🏥 Healthcare Categorical Data:")
    blood_types = ['A+', 'A-', 'B+', 'B-', 'AB+', 'AB-', 'O+', 'O-']
    diagnoses = ['Healthy', 'Hypertension', 'Diabetes', 'Heart Disease', 'Asthma']

    healthcare_categorical = {
        'patient_id': range(1, 11),
        'blood_type': np.random.choice(blood_types, 10),
        'diagnosis': np.random.choice(diagnoses, 10),
        'gender': np.random.choice(['Male', 'Female'], 10),
        'insurance_type': np.random.choice(['Private', 'Medicare', 'Medicaid'], 10)
    }

    healthcare_cat_df = pd.DataFrame(healthcare_categorical)
    print(healthcare_cat_df)
    print(f"\nBlood type distribution:")
    print(healthcare_cat_df['blood_type'].value_counts())

    # Finance categorical data
    print("\n💰 Finance Categorical Data:")
    card_types = ['Visa', 'MasterCard', 'American Express', 'Discover']
    risk_levels = ['Low', 'Medium', 'High']

    finance_categorical = {
        'customer_id': range(1, 11),
        'card_type': np.random.choice(card_types, 10),
        'risk_level': np.random.choice(risk_levels, 10),
        'employment_status': np.random.choice(['Full-time', 'Part-time', 'Self-employed'], 10),
        'education_level': np.random.choice(['High School', 'Bachelor', 'Master', 'PhD'], 10)
    }

    finance_cat_df = pd.DataFrame(finance_categorical)
    print(finance_cat_df)
    print(f"\nCard type distribution:")
    print(finance_cat_df['card_type'].value_counts())

    # 1.3 TEXT DATA EXAMPLES
    print("\n📝 1.3 TEXT DATA EXAMPLES")
    print("-" * 40)

    # Healthcare text data
    print("🏥 Healthcare Text Data (Clinical Notes):")
    healthcare_text_samples = [
        "Patient complains of chest pain after exercise, lasting 5-10 minutes",
        "Treatment with medication has significantly reduced symptoms",
        "Regular blood pressure monitoring shows improvement over 3 months",
        "Patient reports feeling more energetic since starting new diet",
        "Recommended follow-up appointment in 6 weeks to assess progress"
    ]

    for i, note in enumerate(healthcare_text_samples, 1):
        print(f"Patient {i}: {note}")

    # Finance text data
    print("\n💰 Finance Text Data (Customer Reviews):")
    finance_text_samples = [
        "Excellent customer service, quick loan approval process",
        "High fees make this credit card less attractive than competitors",
        "Mobile banking app is user-friendly and reliable",
        "Investment advisor provided helpful guidance on portfolio diversification",
        "Long wait times for customer support calls are frustrating"
    ]

    for i, review in enumerate(finance_text_samples, 1):
        print(f"Customer {i}: {review}")

    # 1.4 TIME SERIES DATA EXAMPLES
    print("\n📈 1.4 TIME SERIES DATA EXAMPLES")
    print("-" * 40)

    # Create time series data
    dates = pd.date_range(start='2024-01-01', end='2024-01-10', freq='D')

    # Healthcare time series (blood glucose readings)
    print("🏥 Healthcare Time Series (Daily Glucose Readings):")
    base_glucose = 120
    glucose_readings = base_glucose + np.random.normal(0, 15, len(dates))

    healthcare_ts = pd.DataFrame({
        'date': dates,
        'glucose_level': glucose_readings.round(1)
    })
    print(healthcare_ts)

    # Finance time series (daily spending)
    print("\n💰 Finance Time Series (Daily Spending Pattern):")
    daily_spending = []
    for date in dates:
        # Higher spending on weekends
        if date.weekday() in [5, 6]:
            base_spending = 150
        else:
            base_spending = 80
        spending = base_spending + np.random.normal(0, 30)
        daily_spending.append(max(0, spending))

    finance_ts = pd.DataFrame({
        'date': dates,
        'daily_spending': np.array(daily_spending).round(2)
    })
    print(finance_ts)

    print("\n✅ Section 1 Complete: You now understand different data types!")

def section_2_existing_datasets():
    """Section 2: Working with Existing Datasets"""

    print("\n" + "="*60)
    print("🗂️ SECTION 2: WORKING WITH EXISTING DATASETS")
    print("="*60)

    # 2.1 DATASET CATALOG
    print("\n📊 2.1 POPULAR EXISTING DATASETS")
    print("-" * 40)

    # Create a catalog of existing datasets
    dataset_catalog = [
        {
            'name': 'MIMIC-III Clinical Database',
            'domain': 'Healthcare',
            'size': '40,000+ patients',
            'quality': 95,
            'description': 'ICU patient data including vital signs and outcomes'
        },
        {
            'name': 'NIH Chest X-ray Dataset',
            'domain': 'Healthcare',
            'size': '100,000+ images',
            'quality': 92,
            'description': 'Chest X-ray images with disease classifications'
        },
        {
            'name': 'Credit Card Fraud Detection',
            'domain': 'Finance',
            'size': '280,000+ transactions',
            'quality': 90,
            'description': 'Credit card transactions with fraud labels'
        },
        {
            'name': 'Yahoo Finance Stock Data',
            'domain': 'Finance',
            'size': '10+ years daily data',
            'quality': 88,
            'description': 'Historical stock prices and trading volumes'
        }
    ]

    catalog_df = pd.DataFrame(dataset_catalog)
    print("Available Datasets:")
    print(catalog_df.to_string(index=False))

    # 2.2 DATASET EVALUATION FUNCTION
    print("\n🔍 2.2 DATASET EVALUATION TOOL")
    print("-" * 40)

    def evaluate_dataset(data, dataset_name):
        """Comprehensive evaluation of dataset quality"""
        print(f"\n🔬 Evaluating: {dataset_name}")
        print("-" * 30)

        # Basic info
        print(f"📊 Shape: {data.shape[0]} rows, {data.shape[1]} columns")
        print(f"📝 Columns: {', '.join(data.columns.tolist())}")

        # Missing values
        missing_data = data.isnull().sum()
        missing_percent = (missing_data / len(data)) * 100

        print("\n❌ Missing Data Analysis:")
        if missing_data.sum() == 0:
            print("  ✅ No missing values found!")
        else:
            for col in missing_data[missing_data > 0].index:
                print(f"  {col}: {missing_data[col]} ({missing_percent[col]:.1f}%)")

        # Data types
        print("\n📋 Data Types:")
        print(data.dtypes.to_string())

        # Quality score calculation
        completeness_score = (1 - missing_data.sum() / (len(data) * len(data.columns))) * 100
        duplicates = data.duplicated().sum()
        duplicate_percent = (duplicates / len(data)) * 100

        quality_score = (completeness_score + (100 - duplicate_percent)) / 2

        print(f"\n🎯 Quality Assessment:")
        print(f"  Completeness: {completeness_score:.1f}%")
        print(f"  Duplicates: {duplicates} ({duplicate_percent:.1f}%)")
        print(f"  Overall Quality Score: {quality_score:.1f}/100")

        # Recommendations
        print("\n💡 Recommendations:")
        if quality_score >= 90:
            print("  ✅ Excellent quality! Ready for AI training.")
        elif quality_score >= 70:
            print("  ⚠️ Good quality, minor cleaning needed.")
        else:
            print("  ❌ Significant quality issues, extensive cleaning required.")

        return quality_score

    # Demonstrate dataset evaluation
    # Create sample datasets with different quality levels
    np.random.seed(42)

    # High quality dataset
    high_quality_data = pd.DataFrame({
        'feature_1': np.random.normal(100, 15, 100),
        'feature_2': np.random.normal(50, 10, 100),
        'feature_3': np.random.choice(['A', 'B', 'C'], 100),
        'target': np.random.choice([0, 1], 100)
    })

    # Low quality dataset (with missing values and duplicates)
    low_quality_data = high_quality_data.copy()
    low_quality_data.loc[0:10, 'feature_1'] = np.nan  # Add missing values
    low_quality_data.loc[15:20, 'feature_2'] = np.nan
    low_quality_data = pd.concat([low_quality_data, low_quality_data.iloc[0:5]])  # Add duplicates

    # Evaluate both datasets
    high_score = evaluate_dataset(high_quality_data, "High Quality Dataset")
    low_score = evaluate_dataset(low_quality_data, "Low Quality Dataset")

    print("\n✅ Section 2 Complete: You can now evaluate existing datasets!")

def section_3_automated_collection():
    """Section 3: Automated Data Collection"""

    print("\n" + "="*60)
    print("🤖 SECTION 3: AUTOMATED DATA COLLECTION")
    print("="*60)

    # 3.1 WEB SCRAPING SIMULATION
    print("\n🕷️ 3.1 WEB SCRAPING DEMONSTRATION")
    print("-" * 40)

    def simulate_web_scraping():
        """Simulate web scraping (for educational purposes)"""
        print("🔍 Simulating web scraping of financial news...")

        # Simulate scraped financial news headlines
        simulated_headlines = [
            "Tech Stocks Rally on Strong Earnings Reports",
            "Federal Reserve Considers Interest Rate Changes",
            "Healthcare Sector Shows Promising Growth",
            "Energy Prices Fluctuate Amid Market Uncertainty",
            "Banking Stocks Respond to Regulatory Updates"
        ]

        # Simulate sentiment scores (in reality, you'd use NLP)
        sentiment_scores = np.random.uniform(-1, 1, len(simulated_headlines))

        scraped_data = pd.DataFrame({
            'headline': simulated_headlines,
            'sentiment_score': sentiment_scores.round(3),
            'timestamp': pd.date_range('2024-01-01', periods=len(simulated_headlines), freq='D')
        })

        return scraped_data

    # Demonstrate web scraping
    news_data = simulate_web_scraping()
    print("📰 Scraped Financial News Data:")
    print(news_data)

    print(f"\n📊 Average sentiment: {news_data['sentiment_score'].mean():.3f}")
    print("(Positive values indicate optimistic news, negative values indicate pessimistic news)")

    # 3.2 API DATA COLLECTION SIMULATION
    print("\n🔌 3.2 API DATA COLLECTION")
    print("-" * 40)

    def simulate_api_data_collection():
        """Simulate collecting data from APIs"""
        print("📡 Simulating API data collection...")

        # Simulate stock price data (like from Yahoo Finance API)
        dates = pd.date_range('2024-01-01', '2024-01-10', freq='D')
        base_price = 150

        # Simulate realistic price movements
        price_changes = np.random.normal(0, 2, len(dates))
        prices = [base_price]

        for change in price_changes[1:]:
            new_price = prices[-1] + change
            prices.append(max(new_price, 1))  # Ensure price stays positive

        api_data = pd.DataFrame({
            'date': dates,
            'open_price': prices,
            'high_price': [p + abs(np.random.normal(0, 1)) for p in prices],
            'low_price': [p - abs(np.random.normal(0, 1)) for p in prices],
            'close_price': [p + np.random.normal(0, 0.5) for p in prices],
            'volume': np.random.randint(1000000, 5000000, len(dates))
        })

        return api_data

    # Demonstrate API data collection
    stock_data = simulate_api_data_collection()
    print("📈 Stock Market Data from API:")
    print(stock_data.round(2))

    print(f"\n📊 Price range: ${stock_data['low_price'].min():.2f} - ${stock_data['high_price'].max():.2f}")
    print(f"📊 Average volume: {stock_data['volume'].mean():,.0f} shares")

    # 3.3 SENSOR DATA COLLECTION SIMULATION
    print("\n📡 3.3 SENSOR DATA COLLECTION")
    print("-" * 40)

    def simulate_sensor_data():
        """Simulate IoT sensor data collection"""
        print("🌡️ Simulating IoT sensor data collection...")

        # Simulate patient monitoring sensors
        timestamps = pd.date_range('2024-01-01 08:00:00', periods=24, freq='H')

        # Simulate realistic vital signs with circadian rhythm
        hours = np.array([t.hour for t in timestamps])

        # Heart rate varies with time of day
        base_hr = 70 + 10 * np.sin(2 * np.pi * (hours - 6) / 24)
        heart_rate = base_hr + np.random.normal(0, 5, len(timestamps))

        # Blood pressure follows similar pattern
        base_systolic = 120 + 5 * np.sin(2 * np.pi * (hours - 8) / 24)
        systolic_bp = base_systolic + np.random.normal(0, 8, len(timestamps))

        sensor_data = pd.DataFrame({
            'timestamp': timestamps,
            'heart_rate': heart_rate.round(0).astype(int),
            'systolic_bp': systolic_bp.round(0).astype(int),
            'temperature': np.random.normal(98.6, 0.5, len(timestamps)).round(1),
            'oxygen_saturation': np.random.normal(98, 1, len(timestamps)).round(1)
        })

        return sensor_data

    # Demonstrate sensor data collection
    sensor_data = simulate_sensor_data()
    print("🏥 Patient Monitoring Sensor Data:")
    print(sensor_data.head(10))

    print(f"\n📊 24-hour averages:")
    print(f"  Heart rate: {sensor_data['heart_rate'].mean():.1f} bpm")
    print(f"  Blood pressure: {sensor_data['systolic_bp'].mean():.1f} mmHg")
    print(f"  Temperature: {sensor_data['temperature'].mean():.1f}°F")
    print(f"  Oxygen saturation: {sensor_data['oxygen_saturation'].mean():.1f}%")

    # 3.4 AUTOMATION BENEFITS AND CHALLENGES
    print("\n⚖️ 3.4 AUTOMATION: BENEFITS vs CHALLENGES")
    print("-" * 40)

    automation_analysis = {
        'Aspect': ['Speed', 'Cost', 'Consistency', 'Scalability', 'Quality Control', 'Flexibility'],
        'Automated_Score': [9, 8, 9, 10, 6, 4],
        'Manual_Score': [3, 4, 5, 3, 9, 9]
    }

    comparison_df = pd.DataFrame(automation_analysis)
    print("📊 Automated vs Manual Collection Comparison (1-10 scale):")
    print(comparison_df)

    print("\n💡 Key Insights:")
    print("✅ Use automation for: High volume, consistent format, continuous data")
    print("✅ Use manual collection for: Quality control, complex judgments, unique contexts")
    print("🎯 Best approach: Hybrid - combine both methods strategically")

    print("\n✅ Section 3 Complete: You understand automated data collection!")

def section_4_manual_collection():
    """Section 4: Manual Data Collection Methods"""

    print("\n" + "="*60)
    print("👥 SECTION 4: MANUAL DATA COLLECTION")
    print("="*60)

    # 4.1 SURVEY DESIGN EXAMPLE
    print("\n📋 4.1 SURVEY DESIGN FOR DATA COLLECTION")
    print("-" * 40)

    def create_sample_survey():
        """Design a sample survey for healthcare data collection"""

        healthcare_survey_questions = [
            {
                'question': 'What is your age?',
                'type': 'numerical',
                'validation': 'Must be between 18-120',
                'purpose': 'Demographic analysis and age-related health patterns'
            },
            {
                'question': 'How would you rate your overall health?',
                'type': 'categorical',
                'options': ['Excellent', 'Very Good', 'Good', 'Fair', 'Poor'],
                'purpose': 'Self-reported health status assessment'
            },
            {
                'question': 'How many hours of sleep do you get per night on average?',
                'type': 'numerical',
                'validation': 'Must be between 1-24',
                'purpose': 'Sleep pattern analysis for health correlations'
            },
            {
                'question': 'Describe any chronic health conditions you have:',
                'type': 'text',
                'purpose': 'Qualitative health information and condition tracking'
            },
            {
                'question': 'How often do you exercise per week?',
                'type': 'categorical',
                'options': ['Never', '1-2 times', '3-4 times', '5+ times'],
                'purpose': 'Activity level assessment for health predictions'
            }
        ]

        return healthcare_survey_questions

    survey_questions = create_sample_survey()
    print("🏥 Healthcare Survey Design Example:")
    for i, q in enumerate(survey_questions, 1):
        print(f"\nQuestion {i}: {q['question']}")
        print(f"  Type: {q['type']}")
        if 'options' in q:
            print(f"  Options: {', '.join(q['options'])}")
        if 'validation' in q:
            print(f"  Validation: {q['validation']}")
        print(f"  Purpose: {q['purpose']}")

    # 4.2 SIMULATE SURVEY RESPONSES
    print("\n📊 4.2 SIMULATED SURVEY RESPONSES")
    print("-" * 40)

    def simulate_survey_responses(num_responses=50):
        """Simulate realistic survey responses"""
        np.random.seed(42)

        responses = []
        for i in range(num_responses):
            response = {
                'respondent_id': i + 1,
                'age': np.random.randint(18, 80),
                'health_rating': np.random.choice(['Excellent', 'Very Good', 'Good', 'Fair', 'Poor'],
                                                p=[0.2, 0.3, 0.3, 0.15, 0.05]),
                'sleep_hours': np.random.normal(7.5, 1.2),
                'exercise_frequency': np.random.choice(['Never', '1-2 times', '3-4 times', '5+ times'],
                                                     p=[0.2, 0.4, 0.3, 0.1]),
                'response_date': datetime.now() - timedelta(days=np.random.randint(0, 30))
            }
            responses.append(response)

        return pd.DataFrame(responses)

    survey_data = simulate_survey_responses()
    print("📋 Sample Survey Response Data:")
    print(survey_data.head())

    # Analyze survey data
    print(f"\n📊 Survey Analysis:")
    print(f"  Total responses: {len(survey_data)}")
    print(f"  Average age: {survey_data['age'].mean():.1f} years")
    print(f"  Average sleep: {survey_data['sleep_hours'].mean():.1f} hours")
    print(f"\n  Health rating distribution:")
    print(survey_data['health_rating'].value_counts())

    print("\n✅ Section 4 Complete: You understand manual data collection methods!")

def section_5_data_quality():
    """Section 5: Data Quality Assessment"""

    print("\n" + "="*60)
    print("🔍 SECTION 5: DATA QUALITY ASSESSMENT")
    print("="*60)

    # 5.1 QUALITY METRICS
    print("\n📊 5.1 DATA QUALITY METRICS")
    print("-" * 40)

    def calculate_quality_metrics(data, dataset_name):
        """Calculate comprehensive quality metrics for a dataset"""

        print(f"\n🔬 Quality Assessment: {dataset_name}")
        print("-" * 30)

        # Basic metrics
        total_cells = data.shape[0] * data.shape[1]
        missing_cells = data.isnull().sum().sum()
        duplicate_rows = data.duplicated().sum()

        # Calculate quality scores
        completeness = ((total_cells - missing_cells) / total_cells) * 100
        uniqueness = ((data.shape[0] - duplicate_rows) / data.shape[0]) * 100

        # Consistency check (for numerical columns)
        numerical_cols = data.select_dtypes(include=[np.number]).columns
        consistency_scores = []

        for col in numerical_cols:
            if len(data[col].dropna()) > 0:
                # Check for outliers using IQR method
                Q1 = data[col].quantile(0.25)
                Q3 = data[col].quantile(0.75)
                IQR = Q3 - Q1
                outliers = data[(data[col] < Q1 - 1.5*IQR) | (data[col] > Q3 + 1.5*IQR)][col]
                consistency = ((len(data[col]) - len(outliers)) / len(data[col])) * 100
                consistency_scores.append(consistency)

        avg_consistency = np.mean(consistency_scores) if consistency_scores else 100

        # Overall quality score
        overall_quality = (completeness + uniqueness + avg_consistency) / 3

        # Display results
        quality_metrics = {
            'Metric': ['Completeness', 'Uniqueness', 'Consistency', 'Overall Quality'],
            'Score': [completeness, uniqueness, avg_consistency, overall_quality],
            'Status': []
        }

        # Assign status based on scores
        for score in quality_metrics['Score']:
            if score >= 95:
                quality_metrics['Status'].append('Excellent ✅')
            elif score >= 85:
                quality_metrics['Status'].append('Good ⚠️')
            elif score >= 70:
                quality_metrics['Status'].append('Fair ⚠️')
            else:
                quality_metrics['Status'].append('Poor ❌')

        metrics_df = pd.DataFrame(quality_metrics)
        metrics_df['Score'] = metrics_df['Score'].round(1)

        print(metrics_df.to_string(index=False))

        return overall_quality

    # Create datasets with different quality levels for demonstration
    np.random.seed(42)

    # High quality dataset
    high_quality = pd.DataFrame({
        'patient_id': range(1, 101),
        'age': np.random.normal(50, 15, 100).astype(int),
        'blood_pressure': np.random.normal(120, 20, 100).astype(int),
        'cholesterol': np.random.normal(200, 40, 100).astype(int),
        'diagnosis': np.random.choice(['Healthy', 'At Risk', 'Condition Present'], 100)
    })

    # Low quality dataset (many issues)
    low_quality = high_quality.copy()
    low_quality.loc[0:15, 'cholesterol'] = np.nan
    low_quality.loc[20:30, 'blood_pressure'] = np.nan
    low_quality.loc[5:8, 'age'] = np.nan
    # Add duplicates
    low_quality = pd.concat([low_quality, low_quality.iloc[0:10]])

    # Assess datasets
    high_score = calculate_quality_metrics(high_quality, "High Quality Dataset")
    low_score = calculate_quality_metrics(low_quality, "Low Quality Dataset")

    print("\n✅ Section 5 Complete: You can now assess and improve data quality!")

def section_6_real_world_examples():
    """Section 6: Real-World Case Studies"""

    print("\n" + "="*60)
    print("🌟 SECTION 6: REAL-WORLD CASE STUDIES")
    print("="*60)

    # 6.1 HEALTHCARE CASE STUDY
    print("\n🏥 6.1 HEALTHCARE CASE STUDY: PREDICTING PATIENT READMISSIONS")
    print("-" * 60)

    print("📋 SCENARIO:")
    print("A hospital wants to predict which patients are likely to be readmitted")
    print("within 30 days of discharge to improve care coordination.\n")

    # Simulate comprehensive patient readmission dataset
    np.random.seed(42)
    n_patients = 1000

    # Demographics (from EHR - high quality)
    demographics = pd.DataFrame({
        'patient_id': range(1, n_patients + 1),
        'age': np.random.normal(65, 15, n_patients).astype(int),
        'gender': np.random.choice(['Male', 'Female'], n_patients),
        'insurance': np.random.choice(['Medicare', 'Private', 'Medicaid'], n_patients, p=[0.6, 0.3, 0.1])
    })

    print(f"✅ Generated dataset with {len(demographics)} patients")
    print("Sample data:")
    print(demographics.head())

    # 6.2 FINANCE CASE STUDY
    print("\n💰 6.2 FINANCE CASE STUDY: FRAUD DETECTION SYSTEM")
    print("-" * 60)

    print("📋 SCENARIO:")
    print("A bank wants to build an AI system to detect fraudulent credit card")
    print("transactions in real-time to protect customers.\n")

    # Generate realistic transaction data
    np.random.seed(42)
    n_transactions = 1000

    transactions = pd.DataFrame({
        'transaction_id': range(1, n_transactions + 1),
        'customer_id': np.random.randint(1, 200, n_transactions),
        'amount': np.random.lognormal(3, 1.5, n_transactions),  # Log-normal for realistic amounts
        'merchant_category': np.random.choice(['grocery', 'gas', 'restaurant', 'online', 'retail'],
                                            n_transactions, p=[0.3, 0.15, 0.2, 0.25, 0.1]),
        'hour_of_day': np.random.randint(0, 24, n_transactions),
    })

    print(f"✅ Generated {len(transactions):,} transactions")
    print("Sample transaction data:")
    print(transactions.head())

    print("\n✅ Section 6 Complete: You've seen real-world data collection in action!")

def create_summary_report():
    """Create a comprehensive summary of all concepts covered"""

    print("\n" + "="*80)
    print("📋 COMPREHENSIVE SUMMARY REPORT")
    print("="*80)

    summary_sections = {
        'Section': [
            '1. Data Types',
            '2. Existing Datasets',
            '3. Automated Collection',
            '4. Manual Collection',
            '5. Data Quality',
            '6. Real-World Examples'
        ],
        'Key Concepts': [
            'Numerical, Categorical, Text, Time Series data types',
            'Dataset evaluation, quality assessment, licensing',
            'Web scraping, APIs, sensors, automation benefits',
            'Surveys, interviews, expert annotation, human insight',
            'Completeness, consistency, validation, cleaning',
            'Healthcare readmission, fraud detection case studies'
        ]
    }

    summary_df = pd.DataFrame(summary_sections)
    print(summary_df.to_string(index=False))

    print(f"\n🎯 KEY TAKEAWAYS FOR AI DATA COLLECTION:")
    print("-" * 50)

    key_takeaways = [
        "✅ Data quality is more important than data quantity",
        "✅ Start with existing datasets when possible, supplement with custom collection",
        "✅ Use automation for scale, manual collection for quality and nuance",
        "✅ Always validate and clean your data before AI training",
        "✅ Document everything - collection methods, quality issues, decisions made",
        "✅ Consider legal and ethical implications from the beginning",
        "✅ Implement quality control throughout the collection process",
        "✅ Plan for data maintenance and updates over time"
    ]

    for takeaway in key_takeaways:
        print(f"  {takeaway}")

# Main execution
if __name__ == "__main__":
    try:
        main()
        create_summary_report()

        print(f"\n" + "="*80)
        print("🎓 CONGRATULATIONS!")
        print("="*80)
        print("You have completed the comprehensive data collection tutorial!")
        print("You now have the knowledge and tools to:")
        print("• Choose the right data collection strategy for any AI project")
        print("• Implement quality control and validation procedures")
        print("• Balance automated and manual collection methods")
        print("• Apply these concepts in healthcare and finance domains")
        print("\n🚀 You're ready to start collecting data for your own AI projects!")

    except Exception as e:
        print(f"❌ An error occurred: {str(e)}")
        print("This is a learning script - errors are part of the learning process!")
        print("Try running individual sections to identify and fix issues.")

    finally:
        print(f"\n📚 Remember to check the references in the Deep Dive Document")
        print("for additional resources and further learning opportunities!")

print("📚 DATA COLLECTION FOR AI - READY TO RUN!")
print("Execute main() to start the tutorial or run individual sections.")

✅ Package installation complete!
🎯 DATA COLLECTION FOR AI: PRACTICAL EXAMPLES
Welcome to the comprehensive data collection tutorial!
This script will guide you through different data collection strategies.


📊 SECTION 1: UNDERSTANDING DATA TYPES

🔢 1.1 NUMERICAL DATA EXAMPLES
----------------------------------------
🏥 Healthcare Numerical Data:
   patient_id  age  systolic_bp  diastolic_bp  heart_rate  cholesterol   bmi
0           1   52          110            94          64          229  26.6
1           2   42          110            77          94          206  23.1
2           3   54          124            80          71          195  21.6
3           4   67           81            65          59          187  28.1
4           5   41           85            74          81          140  30.2
5           6   41          108            81          57          171  29.7
6           7   68           99            68          74          181  20.8
7           8   56          126      