# Exploratory Data Analysis Notebook

## Assignment Task 3: Exploratory Data Analysis (EDA)

This notebook covers detailed exploratory data analysis including:
- Comprehensive statistical analysis
- Advanced visualization techniques
- Feature relationship exploration
- Outlier detection and analysis
- Key insights extraction

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import sys
import os

# Add src directory to path
sys.path.append('../src')

# Import our custom modules
from data.data_loader import DataLoader
from analysis.eda import ExploratoryDataAnalysis, AdvancedVisualizations
from visualization.plots import DataVisualizer, InteractiveVisualizer

# Import visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

## 1. Load and Prepare Dataset

In [None]:
# Initialize data loader
loader = DataLoader('../data/raw')

# Load dataset (replace with your actual dataset)
try:
    # Example: dataset = loader.load_dataset('your_dataset.csv')
    print("Please load your dataset using loader.load_dataset('filename.csv')")
    print("For now, using sample data for demonstration.")
    
    # Sample dataset for demonstration
    np.random.seed(42)
    sample_data = {
        'id': range(1, 1001),
        'age': np.random.normal(35, 10, 1000),
        'income': np.random.lognormal(10, 1, 1000),
        'education': np.random.choice(['High School', 'Bachelor', 'Master', 'PhD'], 1000),
        'city': np.random.choice(['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix'], 1000),
        'experience': np.random.normal(8, 5, 1000),
        'satisfaction': np.random.uniform(1, 10, 1000),
        'target': np.random.choice([0, 1], 1000, p=[0.7, 0.3])
    }
    
    # Add some missing values for realism
    missing_indices = np.random.choice(1000, 50, replace=False)
    for idx in missing_indices[:25]:
        sample_data['income'][idx] = np.nan
    for idx in missing_indices[25:]:
        sample_data['satisfaction'][idx] = np.nan
    
    dataset = pd.DataFrame(sample_data)
    loader.dataset = dataset
    loader._extract_dataset_info()
    
    print(f"Dataset loaded with {len(dataset)} rows and {len(dataset.columns)} columns")
    
except Exception as e:
    print(f"Error loading dataset: {e}")

## 2. Initialize EDA Components

In [None]:
# Initialize EDA components
target_column = 'target'  # Specify your target variable
eda = ExploratoryDataAnalysis(loader.dataset, target_column)
visualizer = DataVisualizer(loader.dataset)
interactive_visualizer = InteractiveVisualizer(loader.dataset)

print("EDA components initialized successfully.")
print(f"Target variable: {target_column}")

## 3. Comprehensive Statistical Analysis

In [None]:
print("=" * 60)
print("COMPREHENSIVE STATISTICAL ANALYSIS")
print("=" * 60)

# Basic statistics
print("\n1. BASIC STATISTICS")
print("-" * 20)
basic_stats = eda.get_basic_statistics()
print(basic_stats)

# Data types analysis
print("\n2. DATA TYPES ANALYSIS")
print("-" * 23)
dtypes_analysis = eda.analyze_data_types()
print(dtypes_analysis)

# Missing data analysis
print("\n3. MISSING DATA ANALYSIS")
print("-" * 25)
missing_data = eda.analyze_missing_data()
if missing_data.empty:
    print("No missing data found.")
else:
    print(missing_data)

# Outlier detection
print("\n4. OUTLIER DETECTION")
print("-" * 20)
outliers = eda.detect_outliers()
if not outliers.empty:
    print(outliers)
else:
    print("No outliers detected using IQR method.")

## 4. Advanced Visualizations

In [None]:
print("=" * 40)
print("ADVANCED VISUALIZATIONS")
print("=" * 40)

# Distribution plots
print("\n1. DISTRIBUTION PLOTS")
print("-" * 20)
if eda.numeric_columns:
    fig1 = eda.plot_distribution(figsize=(15, 10))
    plt.show()

# Categorical distributions
print("\n2. CATEGORICAL DISTRIBUTIONS")
print("-" * 30)
if eda.categorical_columns:
    fig2 = eda.plot_categorical_distributions(figsize=(15, 8))
    plt.show()

# Correlation matrix
print("\n3. CORRELATION MATRIX")
print("-" * 20)
if len(eda.numeric_columns) > 1:
    fig3 = eda.plot_correlation_matrix(figsize=(12, 10))
    plt.show()

# Interactive correlation matrix
print("\n4. INTERACTIVE CORRELATION MATRIX")
print("-" * 35)
if len(eda.numeric_columns) > 1:
    interactive_fig = interactive_visualizer.interactive_correlation_matrix()
    interactive_fig.show()

## 5. Target Variable Analysis

In [None]:
print("=" * 40)
print("TARGET VARIABLE ANALYSIS")
print("=" * 40)

if target_column in loader.dataset.columns:
    # Target distribution
    print("\n1. TARGET DISTRIBUTION")
    print("-" * 22)
    target_counts = loader.dataset[target_column].value_counts()
    print(target_counts)
    
    # Target distribution plot
    fig, ax = plt.subplots(figsize=(8, 6))
    target_counts.plot(kind='bar', ax=ax, color='skyblue')
    ax.set_title(f'Distribution of {target_column}')
    ax.set_xlabel(target_column)
    ax.set_ylabel('Count')
    plt.xticks(rotation=45)
    plt.show()
    
    # Relationship analysis
    print("\n2. FEATURE-TARGET RELATIONSHIPS")
    print("-" * 32)
    relationships = eda.analyze_target_relationships()
    
    if 'numerical_correlations' in relationships:
        print("Top numerical feature correlations with target:")
        for feature, stats in relationships['numerical_correlations'].items():
            significance = "significant" if stats['significant'] else "not significant"
            print(f"  {feature}: {stats['correlation']:.3f} ({significance})")
    
    if 'categorical_anova' in relationships:
        print("\nCategorical feature relationships with target:")
        for feature, stats in relationships['categorical_anova'].items():
            significance = "significant" if stats['significant'] else "not significant"
            print(f"  {feature}: F={stats['f_statistic']:.3f} ({significance})")
    
    # Boxplots by target
    print("\n3. NUMERICAL FEATURES BY TARGET")
    print("-" * 32)
    if len(eda.numeric_columns) > 0:
        fig4 = AdvancedVisualizations.plot_boxplots_by_target(
            loader.dataset, target_column, eda.numeric_columns[:6], figsize=(15, 10)
        )
        plt.show()
else:
    print(f"Target column '{target_column}' not found in dataset.")

## 6. Key Insights and Findings

In [None]:
print("=" * 40)
print("KEY INSIGHTS AND FINDINGS")
print("=" * 40)

# Generate EDA report
eda_report = eda.generate_eda_report()
print(eda_report)

# Additional insights
print("ADDITIONAL INSIGHTS:")
print("-" * 18)

if eda.numeric_columns:
    print(f"1. Dataset contains {len(eda.numeric_columns)} numerical features.")
    print(f"2. Dataset contains {len(eda.categorical_columns)} categorical features.")
    print(f"3. Total missing values: {loader.dataset.isnull().sum().sum()}")
    
    # Highlight strong correlations
    if len(eda.numeric_columns) > 1:
        corr_matrix = loader.dataset[eda.numeric_columns].corr()
        # Find strong correlations (abs > 0.5)
        strong_corrs = []
        for i in range(len(corr_matrix.columns)):
            for j in range(i+1, len(corr_matrix.columns)):
                corr_val = corr_matrix.iloc[i, j]
                if abs(corr_val) > 0.5:
                    strong_corrs.append((corr_matrix.columns[i], corr_matrix.columns[j], corr_val))
        
        if strong_corrs:
            print("4. Strong correlations found:")
            for var1, var2, corr in strong_corrs[:5]:  # Show top 5
                print(f"   {var1} ↔ {var2}: {corr:.3f}")
        else:
            print("4. No strong correlations (|r| > 0.5) found.")

print("\nRECOMMENDATIONS:")
print("-" * 15)
print("1. Address missing values in the data cleaning phase.")
print("2. Consider feature engineering for highly correlated variables.")
print("3. Investigate outliers in detail during data cleaning.")
print("4. Use the identified target relationships for model development.")

## 7. Summary Statistics Dashboard

In [None]:
# Create interactive dashboard
print("=" * 40)
print("INTERACTIVE DASHBOARD")
print("=" * 40)

try:
    dashboard = AdvancedVisualizations.create_eda_dashboard(loader.dataset, target_column)
    dashboard.show()
except Exception as e:
    print(f"Could not create dashboard: {e}")
    print("Dashboard creation requires more complex setup.")

## 8. Next Steps

In [None]:
print("=" * 30)
print("NEXT STEPS")
print("=" * 30)
print("1. Proceed to data cleaning and preprocessing notebook")
print("2. Address identified data quality issues")
print("3. Implement feature engineering strategies")
print("4. Prepare data for machine learning models")
print("5. Document all findings in the final report")

print("\nEDA COMPLETED SUCCESSFULLY!")