# Data Overview and Initial Exploration

This notebook provides a comprehensive overview of the Bangladesh student data, including:
- Data structure and dimensions
- Missing value analysis
- Data type distributions
- Initial statistical summaries
- Data quality assessment

**Data Sources:**
- BANBEIS (Bangladesh Bureau of Educational Information and Statistics)
- Education Board Results
- DSHE (Directorate of Secondary and Higher Education)
- DPE (Directorate of Primary Education)

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
import sys
from pathlib import Path

# Add project root to Python path
sys.path.append('../..')
from src.data_processing.data_processor import DataProcessor

# Configure display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
plt.style.use('seaborn-v0_8')
warnings.filterwarnings('ignore')

# Set up plotting
plt.rcParams['figure.figsize'] = (12, 8)
sns.set_palette('viridis')

## 1. Data Loading and Initial Inspection

In [None]:
# Initialize data processor
processor = DataProcessor()

# Load sample data (modify source as needed)
try:
    student_data = processor.load_student_data('academic')
    print(f"Data loaded successfully: {student_data.shape}")
except Exception as e:
    print(f"Error loading data: {e}")
    # Create sample data for demonstration
    student_data = pd.DataFrame({
        'student_id': [f'S{i:04d}' for i in range(1, 1001)],
        'name': [f'Student {i}' for i in range(1, 1001)],
        'division': np.random.choice(['Dhaka', 'Chittagong', 'Khulna', 'Rajshahi', 'Sylhet'], 1000),
        'district': np.random.choice(['Dhaka', 'Chittagong', 'Khulna', 'Rajshahi', 'Sylhet'], 1000),
        'gender': np.random.choice(['Male', 'Female'], 1000),
        'age': np.random.randint(15, 25, 1000),
        'gpa': np.random.uniform(2.0, 5.0, 1000),
        'attendance_rate': np.random.uniform(0.6, 1.0, 1000),
        'socioeconomic_status': np.random.choice(['Low', 'Medium', 'High'], 1000)
    })
    print("Sample data created for demonstration")

# Display basic information
print(f"\nDataset Shape: {student_data.shape}")
print(f"Number of students: {len(student_data):,}")
print(f"Number of features: {len(student_data.columns)}")

# Show first few rows
student_data.head()

## 2. Data Structure Analysis

In [None]:
# Data types and info
print("Data Types:")
print(student_data.dtypes)
print("\n")

# Memory usage
print("Memory Usage:")
print(student_data.info(memory_usage='deep'))

# Column names
print(f"\nColumn Names ({len(student_data.columns)}):")
for i, col in enumerate(student_data.columns, 1):
    print(f"{i:2d}. {col}")

## 3. Missing Value Analysis

In [None]:
# Missing value analysis
missing_data = pd.DataFrame({
    'Column': student_data.columns,
    'Missing_Count': student_data.isnull().sum(),
    'Missing_Percentage': (student_data.isnull().sum() / len(student_data)) * 100
})

missing_data = missing_data.sort_values('Missing_Percentage', ascending=False)
missing_data = missing_data[missing_data['Missing_Count'] > 0]

if len(missing_data) > 0:
    print("Missing Values Summary:")
    print(missing_data)
    
    # Visualize missing values
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
    
    # Missing values heatmap
    sns.heatmap(student_data.isnull(), yticklabels=False, cbar=True, ax=ax1)
    ax1.set_title('Missing Values Heatmap')
    
    # Missing values bar plot
    missing_data.plot(x='Column', y='Missing_Percentage', kind='bar', ax=ax2)
    ax2.set_title('Missing Values Percentage by Column')
    ax2.set_ylabel('Missing Percentage (%)')
    ax2.tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.show()
else:
    print("No missing values found in the dataset!")

## 4. Statistical Summary

In [None]:
# Numerical columns summary
numerical_cols = student_data.select_dtypes(include=[np.number]).columns
print("Numerical Columns Summary:")
print(student_data[numerical_cols].describe())

# Categorical columns summary
categorical_cols = student_data.select_dtypes(include=['object']).columns
print("\nCategorical Columns Summary:")
for col in categorical_cols:
    print(f"\n{col}:")
    print(student_data[col].value_counts())
    print(f"Unique values: {student_data[col].nunique()}")

## 5. Data Distribution Visualization

In [None]:
# Distribution plots for numerical variables
if len(numerical_cols) > 0:
    n_cols = min(3, len(numerical_cols))
    n_rows = (len(numerical_cols) + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5*n_rows))
    if n_rows == 1:
        axes = [axes] if n_cols == 1 else axes
    else:
        axes = axes.flatten()
    
    for i, col in enumerate(numerical_cols):
        ax = axes[i] if len(numerical_cols) > 1 else axes
        
        # Histogram with KDE
        sns.histplot(data=student_data, x=col, kde=True, ax=ax)
        ax.set_title(f'Distribution of {col}')
        ax.set_ylabel('Frequency')
    
    # Hide empty subplots
    for i in range(len(numerical_cols), len(axes)):
        axes[i].set_visible(False)
    
    plt.tight_layout()
    plt.show()

## 6. Geographic Distribution

In [None]:
# Geographic distribution analysis
if 'division' in student_data.columns:
    division_counts = student_data['division'].value_counts()
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
    
    # Bar plot
    division_counts.plot(kind='bar', ax=ax1)
    ax1.set_title('Student Distribution by Division')
    ax1.set_ylabel('Number of Students')
    ax1.tick_params(axis='x', rotation=45)
    
    # Pie chart
    ax2.pie(division_counts.values, labels=division_counts.index, autopct='%1.1f%%')
    ax2.set_title('Student Distribution by Division (Percentage)')
    
    plt.tight_layout()
    plt.show()
    
    # Interactive plotly visualization
    fig_plotly = px.bar(
        x=division_counts.index, 
        y=division_counts.values,
        title='Interactive Student Distribution by Division',
        labels={'x': 'Division', 'y': 'Number of Students'}
    )
    fig_plotly.show()

## 7. Correlation Analysis

In [None]:
# Correlation analysis for numerical variables
if len(numerical_cols) > 1:
    correlation_matrix = student_data[numerical_cols].corr()
    
    plt.figure(figsize=(10, 8))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
                square=True, linewidths=0.5)
    plt.title('Correlation Matrix of Numerical Variables')
    plt.tight_layout()
    plt.show()
    
    # Find highly correlated pairs
    high_corr = []
    for i in range(len(correlation_matrix.columns)):
        for j in range(i+1, len(correlation_matrix.columns)):
            if abs(correlation_matrix.iloc[i, j]) > 0.7:
                high_corr.append((
                    correlation_matrix.columns[i],
                    correlation_matrix.columns[j],
                    correlation_matrix.iloc[i, j]
                ))
    
    if high_corr:
        print("\nHighly Correlated Variable Pairs (|r| > 0.7):")
        for var1, var2, corr in high_corr:
            print(f"{var1} - {var2}: {corr:.3f}")

## 8. Data Quality Assessment

In [None]:
# Data quality assessment
def assess_data_quality(df):
    """Comprehensive data quality assessment."""
    quality_report = {}
    
    # Basic metrics
    quality_report['total_records'] = len(df)
    quality_report['total_features'] = len(df.columns)
    quality_report['missing_values_total'] = df.isnull().sum().sum()
    quality_report['missing_percentage'] = (df.isnull().sum().sum() / (len(df) * len(df.columns))) * 100
    
    # Duplicates
    quality_report['duplicate_records'] = df.duplicated().sum()
    quality_report['duplicate_percentage'] = (df.duplicated().sum() / len(df)) * 100
    
    # Data types
    quality_report['data_types'] = df.dtypes.value_counts().to_dict()
    
    return quality_report

# Generate quality report
quality_report = assess_data_quality(student_data)

print("DATA QUALITY REPORT")
print("=" * 50)
print(f"Total Records: {quality_report['total_records']:,}")
print(f"Total Features: {quality_report['total_features']}")
print(f"Missing Values: {quality_report['missing_values_total']:,} ({quality_report['missing_percentage']:.2f}%)")
print(f"Duplicate Records: {quality_report['duplicate_records']:,} ({quality_report['duplicate_percentage']:.2f}%)")
print(f"\nData Types Distribution:")
for dtype, count in quality_report['data_types'].items():
    print(f"  {dtype}: {count} columns")

## 9. Key Insights Summary

In [None]:
# Generate key insights
print("KEY INSIGHTS FROM EXPLORATORY ANALYSIS")
print("=" * 50)

# Dataset overview
print(f"📊 Dataset contains {len(student_data):,} student records with {len(student_data.columns)} features")

# Geographic distribution
if 'division' in student_data.columns:
    most_represented = student_data['division'].mode()[0]
    print(f"🗺️  Most represented division: {most_represented}")

# Performance insights
if 'gpa' in student_data.columns:
    avg_gpa = student_data['gpa'].mean()
    print(f"📈 Average GPA: {avg_gpa:.2f}")

# Gender distribution
if 'gender' in student_data.columns:
    gender_dist = student_data['gender'].value_counts(normalize=True) * 100
    print(f"👥 Gender distribution: {dict(gender_dist.round(1))}")

# Data quality
if quality_report['missing_percentage'] < 5:
    print(f"✅ Data quality is good - only {quality_report['missing_percentage']:.1f}% missing values")
else:
    print(f"⚠️  Data quality needs attention - {quality_report['missing_percentage']:.1f}% missing values")

print("\n📝 Recommendations for next steps:")
print("   • Proceed with demographic analysis")
print("   • Investigate performance patterns")
print("   • Analyze geographic variations")
print("   • Examine socioeconomic factors")