# Academic Performance Analysis

This notebook analyzes student performance metrics across Bangladesh educational institutions:
- GPA distributions and trends
- Subject-wise performance analysis
- Performance benchmarking
- Attendance impact analysis
- Grade progression patterns
- Performance ranking systems

**Key Performance Indicators:**
- Overall GPA distributions
- Subject performance scores
- Attendance rates
- Pass/fail rates
- Grade improvement trajectories

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import scipy.stats as stats
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import warnings
import sys
from pathlib import Path

# Add project root to Python path
sys.path.append('../..')
from src.data_processing.data_processor import DataProcessor

# Configure display options
pd.set_option('display.max_columns', None)
plt.style.use('seaborn-v0_8')
warnings.filterwarnings('ignore')

# Set up plotting
plt.rcParams['figure.figsize'] = (12, 8)
sns.set_palette('RdYlBu_r')

## 1. Data Loading and Performance Calculation

In [None]:
# Load and process data
processor = DataProcessor()

# Create comprehensive sample data with performance metrics
np.random.seed(42)
n_students = 2000

performance_data = pd.DataFrame({
    'student_id': [f'S{i:04d}' for i in range(1, n_students + 1)],
    'name': [f'Student {i}' for i in range(1, n_students + 1)],
    'division': np.random.choice(['Dhaka', 'Chittagong', 'Khulna', 'Rajshahi', 'Sylhet', 'Barishal', 'Rangpur', 'Mymensingh'], n_students),
    'institution_type': np.random.choice(['Government', 'Private', 'Madrasa'], n_students, p=[0.6, 0.3, 0.1]),
    'grade_level': np.random.choice([6, 7, 8, 9, 10, 11, 12], n_students),
    'bangla': np.random.normal(3.5, 0.8, n_students).clip(0, 5),
    'english': np.random.normal(3.2, 0.9, n_students).clip(0, 5),
    'mathematics': np.random.normal(3.0, 1.0, n_students).clip(0, 5),
    'science': np.random.normal(3.3, 0.9, n_students).clip(0, 5),
    'social_studies': np.random.normal(3.6, 0.7, n_students).clip(0, 5),
    'attendance_rate': np.random.beta(8, 2, n_students),
    'days_present': None,
    'total_school_days': 200,
    'gender': np.random.choice(['Male', 'Female'], n_students),
    'socioeconomic_status': np.random.choice(['Low', 'Medium', 'High'], n_students, p=[0.4, 0.4, 0.2])
})

# Calculate derived metrics
subject_cols = ['bangla', 'english', 'mathematics', 'science', 'social_studies']
performance_data['gpa'] = performance_data[subject_cols].mean(axis=1)
performance_data['days_present'] = (performance_data['attendance_rate'] * performance_data['total_school_days']).astype(int)

# Apply performance calculation from processor
performance_data = processor.calculate_performance_metrics(performance_data)

print(f"Performance data loaded: {performance_data.shape}")
performance_data.head()

## 2. Overall Performance Distribution Analysis

In [None]:
# Overall GPA distribution
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12))

# GPA distribution
sns.histplot(data=performance_data, x='gpa', bins=30, kde=True, ax=ax1)
ax1.axvline(performance_data['gpa'].mean(), color='red', linestyle='--', label=f'Mean: {performance_data["gpa"].mean():.2f}')
ax1.axvline(performance_data['gpa'].median(), color='green', linestyle='--', label=f'Median: {performance_data["gpa"].median():.2f}')
ax1.set_title('Overall GPA Distribution')
ax1.legend()

# Performance level distribution
performance_counts = performance_data['performance_level'].value_counts()
ax2.pie(performance_counts.values, labels=performance_counts.index, autopct='%1.1f%%')
ax2.set_title('Performance Level Distribution')

# Attendance vs GPA scatter
sns.scatterplot(data=performance_data, x='attendance_rate', y='gpa', alpha=0.6, ax=ax3)
sns.regplot(data=performance_data, x='attendance_rate', y='gpa', scatter=False, color='red', ax=ax3)
ax3.set_title('Attendance Rate vs GPA')
ax3.set_xlabel('Attendance Rate')
ax3.set_ylabel('GPA')

# GPA by institution type
sns.boxplot(data=performance_data, x='institution_type', y='gpa', ax=ax4)
ax4.set_title('GPA Distribution by Institution Type')
ax4.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

# Statistical summary
print("\nGPA Statistics:")
print(performance_data['gpa'].describe())

# Correlation between attendance and performance
correlation = performance_data['attendance_rate'].corr(performance_data['gpa'])
print(f"\nCorrelation between Attendance and GPA: {correlation:.3f}")

## 3. Subject-wise Performance Analysis

In [None]:
# Subject performance comparison
subject_stats = performance_data[subject_cols].describe().T
subject_stats['coefficient_of_variation'] = subject_stats['std'] / subject_stats['mean']

print("Subject-wise Performance Statistics:")
print(subject_stats.round(3))

# Visualization
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12))

# Subject means comparison
subject_means = performance_data[subject_cols].mean().sort_values(ascending=True)
subject_means.plot(kind='barh', ax=ax1, color='skyblue')
ax1.set_title('Average Performance by Subject')
ax1.set_xlabel('Average Score')

# Subject correlation heatmap
subject_corr = performance_data[subject_cols].corr()
sns.heatmap(subject_corr, annot=True, cmap='coolwarm', center=0, ax=ax2)
ax2.set_title('Subject Performance Correlations')

# Subject distribution violin plot
subject_melted = performance_data[subject_cols].melt()
sns.violinplot(data=subject_melted, x='variable', y='value', ax=ax3)
ax3.set_title('Subject Score Distributions')
ax3.set_xlabel('Subject')
ax3.set_ylabel('Score')
ax3.tick_params(axis='x', rotation=45)

# Performance variability
subject_cv = (performance_data[subject_cols].std() / performance_data[subject_cols].mean()).sort_values()
subject_cv.plot(kind='bar', ax=ax4, color='orange')
ax4.set_title('Subject Performance Variability (Coefficient of Variation)')
ax4.set_ylabel('Coefficient of Variation')
ax4.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

# Interactive radar chart for top performers
top_performers = performance_data.nlargest(10, 'gpa')[subject_cols + ['student_id']]

# Create radar chart
fig_radar = go.Figure()

for i, (_, student) in enumerate(top_performers.iterrows()):
    if i < 3:  # Show top 3 students
        fig_radar.add_trace(go.Scatterpolar(
            r=student[subject_cols].values,
            theta=subject_cols,
            fill='toself',
            name=student['student_id'],
            opacity=0.6
        ))

fig_radar.update_layout(
    polar=dict(
        radialaxis=dict(
            visible=True,
            range=[0, 5]
        )),
    showlegend=True,
    title="Top 3 Performers - Subject Comparison"
)

fig_radar.show()

## 4. Performance Benchmarking and Ranking

In [None]:
# Performance benchmarking
def calculate_percentile_ranks(df, score_col):
    """Calculate percentile ranks for performance scores."""
    df[f'{score_col}_percentile'] = df[score_col].rank(pct=True) * 100
    return df

# Calculate percentile ranks
performance_data = calculate_percentile_ranks(performance_data, 'gpa')

# Performance categories
def categorize_performance(percentile):
    if percentile >= 90:
        return 'Top 10%'
    elif percentile >= 75:
        return 'Top 25%'
    elif percentile >= 50:
        return 'Above Average'
    elif percentile >= 25:
        return 'Below Average'
    else:
        return 'Bottom 25%'

performance_data['performance_category'] = performance_data['gpa_percentile'].apply(categorize_performance)

# Visualization
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12))

# Performance category distribution
category_counts = performance_data['performance_category'].value_counts()
ax1.pie(category_counts.values, labels=category_counts.index, autopct='%1.1f%%')
ax1.set_title('Performance Category Distribution')

# Top performers by division
top_10_percent = performance_data[performance_data['performance_category'] == 'Top 10%']
division_top_performers = top_10_percent['division'].value_counts()
division_top_performers.plot(kind='bar', ax=ax2, color='gold')
ax2.set_title('Top 10% Performers by Division')
ax2.set_ylabel('Number of Top Performers')
ax2.tick_params(axis='x', rotation=45)

# Performance distribution by grade level
sns.boxplot(data=performance_data, x='grade_level', y='gpa', ax=ax3)
ax3.set_title('GPA Distribution by Grade Level')
ax3.set_xlabel('Grade Level')
ax3.set_ylabel('GPA')

# Percentile distribution
sns.histplot(data=performance_data, x='gpa_percentile', bins=20, ax=ax4)
ax4.set_title('GPA Percentile Distribution')
ax4.set_xlabel('Percentile')
ax4.set_ylabel('Frequency')

plt.tight_layout()
plt.show()

# Performance summary by institution type
institution_performance = performance_data.groupby('institution_type').agg({
    'gpa': ['mean', 'std', 'count'],
    'attendance_rate': 'mean'
}).round(3)

print("\nPerformance Summary by Institution Type:")
print(institution_performance)

## 5. Performance Clustering Analysis

In [None]:
# Prepare data for clustering
clustering_features = ['gpa', 'attendance_rate'] + subject_cols
clustering_data = performance_data[clustering_features].fillna(performance_data[clustering_features].mean())

# Standardize features
scaler = StandardScaler()
scaled_data = scaler.fit_transform(clustering_data)

# Perform K-means clustering
n_clusters = 4
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
clusters = kmeans.fit_predict(scaled_data)

performance_data['cluster'] = clusters

# Analyze clusters
cluster_analysis = performance_data.groupby('cluster')[clustering_features].mean()

print("Performance Clusters Analysis:")
print(cluster_analysis.round(3))

# Visualization
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12))

# Cluster visualization in 2D
scatter = ax1.scatter(performance_data['gpa'], performance_data['attendance_rate'], 
                     c=performance_data['cluster'], cmap='viridis', alpha=0.6)
ax1.set_xlabel('GPA')
ax1.set_ylabel('Attendance Rate')
ax1.set_title('Performance Clusters (GPA vs Attendance)')
plt.colorbar(scatter, ax=ax1, label='Cluster')

# Cluster size distribution
cluster_counts = performance_data['cluster'].value_counts().sort_index()
ax2.pie(cluster_counts.values, labels=[f'Cluster {i}' for i in cluster_counts.index], autopct='%1.1f%%')
ax2.set_title('Cluster Size Distribution')

# Subject performance by cluster
cluster_subject_means = performance_data.groupby('cluster')[subject_cols].mean()
cluster_subject_means.T.plot(kind='bar', ax=ax3)
ax3.set_title('Average Subject Performance by Cluster')
ax3.set_ylabel('Average Score')
ax3.tick_params(axis='x', rotation=45)
ax3.legend(title='Cluster', bbox_to_anchor=(1.05, 1), loc='upper left')

# Cluster characteristics heatmap
sns.heatmap(cluster_analysis, annot=True, cmap='RdYlBu_r', center=cluster_analysis.mean().mean(), ax=ax4)
ax4.set_title('Cluster Characteristics Heatmap')
ax4.set_ylabel('Cluster')

plt.tight_layout()
plt.show()

# Cluster interpretation
cluster_labels = {
    0: 'Struggling Students',
    1: 'Average Performers',
    2: 'High Achievers',
    3: 'Inconsistent Performers'
}

print("\nCluster Interpretation:")
for cluster_id, label in cluster_labels.items():
    cluster_data = performance_data[performance_data['cluster'] == cluster_id]
    print(f"\nCluster {cluster_id} - {label}:")
    print(f"  Size: {len(cluster_data)} students ({len(cluster_data)/len(performance_data)*100:.1f}%)")
    print(f"  Average GPA: {cluster_data['gpa'].mean():.2f}")
    print(f"  Average Attendance: {cluster_data['attendance_rate'].mean():.2f}")

## 6. Performance Improvement Recommendations

In [None]:
# Identify students needing intervention
def identify_intervention_students(df):
    """Identify students who need academic intervention."""
    intervention_criteria = (
        (df['gpa'] < 2.5) |
        (df['attendance_rate'] < 0.7) |
        (df['performance_category'].isin(['Bottom 25%', 'Below Average']))
    )
    return df[intervention_criteria]

intervention_students = identify_intervention_students(performance_data)

print(f"Students needing intervention: {len(intervention_students)} ({len(intervention_students)/len(performance_data)*100:.1f}%)")

# Analysis of intervention students
intervention_analysis = {
    'Low GPA (< 2.5)': len(performance_data[performance_data['gpa'] < 2.5]),
    'Poor Attendance (< 70%)': len(performance_data[performance_data['attendance_rate'] < 0.7]),
    'Low Performance Category': len(performance_data[performance_data['performance_category'].isin(['Bottom 25%', 'Below Average'])]),
    'Multiple Risk Factors': len(performance_data[
        (performance_data['gpa'] < 2.5) & 
        (performance_data['attendance_rate'] < 0.7)
    ])
}

# Visualization
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12))

# Intervention needs breakdown
intervention_counts = pd.Series(intervention_analysis)
intervention_counts.plot(kind='bar', ax=ax1, color='red', alpha=0.7)
ax1.set_title('Students Needing Different Types of Intervention')
ax1.set_ylabel('Number of Students')
ax1.tick_params(axis='x', rotation=45)

# Subject-wise weaknesses
subject_weaknesses = (performance_data[subject_cols] < 2.5).sum()
subject_weaknesses.plot(kind='bar', ax=ax2, color='orange')
ax2.set_title('Number of Students with Low Performance by Subject')
ax2.set_ylabel('Number of Students')
ax2.tick_params(axis='x', rotation=45)

# Performance improvement potential
performance_data['improvement_potential'] = 5.0 - performance_data['gpa']
sns.scatterplot(data=performance_data, x='attendance_rate', y='improvement_potential', 
                hue='socioeconomic_status', alpha=0.6, ax=ax3)
ax3.set_title('Performance Improvement Potential vs Attendance')
ax3.set_xlabel('Attendance Rate')
ax3.set_ylabel('Improvement Potential (5.0 - Current GPA)')

# Division-wise intervention needs
division_intervention = intervention_students['division'].value_counts()
division_intervention.plot(kind='bar', ax=ax4, color='purple', alpha=0.7)
ax4.set_title('Students Needing Intervention by Division')
ax4.set_ylabel('Number of Students')
ax4.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

# Recommendations
print("\n🎯 PERFORMANCE IMPROVEMENT RECOMMENDATIONS")
print("=" * 50)
print(f"
📊 Overall Performance Insights:")
print(f"   • Average GPA: {performance_data['gpa'].mean():.2f}")
print(f"   • Students below 2.5 GPA: {len(performance_data[performance_data['gpa'] < 2.5])}")
print(f"   • Average attendance rate: {performance_data['attendance_rate'].mean():.2%}")

print(f"📚 Subject-specific Actions:")
worst_subject = subject_weaknesses.idxmax()
print(f"   • Focus on {worst_subject}: {subject_weaknesses[worst_subject]} students struggling")
print(f"   • Strengthen mathematics and science programs")
print(f"   • Enhance language instruction methods")

print(f"🏫 Institutional Recommendations:")
print(f"   • Implement attendance monitoring systems")
print(f"   • Develop targeted tutoring programs")
print(f"   • Create performance tracking dashboards")
print(f"   • Establish early warning systems for at-risk students")