In [1]:
# Advanced JEE Dropout Analysis

# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import (classification_report, confusion_matrix, 
                           accuracy_score, roc_auc_score, roc_curve, 
                           precision_recall_curve, average_precision_score)
from sklearn.feature_selection import SelectKBest, f_classif, RFECV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline as make_imb_pipeline
import shap
import eli5
from eli5.sklearn import PermutationImportance
import warnings
warnings.filterwarnings('ignore')

# Set style for plots
plt.style.use('seaborn')
%matplotlib inline
pd.set_option('display.max_columns', None)


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.5 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "C:\Users\14AC0036AU\AppData\Local\Programs\Python\Python310\lib\runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "C:\Users\14AC0036AU\AppData\Local\Programs\Python\Python310\lib\runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "C:\Users\14AC0036AU\AppData\Local\Programs\Python\Python310\lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "C:\Users\14AC0036AU\AppData\Local\Programs\Python\Python310\lib

AttributeError: _ARRAY_API not found

ImportError: numpy.core.multiarray failed to import

In [2]:
import sys
!{sys.executable} -m pip install imbalanced-learn


Collecting imbalanced-learn
  Using cached imbalanced_learn-0.13.0-py3-none-any.whl (238 kB)
Collecting sklearn-compat<1,>=0.1
  Using cached sklearn_compat-0.1.3-py3-none-any.whl (18 kB)
Installing collected packages: sklearn-compat, imbalanced-learn
Successfully installed imbalanced-learn-0.13.0 sklearn-compat-0.1.3



[notice] A new release of pip available: 22.2.2 -> 25.1.1
[notice] To update, run: C:\Users\14AC0036AU\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip


In [3]:
# Set style for plots
plt.style.use('seaborn')
%matplotlib inline
pd.set_option('display.max_columns', None)

## 1. Advanced Data Loading and Initial Exploration

# Load the dataset
df = pd.read_csv('JEE_Dropout_After_Class_12.csv')

# Advanced data profiling function
def advanced_data_profile(df):
    profile = pd.DataFrame({
        'dtype': df.dtypes,
        'missing_values': df.isnull().sum(),
        'missing_%': (df.isnull().mean() * 100).round(2),
        'unique_values': df.nunique(),
        'cardinality': df.nunique() / len(df),
        'skewness': df.skew(numeric_only=True),
        'kurtosis': df.kurt(numeric_only=True)
    })
    return profile

print("Advanced Data Profile:")
display(advanced_data_profile(df))

# Interactive correlation matrix with Plotly
import plotly.express as px

corr_matrix = df.select_dtypes(include=['float64', 'int64']).corr()
fig = px.imshow(corr_matrix, text_auto=True, aspect="auto", 
                color_continuous_scale='RdBu_r', title='Interactive Correlation Matrix')
fig.show()

## 2. Advanced Data Cleaning and Feature Engineering

# Advanced outlier detection using IQR and Z-score
def detect_outliers(df, col):
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - (1.5 * iqr)
    upper_bound = q3 + (1.5 * iqr)
    
    z_scores = (df[col] - df[col].mean()) / df[col].std()
    
    return {
        'IQR_outliers': ((df[col] < lower_bound) | (df[col] > upper_bound)).sum(),
        'Z_score_outliers': (abs(z_scores) > 3).sum()
    }

outlier_report = {}
for col in df.select_dtypes(include=['float64', 'int64']).columns:
    outlier_report[col] = detect_outliers(df, col)

print("\nOutlier Report:")
display(pd.DataFrame(outlier_report).T)

# Advanced feature engineering
df['total_score'] = df['jee_main_score'] * 0.4 + df['jee_advanced_score'] * 0.6
df['consistency'] = 1 - (abs(df['jee_main_score'] - df['mock_test_score_avg']) / 100)
df['effort_score'] = df['daily_study_hours'] * df['attempt_count']
df['parent_edu_score'] = df['parent_education'].map({
    'Upto 10th': 1,
    '12th': 2,
    'Graduate': 3,
    'PG': 4
})

# Create peer pressure score
df['peer_pressure_score'] = df['peer_pressure_level'].map({
    'Low': 1,
    'Medium': 2,
    'High': 3
})

# Create location score (urban = higher)
df['location_score'] = df['location_type'].map({
    'Rural': 1,
    'Semi-Urban': 2,
    'Urban': 3
})

# Create coaching institute tier
top_institutes = ['FIITJEE', 'Allen']
df['coaching_tier'] = df['coaching_institute'].apply(
    lambda x: 1 if x in top_institutes else (2 if x == 'Local' else 3))

## 3. Advanced Exploratory Data Analysis (EDA)

# Interactive distribution plots with Plotly
for col in ['jee_main_score', 'jee_advanced_score', 'mock_test_score_avg', 'class_12_percent']:
    fig = px.histogram(df, x=col, color='dropout', marginal="box", 
                      title=f'Distribution of {col} by Dropout Status',
                      barmode='overlay')
    fig.show()

# Advanced pairplot with target highlighting
num_cols = ['jee_main_score', 'jee_advanced_score', 'mock_test_score_avg', 
           'class_12_percent', 'daily_study_hours', 'total_score']
fig = px.scatter_matrix(df, dimensions=num_cols, color='dropout',
                       title='Pairplot of Numerical Features by Dropout Status')
fig.show()

# Interactive sunburst chart for categorical features
fig = px.sunburst(df, path=['family_income', 'parent_education', 'dropout'],
                 title='Dropout Distribution by Family Income and Parent Education')
fig.show()

# Advanced time-series like analysis for attempt count
attempt_dropout = df.groupby('attempt_count')['dropout'].mean().reset_index()
fig = px.line(attempt_dropout, x='attempt_count', y='dropout', 
             title='Dropout Rate by Attempt Count',
             markers=True)
fig.update_layout(yaxis_tickformat=".0%")
fig.show()

## 4. Advanced Feature Selection and Preprocessing

# Define categorical and numerical features
categorical_cols = ['school_board', 'family_income', 'parent_education', 
                   'location_type', 'peer_pressure_level', 'mental_health_issues',
                   'admission_taken', 'coaching_institute', 'coaching_tier']
numerical_cols = ['jee_main_score', 'jee_advanced_score', 'mock_test_score_avg',
                 'class_12_percent', 'attempt_count', 'daily_study_hours',
                 'performance_ratio', 'study_efficiency', 'total_score',
                 'consistency', 'effort_score', 'parent_edu_score',
                 'peer_pressure_score', 'location_score']

# Create preprocessing pipelines
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Handle class imbalance with SMOTE
smote = SMOTE(random_state=42)

## 5. Advanced Modeling with Hyperparameter Tuning

# Define models and parameter grids
models = {
    'Random Forest': {
        'model': RandomForestClassifier(random_state=42),
        'params': {
            'model__n_estimators': [100, 200],
            'model__max_depth': [None, 10, 20],
            'model__min_samples_split': [2, 5],
            'model__class_weight': ['balanced', None]
        }
    },
    'Gradient Boosting': {
        'model': GradientBoostingClassifier(random_state=42),
        'params': {
            'model__n_estimators': [100, 200],
            'model__learning_rate': [0.1, 0.05],
            'model__max_depth': [3, 5]
        }
    },
    'Logistic Regression': {
        'model': LogisticRegression(random_state=42, max_iter=1000),
        'params': {
            'model__C': [0.1, 1, 10],
            'model__penalty': ['l1', 'l2'],
            'model__solver': ['liblinear'],
            'model__class_weight': ['balanced', None]
        }
    }
}

# Prepare data
X = df[numerical_cols + categorical_cols]
y = df['dropout']

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y)

# Model training and evaluation
results = {}
for name, config in models.items():
    print(f"\nTraining {name}...")
    
    # Create pipeline
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('smote', smote),
        ('model', config['model'])
    ])
    
    # Grid search with stratified K-fold
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    grid = GridSearchCV(pipeline, config['params'], cv=cv, scoring='roc_auc', n_jobs=-1)
    grid.fit(X_train, y_train)
    
    # Store results
    results[name] = {
        'model': grid.best_estimator_,
        'best_params': grid.best_params_,
        'best_score': grid.best_score_
    }
    
    # Evaluation
    y_pred = grid.predict(X_test)
    y_proba = grid.predict_proba(X_test)[:, 1]
    
    print(f"Best parameters: {grid.best_params_}")
    print(f"Best CV AUC: {grid.best_score_:.4f}")
    print(f"Test Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f"Test AUC: {roc_auc_score(y_test, y_proba):.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

# Select best model
best_model_name = max(results, key=lambda x: results[x]['best_score'])
best_model = results[best_model_name]['model']
print(f"\nBest model: {best_model_name} with AUC: {results[best_model_name]['best_score']:.4f}")

## 6. Advanced Model Interpretation

### 6.1 SHAP Values Analysis
# Prepare data for SHAP
preprocessed = best_model.named_steps['preprocessor'].transform(X_train)
if hasattr(preprocessed, 'toarray'):  # if sparse matrix from one-hot encoding
    preprocessed = preprocessed.toarray()

# Get feature names
numeric_features = numerical_cols
categorical_features = best_model.named_steps['preprocessor'].named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(categorical_cols)
all_features = np.concatenate([numeric_features, categorical_features])

# Create SHAP explainer
explainer = shap.TreeExplainer(best_model.named_steps['model'])
shap_values = explainer.shap_values(preprocessed)

# Summary plot
plt.figure(figsize=(10, 8))
shap.summary_plot(shap_values[1], preprocessed, feature_names=all_features, plot_type="bar")
plt.title('SHAP Feature Importance')
plt.show()

# Force plot for a specific prediction
sample_idx = 0
shap.force_plot(explainer.expected_value[1], shap_values[1][sample_idx, :], 
               preprocessed[sample_idx, :], feature_names=all_features)

### 6.2 Permutation Importance
perm = PermutationImportance(best_model.named_steps['model'], random_state=42).fit(preprocessed, y_train)
eli5.show_weights(perm, feature_names=all_features)

### 6.3 Partial Dependence Plots
from sklearn.inspection import PartialDependenceDisplay

top_features = [all_features[i] for i in np.argsort(-perm.feature_importances_)[:3]]
print(f"\nTop 3 most important features: {top_features}")

fig, ax = plt.subplots(figsize=(12, 8))
PartialDependenceDisplay.from_estimator(
    best_model, X_train, top_features, 
    feature_names=all_features,
    ax=ax)
plt.suptitle('Partial Dependence Plots for Top Features')
plt.tight_layout()
plt.show()

## 7. Advanced Model Evaluation

### 7.1 ROC and Precision-Recall Curves
y_proba = best_model.predict_proba(X_test)[:, 1]

# ROC Curve
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
roc_auc = roc_auc_score(y_test, y_proba)

plt.figure(figsize=(10, 8))
plt.plot(fpr, tpr, label=f'AUC = {roc_auc:.2f}')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()

# Precision-Recall Curve
precision, recall, thresholds = precision_recall_curve(y_test, y_proba)
avg_precision = average_precision_score(y_test, y_proba)

plt.figure(figsize=(10, 8))
plt.plot(recall, precision, label=f'AP = {avg_precision:.2f}')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend()
plt.show()

### 7.2 Calibration Plot
from sklearn.calibration import calibration_curve

prob_true, prob_pred = calibration_curve(y_test, y_proba, n_bins=10)

plt.figure(figsize=(10, 8))
plt.plot(prob_pred, prob_true, marker='o', label='Model')
plt.plot([0, 1], [0, 1], 'k--', label='Perfectly calibrated')
plt.xlabel('Mean predicted probability')
plt.ylabel('Fraction of positives')
plt.title('Calibration Plot')
plt.legend()
plt.show()

## 8. Deployment-Ready Pipeline

# Create final pipeline with best model
final_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', best_model.named_steps['model'])
])

# Train on full data
final_pipeline.fit(X, y)

# Save the model
import joblib
joblib.dump(final_pipeline, 'jee_dropout_predictor.pkl')

# Example prediction
sample_data = X.iloc[0:1].copy()
print("\nSample prediction:")
print(f"Input features:\n{sample_data}")
print(f"Prediction: {final_pipeline.predict(sample_data)[0]}")
print(f"Probability: {final_pipeline.predict_proba(sample_data)[0][1]:.2f}")

## 9. Actionable Insights and Recommendations

# Get top risk factors
shap_values = explainer.shap_values(preprocessed)
shap_df = pd.DataFrame({
    'feature': all_features,
    'importance': np.abs(shap_values[1]).mean(axis=0)
}).sort_values('importance', ascending=False)

top_risk_factors = shap_df.head(5)['feature'].tolist()

print("\nAdvanced Insights and Recommendations:")
print("1. Top 5 Risk Factors for Dropout:")
for i, factor in enumerate(top_risk_factors, 1):
    print(f"   {i}. {factor}")

print("\n2. Intervention Strategies:")
print("   - Targeted academic support for students with:")
print("     * JEE Main scores below 65")
print("     * Mock test averages below 60")
print("     * Study hours less than 4 per day")
print("   - Mental health counseling programs")
print("   - Peer mentoring for students with high peer pressure")
print("   - Special coaching for repeat attempt students")

print("\n3. Early Warning System:")
print("   - Implement predictive model to flag at-risk students")
print("   - Monitor key metrics monthly:")
print("     * Performance ratio (JEE Adv/JEE Main)")
print("     * Study efficiency (mock scores/study hours)")
print("     * Consistency between mock and actual scores")

print("\n4. Parental Engagement:")
print("   - Workshops for parents with lower education levels")
print("   - Regular progress reports")
print("   - Guidance on creating supportive home environments")

print("\n5. Institutional Recommendations:")
print("   - Coaching institutes should track these risk factors")
print("   - Develop tiered intervention programs")
print("   - Focus on building consistency in performance")

NameError: name 'plt' is not defined