# Prosit 5: Bringing It All Together
## Comprehensive Student Journey Analysis

**9 Research Questions** with ML models and visualizations

## 1. Setup

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc, accuracy_score
from scipy.stats import f_oneway, ttest_ind
import joblib
import json
import warnings
warnings.filterwarnings('ignore')

print('✅ Libraries loaded!')

✅ Libraries loaded!


## 2. Data Loading

In [2]:
df_students = pd.read_csv('../data/merged_cleaned_encoded.csv')
print(f'Students: {df_students.shape}, Unique: {df_students["StudentRef"].nunique()}')
df_students.head()

Students: (24648, 19), Unique: 3718


Unnamed: 0,Yeargroup,StudentRef,Admission Year,Program,Semester/Year,Academic Year,Student Status,Gender,Nationality,Application Category,GPA,CGPA,Gender_App,Nationality_App,Disadvantaged background,Latest Education Level,Language: native,Intended_Major,Needs_Financial_Aid
0,2017,Sb01f8b2a9888be6f,6,5,0,0,0,1,0,4,3.63,3.26,2,34,,1,,,0
1,2017,Sb01f8b2a9888be6f,6,5,1,0,0,1,0,4,2.7,3.15,2,34,,1,,,0
2,2017,S87f7615365ccf796,6,2,0,0,0,1,0,4,3.6,3.66,2,34,,1,,,0
3,2017,S87f7615365ccf796,6,2,1,0,0,1,0,4,3.4,3.61,2,34,,1,,,0
4,2018,Sfea019092a0d1158,7,0,0,0,3,0,1,4,3.45,3.11,2,34,,1,,,0


In [3]:
import glob
admissions_files = glob.glob('../data/prosit 5/*_C2023-C2028-anon.csv')
admissions_dfs = {}
for file in admissions_files:
    exam_type = file.split('/')[-1].split('_')[0]
    admissions_dfs[exam_type] = pd.read_csv(file)
print(f'Loaded {len(admissions_dfs)} admissions files')

Loaded 6 admissions files


In [4]:
df_ajc = pd.read_csv('../data/prosit 5/anon_AJC.csv')
print(f'AJC: {df_ajc.shape}, Students: {df_ajc["StudentRef"].nunique()}')

AJC: (143, 8), Students: 134


## 3. Data Processing

In [5]:
# Grade maps
wassce_map = {'A1': 100, 'B2': 90, 'B3': 85, 'C4': 80, 'C5': 75, 'C6': 70, 'D7': 65, 'E8': 60, 'F9': 50}
ib_map = {7: 100, 6: 90, 5: 80, 4: 70, 3: 60, 2: 50, 1: 40}
olevel_map = {'A': 100, 'B': 85, 'C': 70, 'D': 60, 'E': 50}

def std_wassce(g):
    return wassce_map.get(str(g).strip().upper(), np.nan) if pd.notna(g) else np.nan
def std_ib(g):
    try:
        return ib_map.get(int(g), np.nan) if pd.notna(g) else np.nan
    except:
        return np.nan
def std_olevel(g):
    return olevel_map.get(str(g).strip().upper(), np.nan) if pd.notna(g) else np.nan

print('✅ Functions defined')

✅ Functions defined


In [6]:
# Process WASSCE
wassce = admissions_dfs['WASSCE'].copy()
wassce['math_score'] = wassce['Elective Math'].fillna(wassce['Mathematics']).apply(std_wassce)
wassce['english_score'] = wassce['English Language'].apply(std_wassce)
wassce['science_score'] = wassce[['Physics', 'Chemistry', 'Biology']].apply(lambda r: r.apply(std_wassce).mean(), axis=1)
wassce['exam_type'] = 'WASSCE'

# Process IB
ib = admissions_dfs['IB'].copy()
math_cols = [c for c in ib.columns if 'Math' in c or 'math' in c]
ib['math_score'] = ib[math_cols].apply(lambda r: r.apply(std_ib).max(), axis=1)
eng_cols = [c for c in ib.columns if 'English' in c or 'english' in c]
ib['english_score'] = ib[eng_cols].apply(lambda r: r.apply(std_ib).max(), axis=1)
sci_cols = [c for c in ib.columns if any(s in c for s in ['Physics', 'Chemistry', 'Biology'])]
ib['science_score'] = ib[sci_cols].apply(lambda r: r.apply(std_ib).mean(), axis=1)
ib['exam_type'] = 'IB'

# Process O&A
olevel = admissions_dfs['O&A'].copy()
olevel['math_score'] = olevel['Mathematics'].apply(std_olevel)
olevel['english_score'] = olevel['English'].apply(std_olevel)
olevel['science_score'] = olevel[['Physics', 'Chemistry', 'Biology']].apply(lambda r: r.apply(std_olevel).mean(), axis=1)
olevel['exam_type'] = 'O&A'

print('✅ Admissions processed')

✅ Admissions processed


In [7]:
common_cols = ['StudentRef', 'Yeargroup', 'Proposed Major', 'High School', 'Exam Type', 'math_score', 'english_score', 'science_score', 'exam_type']
df_admissions = pd.concat([wassce[common_cols], ib[common_cols], olevel[common_cols]], ignore_index=True)
df_admissions['composite_score'] = df_admissions[['math_score', 'english_score', 'science_score']].mean(axis=1)
print(f'Admissions: {df_admissions.shape}, Students: {df_admissions["StudentRef"].nunique()}')

Admissions: (1748, 10), Students: 1717


In [8]:
df_ajc_features = df_ajc.groupby('StudentRef').agg({
    'Verdict': 'count',
    'Type of Misconduct': lambda x: (x.str.contains('Academic', na=False)).sum()
}).reset_index()
df_ajc_features.columns = ['StudentRef', 'ajc_case_count', 'ajc_academic_count']
df_ajc_features['ajc_social_count'] = df_ajc_features['ajc_case_count'] - df_ajc_features['ajc_academic_count']
df_ajc_features['has_ajc_case'] = 1
print(f'AJC features: {len(df_ajc_features)} students')

AJC features: 134 students


In [9]:
df_master = df_students.merge(df_admissions, on='StudentRef', how='left', suffixes=('', '_adm'))
df_master = df_master.merge(df_ajc_features, on='StudentRef', how='left')
df_master[['ajc_case_count', 'ajc_academic_count', 'ajc_social_count', 'has_ajc_case']] = df_master[['ajc_case_count', 'ajc_academic_count', 'ajc_social_count', 'has_ajc_case']].fillna(0)
print(f'Master: {df_master.shape}, Students: {df_master["StudentRef"].nunique()}')
print(f'With admissions: {df_master["math_score"].notna().sum()} records, {df_master[df_master["math_score"].notna()]["StudentRef"].nunique()} students')

Master: (24784, 32), Students: 3718
With admissions: 9494 records, 1509 students


## 4. Feature Engineering

In [10]:
student_summary = df_master.groupby('StudentRef').agg({
    'CGPA': 'last',
    'GPA': ['mean', 'last'],
    'Semester/Year': 'max',
    'Program': 'last',
    'Intended_Major': 'first',
    'math_score': 'first',
    'english_score': 'first',
    'composite_score': 'first',
    'has_ajc_case': 'first',
    'Yeargroup': 'first'
}).reset_index()
student_summary.columns = ['StudentRef', 'final_cgpa', 'avg_gpa', 'last_gpa', 'total_semesters', 'final_major', 'proposed_major', 'math_score', 'english_score', 'composite_score', 'has_ajc_case', 'yeargroup']
print(f'Student summary: {student_summary.shape}')

Student summary: (3718, 12)


In [11]:
student_summary['struggling'] = (student_summary['final_cgpa'] < 2.0).astype(int)
student_summary['successful'] = (student_summary['final_cgpa'] >= 3.0).astype(int)
student_summary['major_changed'] = (student_summary['proposed_major'] != student_summary['final_major']).astype(int)
student_summary['delayed_grad'] = (student_summary['total_semesters'] > 8).astype(int)
print('✅ Targets created')
print(f'With admissions: {student_summary["math_score"].notna().sum()}')
print(f'With CGPA: {student_summary["final_cgpa"].notna().sum()}')
print(f'Both: {student_summary[["math_score", "final_cgpa"]].notna().all(axis=1).sum()}')

✅ Targets created
With admissions: 1509
With CGPA: 3718
Both: 1509


## 5. Research Questions

### Question 1: Academic Struggle

In [12]:
# Q1: Academic Struggle
print(f'\n============================================================')
print(f'Q1: Academic Struggle')
print(f'============================================================')

q1_data = student_summary[student_summary[['math_score', 'final_cgpa']].notna().all(axis=1)].copy()
print(f'Data: {len(q1_data)} students')

if len(q1_data) > 20:
    X = q1_data[['math_score', 'english_score', 'composite_score']]
    y = q1_data['struggling']
    
    print(f'Target distribution: {y.value_counts().to_dict()}')
    
    # Check if we have multiple classes
    if len(y.unique()) < 2:
        print('⚠️ Only one class in target - cannot train classifier')
        q1_results = None
    else:
        # Train/test split
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )
        
        # Scale
        scaler_q1 = StandardScaler()
        X_train_scaled = scaler_q1.fit_transform(X_train)
        X_test_scaled = scaler_q1.transform(X_test)
        
        # Train
        model_q1 = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
        model_q1.fit(X_train_scaled, y_train)
        
        # Predict
        y_pred = model_q1.predict(X_test_scaled)
        y_pred_proba = model_q1.predict_proba(X_test_scaled)
        
        # Handle single vs multi-class probability
        if y_pred_proba.shape[1] > 1:
            y_pred_proba_pos = y_pred_proba[:, 1]
        else:
            y_pred_proba_pos = y_pred_proba[:, 0]
        
        acc = accuracy_score(y_test, y_pred)
        print(f'\nAccuracy: {acc:.3f}')
        print(classification_report(y_test, y_pred))
        
        # Feature importance
        feat_imp = pd.DataFrame({'feature': ['math_score', 'english_score', 'composite_score'], 'importance': model_q1.feature_importances_})
        feat_imp = feat_imp.sort_values('importance', ascending=False)
        
        # === VISUALIZATIONS ===
        fig = make_subplots(
            rows=2, cols=2,
            subplot_titles=('Confusion Matrix', 'ROC Curve', 'Feature Importance', 'Prediction Distribution'),
            specs=[[{'type': 'heatmap'}, {'type': 'scatter'}],
                   [{'type': 'bar'}, {'type': 'bar'}]]
        )
        
        # 1. Confusion Matrix
        cm = confusion_matrix(y_test, y_pred)
        fig.add_trace(
            go.Heatmap(z=cm, x=['Pred 0', 'Pred 1'], y=['True 0', 'True 1'],
                       colorscale='Blues', showscale=False, text=cm, texttemplate='%{text}'),
            row=1, col=1
        )
        
        # 2. ROC Curve (only if binary classification)
        if len(model_q1.classes_) == 2:
            fpr, tpr, _ = roc_curve(y_test, y_pred_proba_pos)
            roc_auc = auc(fpr, tpr)
            fig.add_trace(
                go.Scatter(x=fpr, y=tpr, name=f'AUC={roc_auc:.3f}', mode='lines',
                           line=dict(color='blue', width=2)),
                row=1, col=2
            )
            fig.add_trace(
                go.Scatter(x=[0, 1], y=[0, 1], name='Random', mode='lines',
                           line=dict(dash='dash', color='gray')),
                row=1, col=2
            )
        else:
            roc_auc = 0
        
        # 3. Feature Importance
        fig.add_trace(
            go.Bar(x=feat_imp['importance'], y=feat_imp['feature'], orientation='h',
                   marker=dict(color='steelblue')),
            row=2, col=1
        )
        
        # 4. Prediction Distribution
        pred_counts = pd.Series(y_pred).value_counts().sort_index()
        fig.add_trace(
            go.Bar(x=pred_counts.index.astype(str), y=pred_counts.values,
                   marker=dict(color='lightcoral')),
            row=2, col=2
        )
        
        fig.update_layout(height=800, title_text=f'Q1: Academic Struggle', showlegend=False)
        fig.show()
        
        # Store results
        q1_results = {
            'accuracy': acc,
            'auc': roc_auc,
            'model': model_q1,
            'scaler': scaler_q1,
            'features': ['math_score', 'english_score', 'composite_score']
        }
else:
    print('⚠️ Insufficient data')
    q1_results = None



Q1: Academic Struggle
Data: 1509 students
Target distribution: {0: 1436, 1: 73}

Accuracy: 0.815
              precision    recall  f1-score   support

           0       0.96      0.84      0.90       287
           1       0.08      0.27      0.12        15

    accuracy                           0.81       302
   macro avg       0.52      0.55      0.51       302
weighted avg       0.91      0.81      0.86       302



### Question 2: AJC Involvement

In [13]:
# Q2: AJC Involvement
print(f'\n============================================================')
print(f'Q2: AJC Involvement')
print(f'============================================================')

q2_data = student_summary[student_summary['math_score'].notna()].copy()
print(f'Data: {len(q2_data)} students')

if len(q2_data) > 20:
    X = q2_data[['math_score', 'english_score', 'composite_score']]
    y = q2_data['has_ajc_case']
    
    print(f'Target distribution: {y.value_counts().to_dict()}')
    
    # Check if we have multiple classes
    if len(y.unique()) < 2:
        print('⚠️ Only one class in target - cannot train classifier')
        q2_results = None
    else:
        # Train/test split
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )
        
        # Scale
        scaler_q2 = StandardScaler()
        X_train_scaled = scaler_q2.fit_transform(X_train)
        X_test_scaled = scaler_q2.transform(X_test)
        
        # Train
        model_q2 = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
        model_q2.fit(X_train_scaled, y_train)
        
        # Predict
        y_pred = model_q2.predict(X_test_scaled)
        y_pred_proba = model_q2.predict_proba(X_test_scaled)
        
        # Handle single vs multi-class probability
        if y_pred_proba.shape[1] > 1:
            y_pred_proba_pos = y_pred_proba[:, 1]
        else:
            y_pred_proba_pos = y_pred_proba[:, 0]
        
        acc = accuracy_score(y_test, y_pred)
        print(f'\nAccuracy: {acc:.3f}')
        print(classification_report(y_test, y_pred))
        
        # Feature importance
        feat_imp = pd.DataFrame({'feature': ['math_score', 'english_score', 'composite_score'], 'importance': model_q2.feature_importances_})
        feat_imp = feat_imp.sort_values('importance', ascending=False)
        
        # === VISUALIZATIONS ===
        fig = make_subplots(
            rows=2, cols=2,
            subplot_titles=('Confusion Matrix', 'ROC Curve', 'Feature Importance', 'Prediction Distribution'),
            specs=[[{'type': 'heatmap'}, {'type': 'scatter'}],
                   [{'type': 'bar'}, {'type': 'bar'}]]
        )
        
        # 1. Confusion Matrix
        cm = confusion_matrix(y_test, y_pred)
        fig.add_trace(
            go.Heatmap(z=cm, x=['Pred 0', 'Pred 1'], y=['True 0', 'True 1'],
                       colorscale='Blues', showscale=False, text=cm, texttemplate='%{text}'),
            row=1, col=1
        )
        
        # 2. ROC Curve (only if binary classification)
        if len(model_q2.classes_) == 2:
            fpr, tpr, _ = roc_curve(y_test, y_pred_proba_pos)
            roc_auc = auc(fpr, tpr)
            fig.add_trace(
                go.Scatter(x=fpr, y=tpr, name=f'AUC={roc_auc:.3f}', mode='lines',
                           line=dict(color='blue', width=2)),
                row=1, col=2
            )
            fig.add_trace(
                go.Scatter(x=[0, 1], y=[0, 1], name='Random', mode='lines',
                           line=dict(dash='dash', color='gray')),
                row=1, col=2
            )
        else:
            roc_auc = 0
        
        # 3. Feature Importance
        fig.add_trace(
            go.Bar(x=feat_imp['importance'], y=feat_imp['feature'], orientation='h',
                   marker=dict(color='steelblue')),
            row=2, col=1
        )
        
        # 4. Prediction Distribution
        pred_counts = pd.Series(y_pred).value_counts().sort_index()
        fig.add_trace(
            go.Bar(x=pred_counts.index.astype(str), y=pred_counts.values,
                   marker=dict(color='lightcoral')),
            row=2, col=2
        )
        
        fig.update_layout(height=800, title_text=f'Q2: AJC Involvement', showlegend=False)
        fig.show()
        
        # Store results
        q2_results = {
            'accuracy': acc,
            'auc': roc_auc,
            'model': model_q2,
            'scaler': scaler_q2,
            'features': ['math_score', 'english_score', 'composite_score']
        }
else:
    print('⚠️ Insufficient data')
    q2_results = None



Q2: AJC Involvement
Data: 1509 students
Target distribution: {0.0: 1466, 1.0: 43}

Accuracy: 0.841
              precision    recall  f1-score   support

         0.0       0.98      0.86      0.91       293
         1.0       0.07      0.33      0.11         9

    accuracy                           0.84       302
   macro avg       0.52      0.59      0.51       302
weighted avg       0.95      0.84      0.89       302



### Question 3: Academic Success

In [14]:
# Q3: Academic Success
print(f'\n============================================================')
print(f'Q3: Academic Success')
print(f'============================================================')

q3_data = student_summary[student_summary[['math_score', 'avg_gpa', 'final_cgpa']].notna().all(axis=1)].copy()
print(f'Data: {len(q3_data)} students')

if len(q3_data) > 20:
    X = q3_data[['math_score', 'english_score', 'composite_score', 'avg_gpa']]
    y = q3_data['successful']
    
    print(f'Target distribution: {y.value_counts().to_dict()}')
    
    # Check if we have multiple classes
    if len(y.unique()) < 2:
        print('⚠️ Only one class in target - cannot train classifier')
        q3_results = None
    else:
        # Train/test split
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )
        
        # Scale
        scaler_q3 = StandardScaler()
        X_train_scaled = scaler_q3.fit_transform(X_train)
        X_test_scaled = scaler_q3.transform(X_test)
        
        # Train
        model_q3 = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
        model_q3.fit(X_train_scaled, y_train)
        
        # Predict
        y_pred = model_q3.predict(X_test_scaled)
        y_pred_proba = model_q3.predict_proba(X_test_scaled)
        
        # Handle single vs multi-class probability
        if y_pred_proba.shape[1] > 1:
            y_pred_proba_pos = y_pred_proba[:, 1]
        else:
            y_pred_proba_pos = y_pred_proba[:, 0]
        
        acc = accuracy_score(y_test, y_pred)
        print(f'\nAccuracy: {acc:.3f}')
        print(classification_report(y_test, y_pred))
        
        # Feature importance
        feat_imp = pd.DataFrame({'feature': ['math_score', 'english_score', 'composite_score', 'avg_gpa'], 'importance': model_q3.feature_importances_})
        feat_imp = feat_imp.sort_values('importance', ascending=False)
        
        # === VISUALIZATIONS ===
        fig = make_subplots(
            rows=2, cols=2,
            subplot_titles=('Confusion Matrix', 'ROC Curve', 'Feature Importance', 'Prediction Distribution'),
            specs=[[{'type': 'heatmap'}, {'type': 'scatter'}],
                   [{'type': 'bar'}, {'type': 'bar'}]]
        )
        
        # 1. Confusion Matrix
        cm = confusion_matrix(y_test, y_pred)
        fig.add_trace(
            go.Heatmap(z=cm, x=['Pred 0', 'Pred 1'], y=['True 0', 'True 1'],
                       colorscale='Blues', showscale=False, text=cm, texttemplate='%{text}'),
            row=1, col=1
        )
        
        # 2. ROC Curve (only if binary classification)
        if len(model_q3.classes_) == 2:
            fpr, tpr, _ = roc_curve(y_test, y_pred_proba_pos)
            roc_auc = auc(fpr, tpr)
            fig.add_trace(
                go.Scatter(x=fpr, y=tpr, name=f'AUC={roc_auc:.3f}', mode='lines',
                           line=dict(color='blue', width=2)),
                row=1, col=2
            )
            fig.add_trace(
                go.Scatter(x=[0, 1], y=[0, 1], name='Random', mode='lines',
                           line=dict(dash='dash', color='gray')),
                row=1, col=2
            )
        else:
            roc_auc = 0
        
        # 3. Feature Importance
        fig.add_trace(
            go.Bar(x=feat_imp['importance'], y=feat_imp['feature'], orientation='h',
                   marker=dict(color='steelblue')),
            row=2, col=1
        )
        
        # 4. Prediction Distribution
        pred_counts = pd.Series(y_pred).value_counts().sort_index()
        fig.add_trace(
            go.Bar(x=pred_counts.index.astype(str), y=pred_counts.values,
                   marker=dict(color='lightcoral')),
            row=2, col=2
        )
        
        fig.update_layout(height=800, title_text=f'Q3: Academic Success', showlegend=False)
        fig.show()
        
        # Store results
        q3_results = {
            'accuracy': acc,
            'auc': roc_auc,
            'model': model_q3,
            'scaler': scaler_q3,
            'features': ['math_score', 'english_score', 'composite_score', 'avg_gpa']
        }
else:
    print('⚠️ Insufficient data')
    q3_results = None



Q3: Academic Success
Data: 1509 students
Target distribution: {1: 836, 0: 673}

Accuracy: 0.964
              precision    recall  f1-score   support

           0       0.95      0.97      0.96       135
           1       0.98      0.96      0.97       167

    accuracy                           0.96       302
   macro avg       0.96      0.96      0.96       302
weighted avg       0.96      0.96      0.96       302



### Question 4: Major Change

In [15]:
# Q4: Major Change
print(f'\n============================================================')
print(f'Q4: Major Change')
print(f'============================================================')

q4_data = student_summary[student_summary[['math_score', 'avg_gpa', 'proposed_major', 'final_major']].notna().all(axis=1)].copy()
print(f'Data: {len(q4_data)} students')

if len(q4_data) > 20:
    X = q4_data[['math_score', 'english_score', 'composite_score', 'avg_gpa']]
    y = q4_data['major_changed']
    
    print(f'Target distribution: {y.value_counts().to_dict()}')
    
    # Check if we have multiple classes
    if len(y.unique()) < 2:
        print('⚠️ Only one class in target - cannot train classifier')
        q4_results = None
    else:
        # Train/test split
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )
        
        # Scale
        scaler_q4 = StandardScaler()
        X_train_scaled = scaler_q4.fit_transform(X_train)
        X_test_scaled = scaler_q4.transform(X_test)
        
        # Train
        model_q4 = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
        model_q4.fit(X_train_scaled, y_train)
        
        # Predict
        y_pred = model_q4.predict(X_test_scaled)
        y_pred_proba = model_q4.predict_proba(X_test_scaled)
        
        # Handle single vs multi-class probability
        if y_pred_proba.shape[1] > 1:
            y_pred_proba_pos = y_pred_proba[:, 1]
        else:
            y_pred_proba_pos = y_pred_proba[:, 0]
        
        acc = accuracy_score(y_test, y_pred)
        print(f'\nAccuracy: {acc:.3f}')
        print(classification_report(y_test, y_pred))
        
        # Feature importance
        feat_imp = pd.DataFrame({'feature': ['math_score', 'english_score', 'composite_score', 'avg_gpa'], 'importance': model_q4.feature_importances_})
        feat_imp = feat_imp.sort_values('importance', ascending=False)
        
        # === VISUALIZATIONS ===
        fig = make_subplots(
            rows=2, cols=2,
            subplot_titles=('Confusion Matrix', 'ROC Curve', 'Feature Importance', 'Prediction Distribution'),
            specs=[[{'type': 'heatmap'}, {'type': 'scatter'}],
                   [{'type': 'bar'}, {'type': 'bar'}]]
        )
        
        # 1. Confusion Matrix
        cm = confusion_matrix(y_test, y_pred)
        fig.add_trace(
            go.Heatmap(z=cm, x=['Pred 0', 'Pred 1'], y=['True 0', 'True 1'],
                       colorscale='Blues', showscale=False, text=cm, texttemplate='%{text}'),
            row=1, col=1
        )
        
        # 2. ROC Curve (only if binary classification)
        if len(model_q4.classes_) == 2:
            fpr, tpr, _ = roc_curve(y_test, y_pred_proba_pos)
            roc_auc = auc(fpr, tpr)
            fig.add_trace(
                go.Scatter(x=fpr, y=tpr, name=f'AUC={roc_auc:.3f}', mode='lines',
                           line=dict(color='blue', width=2)),
                row=1, col=2
            )
            fig.add_trace(
                go.Scatter(x=[0, 1], y=[0, 1], name='Random', mode='lines',
                           line=dict(dash='dash', color='gray')),
                row=1, col=2
            )
        else:
            roc_auc = 0
        
        # 3. Feature Importance
        fig.add_trace(
            go.Bar(x=feat_imp['importance'], y=feat_imp['feature'], orientation='h',
                   marker=dict(color='steelblue')),
            row=2, col=1
        )
        
        # 4. Prediction Distribution
        pred_counts = pd.Series(y_pred).value_counts().sort_index()
        fig.add_trace(
            go.Bar(x=pred_counts.index.astype(str), y=pred_counts.values,
                   marker=dict(color='lightcoral')),
            row=2, col=2
        )
        
        fig.update_layout(height=800, title_text=f'Q4: Major Change', showlegend=False)
        fig.show()
        
        # Store results
        q4_results = {
            'accuracy': acc,
            'auc': roc_auc,
            'model': model_q4,
            'scaler': scaler_q4,
            'features': ['math_score', 'english_score', 'composite_score', 'avg_gpa']
        }
else:
    print('⚠️ Insufficient data')
    q4_results = None



Q4: Major Change
Data: 0 students
⚠️ Insufficient data


### Question 5: Delayed Graduation

In [16]:
# Q5: Delayed Graduation
print(f'\n============================================================')
print(f'Q5: Delayed Graduation')
print(f'============================================================')

q5_data = student_summary[student_summary[['math_score', 'avg_gpa', 'total_semesters']].notna().all(axis=1)].copy()
print(f'Data: {len(q5_data)} students')

if len(q5_data) > 20:
    X = q5_data[['math_score', 'english_score', 'composite_score', 'avg_gpa', 'has_ajc_case']]
    y = q5_data['delayed_grad']
    
    print(f'Target distribution: {y.value_counts().to_dict()}')
    
    # Check if we have multiple classes
    if len(y.unique()) < 2:
        print('⚠️ Only one class in target - cannot train classifier')
        q5_results = None
    else:
        # Train/test split
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )
        
        # Scale
        scaler_q5 = StandardScaler()
        X_train_scaled = scaler_q5.fit_transform(X_train)
        X_test_scaled = scaler_q5.transform(X_test)
        
        # Train
        model_q5 = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
        model_q5.fit(X_train_scaled, y_train)
        
        # Predict
        y_pred = model_q5.predict(X_test_scaled)
        y_pred_proba = model_q5.predict_proba(X_test_scaled)
        
        # Handle single vs multi-class probability
        if y_pred_proba.shape[1] > 1:
            y_pred_proba_pos = y_pred_proba[:, 1]
        else:
            y_pred_proba_pos = y_pred_proba[:, 0]
        
        acc = accuracy_score(y_test, y_pred)
        print(f'\nAccuracy: {acc:.3f}')
        print(classification_report(y_test, y_pred))
        
        # Feature importance
        feat_imp = pd.DataFrame({'feature': ['math_score', 'english_score', 'composite_score', 'avg_gpa', 'has_ajc_case'], 'importance': model_q5.feature_importances_})
        feat_imp = feat_imp.sort_values('importance', ascending=False)
        
        # === VISUALIZATIONS ===
        fig = make_subplots(
            rows=2, cols=2,
            subplot_titles=('Confusion Matrix', 'ROC Curve', 'Feature Importance', 'Prediction Distribution'),
            specs=[[{'type': 'heatmap'}, {'type': 'scatter'}],
                   [{'type': 'bar'}, {'type': 'bar'}]]
        )
        
        # 1. Confusion Matrix
        cm = confusion_matrix(y_test, y_pred)
        fig.add_trace(
            go.Heatmap(z=cm, x=['Pred 0', 'Pred 1'], y=['True 0', 'True 1'],
                       colorscale='Blues', showscale=False, text=cm, texttemplate='%{text}'),
            row=1, col=1
        )
        
        # 2. ROC Curve (only if binary classification)
        if len(model_q5.classes_) == 2:
            fpr, tpr, _ = roc_curve(y_test, y_pred_proba_pos)
            roc_auc = auc(fpr, tpr)
            fig.add_trace(
                go.Scatter(x=fpr, y=tpr, name=f'AUC={roc_auc:.3f}', mode='lines',
                           line=dict(color='blue', width=2)),
                row=1, col=2
            )
            fig.add_trace(
                go.Scatter(x=[0, 1], y=[0, 1], name='Random', mode='lines',
                           line=dict(dash='dash', color='gray')),
                row=1, col=2
            )
        else:
            roc_auc = 0
        
        # 3. Feature Importance
        fig.add_trace(
            go.Bar(x=feat_imp['importance'], y=feat_imp['feature'], orientation='h',
                   marker=dict(color='steelblue')),
            row=2, col=1
        )
        
        # 4. Prediction Distribution
        pred_counts = pd.Series(y_pred).value_counts().sort_index()
        fig.add_trace(
            go.Bar(x=pred_counts.index.astype(str), y=pred_counts.values,
                   marker=dict(color='lightcoral')),
            row=2, col=2
        )
        
        fig.update_layout(height=800, title_text=f'Q5: Delayed Graduation', showlegend=False)
        fig.show()
        
        # Store results
        q5_results = {
            'accuracy': acc,
            'auc': roc_auc,
            'model': model_q5,
            'scaler': scaler_q5,
            'features': ['math_score', 'english_score', 'composite_score', 'avg_gpa', 'has_ajc_case']
        }
else:
    print('⚠️ Insufficient data')
    q5_results = None



Q5: Delayed Graduation
Data: 1509 students
Target distribution: {0: 1509}
⚠️ Only one class in target - cannot train classifier


### Questions 6-9: Additional Analyses

In [19]:
# Q6-9: Additional Analyses
print('\n============================================================')
print('Q6-9: Additional Analyses')
print('============================================================\n')

# ==================== Q6: Admissions Score Impact on Success ====================
print('Q6: Admissions Score Impact on Academic Success')
print('='*60)

# Analyze relationship between admissions scores and academic success
q6_data = student_summary[student_summary[['math_score', 'english_score', 'composite_score', 'final_cgpa']].notna().all(axis=1)].copy()

if len(q6_data) > 30:
    print(f'\nAnalyzing {len(q6_data)} students with complete admissions and academic data')
    
    # Correlation analysis
    from scipy.stats import pearsonr
    
    correlations = {}
    for score_type in ['math_score', 'english_score', 'composite_score']:
        corr, p_value = pearsonr(q6_data[score_type], q6_data['final_cgpa'])
        correlations[score_type] = {'correlation': corr, 'p_value': p_value}
        print(f'\n{score_type.replace("_", " ").title()} vs Final CGPA:')
        print(f'  Correlation: {corr:.3f}, p-value: {p_value:.4f}')
        if p_value < 0.05:
            print(f'  ✅ Significant correlation')
        else:
            print(f'  ❌ No significant correlation')
    
    # Visualization
    fig = make_subplots(rows=1, cols=3,
                       subplot_titles=('Math Score vs CGPA', 'English Score vs CGPA', 'Composite Score vs CGPA'))
    
    for idx, score_type in enumerate(['math_score', 'english_score', 'composite_score'], 1):
        fig.add_trace(
            go.Scatter(x=q6_data[score_type], y=q6_data['final_cgpa'],
                      mode='markers', name=score_type.replace('_', ' ').title(),
                      marker=dict(size=5, opacity=0.6)),
            row=1, col=idx
        )
        # Add trend line
        z = np.polyfit(q6_data[score_type], q6_data['final_cgpa'], 1)
        p = np.poly1d(z)
        x_trend = np.linspace(q6_data[score_type].min(), q6_data[score_type].max(), 100)
        fig.add_trace(
            go.Scatter(x=x_trend, y=p(x_trend), mode='lines',
                      name='Trend', line=dict(color='red', dash='dash')),
            row=1, col=idx
        )
    
    fig.update_layout(height=400, showlegend=False, title_text='Admissions Scores vs Final CGPA')
    fig.update_yaxes(title_text='Final CGPA')
    fig.show()
else:
    print('⚠️ Insufficient data for admissions score analysis')

# ==================== Q7: Program-Specific Success Patterns ====================
print('\n\nQ7: Program-Specific Success Patterns')
print('='*60)

q7_data = student_summary[student_summary['final_cgpa'].notna() & student_summary['final_major'].notna()].copy()

if len(q7_data) > 50:
    # Calculate success rate by program (CGPA >= 3.0)
    program_stats = q7_data.groupby('final_major').agg({
        'successful': 'mean',
        'final_cgpa': ['mean', 'std', 'count']
    }).round(3)
    
    program_stats.columns = ['Success_Rate', 'Mean_CGPA', 'Std_CGPA', 'Count']
    program_stats = program_stats[program_stats['Count'] >= 10]  # Filter programs with at least 10 students
    program_stats = program_stats.sort_values('Success_Rate', ascending=False)
    
    print(f'\nProgram Performance Summary (n={len(q7_data)} students):')
    print(program_stats)
    
    # Visualization
    fig = px.bar(program_stats.reset_index(), 
                 x='final_major', y='Success_Rate',
                 title='Success Rate by Program (CGPA >= 3.0)',
                 labels={'final_major': 'Program', 'Success_Rate': 'Success Rate'},
                 text='Success_Rate')
    fig.update_traces(texttemplate='%{text:.1%}', textposition='outside')
    fig.update_layout(xaxis_tickangle=-45, yaxis_range=[0, 1.1])
    fig.show()
else:
    print('⚠️ Insufficient data for program-specific analysis')

# ==================== Q8: Major Change Analysis ====================
print('\n\nQ8: Major Change Patterns and Impact')
print('='*60)

q8_data = student_summary[student_summary[['proposed_major', 'final_major', 'final_cgpa']].notna().all(axis=1)].copy()

if len(q8_data) > 30:
    # Calculate major change rate
    change_rate = q8_data['major_changed'].mean()
    print(f'\nAnalyzing {len(q8_data)} students')
    print(f'Major Change Rate: {change_rate:.1%}')
    
    # Compare performance between students who changed vs didn't change
    changed_cgpa = q8_data[q8_data['major_changed'] == 1]['final_cgpa']
    not_changed_cgpa = q8_data[q8_data['major_changed'] == 0]['final_cgpa']
    
    # Statistical comparison
    t_stat, p_value = ttest_ind(changed_cgpa, not_changed_cgpa)
    
    print(f'\nPerformance Comparison:')
    print(f'Changed Major - Mean CGPA: {changed_cgpa.mean():.3f}, Std: {changed_cgpa.std():.3f}, n={len(changed_cgpa)}')
    print(f'Did Not Change - Mean CGPA: {not_changed_cgpa.mean():.3f}, Std: {not_changed_cgpa.std():.3f}, n={len(not_changed_cgpa)}')
    print(f'\nT-test Results: t-statistic={t_stat:.3f}, p-value={p_value:.4f}')
    
    if p_value < 0.05:
        print('✅ Significant difference found between groups')
    else:
        print('❌ No significant difference between groups')
    
    # Visualization
    fig = make_subplots(rows=1, cols=2,
                       subplot_titles=('CGPA Distribution by Major Change Status', 'Major Change Rate'))
    
    # Box plot
    for status, label in [(0, 'No Change'), (1, 'Changed')]:
        status_data = q8_data[q8_data['major_changed'] == status]['final_cgpa']
        fig.add_trace(go.Box(y=status_data, name=label), row=1, col=1)
    
    # Pie chart for change rate
    fig.add_trace(go.Pie(labels=['No Change', 'Changed'], 
                        values=[len(not_changed_cgpa), len(changed_cgpa)],
                        hole=0.3),
                 row=1, col=2)
    
    fig.update_layout(height=400, title_text='Major Change Analysis')
    fig.show()
else:
    print('⚠️ Insufficient data for major change analysis')

# ==================== Q9: AJC Impact on Academic Performance ====================
print('\n\nQ9: AJC Impact on Academic Performance')
print('='*60)

q9_data = student_summary[student_summary['final_cgpa'].notna()].copy()

if len(q9_data) > 30:
    # Compare students with and without AJC cases
    ajc_students = q9_data[q9_data['has_ajc_case'] == 1]['final_cgpa']
    no_ajc_students = q9_data[q9_data['has_ajc_case'] == 0]['final_cgpa']
    
    print(f'\nAnalyzing {len(q9_data)} students')
    print(f'Students with AJC cases: {len(ajc_students)} ({len(ajc_students)/len(q9_data):.1%})')
    print(f'Students without AJC cases: {len(no_ajc_students)} ({len(no_ajc_students)/len(q9_data):.1%})')
    
    if len(ajc_students) > 5 and len(no_ajc_students) > 5:
        # Statistical comparison
        t_stat, p_value = ttest_ind(ajc_students, no_ajc_students)
        
        print(f'\nPerformance Statistics:')
        print(f'With AJC - Mean CGPA: {ajc_students.mean():.3f}, Std: {ajc_students.std():.3f}')
        print(f'Without AJC - Mean CGPA: {no_ajc_students.mean():.3f}, Std: {no_ajc_students.std():.3f}')
        print(f'\nT-test Results: t-statistic={t_stat:.3f}, p-value={p_value:.4f}')
        
        if p_value < 0.05:
            print('✅ Significant difference found - AJC cases impact academic performance')
        else:
            print('❌ No significant difference found')
        
        # Visualization
        fig = make_subplots(rows=1, cols=2,
                           subplot_titles=('CGPA Distribution by AJC Status', 'Academic Risk by AJC Status'))
        
        # Box plot
        for status, label in [(0, 'No AJC'), (1, 'Has AJC')]:
            status_data = q9_data[q9_data['has_ajc_case'] == status]['final_cgpa']
            fig.add_trace(go.Box(y=status_data, name=label), row=1, col=1)
        
        # Struggling rate bar chart
        struggling_by_ajc = q9_data.groupby('has_ajc_case')['struggling'].mean()
        fig.add_trace(go.Bar(x=['No AJC', 'Has AJC'], y=struggling_by_ajc.values,
                            text=struggling_by_ajc.values, texttemplate='%{text:.1%}',
                            marker=dict(color=['green', 'red'])),
                     row=1, col=2)
        
        fig.update_layout(height=400, showlegend=False, title_text='AJC Impact Analysis')
        fig.update_yaxes(title_text='Final CGPA', row=1, col=1)
        fig.update_yaxes(title_text='Struggling Rate (CGPA < 2.0)', row=1, col=2, range=[0, 1])
        fig.show()
    else:
        print('⚠️ Insufficient AJC cases for statistical comparison')
else:
    print('⚠️ Insufficient data for AJC impact analysis')

print('\n✅ Additional analyses (Q6-9) complete!')


Q6-9: Additional Analyses

Q6: Admissions Score Impact on Academic Success

Analyzing 1364 students with complete admissions and academic data

Math Score vs Final CGPA:
  Correlation: 0.235, p-value: 0.0000
  ✅ Significant correlation

English Score vs Final CGPA:
  Correlation: 0.238, p-value: 0.0000
  ✅ Significant correlation

Composite Score vs Final CGPA:
  Correlation: 0.314, p-value: 0.0000
  ✅ Significant correlation




Q7: Program-Specific Success Patterns

Program Performance Summary (n=3718 students):
             Success_Rate  Mean_CGPA  Std_CGPA  Count
final_major                                          
10                  0.725      2.922     1.441     40
4                   0.669      3.121     0.514    314
3                   0.641      3.091     0.569     39
6                   0.613      3.056     0.655    222
1                   0.580      3.063     0.527    219
2                   0.580      3.036     0.586    893
7                   0.556      3.097     0.462     36
0                   0.525      2.952     0.602   1396
5                   0.483      2.918     0.547    520
9                   0.395      2.789     0.665     38




Q8: Major Change Patterns and Impact
⚠️ Insufficient data for major change analysis


Q9: AJC Impact on Academic Performance

Analyzing 3718 students
Students with AJC cases: 134 (3.6%)
Students without AJC cases: 3584 (96.4%)

Performance Statistics:
With AJC - Mean CGPA: 2.788, Std: 0.565
Without AJC - Mean CGPA: 3.003, Std: 0.600

T-test Results: t-statistic=-4.095, p-value=0.0000
✅ Significant difference found - AJC cases impact academic performance



✅ Additional analyses (Q6-9) complete!


## 6. Save Models for API

In [20]:
import os
os.makedirs('../models/prosit_5', exist_ok=True)

# Save all trained models and scalers
saved_models = {}
for q in [1, 2, 3, 4, 5]:
    var_name = f'q{q}_results'
    if var_name in locals() and locals()[var_name] is not None:
        results = locals()[var_name]
        
        # Save model
        model_path = f'../models/prosit_5/q{q}_model.pkl'
        joblib.dump(results['model'], model_path)
        
        # Save scaler
        scaler_path = f'../models/prosit_5/q{q}_scaler.pkl'
        joblib.dump(results['scaler'], scaler_path)
        
        # Save metadata
        saved_models[f'q{q}'] = {
            'model_path': model_path,
            'scaler_path': scaler_path,
            'features': results['features'],
            'accuracy': results['accuracy'],
            'auc': results['auc']
        }
        
        print(f'✅ Q{q}: Saved model and scaler')

# Save metadata JSON
with open('../models/prosit_5/metadata.json', 'w') as f:
    json.dump(saved_models, f, indent=2)

print(f'\n✅ Saved {len(saved_models)} models to models/prosit_5/')
print('Metadata saved to models/prosit_5/metadata.json')

✅ Q1: Saved model and scaler
✅ Q2: Saved model and scaler
✅ Q3: Saved model and scaler

✅ Saved 3 models to models/prosit_5/
Metadata saved to models/prosit_5/metadata.json


In [21]:
# Save summary results for Q6-9 analyses
print('\n' + '='*60)
print('Saving Q6-9 Analysis Results')
print('='*60)

# Create results directory if it doesn't exist
results_dir = '../results/prosit_5'
os.makedirs(results_dir, exist_ok=True)

# 1. Save student_summary.csv
student_summary_path = f'{results_dir}/student_summary.csv'
student_summary.to_csv(student_summary_path, index=False)
print(f'✅ Saved student summary: {student_summary_path}')

# 2. Update performance metrics to include Q6-9
from datetime import datetime

# Load existing metrics if they exist, or create new
metrics_path = f'{results_dir}/performance_metrics.json'
try:
    with open(metrics_path, 'r') as f:
        performance_metrics = json.load(f)
except FileNotFoundError:
    performance_metrics = {
        'timestamp': datetime.now().isoformat(),
        'dataset_info': {
            'total_students': int(len(student_summary)),
            'students_with_admissions_data': int(student_summary['math_score'].notna().sum())
        }
    }

# Add Q6-9 results to performance metrics
if len(q6_data) > 30:
    performance_metrics['q6_admissions_impact'] = {
        'analysis_type': 'Correlation Analysis',
        'n_samples': int(len(q6_data)),
        'correlations': {
            score_type: {
                'correlation': float(correlations[score_type]['correlation']),
                'p_value': float(correlations[score_type]['p_value']),
                'significant': bool(correlations[score_type]['p_value'] < 0.05)
            }
            for score_type in ['math_score', 'english_score', 'composite_score']
        },
        'interpretation': 'Admissions scores show correlation with final academic performance'
    }

if len(q7_data) > 50:
    performance_metrics['q7_program_success'] = {
        'analysis_type': 'Program Comparison',
        'n_samples': int(len(q7_data)),
        'programs_analyzed': int(len(program_stats)),
        'top_program': str(program_stats.index[0]),
        'top_success_rate': float(program_stats.iloc[0]['Success_Rate']),
        'interpretation': 'Success rates vary significantly across programs'
    }

if len(q8_data) > 30:
    performance_metrics['q8_major_change'] = {
        'analysis_type': 'T-test Comparison',
        'n_samples': int(len(q8_data)),
        'change_rate': float(change_rate),
        'changed_mean_cgpa': float(changed_cgpa.mean()),
        'not_changed_mean_cgpa': float(not_changed_cgpa.mean()),
        't_statistic': float(t_stat),
        'p_value': float(p_value),
        'significant': bool(p_value < 0.05),
        'interpretation': 'Major change impact on academic performance analyzed'
    }

if len(q9_data) > 30 and len(ajc_students) > 5:
    performance_metrics['q9_ajc_impact'] = {
        'analysis_type': 'T-test Comparison',
        'n_samples': int(len(q9_data)),
        'ajc_cases': int(len(ajc_students)),
        'ajc_percentage': float(len(ajc_students)/len(q9_data) * 100),
        'ajc_mean_cgpa': float(ajc_students.mean()),
        'no_ajc_mean_cgpa': float(no_ajc_students.mean()),
        't_statistic': float(t_stat),
        'p_value': float(p_value),
        'significant': bool(p_value < 0.05),
        'interpretation': 'AJC involvement shows impact on academic performance'
    }

# Save updated metrics
with open(metrics_path, 'w') as f:
    json.dump(performance_metrics, f, indent=2)

print(f'✅ Updated performance metrics: {metrics_path}')

# 3. Update findings summary
findings_path = f'{results_dir}/findings_summary.txt'

# Create or append Q6-9 findings
q6_9_findings = f"""

## Additional Analyses (Q6-9)

### Q6: Admissions Score Impact
- Analyzed {len(q6_data) if len(q6_data) > 30 else 0} students with complete admissions data
"""

if len(q6_data) > 30:
    for score_type in ['math_score', 'english_score', 'composite_score']:
        corr_info = correlations[score_type]
        sig = "✅ Significant" if corr_info['p_value'] < 0.05 else "❌ Not significant"
        q6_9_findings += f"- {score_type.replace('_', ' ').title()}: r={corr_info['correlation']:.3f}, p={corr_info['p_value']:.4f} ({sig})\n"

q6_9_findings += f"""
### Q7: Program-Specific Success
"""
if len(q7_data) > 50:
    q6_9_findings += f"""- Analyzed {len(program_stats)} programs with ≥10 students
- Highest success rate: {program_stats.index[0]} ({program_stats.iloc[0]['Success_Rate']:.1%})
- Mean CGPA range: {program_stats['Mean_CGPA'].min():.2f} - {program_stats['Mean_CGPA'].max():.2f}
"""
else:
    q6_9_findings += "- Insufficient data for analysis\n"

q6_9_findings += f"""
### Q8: Major Change Impact
"""
if len(q8_data) > 30:
    sig_text = "significant" if p_value < 0.05 else "not significant"
    q6_9_findings += f"""- Major change rate: {change_rate:.1%}
- Changed major - Mean CGPA: {changed_cgpa.mean():.3f}
- Did not change - Mean CGPA: {not_changed_cgpa.mean():.3f}
- Difference is {sig_text} (p={p_value:.4f})
"""
else:
    q6_9_findings += "- Insufficient data for analysis\n"

q6_9_findings += f"""
### Q9: AJC Impact on Performance
"""
if len(q9_data) > 30 and len(ajc_students) > 5:
    sig_text = "significant" if p_value < 0.05 else "not significant"
    q6_9_findings += f"""- Students with AJC cases: {len(ajc_students)} ({len(ajc_students)/len(q9_data):.1%})
- With AJC - Mean CGPA: {ajc_students.mean():.3f}
- Without AJC - Mean CGPA: {no_ajc_students.mean():.3f}
- Impact is {sig_text} (p={p_value:.4f})
"""
else:
    q6_9_findings += "- Insufficient data for analysis\n"

# Append to existing findings or create new
try:
    with open(findings_path, 'r') as f:
        existing_findings = f.read()
    # Append Q6-9 findings
    with open(findings_path, 'w') as f:
        f.write(existing_findings + q6_9_findings)
except FileNotFoundError:
    # Create new findings file
    with open(findings_path, 'w') as f:
        f.write(f"""# Prosit 5: Key Findings Summary
Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}

## Dataset Overview
- Total Students: {len(student_summary)}
- Students with Admissions Data: {student_summary['math_score'].notna().sum()}
""" + q6_9_findings)

print(f'✅ Updated findings summary: {findings_path}')

print('\n✅ All results saved successfully!')
print(f'   - Student summary CSV: {student_summary_path}')
print(f'   - Performance metrics JSON: {metrics_path}')
print(f'   - Findings summary TXT: {findings_path}')


Saving Q6-9 Analysis Results
✅ Saved student summary: ../results/prosit_5/student_summary.csv
✅ Updated performance metrics: ../results/prosit_5/performance_metrics.json
✅ Updated findings summary: ../results/prosit_5/findings_summary.txt

✅ All results saved successfully!
   - Student summary CSV: ../results/prosit_5/student_summary.csv
   - Performance metrics JSON: ../results/prosit_5/performance_metrics.json
   - Findings summary TXT: ../results/prosit_5/findings_summary.txt


## 7. Summary

In [None]:
results = []
for q in [1, 2, 3, 4, 5]:
    var_name = f'q{q}_results'
    if var_name in locals() and locals()[var_name] is not None:
        results.append({
            'Question': f'Q{q}',
            'Accuracy': locals()[var_name]['accuracy'],
            'AUC': locals()[var_name]['auc']
        })

if results:
    df_results = pd.DataFrame(results)
    fig = px.bar(df_results, x='Question', y='Accuracy', title='Model Performance',
                 text='Accuracy', color='Accuracy', color_continuous_scale='Blues')
    fig.update_traces(texttemplate='%{text:.3f}', textposition='outside')
    fig.update_layout(yaxis_range=[0, 1.1])
    fig.show()
    print('\n✅ Complete!')
    print(df_results)
else:
    print('No results')