In [1]:
import pandas as pd
import numpy as np
import ast
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import combinations
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor

# -----------------------------
# Stage A: Numeric-Only Screening
# -----------------------------

# 1) Load the raw data
df_raw = pd.read_csv('output/langchain_student_params_1500.csv')

# 2) Identify purely numeric columns
numeric_cols = df_raw.select_dtypes(include=[np.number]).columns.tolist()
print('No of numeric columns:', len(numeric_cols))
print("Numeric columns detected:\n", numeric_cols, "\n")

# 3) Compute pairwise correlation among numeric columns
corr_numeric = df_raw[numeric_cols].corr().abs()
upper_tri_numeric = corr_numeric.where(np.triu(np.ones(corr_numeric.shape), k=1).astype(bool))

# Define a correlation threshold for "high correlation"
corr_threshold = 0.99

high_corr_pairs_numeric = [
    (c1, c2, upper_tri_numeric.loc[c1, c2])
    for c1, c2 in combinations(numeric_cols, 2)
    if upper_tri_numeric.loc[c1, c2] > corr_threshold
]

# Display high-correlation pairs
print("Stage A: Numeric-Only Screening,number of high correlation pairs:", len(high_corr_pairs_numeric))
print("Stage A: Highly correlated numeric column pairs (|ρ| > {}):".format(corr_threshold))
for col1, col2, corr_val in high_corr_pairs_numeric:
    print(f"  - {col1} and {col2}: correlation = {corr_val:.2f}")
if not high_corr_pairs_numeric:
    print("  None found.\n")
else:
    print()


No of numeric columns: 27
Numeric columns detected:
 ['conceptual_clarity_level', 'attention_span_category', 'retention_strength', 'problem_solving_speed_sec', 'growth_slope', 'response_to_feedback', 'revisions_per_week', 'days_between_revisions', 'question_asking_nature', 'self_assessment_accuracy', 'exploration_tendency', 'teacher_relationship_quality', 'peer_learning_behavior', 'communication_clarity', 'discussion_engagement', 'test_anxiety_level', 'resilience_after_failure', 'motivation_intrinsic_vs_extrinsic', 'achievement_orientation', 'emotional_self_awareness', 'device_access_type', 'digital_distraction_level', 'study_space_quality', 'academic_pressure_at_home', 'family_responsibilities_hrs', 'support_system_strength', 'metacognitive_skill_level'] 

Stage A: Numeric-Only Screening,number of high correlation pairs: 45
Stage A: Highly correlated numeric column pairs (|ρ| > 0.99):
  - attention_span_category and question_asking_nature: correlation = 1.00
  - attention_span_categor

In [7]:

# 4) Prompt user to select numeric columns to drop based on the above list
drop_list_A = ['attention_span_category','response_to_feedback','question_asking_nature','exploration_tendency','teacher_relationship_quality','peer_learning_behavior','communication_clarity','discussion_engagement','test_anxiety_level','resilience_after_failure','motivation_intrinsic_vs_extrinsic','achievement_orientation''device_access_type','digital_distraction_level','study_space_quality','academic_pressure_at_home']
# if to_drop_numeric:
#     drop_list_A = [col.strip() for col in to_drop_numeric.split(',') if col.strip() in numeric_cols]
# else:
# drop_list_A = []

good_numeric_cols = [c for c in numeric_cols if c not in drop_list_A]
print("\nStage A: Columns retained after user drop:\n", good_numeric_cols, "\n")
print('No of numeric columns retained:', len(good_numeric_cols))



Stage A: Columns retained after user drop:
 ['conceptual_clarity_level', 'retention_strength', 'problem_solving_speed_sec', 'growth_slope', 'revisions_per_week', 'days_between_revisions', 'self_assessment_accuracy', 'achievement_orientation', 'emotional_self_awareness', 'device_access_type', 'family_responsibilities_hrs', 'support_system_strength', 'metacognitive_skill_level'] 

No of numeric columns retained: 13


In [8]:

# 5) Compute VIF iteratively on the retained numeric columns
df_numeric_stageA = df_raw[good_numeric_cols].copy()
scaler_A = StandardScaler()
X_scaled_A = scaler_A.fit_transform(df_numeric_stageA)
df_scaled_A = pd.DataFrame(X_scaled_A, columns=good_numeric_cols)

def compute_vif(df_features):
    vif_data = pd.DataFrame()
    vif_data['feature'] = df_features.columns
    vif_data['VIF'] = [
        variance_inflation_factor(df_features.values, i)
        for i in range(df_features.shape[1])
    ]
    return vif_data

vif_threshold = 10.0
iteration = 1

while True:
    vif_df_A = compute_vif(df_scaled_A[good_numeric_cols])
    high_vif_A = vif_df_A[vif_df_A['VIF'] > vif_threshold]
    if high_vif_A.empty:
        break
    # Identify feature with the highest VIF
    feature_to_drop = high_vif_A.sort_values(by='VIF', ascending=False).iloc[0]['feature']
    max_vif_val = high_vif_A.sort_values(by='VIF', ascending=False).iloc[0]['VIF']
    print(f"Stage A – Iteration {iteration}: Dropping '{feature_to_drop}' with VIF = {max_vif_val:.2f}")
    good_numeric_cols.remove(feature_to_drop)
    iteration += 1

print("\nStage A: Final numeric columns after VIF filtering:\n", good_numeric_cols, "\n")
print('No of numeric columns after VIF filtering:', len(good_numeric_cols))

Stage A – Iteration 1: Dropping 'support_system_strength' with VIF = 274.09
Stage A – Iteration 2: Dropping 'emotional_self_awareness' with VIF = 101.32
Stage A – Iteration 3: Dropping 'self_assessment_accuracy' with VIF = 85.29
Stage A – Iteration 4: Dropping 'growth_slope' with VIF = 56.52
Stage A – Iteration 5: Dropping 'device_access_type' with VIF = 40.18
Stage A – Iteration 6: Dropping 'achievement_orientation' with VIF = 35.93
Stage A – Iteration 7: Dropping 'problem_solving_speed_sec' with VIF = 28.66
Stage A – Iteration 8: Dropping 'revisions_per_week' with VIF = 21.50
Stage A – Iteration 9: Dropping 'days_between_revisions' with VIF = 17.40

Stage A: Final numeric columns after VIF filtering:
 ['conceptual_clarity_level', 'retention_strength', 'family_responsibilities_hrs', 'metacognitive_skill_level'] 

No of numeric columns after VIF filtering: 4


In [4]:

# -----------------------------
# Stage B: One-Hot Encode + Full Collinearity Check
# -----------------------------

# 6) Start from the original DataFrame again
df_stageB = df_raw.copy()

# Define single-choice Literal columns and list-of-options columns
single_choice_literal_cols = [
    'error_pattern',
    'input_method_preference',
    'highest_academic_level'
]
list_of_options_cols = [
    'method_of_revision',
    'preferred_edtech_apps',
    'content_type_preference',
    'knowledge_graph_nodes_covered'
]

# 7) One-Hot encode single-choice literals (dropping first level)
df_stageB = pd.get_dummies(df_stageB, columns=single_choice_literal_cols, drop_first=True)

# 8) Parse and multi-hot encode List-of-options columns
# We'll use ast.literal_eval to convert string representations of lists into actual lists.
for col in list_of_options_cols:
    # Convert string to Python list if necessary
    df_stageB[col] = df_stageB[col].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# method_of_revision
revision_methods = [
    'active_recall',
    'spaced_repetition',
    'flashcards',
    'practice_tests',
    'summarization',
    'group_study',
    'mnemonics'
]
for method in revision_methods:
    df_stageB[f"method_{method}"] = df_stageB['method_of_revision'].apply(lambda lst: 1 if method in lst else 0)
df_stageB.drop(columns=['method_of_revision'], inplace=True)

# preferred_edtech_apps (you can keep all or select top N; here we'll encode all unique values)
unique_apps = set(app for sublist in df_stageB['preferred_edtech_apps'] if isinstance(sublist, list) for app in sublist)
for app in unique_apps:
    df_stageB[f"app_{app}"] = df_stageB['preferred_edtech_apps'].apply(lambda lst: 1 if app in lst else 0)
df_stageB.drop(columns=['preferred_edtech_apps'], inplace=True)

# content_type_preference
unique_content_types = set(ct for sublist in df_stageB['content_type_preference'] if isinstance(sublist, list) for ct in sublist)
for ct in unique_content_types:
    df_stageB[f"content_{ct}"] = df_stageB['content_type_preference'].apply(lambda lst: 1 if ct in lst else 0)
df_stageB.drop(columns=['content_type_preference'], inplace=True)

# knowledge_graph_nodes_covered
unique_nodes = set(
    node for sublist in df_stageB['knowledge_graph_nodes_covered'] if isinstance(sublist, list) for node in sublist
)
for node in unique_nodes:
    df_stageB[f"node_{node.replace(' ', '_')}"] = df_stageB['knowledge_graph_nodes_covered'].apply(lambda lst: 1 if node in lst else 0)
df_stageB.drop(columns=['knowledge_graph_nodes_covered'], inplace=True)

# 9) Combine good_numeric_cols with all dummy columns we just created
all_candidate_cols = good_numeric_cols + [c for c in df_stageB.columns if c not in df_raw.columns or c.startswith(('error_pattern_','input_method_','highest_academic_level_','method_','app_','content_','node_'))]
print("Stage B: Candidate feature set (numeric + dummies):\n", all_candidate_cols, "\n")
print('No of candidate columns:', len(all_candidate_cols))
# 10) Compute pairwise correlation on the full set
corr_full = df_stageB[all_candidate_cols].corr().abs()
upper_tri_full = corr_full.where(np.triu(np.ones(corr_full.shape), k=1).astype(bool))

corr_threshold = 0.99

high_corr_pairs_full = [
    (c1, c2, upper_tri_full.loc[c1, c2])
    for c1, c2 in combinations(all_candidate_cols, 2)
    if upper_tri_full.loc[c1, c2] > corr_threshold
]

# Display high-correlation pairs in Stage B
print("Stage B: Highly correlated feature pairs (|ρ| > {}):".format(corr_threshold))
print("Stage B: Number of high correlation pairs:", len(high_corr_pairs_full))
for col1, col2, corr_val in high_corr_pairs_full:
    print(f"  - {col1} and {col2}: correlation = {corr_val:.2f}")
if not high_corr_pairs_full:
    print("  None found.\n")
else:
    print()


Stage B: Candidate feature set (numeric + dummies):
 ['conceptual_clarity_level', 'retention_strength', 'resilience_after_failure', 'family_responsibilities_hrs', 'error_pattern_careless', 'error_pattern_conceptual', 'input_method_preference_voice', 'input_method_preference_writing', 'highest_academic_level_high_school', 'highest_academic_level_master', 'highest_academic_level_middle_school', 'highest_academic_level_none', 'highest_academic_level_phd', 'highest_academic_level_primary_school', 'method_active_recall', 'method_spaced_repetition', 'method_flashcards', 'method_practice_tests', 'method_summarization', 'method_group_study', 'method_mnemonics', 'app_Brilliant', 'app_StackOverflow', 'app_YouTube', 'app_Edpuzzle', 'app_Wikipedia', 'app_Coursera', 'app_Edmodo', 'app_Duolingo', 'app_Quizlet', 'app_Overleaf', 'app_Notion', 'app_Anki', 'app_KhanAcademy', 'app_Brainly', 'content_video', 'content_simulation', 'content_gamified', 'content_text', 'content_practice_tests', 'node_Candle_C

In [5]:

# 11) Prompt user to select columns to drop in Stage B
# to_drop_full = input(
#     "Stage B: Enter comma-separated columns to drop (numeric or dummy) based on above list (or press Enter to keep all):\n"
# ).strip()
# if to_drop_full:
#     drop_list_B = [col.strip() for col in to_drop_full.split(',') if col.strip() in all_candidate_cols]
# else:
#     drop_list_B = []
drop_list_B = ['error_pattern_careless','error_pattern_conceptual','input_method_preference_voice','input_method_preference_writing','content_text','']
#All with correlation 1 will be handled by vif later
good_full_cols = [c for c in all_candidate_cols if c not in drop_list_B]
print("\nStage B: Columns retained after user drop:\n", good_full_cols, "\n")
print('No of columns retained after user drop:', len(good_full_cols))



Stage B: Columns retained after user drop:
 ['conceptual_clarity_level', 'retention_strength', 'resilience_after_failure', 'family_responsibilities_hrs', 'highest_academic_level_high_school', 'highest_academic_level_master', 'highest_academic_level_middle_school', 'highest_academic_level_none', 'highest_academic_level_phd', 'highest_academic_level_primary_school', 'method_active_recall', 'method_spaced_repetition', 'method_flashcards', 'method_practice_tests', 'method_summarization', 'method_group_study', 'method_mnemonics', 'app_Brilliant', 'app_StackOverflow', 'app_YouTube', 'app_Edpuzzle', 'app_Wikipedia', 'app_Coursera', 'app_Edmodo', 'app_Duolingo', 'app_Quizlet', 'app_Overleaf', 'app_Notion', 'app_Anki', 'app_KhanAcademy', 'app_Brainly', 'content_video', 'content_simulation', 'content_gamified', 'content_practice_tests', 'node_Candle_Clock', 'node_Sundial', 'node_SI_Unit_of_Time', 'node_Speed', 'node_Simple_Pendulum', 'node_Non-uniform_Linear_Motion', 'node_Hourglass', 'node_Uni

In [6]:

# 12) Compute VIF iteratively on the full set
df_full_stageB = df_stageB[good_full_cols].copy()
scaler_B = StandardScaler()
X_scaled_B = scaler_B.fit_transform(df_full_stageB)
df_scaled_B = pd.DataFrame(X_scaled_B, columns=good_full_cols)

iteration = 1
while True:
    vif_df_B = compute_vif(df_scaled_B[good_full_cols])
    high_vif_B = vif_df_B[vif_df_B['VIF'] > vif_threshold]
    if high_vif_B.empty:
        break
    # Identify feature with the highest VIF
    feature_to_drop_B = high_vif_B.sort_values(by='VIF', ascending=False).iloc[0]['feature']
    max_vif_val_B = high_vif_B.sort_values(by='VIF', ascending=False).iloc[0]['VIF']
    print(f"Stage B – Iteration {iteration}: Dropping '{feature_to_drop_B}' with VIF = {max_vif_val_B:.2f}")
    good_full_cols.remove(feature_to_drop_B)
    iteration += 1

print("\nStage B: Final feature set after VIF filtering:\n", good_full_cols)
print('No of columns after VIF filtering:', len(good_full_cols))


  vif = 1. / (1. - r_squared_i)
  return 1 - self.ssr/self.uncentered_tss
  vif = 1. / (1. - r_squared_i)


Stage B – Iteration 1: Dropping 'node_Water_Clock' with VIF = inf
Stage B – Iteration 2: Dropping 'node_Time_Period_of_Pendulum' with VIF = inf


  return 1 - self.ssr/self.uncentered_tss
  vif = 1. / (1. - r_squared_i)
  return 1 - self.ssr/self.uncentered_tss
  vif = 1. / (1. - r_squared_i)
  return 1 - self.ssr/self.uncentered_tss


Stage B – Iteration 3: Dropping 'node_Uniform_Linear_Motion' with VIF = inf
Stage B – Iteration 4: Dropping 'node_Hourglass' with VIF = inf


  vif = 1. / (1. - r_squared_i)
  return 1 - self.ssr/self.uncentered_tss
  vif = 1. / (1. - r_squared_i)


Stage B – Iteration 5: Dropping 'node_SI_Unit_of_Time' with VIF = inf
Stage B – Iteration 6: Dropping 'node_Candle_Clock' with VIF = inf


  return 1 - self.ssr/self.uncentered_tss
  vif = 1. / (1. - r_squared_i)
  return 1 - self.ssr/self.uncentered_tss
  vif = 1. / (1. - r_squared_i)
  return 1 - self.ssr/self.uncentered_tss
  vif = 1. / (1. - r_squared_i)


Stage B – Iteration 7: Dropping 'node_Speed' with VIF = inf
Stage B – Iteration 8: Dropping 'highest_academic_level_high_school' with VIF = inf


  return 1 - self.ssr/self.uncentered_tss
  vif = 1. / (1. - r_squared_i)
  return 1 - self.ssr/self.uncentered_tss
  vif = 1. / (1. - r_squared_i)


Stage B – Iteration 9: Dropping 'node_Simple_Pendulum' with VIF = inf
Stage B – Iteration 10: Dropping 'content_practice_tests' with VIF = inf


  return 1 - self.ssr/self.uncentered_tss
  vif = 1. / (1. - r_squared_i)
  return 1 - self.ssr/self.uncentered_tss
  return 1 - self.ssr/self.uncentered_tss


Stage B – Iteration 11: Dropping 'content_gamified' with VIF = inf
Stage B – Iteration 12: Dropping 'content_simulation' with VIF = inf
Stage B – Iteration 13: Dropping 'app_Quizlet' with VIF = 1353.53


  return 1 - self.ssr/self.uncentered_tss
  return 1 - self.ssr/self.uncentered_tss
  return 1 - self.ssr/self.uncentered_tss
  return 1 - self.ssr/self.uncentered_tss


Stage B – Iteration 14: Dropping 'highest_academic_level_none' with VIF = 569.93
Stage B – Iteration 15: Dropping 'content_video' with VIF = 398.76
Stage B – Iteration 16: Dropping 'app_Wikipedia' with VIF = 301.90
Stage B – Iteration 17: Dropping 'highest_academic_level_primary_school' with VIF = 177.89


  return 1 - self.ssr/self.uncentered_tss
  return 1 - self.ssr/self.uncentered_tss
  return 1 - self.ssr/self.uncentered_tss
  return 1 - self.ssr/self.uncentered_tss
  return 1 - self.ssr/self.uncentered_tss
  return 1 - self.ssr/self.uncentered_tss


Stage B – Iteration 18: Dropping 'method_flashcards' with VIF = 112.76
Stage B – Iteration 19: Dropping 'resilience_after_failure' with VIF = 76.21
Stage B – Iteration 20: Dropping 'node_Non-uniform_Linear_Motion' with VIF = 31.26
Stage B – Iteration 21: Dropping 'app_Anki' with VIF = 24.25
Stage B – Iteration 22: Dropping 'app_Brainly' with VIF = 19.92
Stage B – Iteration 23: Dropping 'method_summarization' with VIF = 16.89
Stage B – Iteration 24: Dropping 'app_Notion' with VIF = 13.16
Stage B – Iteration 25: Dropping 'family_responsibilities_hrs' with VIF = 12.40

Stage B: Final feature set after VIF filtering:
 ['conceptual_clarity_level', 'retention_strength', 'highest_academic_level_master', 'highest_academic_level_middle_school', 'highest_academic_level_phd', 'method_active_recall', 'method_spaced_repetition', 'method_practice_tests', 'method_group_study', 'method_mnemonics', 'app_Brilliant', 'app_StackOverflow', 'app_YouTube', 'app_Edpuzzle', 'app_Coursera', 'app_Edmodo', 'app_D

  return 1 - self.ssr/self.uncentered_tss
  return 1 - self.ssr/self.uncentered_tss
  return 1 - self.ssr/self.uncentered_tss
