In [1]:
import pandas as pd
import numpy as np
import ast
from sklearn.preprocessing import MultiLabelBinarizer


In [2]:
df = pd.read_csv('langchain_student_params_1300.csv')

In [3]:
df.columns

Index(['conceptual_clarity_level', 'attention_span_category',
       'retention_strength', 'problem_solving_speed_sec', 'error_pattern',
       'growth_slope', 'response_to_feedback', 'revisions_per_week',
       'days_between_revisions', 'method_of_revision',
       'question_asking_nature', 'self_assessment_accuracy',
       'exploration_tendency', 'teacher_relationship_quality',
       'peer_learning_behavior', 'communication_clarity',
       'discussion_engagement', 'test_anxiety_level',
       'resilience_after_failure', 'motivation_intrinsic_vs_extrinsic',
       'achievement_orientation', 'emotional_self_awareness',
       'device_access_type', 'preferred_edtech_apps',
       'digital_distraction_level', 'input_method_preference',
       'highest_academic_level', 'study_space_quality',
       'academic_pressure_at_home', 'family_responsibilities_hrs',
       'support_system_strength', 'content_type_preference',
       'knowledge_graph_nodes_covered', 'ongoing_concept',
       'm

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1300 entries, 0 to 1299
Data columns (total 45 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   conceptual_clarity_level           1300 non-null   int64  
 1   attention_span_category            1300 non-null   int64  
 2   retention_strength                 1300 non-null   float64
 3   problem_solving_speed_sec          1300 non-null   int64  
 4   error_pattern                      1300 non-null   object 
 5   growth_slope                       1300 non-null   float64
 6   response_to_feedback               1300 non-null   int64  
 7   revisions_per_week                 1300 non-null   int64  
 8   days_between_revisions             1300 non-null   int64  
 9   method_of_revision                 1300 non-null   object 
 10  question_asking_nature             1300 non-null   int64  
 11  self_assessment_accuracy           1300 non-null   float

In [5]:
X = df.drop(columns = 'next_section')
y = df['next_section']

# One-Hot Encoding Columns

In [6]:
for i in X.columns:
    if X[i].dtype == 'object':
        print(f"Column {i} is of type object")

Column error_pattern is of type object
Column method_of_revision is of type object
Column preferred_edtech_apps is of type object
Column input_method_preference is of type object
Column highest_academic_level is of type object
Column content_type_preference is of type object
Column knowledge_graph_nodes_covered is of type object
Column ongoing_concept is of type object


In [7]:
X = pd.get_dummies(X, columns=['error_pattern', 'input_method_preference', 'highest_academic_level', 'ongoing_concept'])

In [8]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1300 entries, 0 to 1299
Data columns (total 57 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   conceptual_clarity_level                   1300 non-null   int64  
 1   attention_span_category                    1300 non-null   int64  
 2   retention_strength                         1300 non-null   float64
 3   problem_solving_speed_sec                  1300 non-null   int64  
 4   growth_slope                               1300 non-null   float64
 5   response_to_feedback                       1300 non-null   int64  
 6   revisions_per_week                         1300 non-null   int64  
 7   days_between_revisions                     1300 non-null   int64  
 8   method_of_revision                         1300 non-null   object 
 9   question_asking_nature                     1300 non-null   int64  
 10  self_assessment_accuracy

In [9]:
multi_label_cols = ['method_of_revision', 'preferred_edtech_apps','content_type_preference','knowledge_graph_nodes_covered']  # add other columns here

# 4. Convert and binarize each multi-label column
for col in multi_label_cols:
    # Parse string representation of lists into actual Python lists
    X[col] = X[col].apply(ast.literal_eval)
    
    # Binarize
    mlb = MultiLabelBinarizer()
    multi_dummies = mlb.fit_transform(X[col])

    # Create DataFrame of dummy columns with a prefix
    dummy_X = pd.DataFrame(
        multi_dummies,
        columns=[f"{col}_{c}" for c in mlb.classes_],
        index=X.index
    )
    
    # Join and drop original
    X = X.join(dummy_X).drop(columns=[col])

In [10]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1300 entries, 0 to 1299
Data columns (total 80 columns):
 #   Column                                                 Non-Null Count  Dtype  
---  ------                                                 --------------  -----  
 0   conceptual_clarity_level                               1300 non-null   int64  
 1   attention_span_category                                1300 non-null   int64  
 2   retention_strength                                     1300 non-null   float64
 3   problem_solving_speed_sec                              1300 non-null   int64  
 4   growth_slope                                           1300 non-null   float64
 5   response_to_feedback                                   1300 non-null   int64  
 6   revisions_per_week                                     1300 non-null   int64  
 7   days_between_revisions                                 1300 non-null   int64  
 8   question_asking_nature                          

# train-test-split(stratified)

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# Removing redundant columns to perform class balancing later

In [12]:
# for D in (X_train, X_test):
#     bools = D.select_dtypes(include=['bool']).columns
#     D[bools] = D[bools].astype(int)


In [13]:
# def correlation_filter(X, threshold=0.9):
#     corr_matrix = X.corr().abs()
#     upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
#     to_drop = [col for col in upper_tri.columns if any(upper_tri[col] > threshold)]
#     return to_drop

In [14]:
# drop_corr = correlation_filter(X_train, threshold=0.9)
# X_train_reduced = X_train.drop(columns=drop_corr)
# X_test_reduced = X_test.drop(columns=drop_corr)

In [15]:
# def calculate_vif(X):
#     vif_data = pd.DataFrame()
#     vif_data['feature'] = X.columns
#     vif_data['VIF'] = [
#         variance_inflation_factor(X.values, i) 
#         for i in range(X.shape[1])
#     ]
#     return vif_data

In [16]:
# max_vif = 10
# from statsmodels.stats.outliers_influence import variance_inflation_factor
# while True:
#     vif_df = calculate_vif(X_train_reduced)
#     highest_vif = vif_df['VIF'].max()
#     if highest_vif > max_vif:
#         # Drop the feature with highest VIF
#         drop_feature = vif_df.sort_values('VIF', ascending=False)['feature'].iloc[0]
#         X_train_reduced = X_train_reduced.drop(columns=[drop_feature])
#         X_test_reduced = X_test_reduced.drop(columns=[drop_feature])
#     else:
#         break


In [17]:
X_train_reduced = X_train.copy()
X_test_reduced = X_test.copy()

In [18]:
y_train

1013    Explanation (with analogies)
506     Explanation (with analogies)
710            Real-Life Application
406               Concept Definition
1286    Explanation (with analogies)
                    ...             
33                Concept Definition
1169                       Intuition
163                             MCQs
1195    Explanation (with analogies)
831            Real-Life Application
Name: next_section, Length: 1040, dtype: object

# Applying the models

In [19]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    RandomForestClassifier, ExtraTreesClassifier,
    GradientBoostingClassifier, HistGradientBoostingClassifier,
    AdaBoostClassifier, BaggingClassifier
)
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from imblearn.over_sampling import (
    RandomOverSampler, SMOTE, BorderlineSMOTE, SVMSMOTE, ADASYN
)
# from imblearn.under_sampling import RandomUnderSampler, ClusterCentroids
# from imblearn.combine import SMOTEENN, SMOTETomek
# from imblearn.ensemble import EasyEnsembleClassifier
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier
import ast
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

# === Part 1: Oversampled classifiers on X_train_reduced/X_test_reduced ===

le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

samplers = {
    'NoSampling': None,
    'RandomOverSampler': RandomOverSampler(random_state=42),
    'SMOTE': SMOTE(k_neighbors=3,random_state=42),
    'BorderlineSMOTE': BorderlineSMOTE(k_neighbors = 3,random_state=42),
    # 'SVMSMOTE': SVMSMOTE(random_state=42),
    'ADASYN': ADASYN(n_neighbors=3, random_state=42)
}

models = {
    'DecisionTree': DecisionTreeClassifier(random_state=42),
    'RandomForest': RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42),
    'ExtraTrees': ExtraTreesClassifier(n_estimators=100, class_weight='balanced', random_state=42),
    'GradientBoosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'HistGradientBoosting': HistGradientBoostingClassifier(random_state=42),
    'AdaBoost': AdaBoostClassifier(n_estimators=50, random_state=42),
    'Bagging': BaggingClassifier(n_estimators=10, random_state=42),
    'XGBoost': XGBClassifier(eval_metric='mlogloss', random_state=42),
    'LogisticRegression': LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42),
    'SVM': SVC(kernel='rbf', class_weight='balanced', probability=True, random_state=42),
    'KNN': KNeighborsClassifier(n_neighbors=5),
    'GaussianNB': GaussianNB(),
    'MLP': MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=42)
}

results = []
trained_models = {} 

scaler = StandardScaler()

X_train_reduced = scaler.fit_transform(X_train_reduced)
X_test_reduced = scaler.transform(X_test_reduced)

for samp_name, sampler in samplers.items():
    if sampler:
        X_res, y_res = sampler.fit_resample(X_train_reduced, y_train)
    else:
        X_res, y_res = X_train_reduced, y_train

    for model_name, model in models.items():
        model.fit(X_res, y_res)
        trained_models[(samp_name, model_name)] = model

        y_pred = model.predict(X_test_reduced)
        f1 = f1_score(y_test, y_pred, average='macro')  # or 'weighted'

        results.append({
            'Sampler': samp_name,
            'Model Name':   model_name,
            'Model': model,
            'Accuracy': accuracy_score(y_test, y_pred),
            'Macro F1': f1
        })
        print(f"Trained {model_name} with {samp_name}: macro F₁ = {f1:.4f}, accuracy = {accuracy_score(y_test, y_pred):.4f}")

# Final results table
results_df = pd.DataFrame(results).sort_values(by='Macro F1', ascending=False).reset_index(drop=True)
print("\n=== Combined Results ===")
print(results_df[['Sampler','Model Name','Macro F1','Accuracy']].to_string(index=False))

print("Accuracy based results:")
print(results_df[['Sampler','Model Name','Macro F1','Accuracy']].sort_values(by='Accuracy', ascending=False).to_string(index=False))

Trained DecisionTree with NoSampling: macro F₁ = 0.1849, accuracy = 0.4385
Trained RandomForest with NoSampling: macro F₁ = 0.2237, accuracy = 0.4923
Trained ExtraTrees with NoSampling: macro F₁ = 0.2194, accuracy = 0.4885
Trained GradientBoosting with NoSampling: macro F₁ = 0.2496, accuracy = 0.5692
Trained HistGradientBoosting with NoSampling: macro F₁ = 0.2725, accuracy = 0.5346
Trained AdaBoost with NoSampling: macro F₁ = 0.1723, accuracy = 0.5538
Trained Bagging with NoSampling: macro F₁ = 0.2212, accuracy = 0.4808
Trained XGBoost with NoSampling: macro F₁ = 0.2492, accuracy = 0.5269
Trained LogisticRegression with NoSampling: macro F₁ = 0.3105, accuracy = 0.4269
Trained SVM with NoSampling: macro F₁ = 0.2280, accuracy = 0.5000
Trained KNN with NoSampling: macro F₁ = 0.2786, accuracy = 0.5538
Trained GaussianNB with NoSampling: macro F₁ = 0.0492, accuracy = 0.0731
Trained MLP with NoSampling: macro F₁ = 0.2514, accuracy = 0.6192
Trained DecisionTree with RandomOverSampler: macro F

In [20]:

# === Part 2: CatBoost on original X/y ===
df = pd.read_csv('/mnt/data/langchain_student_params_1300.csv').dropna()

# Binarize multi-label columns
ml_cols = ['method_of_revision','preferred_edtech_apps','knowledge_graph_nodes_covered']
for col in ml_cols:
    df[col] = df[col].apply(ast.literal_eval)
    mlb = MultiLabelBinarizer()
    dummies = mlb.fit_transform(df[col])
    df = df.join(pd.DataFrame(dummies,
                              columns=[f"{col}_{lbl}" for lbl in mlb.classes_],
                              index=df.index)).drop(columns=[col])

# Convert booleans to ints
bool_cols = df.select_dtypes(include=['bool']).columns
df[bool_cols] = df[bool_cols].astype(int)

# Split for CatBoost
X2 = df.drop(columns=['next_section'])
y2 = df['next_section'].astype('category').cat.codes
X2_train, X2_test, y2_train, y2_test = train_test_split(
    X2, y2, stratify=y2, test_size=0.2, random_state=42
)

# Train CatBoost
cat_features = X2_train.select_dtypes(include=['object','category']).columns.tolist()
cb = CatBoostClassifier(iterations=500, learning_rate=0.05, depth=6,
                        eval_metric='Accuracy', random_seed=42, verbose=False)
cb.fit(X2_train, y2_train, cat_features=cat_features, eval_set=(X2_test, y2_test))
trained_models[('NoSampling', 'CatBoost')] = cb
y2_pred = cb.predict(X2_test)
cb_acc = accuracy_score(y2_test, y2_pred)
results.append({'Sampler': 'NoSampling', 'Model': 'CatBoost', 'Accuracy': cb_acc})
print(f"Trained CatBoost: accuracy = {cb_acc:.4f}")

# Final results table
results_df = pd.DataFrame(results).sort_values(by='Accuracy', ascending=False).reset_index(drop=True)
print("\n=== Combined Results ===")
print(results_df.to_string(index=False))


FileNotFoundError: [Errno 2] No such file or directory: '/mnt/data/langchain_student_params_1300.csv'