In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_class_weight
import json

In [4]:
def load_data(file):
    with open(file, 'r', encoding='utf-8') as f:
        data = json.load(f)
        data = [item for item in data]
    return data

def create_df(data):
    rows = []
    for conv in data:
        conv_id = conv["conversation_id"]
        history = conv["conversation_history"]
        
        for model_name, response_data in conv["tutor_responses"].items():
            row = {
                "conversation_id": conv_id,
                "conversation_history": history,
                "model": model_name,
                "response": response_data["response"],
                "mistake_identification": response_data["annotation"]["Mistake_Identification"],
                "mistake_location": response_data["annotation"]["Mistake_Location"],
                "providing_guidance": response_data["annotation"]["Providing_Guidance"],
                "actionability": response_data["annotation"]["Actionability"]
            }
            rows.append(row)

    df = pd.DataFrame(rows)
    return df

In [None]:
def encode_labels(df, column_name):
    mapping = {"No": 0, "To some extent": 1, "Yes": 2}
    return df[column_name].map(mapping)

def tfidf_vectorize(X_train, X_val):
    vectorizer = TfidfVectorizer(ngram_range=(1, 2))
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_val_tfidf = vectorizer.transform(X_val)
    return X_train_tfidf, X_val_tfidf, vectorizer

def get_class_weights(y_train):
    class_weights = compute_class_weight(class_weight="balanced", classes=np.unique(y_train), y=y_train)
    class_weight_dict = {i: class_weights[i] for i in range(len(class_weights))}
    return class_weight_dict

def train_and_eval(X, y, k, class_weight=None):
    skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)
    fold_reports = []
    for _, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        X_train_tfidf, X_val_tfidf, vectorizer = tfidf_vectorize(X_train, X_val)
        if class_weight is not None:
            class_weight_dict = get_class_weights(y)
        model = SVC(kernel='linear', class_weight=class_weight_dict)
        model.fit(X_train_tfidf, y_train)
        y_pred = model.predict(X_val_tfidf)
        report = classification_report(y_val, y_pred, output_dict=True)
        fold_reports.append(report)
    return model, vectorizer, fold_reports

In [7]:
data = load_data('../data/mrbench_v3_devset.json')
df = create_df(data)
metrics = ['mistake_identification', 'mistake_location', 'providing_guidance', 'actionability']

df["last_student_utterance"] = df["conversation_history"].apply(lambda x: x.split("\n")[-1].replace("Student: ", ""))
df["input_text_1"] = df["response"] # macro accuracy 0.84 macro f1 0.64 with vectorizer ngram_range=(1, 2) (before i had 0.80, 0.60) - on 5 folds
df["input_text_2"] = df["last_student_utterance"] + " [SEP] " + df["response"] # macro accuracy 0.79 macro f1 0.62 (test train 0.8/0.2)
df["input_text_3"] = df["conversation_history"] + " [SEP] " + df["response"] # macro accuracy 0.55 macro f1 0.41 (test train 0.8/0.2)

In [8]:
def get_avg_report(fold_reports):
    avg_report = {}
    for fold_report in fold_reports:  # Iterate through folds
        for key in fold_report.keys(): # Iterate through keys (0, 1, 2, accuracy, macro avg, weighted avg)
            if isinstance(fold_report[key], dict):  # Handle per-class metrics
                avg_report[key] = {}
                for subkey in fold_report[key]: # Iterate through subkeys (precision, recall, f1-score, support)
                    avg_report[key][subkey] = np.mean([report[key][subkey] for report in fold_reports])
            else: # accuracy 
                avg_report[key] = np.mean([report[key] for report in fold_reports])

    return avg_report

In [15]:
# SVC + TF-IDF + class weights
def get_results(input_data, class_weight=True):
    results = {}
    for metric in metrics:
        print(f"\nTraining model for: {metric}")
        df["labels"] = encode_labels(df, metric)
        model, vectorizer, fold_reports = train_and_eval(input_data, df["labels"], k=5, class_weight=class_weight)
        avg_report = get_avg_report(fold_reports)
        results[metric] = {
            "model": model,
            "vectorizer": vectorizer
        }
        print(f"Metric: {metric}")
        print(avg_report)
        print(f'Macro accuracy: {avg_report["accuracy"]} Macro F1: {avg_report["macro avg"]["f1-score"]}')
        print("----------------------------------------------------")
    return results

import joblib
def save_trained_models_and_vectorizers_for_inference(results, folder):
    for metric, data in results.items():
        model = data["model"]
        vectorizer = data["vectorizer"]
        joblib.dump(model, f"../models/{folder}/{metric}_model.pkl")
        joblib.dump(vectorizer, f"../models/{folder}/{metric}_vectorizer.pkl")

In [16]:
results1 = get_results(df["input_text_1"])
# results2 = get_results(df["input_text_2"])
# results3 = get_results(df["input_text_3"])
# save_trained_models_and_vectorizers_for_inference(results1, 'SVC_tfidf_weighted')


Training model for: mistake_identification
Metric: mistake_identification
{'0': {'precision': np.float64(0.7878098033842089), 'recall': np.float64(0.6972972972972973), 'f1-score': np.float64(0.7390455375065341), 'support': np.float64(74.0)}, '1': {'precision': np.float64(0.3378968253968254), 'recall': np.float64(0.19008403361344536), 'f1-score': np.float64(0.2406764288186364), 'support': np.float64(34.8)}, '2': {'precision': np.float64(0.8917925422990566), 'recall': np.float64(0.9461742378599831), 'f1-score': np.float64(0.9181145771368096), 'support': np.float64(386.4)}, 'accuracy': np.float64(0.8558170413815575), 'macro avg': {'precision': np.float64(0.6724997236933636), 'recall': np.float64(0.6111851895902419), 'f1-score': np.float64(0.6326121811539933), 'support': np.float64(495.2)}, 'weighted avg': {'precision': np.float64(0.8373369444820881), 'recall': np.float64(0.8558170413815575), 'f1-score': np.float64(0.843728768303594), 'support': np.float64(495.2)}}
Macro accuracy: 0.85581

In [17]:
# Logistic Regression + TF-IDF + SMOTE 
from imblearn.over_sampling import SMOTE
from collections import Counter

smote = SMOTE(random_state=42)

def get_results_smote(input_data):
    results = {}
    for metric in metrics:
        print(f"\nTraining model for: {metric}")
        df["labels"] = encode_labels(df, metric)
        
        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        fold_reports = []
        X = input_data
        y = df["labels"]

        for train_idx, val_idx in skf.split(X, y):
            X_train, X_val = X[train_idx], X[val_idx]
            y_train, y_val = y[train_idx], y[val_idx]

            X_train_tfidf, X_val_tfidf, vectorizer = tfidf_vectorize(X_train, X_val)
            X_train_resampled, y_train_resampled = smote.fit_resample(X_train_tfidf, y_train)
            model = SVC(kernel='linear', class_weight='balanced')
            model.fit(X_train_resampled, y_train_resampled)
            y_pred = model.predict(X_val_tfidf)
            report = classification_report(y_val, y_pred, output_dict=True)
            fold_reports.append(report)

        avg_report = get_avg_report(fold_reports)
        results[metric] = {
            "model": model,
            "vectorizer": vectorizer
        }
        print(f"Metric: {metric}")
        print(avg_report)
        print(f'Macro accuracy: {avg_report["accuracy"]} Macro F1: {avg_report["macro avg"]["f1-score"]}')
        print("----------------------------------------------------")
    return results

results_smote1 = get_results_smote(df["input_text_1"])


Training model for: mistake_identification
Metric: mistake_identification
{'0': {'precision': np.float64(0.8361011453120373), 'recall': np.float64(0.6702702702702703), 'f1-score': np.float64(0.7434916011667243), 'support': np.float64(74.0)}, '1': {'precision': np.float64(0.3478894634776987), 'recall': np.float64(0.1557983193277311), 'f1-score': np.float64(0.21040578742549187), 'support': np.float64(34.8)}, '2': {'precision': np.float64(0.8839440480964493), 'recall': np.float64(0.9611814007042347), 'f1-score': np.float64(0.9208919272548748), 'support': np.float64(386.4)}, 'accuracy': np.float64(0.8610671228413164), 'macro avg': {'precision': np.float64(0.689311552295395), 'recall': np.float64(0.595749996767412), 'f1-score': np.float64(0.6249297719490304), 'support': np.float64(495.2)}, 'weighted avg': {'precision': np.float64(0.839115798181884), 'recall': np.float64(0.8610671228413164), 'f1-score': np.float64(0.8444150816834475), 'support': np.float64(495.2)}}
Macro accuracy: 0.8610671

In [18]:
# SVC + TF-IDF + sentence piece tokenization
import sentencepiece as spm
from huggingface_hub import hf_hub_download

sp_model_path = hf_hub_download(repo_id="google/t5-v1_1-base", filename="spiece.model")
sp = spm.SentencePieceProcessor()
sp.Load(sp_model_path)

df["tokenized_response"] = df["response"].apply(lambda x: " ".join(sp.EncodeAsPieces(x)))

results_sp1 = get_results(df["tokenized_response"], class_weight=False)

  from .autonotebook import tqdm as notebook_tqdm



Training model for: mistake_identification
Metric: mistake_identification
{'0': {'precision': np.float64(0.7950011622509429), 'recall': np.float64(0.6891891891891891), 'f1-score': np.float64(0.7376041492697952), 'support': np.float64(74.0)}, '1': {'precision': np.float64(0.3248323013415893), 'recall': np.float64(0.18436974789915964), 'f1-score': np.float64(0.2329899144749675), 'support': np.float64(34.8)}, '2': {'precision': np.float64(0.8907320709428277), 'recall': np.float64(0.9482427601719083), 'f1-score': np.float64(0.9185443311092689), 'support': np.float64(386.4)}, 'accuracy': np.float64(0.855818670576735), 'macro avg': {'precision': np.float64(0.6701885115117866), 'recall': np.float64(0.6072672324200857), 'f1-score': np.float64(0.6297127982846773), 'support': np.float64(495.2)}, 'weighted avg': {'precision': np.float64(0.8366574083669278), 'recall': np.float64(0.855818670576735), 'f1-score': np.float64(0.8433057075434434), 'support': np.float64(495.2)}}
Macro accuracy: 0.855818

In [154]:
df["tokenized_response"].head()

0    ▁Great , ▁you ' ve ▁correctly ▁identified ▁the...
1    ▁Now ▁that ▁we ▁know ▁the ▁cost ▁of ▁1 ▁ pound...
2    ▁You ' re ▁close , ▁but ▁I ▁notice ▁that ▁you ...
3    ▁That ' s ▁correct . ▁So , ▁ if ▁1 ▁ pound ▁of...
4    ▁It ▁seems ▁like ▁you ' ve ▁calculated ▁the ▁c...
Name: tokenized_response, dtype: object