In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_class_weight
import json

In [3]:
def load_data(file):
    with open(file, 'r', encoding='utf-8') as f:
        data = json.load(f)
        data = [item for item in data]
    return data

def create_df(data):
    rows = []
    for conv in data:
        conv_id = conv["conversation_id"]
        history = conv["conversation_history"]
        
        for model_name, response_data in conv["tutor_responses"].items():
            row = {
                "conversation_id": conv_id,
                "conversation_history": history,
                "model": model_name,
                "response": response_data["response"],
                "mistake_identification": response_data["annotation"]["Mistake_Identification"],
                "mistake_location": response_data["annotation"]["Mistake_Location"],
                "providing_guidance": response_data["annotation"]["Providing_Guidance"],
                "actionability": response_data["annotation"]["Actionability"]
            }
            rows.append(row)

    df = pd.DataFrame(rows)
    return df

In [None]:
def encode_labels(df, column_name):
    mapping = {"No": 0, "To some extent": 1, "Yes": 2}
    return df[column_name].map(mapping)

def tfidf_vectorize(X_train, X_val):
    vectorizer = TfidfVectorizer(ngram_range=(1, 2))
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_val_tfidf = vectorizer.transform(X_val)
    return X_train_tfidf, X_val_tfidf, vectorizer

def get_class_weights(y_train):
    class_weights = compute_class_weight(class_weight="balanced", classes=np.unique(y_train), y=y_train)
    class_weight_dict = {i: class_weights[i] for i in range(len(class_weights))}
    return class_weight_dict

def train_and_eval(X, y, k, class_weight=None):
    skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)
    fold_reports = []
    for _, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        X_train_tfidf, X_val_tfidf, vectorizer = tfidf_vectorize(X_train, X_val)
        if class_weight is not None:
            class_weight_dict = get_class_weights(y)
        model = LogisticRegression(class_weight=class_weight_dict, max_iter=1000)
        model.fit(X_train_tfidf, y_train)
        y_pred = model.predict(X_val_tfidf)
        report = classification_report(y_val, y_pred, output_dict=True)
        fold_reports.append(report)
    return model, vectorizer, fold_reports

In [None]:
data = load_data('../data/mrbench_v3_devset.json')
df = create_df(data)
metrics = ['mistake_identification', 'mistake_location', 'providing_guidance', 'actionability']

df["last_student_utterance"] = df["conversation_history"].apply(lambda x: x.split("\n")[-1].replace("Student: ", ""))
df["input_text_1"] = df["response"] # macro accuracy 0.84 macro f1 0.64 with vectorizer bigrams (before i had 0.80, 0.60) - on 5 folds
df["input_text_2"] = df["last_student_utterance"] + " [SEP] " + df["response"] # macro accuracy 0.79 macro f1 0.62 (test train 0.8/0.2)
df["input_text_3"] = df["conversation_history"] + " [SEP] " + df["response"] # macro accuracy 0.55 macro f1 0.41 (test train 0.8/0.2)

In [6]:
def get_avg_report(fold_reports):
    avg_report = {}
    for fold_report in fold_reports:  # Iterate through folds
        for key in fold_report.keys(): # Iterate through keys (0, 1, 2, accuracy, macro avg, weighted avg)
            if isinstance(fold_report[key], dict):  # Handle per-class metrics
                avg_report[key] = {}
                for subkey in fold_report[key]: # Iterate through subkeys (precision, recall, f1-score, support)
                    avg_report[key][subkey] = np.mean([report[key][subkey] for report in fold_reports])
            else: # accuracy 
                avg_report[key] = np.mean([report[key] for report in fold_reports])

    return avg_report

In [8]:
# Logistic Regression + TF-IDF + class weights
def get_results(input_data, class_weight=True):
    results = {}
    for metric in metrics:
        print(f"\nTraining model for: {metric}")
        df["labels"] = encode_labels(df, metric)
        model, vectorizer, fold_reports = train_and_eval(input_data, df["labels"], k=5, class_weight=class_weight)
        avg_report = get_avg_report(fold_reports)
        results[metric] = {
            "model": model,
            "vectorizer": vectorizer
        }
        print(f"Metric: {metric}")
        print(avg_report)
        print(f'Macro accuracy: {avg_report["accuracy"]} Macro F1: {avg_report["macro avg"]["f1-score"]}')
        print("----------------------------------------------------")
    return results

import joblib
def save_trained_models_and_vectorizers_for_inference(results, folder):
    for metric, data in results.items():
        model = data["model"]
        vectorizer = data["vectorizer"]
        joblib.dump(model, f"../models/{folder}/{metric}_model.pkl")
        joblib.dump(vectorizer, f"../models/{folder}/{metric}_vectorizer.pkl")

# bigrams results
# Training model for: mistake_identification
# Macro accuracy: 0.8485434995112415 Macro F1: 0.6438575934024939
# Training model for: mistake_location
# Macro accuracy: 0.6890029325513196 Macro F1: 0.5596451814761345
# Training model for: providing_guidance
# Macro accuracy: 0.6385272075594657 Macro F1: 0.5787090520169101
# Metric: actionability
# Macro accuracy: 0.6478258390355165 Macro F1: 0.5748426701105898

In [9]:
results1 = get_results(df["input_text_1"])
# results2 = get_results(df["input_text_2"])
# results3 = get_results(df["input_text_3"])
# save_trained_models_and_vectorizers_for_inference(results1, 'logreg_tfidf_weighted')


Training model for: mistake_identification
Metric: mistake_identification
{'0': {'precision': 0.8370572493215352, 'recall': 0.6702702702702703, 'f1-score': 0.7426457459251125, 'support': 74.0}, '1': {'precision': 0.35665041576219114, 'recall': 0.2073949579831933, 'f1-score': 0.2593421951912518, 'support': 34.8}, '2': {'precision': 0.8869579483969614, 'recall': 0.9534160742258104, 'f1-score': 0.9189436707782095, 'support': 386.4}, 'accuracy': 0.858646953405018, 'macro avg': {'precision': 0.6935552044935627, 'recall': 0.6103604341597579, 'f1-score': 0.6403105372981912, 'support': 495.2}, 'weighted avg': {'precision': 0.8422397398037942, 'recall': 0.858646953405018, 'f1-score': 0.8462253367735446, 'support': 495.2}}
Macro accuracy: 0.858646953405018 Macro F1: 0.6403105372981912
----------------------------------------------------

Training model for: mistake_location
Metric: mistake_location
{'0': {'precision': 0.6870107957673521, 'recall': 0.5960504284447946, 'f1-score': 0.6377282005201

In [140]:
# Logistic Regression + TF-IDF + SMOTE 
from imblearn.over_sampling import SMOTE
from collections import Counter

smote = SMOTE(random_state=42)

def get_results_smote(input_data):
    results = {}
    for metric in metrics:
        print(f"\nTraining model for: {metric}")
        df["labels"] = encode_labels(df, metric)
        
        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        fold_reports = []
        X = input_data
        y = df["labels"]

        for train_idx, val_idx in skf.split(X, y):
            X_train, X_val = X[train_idx], X[val_idx]
            y_train, y_val = y[train_idx], y[val_idx]

            X_train_tfidf, X_val_tfidf, vectorizer = tfidf_vectorize(X_train, X_val)
            X_train_resampled, y_train_resampled = smote.fit_resample(X_train_tfidf, y_train)
            model = LogisticRegression(max_iter=1000)
            model.fit(X_train_resampled, y_train_resampled)
            y_pred = model.predict(X_val_tfidf)
            report = classification_report(y_val, y_pred, output_dict=True)
            fold_reports.append(report)

        avg_report = get_avg_report(fold_reports)
        results[metric] = {
            "model": model,
            "vectorizer": vectorizer
        }
        print(f"Metric: {metric}")
        print(avg_report)
        print(f'Macro accuracy: {avg_report["accuracy"]} Macro F1: {avg_report["macro avg"]["f1-score"]}')
        print("----------------------------------------------------")
    return results

results_smote1 = get_results_smote(df["input_text_1"])


Training model for: mistake_identification
Metric: mistake_identification
{'0': {'precision': np.float64(0.7233545794541258), 'recall': np.float64(0.6864864864864864), 'f1-score': np.float64(0.7030764075132526), 'support': np.float64(74.0)}, '1': {'precision': np.float64(0.24362669494248443), 'recall': np.float64(0.2989915966386555), 'f1-score': np.float64(0.2665014115557017), 'support': np.float64(34.8)}, '2': {'precision': np.float64(0.9027660099980569), 'recall': np.float64(0.8907873773279243), 'f1-score': np.float64(0.896546647011939), 'support': np.float64(386.4)}, 'accuracy': np.float64(0.8186567285760834), 'macro avg': {'precision': np.float64(0.6232490947982224), 'recall': np.float64(0.6254218201510221), 'f1-score': np.float64(0.6220414886936311), 'support': np.float64(495.2)}, 'weighted avg': {'precision': np.float64(0.8296538405247411), 'recall': np.float64(0.8186567285760834), 'f1-score': np.float64(0.8233668102962064), 'support': np.float64(495.2)}}
Macro accuracy: 0.81865

In [151]:
# Logistic Regression + TF-IDF + sentence piece tokenization
import sentencepiece as spm
from huggingface_hub import hf_hub_download

sp_model_path = hf_hub_download(repo_id="google/t5-v1_1-base", filename="spiece.model")
sp = spm.SentencePieceProcessor()
sp.Load(sp_model_path)

df["tokenized_response"] = df["response"].apply(lambda x: " ".join(sp.EncodeAsPieces(x)))

results_sp1 = get_results(df["tokenized_response"], class_weight=False)


Training model for: mistake_identification
Metric: mistake_identification
{'0': {'precision': np.float64(0.7939232409381662), 'recall': np.float64(0.6756756756756757), 'f1-score': np.float64(0.7285221366878525), 'support': np.float64(74.0)}, '1': {'precision': np.float64(0.3292307692307692), 'recall': np.float64(0.27630252100840336), 'f1-score': np.float64(0.2972179646065013), 'support': np.float64(34.8)}, '2': {'precision': np.float64(0.896370965510871), 'recall': np.float64(0.9342598171131729), 'f1-score': np.float64(0.9148493877750459), 'support': np.float64(386.4)}, 'accuracy': np.float64(0.8493515803193222), 'macro avg': {'precision': np.float64(0.6731749918932689), 'recall': np.float64(0.628746004599084), 'f1-score': np.float64(0.6468631630231332), 'support': np.float64(495.2)}, 'weighted avg': {'precision': np.float64(0.8412165577861371), 'recall': np.float64(0.8493515803193222), 'f1-score': np.float64(0.8435916550821816), 'support': np.float64(495.2)}}
Macro accuracy: 0.849351

In [154]:
df["tokenized_response"].head()

0    ▁Great , ▁you ' ve ▁correctly ▁identified ▁the...
1    ▁Now ▁that ▁we ▁know ▁the ▁cost ▁of ▁1 ▁ pound...
2    ▁You ' re ▁close , ▁but ▁I ▁notice ▁that ▁you ...
3    ▁That ' s ▁correct . ▁So , ▁ if ▁1 ▁ pound ▁of...
4    ▁It ▁seems ▁like ▁you ' ve ▁calculated ▁the ▁c...
Name: tokenized_response, dtype: object