In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_class_weight
import json

In [2]:
def load_data(file):
    with open(file, 'r', encoding='utf-8') as f:
        data = json.load(f)
        data = [item for item in data]
    return data

def create_df(data):
    rows = []
    for conv in data:
        conv_id = conv["conversation_id"]
        history = conv["conversation_history"]
        
        for model_name, response_data in conv["tutor_responses"].items():
            row = {
                "conversation_id": conv_id,
                "conversation_history": history,
                "model": model_name,
                "response": response_data["response"],
                "mistake_identification": response_data["annotation"]["Mistake_Identification"],
                "mistake_location": response_data["annotation"]["Mistake_Location"],
                "providing_guidance": response_data["annotation"]["Providing_Guidance"],
                "actionability": response_data["annotation"]["Actionability"]
            }
            rows.append(row)

    df = pd.DataFrame(rows)
    return df

In [3]:
def encode_labels(df, column_name):
    mapping = {"No": 0, "To some extent": 1, "Yes": 2}
    return df[column_name].map(mapping)

def tfidf_vectorize(X_train, X_val):
    vectorizer = TfidfVectorizer(ngram_range=(1, 2))
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_val_tfidf = vectorizer.transform(X_val)
    return X_train_tfidf, X_val_tfidf, vectorizer

def get_class_weights(y_train):
    class_weights = compute_class_weight(class_weight="balanced", classes=np.unique(y_train), y=y_train)
    class_weight_dict = {i: class_weights[i] for i in range(len(class_weights))}
    return class_weight_dict

def train_and_eval(X, y, k, class_weight=None):
    skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)
    fold_reports = []
    for _, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        X_train_tfidf, X_val_tfidf, vectorizer = tfidf_vectorize(X_train, X_val)
        if class_weight is not None:
            class_weight_dict = get_class_weights(y)
        model = RandomForestClassifier(class_weight='balanced', random_state=42)
        model.fit(X_train_tfidf, y_train)
        y_pred = model.predict(X_val_tfidf)
        report = classification_report(y_val, y_pred, output_dict=True)
        fold_reports.append(report)
    return model, vectorizer, fold_reports

In [4]:
data = load_data('../data/mrbench_v3_devset.json')
df = create_df(data)
metrics = ['mistake_identification', 'mistake_location', 'providing_guidance', 'actionability']

df["last_student_utterance"] = df["conversation_history"].apply(lambda x: x.split("\n")[-1].replace("Student: ", ""))
df["input_text_1"] = df["response"] # macro accuracy 0.84 macro f1 0.64 with vectorizer ngram_range=(1, 2) (before i had 0.80, 0.60) - on 5 folds
df["input_text_2"] = df["last_student_utterance"] + " [SEP] " + df["response"] # macro accuracy 0.79 macro f1 0.62 (test train 0.8/0.2)
df["input_text_3"] = df["conversation_history"] + " [SEP] " + df["response"] # macro accuracy 0.55 macro f1 0.41 (test train 0.8/0.2)

In [5]:
def get_avg_report(fold_reports):
    avg_report = {}
    for fold_report in fold_reports:  # Iterate through folds
        for key in fold_report.keys(): # Iterate through keys (0, 1, 2, accuracy, macro avg, weighted avg)
            if isinstance(fold_report[key], dict):  # Handle per-class metrics
                avg_report[key] = {}
                for subkey in fold_report[key]: # Iterate through subkeys (precision, recall, f1-score, support)
                    avg_report[key][subkey] = np.mean([report[key][subkey] for report in fold_reports])
            else: # accuracy 
                avg_report[key] = np.mean([report[key] for report in fold_reports])

    return avg_report

In [7]:
# RF + TF-IDF + class weights
def get_results(input_data, class_weight=True):
    results = {}
    for metric in metrics:
        print(f"\nTraining model for: {metric}")
        df["labels"] = encode_labels(df, metric)
        model, vectorizer, fold_reports = train_and_eval(input_data, df["labels"], k=5, class_weight=class_weight)
        avg_report = get_avg_report(fold_reports)
        results[metric] = {
            "model": model,
            "vectorizer": vectorizer
        }
        print(f"Metric: {metric}")
        print(avg_report)
        print(f'Macro accuracy: {avg_report["accuracy"]} Macro F1: {avg_report["macro avg"]["f1-score"]}')
        print("----------------------------------------------------")
    return results

import joblib
def save_trained_models_and_vectorizers_for_inference(results, folder):
    for metric, data in results.items():
        model = data["model"]
        vectorizer = data["vectorizer"]
        joblib.dump(model, f"../models/{folder}/{metric}_model.pkl")
        joblib.dump(vectorizer, f"../models/{folder}/{metric}_vectorizer.pkl")

In [8]:
results1 = get_results(df["input_text_1"])
# results2 = get_results(df["input_text_2"])
# results3 = get_results(df["input_text_3"])
# save_trained_models_and_vectorizers_for_inference(results1, 'RF_tfidf_weighted')


Training model for: mistake_identification
Metric: mistake_identification
{'0': {'precision': np.float64(0.9403551514445028), 'recall': np.float64(0.49729729729729727), 'f1-score': np.float64(0.6489694136887877), 'support': np.float64(74.0)}, '1': {'precision': np.float64(0.5732600732600732), 'recall': np.float64(0.12067226890756304), 'f1-score': np.float64(0.19638792102206737), 'support': np.float64(34.8)}, '2': {'precision': np.float64(0.852469672117374), 'recall': np.float64(0.9891272040808129), 'f1-score': np.float64(0.9156930616233272), 'support': np.float64(386.4)}, 'accuracy': np.float64(0.8546081785597914), 'macro avg': {'precision': np.float64(0.7886949656073166), 'recall': np.float64(0.5356989234285577), 'f1-score': np.float64(0.5870167987780607), 'support': np.float64(495.2)}, 'weighted avg': {'precision': np.float64(0.8459427876764802), 'recall': np.float64(0.8546081785597914), 'f1-score': np.float64(0.8252884340480519), 'support': np.float64(495.2)}}
Macro accuracy: 0.854

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Metric: mistake_location
{'0': {'precision': np.float64(0.8668770198875908), 'recall': np.float64(0.40390032502708556), 'f1-score': np.float64(0.5493905479125067), 'support': np.float64(142.6)}, '1': {'precision': np.float64(0.13333333333333333), 'recall': np.float64(0.00909090909090909), 'f1-score': np.float64(0.01702127659574468), 'support': np.float64(44.0)}, '2': {'precision': np.float64(0.7037940335591595), 'recall': np.float64(0.9753772117849788), 'f1-score': np.float64(0.8175536500969128), 'support': np.float64(308.6)}, 'accuracy': np.float64(0.7249519387422614), 'macro avg': {'precision': np.float64(0.5680014622600279), 'recall': np.float64(0.4627894819676578), 'f1-score': np.float64(0.46132182486838813), 'support': np.float64(495.2)}, 'weighted avg': {'precision': np.float64(0.7000676349888894), 'recall': np.float64(0.7249519387422614), 'f1-score': np.float64(0.6692000091879396), 'support': np.float64(495.2)}}
Macro accuracy: 0.7249519387422614 Macro F1: 0.46132182486838813
--

In [9]:
# RF + TF-IDF + SMOTE 
from imblearn.over_sampling import SMOTE
from collections import Counter

smote = SMOTE(random_state=42)

def get_results_smote(input_data):
    results = {}
    for metric in metrics:
        print(f"\nTraining model for: {metric}")
        df["labels"] = encode_labels(df, metric)
        
        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        fold_reports = []
        X = input_data
        y = df["labels"]

        for train_idx, val_idx in skf.split(X, y):
            X_train, X_val = X[train_idx], X[val_idx]
            y_train, y_val = y[train_idx], y[val_idx]

            X_train_tfidf, X_val_tfidf, vectorizer = tfidf_vectorize(X_train, X_val)
            X_train_resampled, y_train_resampled = smote.fit_resample(X_train_tfidf, y_train)
            model = RandomForestClassifier(class_weight='balanced', random_state=42)
            model.fit(X_train_resampled, y_train_resampled)
            y_pred = model.predict(X_val_tfidf)
            report = classification_report(y_val, y_pred, output_dict=True)
            fold_reports.append(report)

        avg_report = get_avg_report(fold_reports)
        results[metric] = {
            "model": model,
            "vectorizer": vectorizer
        }
        print(f"Metric: {metric}")
        print(avg_report)
        print(f'Macro accuracy: {avg_report["accuracy"]} Macro F1: {avg_report["macro avg"]["f1-score"]}')
        print("----------------------------------------------------")
    return results

results_smote1 = get_results_smote(df["input_text_1"])


Training model for: mistake_identification
Metric: mistake_identification
{'0': {'precision': np.float64(0.8980382051810623), 'recall': np.float64(0.5648648648648649), 'f1-score': np.float64(0.6921269089773013), 'support': np.float64(74.0)}, '1': {'precision': np.float64(0.5282051282051283), 'recall': np.float64(0.12100840336134455), 'f1-score': np.float64(0.1938685780092473), 'support': np.float64(34.8)}, '2': {'precision': np.float64(0.8642219709761733), 'recall': np.float64(0.9844693470431511), 'f1-score': np.float64(0.920403397248052), 'support': np.float64(386.4)}, 'accuracy': np.float64(0.8610703812316716), 'macro avg': {'precision': np.float64(0.7634884347874545), 'recall': np.float64(0.5567808717564536), 'f1-score': np.float64(0.6021329614115336), 'support': np.float64(495.2)}, 'weighted avg': {'precision': np.float64(0.8456355288633102), 'recall': np.float64(0.8610703812316716), 'f1-score': np.float64(0.835207123786067), 'support': np.float64(495.2)}}
Macro accuracy: 0.861070

In [None]:
# RF + TF-IDF + sentence piece tokenization
import sentencepiece as spm
from huggingface_hub import hf_hub_download

sp_model_path = hf_hub_download(repo_id="google/t5-v1_1-base", filename="spiece.model")
sp = spm.SentencePieceProcessor()
sp.Load(sp_model_path)

df["tokenized_response"] = df["response"].apply(lambda x: " ".join(sp.EncodeAsPieces(x)))

results_sp1 = get_results(df["tokenized_response"], class_weight=False)

  from .autonotebook import tqdm as notebook_tqdm



Training model for: mistake_identification
Metric: mistake_identification
{'0': {'precision': np.float64(0.7950011622509429), 'recall': np.float64(0.6891891891891891), 'f1-score': np.float64(0.7376041492697952), 'support': np.float64(74.0)}, '1': {'precision': np.float64(0.3248323013415893), 'recall': np.float64(0.18436974789915964), 'f1-score': np.float64(0.2329899144749675), 'support': np.float64(34.8)}, '2': {'precision': np.float64(0.8907320709428277), 'recall': np.float64(0.9482427601719083), 'f1-score': np.float64(0.9185443311092689), 'support': np.float64(386.4)}, 'accuracy': np.float64(0.855818670576735), 'macro avg': {'precision': np.float64(0.6701885115117866), 'recall': np.float64(0.6072672324200857), 'f1-score': np.float64(0.6297127982846773), 'support': np.float64(495.2)}, 'weighted avg': {'precision': np.float64(0.8366574083669278), 'recall': np.float64(0.855818670576735), 'f1-score': np.float64(0.8433057075434434), 'support': np.float64(495.2)}}
Macro accuracy: 0.855818

In [154]:
df["tokenized_response"].head()

0    ▁Great , ▁you ' ve ▁correctly ▁identified ▁the...
1    ▁Now ▁that ▁we ▁know ▁the ▁cost ▁of ▁1 ▁ pound...
2    ▁You ' re ▁close , ▁but ▁I ▁notice ▁that ▁you ...
3    ▁That ' s ▁correct . ▁So , ▁ if ▁1 ▁ pound ▁of...
4    ▁It ▁seems ▁like ▁you ' ve ▁calculated ▁the ▁c...
Name: tokenized_response, dtype: object