In [2]:
import json
import pandas as pd
import spacy
from sentence_transformers import SentenceTransformer, util
from tqdm import tqdm
import re

class TutorDataLoader:
    def __init__(self, data_path: str, model_name='all-MiniLM-L6-v2', nlp='en_core_web_sm') -> None:
        self.data_path = data_path
        self.raw_data = None
        self.data_df = None
        self.model = SentenceTransformer(model_name)
        self.nlp = spacy.load(nlp)

    def load_data(self) -> None:
        with open(self.data_path) as f:
            self.raw_data = json.load(f)

    def process_data(self) -> pd.DataFrame:
        data = []

        for conv in tqdm(self.raw_data):
            conv_id = conv['conversation_id']
            history = conv['conversation_history']
            conversation_turns = self._parse_conversation_history(history)
            last_student_turn = next((turn['text'] for turn in reversed(conversation_turns) if turn['speaker'] == 'Student'), None)
            
            for tutor_name, tutor_data in conv['tutor_responses'].items():
                response_text = tutor_data['response']
                annotation = tutor_data.get('annotation', {})
                features = self._get_linguisitc_features(response_text)

                row = {
                    'conversation_id': conv_id,
                    'response_id': f"{conv_id}-{tutor_name}",
                    'tutor': tutor_name,
                    'conversation_history': history,
                    'num_turns': len(conversation_turns),
                    'last_student_turn': last_student_turn,
                    'response': response_text,
                    'sentence_similarity': self._calculate_sentence_similarity(last_student_turn, response_text),
                    'response_length': len(response_text),
                    'contains_question': '?' in response_text,
                    'num_tokens': features['num_tokens'],
                    'num_sentences': features['num_sentences'],
                    'num_questions': features['num_questions'],
                    'contains_question_words': features['contains_question_words'],
                }

                for dimension, value in annotation.items():
                    row[f"{dimension.lower()}"] = 0 if value == 'No' else (1 if value == 'To some extent' else 2)

                data.append(row)

        self.data_df = pd.DataFrame(data)
        return self.data_df

    def _parse_conversation_history(self, history: str) -> list:
        turns = []
        parts = re.split(r'(Tutor:|Student:)', history)
        current_speaker = None

        for part in parts:
            part = part.strip()
            if part == 'Tutor:':
                current_speaker = 'Tutor'
            elif part == 'Student:':
                current_speaker = 'Student'
            elif part and current_speaker:
                turns.append({'speaker': current_speaker, 'text': part})

        return turns

    def _calculate_sentence_similarity(self, sentence1: str, sentence2: str) -> float:
        """Compute cosine similarity between two sentences using SBERT embeddings."""
        if not sentence1 or not sentence2:
            return 0.0 

        embeddings = self.model.encode([sentence1, sentence2], convert_to_tensor=True)
        similarity = util.pytorch_cos_sim(embeddings[0], embeddings[1])
        
        return similarity.item()

    def _get_linguisitc_features(self, text: str) -> dict:
        doc = self.nlp(text)
    
        features = {
            'num_tokens': len(doc),
            'num_sentences': len(list(doc.sents)),
            'num_questions': sum(1 for token in doc if token.text in ['?']),
            'contains_question_words': any(token.text.lower() in ['what', 'why', 'how', 'when', 'where'] for token in doc),
        }

        return features

loader = TutorDataLoader("../data/mrbench_v3_devset.json")
loader.load_data()
final_df = loader.process_data()

final_df.head()

  from .autonotebook import tqdm as notebook_tqdm
100%|██████████| 300/300 [00:38<00:00,  7.74it/s]


Unnamed: 0,conversation_id,response_id,tutor,conversation_history,num_turns,last_student_turn,response,sentence_similarity,response_length,contains_question,num_tokens,num_sentences,num_questions,contains_question_words,mistake_identification,mistake_location,providing_guidance,actionability
0,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e-Sonnet,Sonnet,"Tutor: Hi, could you please provide a step-by-...",6,The cost of 1 pound of meat is $7.00.,"Great, you've correctly identified the cost of...",0.505259,141,False,29,1,0,False,2,2,2,2
1,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e-Llama...,Llama318B,"Tutor: Hi, could you please provide a step-by-...",6,The cost of 1 pound of meat is $7.00.,Now that we know the cost of 1 pound of meat i...,0.718742,175,False,37,1,0,False,2,1,1,1
2,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e-Llama...,Llama31405B,"Tutor: Hi, could you please provide a step-by-...",6,The cost of 1 pound of meat is $7.00.,"You're close, but I notice that you calculated...",0.620214,225,True,48,1,1,True,2,2,2,2
3,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e-GPT4,GPT4,"Tutor: Hi, could you please provide a step-by-...",6,The cost of 1 pound of meat is $7.00.,"That's correct. So, if 1 pound of meat costs $...",0.690453,169,True,44,3,1,False,2,2,2,2
4,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e-Mistral,Mistral,"Tutor: Hi, could you please provide a step-by-...",6,The cost of 1 pound of meat is $7.00.,It seems like you've calculated the cost as if...,0.70709,148,False,32,1,0,False,2,2,2,2


In [3]:
final_df.to_csv("../data/df_eda.csv", index=False)

In [17]:
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
class TutorAnalysis:
    def __init__(self, data: pd.DataFrame, output_dir='eda_results') -> None:
        self.data = data
        self.output_dir = output_dir
        self.dimensions = ["Mistake_Identification", "Mistake_Location", "Providing_Guidance", "Actionability"]
        sns.set(style="whitegrid")

    def run_complete_analysis(self) -> None:
        print("Starting comprehensive EDA...")
        # self._basic_dataset_overview()
        # self._annotation_distributions()
        # self._annotation_correlations()
        # self._plot_tutor_radar()
        # self._linguistic_features_by_tutor()
        # self._feature_breakdown_by_annotation_and_tutor()
        # self._ngram_content_analysis()
        # self._plot_ngram_bars_by_annotation()
        # self._log_odds_ratio_ngrams()
        # self._top_ngrams_by_tutor()
        # self._identify_boilerplate_ngrams()
        # self._feature_importance_by_annotation()
        # self._per_model_feature_annotation_correlations()
        self._cross_tutor_feature_correlations()

        print(f"Analysis complete. Results saved to {self.output_dir}/")

    def _basic_dataset_overview(self) -> None:
        print("Basic dataset overview...")
        _, axes = plt.subplots(2, 2, figsize=(16, 12))
        
        # Number of responses per tutor
        tutor_counts = self.data['tutor'].value_counts()
        tutor_counts.plot(kind='bar', ax=axes[0, 0], color='steelblue')
        axes[0, 0].set_title('Number of Responses per Tutor')
        axes[0, 0].set_ylabel('Count')
        
        # Distribution of conversation turn counts
        self.data['num_turns'].plot(kind='hist', bins=20, ax=axes[0, 1], color='steelblue')
        axes[0, 1].set_title('Distribution of Conversation Turn Counts')
        axes[0, 1].set_xlabel('Number of Turns')
        
        # Response length distribution
        self.data['response_length'].plot(kind='hist', bins=50, ax=axes[1, 0], color='steelblue')
        axes[1, 0].set_title('Distribution of Response Lengths')
        axes[1, 0].set_xlabel('Characters')
        
        # Response length by tutor
        sns.boxplot(x='tutor', y='response_length', data=self.data, ax=axes[1, 1])
        axes[1, 1].set_title('Response Length by Tutor')
        axes[1, 1].set_ylabel('Characters')
        plt.xticks(rotation=45)
        
        plt.tight_layout()
        plt.savefig(f"{self.output_dir}/dataset_overview.png", dpi=300)
        plt.close()

    def _annotation_distributions(self) -> None:
        print("Annotation distributions...")
        plt.figure(figsize=(14, 10))
  
        for i, dim in enumerate(self.dimensions, 1):
            plt.subplot(2, 2, i)
            dim_data = self.data[[f"{dim.lower()}"]].copy()
            counts = dim_data.value_counts().sort_index(ascending=False)
            
            ax = counts.plot(kind='bar', color=['#2ecc71', '#f39c12', '#e74c3c'])
            ax.set_title(f'Distribution of {dim}')
            ax.set_ylabel('Count')
            
            # Add percentage labels
            total = counts.sum()
            for _, p in enumerate(ax.patches):
                percentage = f'{100 * p.get_height() / total:.1f}%'
                ax.annotate(percentage, (p.get_x() + p.get_width() / 2, p.get_height()),
                           ha='center', va='bottom')
        
        plt.tight_layout()
        plt.savefig(f"{self.output_dir}/annotation_distributions.png", dpi=300)
        plt.close()
    
    def _annotation_correlations(self) -> None:
        print("Analyzing annotation correlations...")
        annotation_columns = [f"{dim.lower()}" for dim in self.dimensions]
        corr = self.data[annotation_columns].corr()
        
        plt.figure(figsize=(10, 8))
        mask = np.triu(np.ones_like(corr, dtype=bool))
        cmap = sns.diverging_palette(230, 20, as_cmap=True)
        
        sns.heatmap(corr, mask=mask, cmap=cmap, vmax=1, vmin=-1, center=0,
                   annot=True, fmt='.2f', square=True, linewidths=.5)
        
        plt.title('Correlation between Annotation Dimensions')
        plt.tight_layout()
        plt.savefig(f"{self.output_dir}/annotation_correlations.png", dpi=300)
        plt.close()

    def _plot_tutor_radar(self) -> None:
        print("Generating radar plots for tutor effectiveness...")
        outdir = os.path.join(self.output_dir, "tutor_radar_plot")
        os.makedirs(outdir, exist_ok=True)

        tutor_scores = self.data.groupby('tutor')[[
        'mistake_identification', 'mistake_location', 'providing_guidance', 'actionability']].mean()
        # Normalize scores to [0, 1]
        tutor_scores_norm = tutor_scores / 2.0

        labels = tutor_scores_norm.columns.tolist()
        num_vars = len(labels)

        angles = np.linspace(0, 2 * np.pi, num_vars, endpoint=False).tolist()
        angles += angles[:1]  # Close the loop

        _, ax = plt.subplots(figsize=(8, 8), subplot_kw=dict(polar=True))

        for tutor_name, values in tutor_scores_norm.iterrows():
            values = values.tolist()
            values += values[:1]  # Close the loop
            
            ax.plot(angles, values, linewidth=2, label=tutor_name)
            ax.fill(angles, values, alpha=0.25)

        ax.set_yticks([0, 0.5, 1])
        ax.set_yticklabels(['0', '1', '2'], fontsize=9)
        ax.set_xticks(angles[:-1])
        ax.set_xticklabels(labels, fontsize=10)
        ax.set_title("Tutor Effectiveness Radar", fontsize=14)
        ax.legend(loc='upper right', bbox_to_anchor=(1.2, 1.1))
        ax.grid(True)

        plt.tight_layout()
        plt.savefig(os.path.join(outdir, "combined_radar_plot.png"), dpi=300)
        plt.close()

    def _linguistic_features_by_tutor(self) -> None:
        print("Analyzing linguistic features by tutor...")

        plt.figure(figsize=(12, 10))

        plt.subplot(2, 2, 1)
        sns.boxplot(x='tutor', y='num_tokens', data=self.data)
        plt.title('Number of Tokens by Tutor')
        plt.xticks(rotation=45)

        plt.subplot(2, 2, 2)
        sns.boxplot(x='tutor', y='num_sentences', data=self.data)
        plt.title('Number of Sentences by Tutor')
        plt.xticks(rotation=45)

        plt.subplot(2, 2, 3)
        sns.boxplot(x='tutor', y='num_questions', data=self.data)
        plt.title('Number of Questions by Tutor')
        plt.xticks(rotation=45)

        plt.subplot(2, 2, 4)
        sns.countplot(x='tutor', hue='contains_question_words', data=self.data)
        plt.title('Usage of Question Words by Tutor')
        plt.xticks(rotation=45)

        plt.tight_layout()
        plt.savefig('linguistic_features_by_tutor.png')

        plt.tight_layout()
        plt.savefig(f"{self.output_dir}/linguistic_features_by_tutor.png", dpi=300)
        plt.close()

    def _feature_breakdown_by_annotation_and_tutor(self) -> None:
        print("Analyzing features across annotation levels and tutors...")

        feature_vars = ['response_length', 'num_questions', 'sentence_similarity']
        binary_var = 'contains_question'
        
        for dim in self.dimensions:
            label = dim.lower()
            print(f" → Processing: {dim}")
            
            fig, axes = plt.subplots(2, 2, figsize=(18, 12))
            fig.suptitle(f'{dim}: Feature Distribution by Label and Tutor', fontsize=16)

            # Plot 1: Response Length
            sns.boxplot(x=label, y=feature_vars[0], hue='tutor', data=self.data, ax=axes[0, 0])
            axes[0, 0].set_title('Response Length')
            axes[0, 0].set_xlabel(dim)
            axes[0, 0].set_ylabel("Characters")

            # Plot 2: Number of Questions
            sns.boxplot(x=label, y=feature_vars[1], hue='tutor', data=self.data, ax=axes[0, 1])
            axes[0, 1].set_title('Number of Questions')
            axes[0, 1].set_xlabel(dim)
            axes[0, 1].set_ylabel("Count")

            # Plot 3: Sentence Similarity
            sns.boxplot(x=label, y=feature_vars[2], hue='tutor', data=self.data, ax=axes[1, 0])
            axes[1, 0].set_title('Sentence Similarity with Last Student Turn')
            axes[1, 0].set_xlabel(dim)
            axes[1, 0].set_ylabel("Cosine Similarity")

            # Plot 4: Contains Question (Bar %)
            bin_df = self.data.groupby([label, 'tutor'])[binary_var].mean().reset_index()
            sns.barplot(x=label, y=binary_var, hue='tutor', data=bin_df, ax=axes[1, 1])
            axes[1, 1].set_title('Contains Question (%)')
            axes[1, 1].set_xlabel(dim)
            axes[1, 1].set_ylabel("Proportion")

            for ax in axes.flat:
                ax.set_xticks([0, 1, 2])
                ax.set_xticklabels(['No', 'To some extent', 'Yes'])

            plt.tight_layout(rect=[0, 0.03, 1, 0.95])
            plt.savefig(f"{self.output_dir}/feature_breakdown_{label}_by_tutor.png", dpi=300)
            plt.close()

    def _plot_ngram_bars_by_annotation(self, ngram_range=(2, 3), top_k=15) -> None:
        print("Generating bar charts for top n-grams...")
        from sklearn.feature_extraction.text import CountVectorizer
        from collections import Counter
        import os

        outdir = os.path.join(self.output_dir, "ngram_barplots")
        os.makedirs(outdir, exist_ok=True)

        for dim in self.dimensions:
            dim_col = dim.lower()
            high_df = self.data[self.data[dim_col] == 2]
            low_df = self.data[self.data[dim_col] == 0]

            # Texts
            high_texts = high_df['response'].dropna().tolist()
            low_texts = low_df['response'].dropna().tolist()

            # High
            vect = CountVectorizer(ngram_range=ngram_range, stop_words='english', lowercase=True)
            high_ngrams = vect.fit_transform(high_texts)
            high_counts = Counter(dict(zip(vect.get_feature_names_out(), high_ngrams.sum(axis=0).A1)))
            high_top = high_counts.most_common(top_k)

            # Low
            vect = CountVectorizer(ngram_range=ngram_range, stop_words='english', lowercase=True)
            low_ngrams = vect.fit_transform(low_texts)
            low_counts = Counter(dict(zip(vect.get_feature_names_out(), low_ngrams.sum(axis=0).A1)))
            low_top = low_counts.most_common(top_k)

            # Plot both
            fig, axes = plt.subplots(1, 2, figsize=(18, 6))
            fig.suptitle(f"Top N-grams for {dim}", fontsize=16)

            high_labels, high_vals = zip(*high_top)
            axes[0].barh(high_labels[::-1], high_vals[::-1], color='green')
            axes[0].set_title("High Score (2)")
            axes[0].set_xlabel("Frequency")

            low_labels, low_vals = zip(*low_top)
            axes[1].barh(low_labels[::-1], low_vals[::-1], color='red')
            axes[1].set_title("Low Score (0)")
            axes[1].set_xlabel("Frequency")

            plt.tight_layout()
            plt.savefig(os.path.join(outdir, f"{dim_col}_ngram_barplot.png"), dpi=300)
            plt.close()

    def _log_odds_ratio_ngrams(self, ngram_range=(1, 3), top_k=20) -> None:
        print("Computing log-odds ratio for discriminative n-grams...")
        from sklearn.feature_extraction.text import CountVectorizer
        import numpy as np
        import os

        outdir = os.path.join(self.output_dir, "log_odds_ngrams")
        os.makedirs(outdir, exist_ok=True)

        def compute_log_odds(p1, p2):
            eps = 1e-5
            return np.log((p1 + eps) / (1 - p1 + eps)) - np.log((p2 + eps) / (1 - p2 + eps))

        for dim in self.dimensions:
            dim_col = dim.lower()
            high_df = self.data[self.data[dim_col] == 2]
            low_df = self.data[self.data[dim_col] == 0]

            vect = CountVectorizer(ngram_range=ngram_range, stop_words='english', lowercase=True)
            X_all = vect.fit_transform(self.data['response'])
            X_high = vect.transform(high_df['response'])
            X_low = vect.transform(low_df['response'])

            total = X_all.sum(axis=0).A1
            p_total = total / total.sum()

            p_high = X_high.sum(axis=0).A1 / X_high.sum()
            p_low = X_low.sum(axis=0).A1 / X_low.sum()

            log_odds_scores = compute_log_odds(p_high, p_low)
            terms = vect.get_feature_names_out()
            sorted_idx = np.argsort(log_odds_scores)[::-1]

            with open(os.path.join(outdir, f"{dim_col}_log_odds_top.txt"), "w") as f:
                f.write(f"Top {top_k} phrases for HIGH vs LOW in {dim}\n")
                for i in sorted_idx[:top_k]:
                    f.write(f"{terms[i]}: {log_odds_scores[i]:.4f}\n")

    def _top_ngrams_by_tutor(self, ngram_range=(2, 3), top_k=10) -> None:
        print("Extracting top n-grams per tutor model...")
        from sklearn.feature_extraction.text import CountVectorizer
        from collections import Counter
        import os

        outdir = os.path.join(self.output_dir, "ngrams_by_tutor")
        os.makedirs(outdir, exist_ok=True)

        for tutor in self.data['tutor'].unique():
            responses = self.data[self.data['tutor'] == tutor]['response'].dropna().tolist()
            vect = CountVectorizer(ngram_range=ngram_range, stop_words='english')
            X = vect.fit_transform(responses)
            ngram_freq = Counter(dict(zip(vect.get_feature_names_out(), X.sum(axis=0).A1)))
            top_ngrams = ngram_freq.most_common(top_k)

            with open(os.path.join(outdir, f"{tutor}_ngrams.txt"), "w") as f:
                for phrase, count in top_ngrams:
                    f.write(f"{phrase}: {count}\n")
    
    def _identify_boilerplate_ngrams(self, ngram_range=(2, 3), top_k=30) -> None:
        """
        Identifies common n-grams across all tutors and all annotation scores,
        which are likely to be boilerplate (uninformative) phrases.
        """
        print("Identifying common boilerplate phrases across the dataset...")
        from sklearn.feature_extraction.text import CountVectorizer
        from collections import Counter
        import os

        outdir = os.path.join(self.output_dir, "boilerplate")
        os.makedirs(outdir, exist_ok=True)

        # Collect n-grams across all responses
        responses = self.data['response'].dropna().tolist()
        vectorizer = CountVectorizer(ngram_range=ngram_range, stop_words='english', lowercase=True)
        X = vectorizer.fit_transform(responses)
        total_counts = Counter(dict(zip(vectorizer.get_feature_names_out(), X.sum(axis=0).A1)))
        most_common_all = total_counts.most_common(top_k)

        # Save 
        boilerplate_file = os.path.join(outdir, f"common_ngrams_top{top_k}.txt")
        with open(boilerplate_file, "w") as f:
            f.write("Most Common N-grams (likely boilerplate)\n")
            for phrase, count in most_common_all:
                f.write(f"{phrase}: {count}\n")

    def _feature_importance_by_annotation(self) -> None:
        """
        Computes feature importance using mutual information between features and annotation labels.
        Saves bar plots showing the most informative features per label.
        """
        print("Running feature importance analysis using Mutual Information...")
        from sklearn.feature_selection import mutual_info_classif
        from sklearn.preprocessing import LabelEncoder
        import os

        outdir = os.path.join(self.output_dir, "feature_importance")
        os.makedirs(outdir, exist_ok=True)

        feature_cols = ['response_length', 'num_sentences', 'num_questions',
                        'sentence_similarity', 'contains_question', 
                        'num_tokens', 'contains_question_words']
        
        feature_data = self.data[feature_cols]

        for dim in self.dimensions:
            label_col = dim.lower()
            y = self.data[label_col]

            # Compute mutual information
            mi_scores = mutual_info_classif(feature_data, y, discrete_features='auto', random_state=42)
            feature_importance = pd.Series(mi_scores, index=feature_cols).sort_values(ascending=True)

            plt.figure(figsize=(8, 5))
            feature_importance.plot(kind='barh', color='teal')
            plt.title(f"Mutual Information: Features vs. {dim}")
            plt.xlabel("Mutual Information Score")
            plt.tight_layout()
            plt.savefig(os.path.join(outdir, f"feature_importance_{label_col}.png"), dpi=300)
            plt.close()

    def _per_model_feature_annotation_correlations(self) -> None:
        """
        Computes correlation between linguistic features and annotation scores for each tutor.
        Saves a heatmap per tutor to reveal model-specific feature-behavior patterns.
        """
        print("Analyzing per-model feature-to-annotation correlations...")
        import os

        outdir = os.path.join(self.output_dir, "tutor_feature_correlations")
        os.makedirs(outdir, exist_ok=True)

        feature_cols = ['response_length', 'num_sentences', 'num_questions',
                        'sentence_similarity', 'contains_question', 
                        'num_tokens', 'contains_question_words']
        target_cols = [dim.lower() for dim in self.dimensions]

        for tutor in self.data['tutor'].unique():
            tutor_df = self.data[self.data['tutor'] == tutor]
            corr_df = tutor_df[feature_cols + target_cols].corr().loc[feature_cols, target_cols]

            plt.figure(figsize=(10, 6))
            sns.heatmap(corr_df, annot=True, cmap='vlag', center=0, linewidths=0.5)
            plt.title(f"Feature ↔ Annotation Correlation: {tutor}")
            plt.tight_layout()
            plt.savefig(os.path.join(outdir, f"{tutor}_feature_annotation_corr.png"), dpi=300)
            plt.close()

    def _cross_tutor_feature_correlations(self) -> None:
        """
        Computes correlation between linguistic features and annotation scores across all tutors.
        Saves a heatmap to reveal global feature-behavior patterns.
        """
        print("Analyzing cross-tutor aggregate features...")
        df = self.data.copy()

        # Compute per-conversation maxima
        df['max_response_length'] = df.groupby('conversation_id')['response_length'].transform('max')
        df['max_sentence_similarity'] = df.groupby('conversation_id')['sentence_similarity'].transform('max')
        df['max_num_questions'] = df.groupby('conversation_id')['num_questions'].transform('max')

        # Binary features: is this the max value in group?
        df['is_longest'] = (df['response_length'] == df['max_response_length']).astype(int)
        df['is_most_similar'] = (df['sentence_similarity'] == df['max_sentence_similarity']).astype(int)
        df['is_most_questioning'] = (df['num_questions'] == df['max_num_questions']).astype(int)

        # Store to CSV for inspection if needed
        df[['conversation_id', 'tutor', 'is_longest', 'is_most_similar', 'is_most_questioning']].to_csv(
            f"{self.output_dir}/cross_tutor_flags.csv", index=False
        )

        # Evaluate how those binary indicators relate to pedagogical scores
        features = ['is_longest', 'is_most_similar', 'is_most_questioning']
        dim_labels = [dim.lower() for dim in self.dimensions]

        plot_data = []

        for feature in features:
            for dim in dim_labels:
                subset = df[df[feature] == 1]
                mean = subset[dim].mean()
                plot_data.append({
                    'feature': feature,
                    'dimension': dim,
                    'mean_score': mean
                })

        plot_df = pd.DataFrame(plot_data)

        # Pivot for heatmap
        heatmap_data = plot_df.pivot(index='feature', columns='dimension', values='mean_score')

        plt.figure(figsize=(10, 6))
        sns.heatmap(heatmap_data, annot=True, cmap='YlGnBu', fmt=".2f")
        plt.title("Average Pedagogical Score by Aggregated Cross-Tutor Feature")
        plt.tight_layout()
        plt.savefig(f"{self.output_dir}/cross_tutor_feature_heatmap.png")
        plt.close()

        # Save table
        plot_df.to_csv(f"{self.output_dir}/cross_tutor_feature_table.csv", index=False)


analysis = TutorAnalysis(final_df)
analysis.run_complete_analysis()

Starting comprehensive EDA...
Analyzing cross-tutor aggregate features...
Analysis complete. Results saved to eda_results/


N-gram Barplots conclusions
1) Certain response styles (e.g., “great job”, “let’s try”, “total number”) are ubiquitous across the dataset regardless of annotation quality.
- They lack discriminative power between dimensions and labels, these n-grams are popular but not pedagogically informative.

I may need to filter out boilerplate phrases that occur frequently in both low and high scores when building classifiers.

2) Many n-grams in the low score bar also appear in the Phi3 tutor’s frequent n-grams.
=> Often fails to identify and locate mistakes & provide guidance/actionable steps, leading to low scores.
Use textual pattern frequency per tutor per label as feature

Phi feature correlation heatmap
- High Sentence Similarity = High Scores

Phi3’s highest positive correlation is with sentence similarity, especially for Mistake Identification (+0.34), it tends to echo or paraphrase the student’s utterance rather than correcting it.
- Longer = Worse. Negative correlation between response length / sentence count / num tokens and all annotation scores.
- Number of questions is weakly or negatively correlated with all annotation labels.

Novice feature correlation heatmap
- high correlations for response_length (0.64), num_sentences (0.58), num_tokens (0.63) with Providing_Guidance

Expert
- Strongest correlation with Actionability via contains_question (0.44) and num_questions (0.43), implying expert tutors engage learners via targeted questioning.

General trends:
- num_questions and contains_question are helpful especially for Actionability and Providing_Guidance, but only for some tutors (Expert, GPT4, Sonnet).

Divergent behavior:
- num_questions has positive correlations with annotations for most of them.
but is negatively correlated for Phi3, Novice and Mistral — suggesting some models may use questions rhetorically or ineffectively.

I'd say we have great insights to incorporate tutor-awareness in your classification models, but the problem is that at inference time, I will be in a semi-supervised setting, I don’t know the tutor ID, but you receive multiple responses per conversation, one per (unknown) tutor.

This means I cannot directly use tutor-specific features or heads at inference, unless
1) I predict the tutor identity first, or
2) I design a model to implicitly learn tutor-like clusters without knowing the tutor.