In [4]:
# prompt: mount google drive

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
STRATEGY_ID_2_NAME = {
    1: "Explanation_of_concept",
    2: "Ask_a_question",
    # 3: "Provide_a_hint", # Not doing because it doesn't show up enough
    4: "Provide_a_solution_strategy",
    5: "Prompt_an_explanation",
    6: "Encourage_student",
    7: "Affirm_correct_answer",
    8: "Give_away_answer_explanation",
    9: "Retry",
    # 10: "NA"
}

CLASSIFIER_STRATEGY_NL = {
    "strategies-2": "Ask Question to Guide Thinking",
    "strategies-4": "Give Solution Strategy",
    "strategies-5": "Prompt Student to Explain",
    "strategies-6": "Encourage Student in Generic Way",
    "strategies-7": "Affirm Student's Correct Attempt",
    "strategies-8": "Give Away Answer/Explanation",
    "strategies-9": "Ask Student to Retry",
}

#strategies_fname = "data/filtered_copilot_data.csv"

In [3]:
"""
This script is to replicate Figure 3, log odds analysis.

The plot used is stored under results/strategies.pdf
"""

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os

sys.path.append(os.getcwd())
#from scripts.log_odds import constants
import math
from collections import defaultdict


def plot_log_odds(df, **kwargs):
    # Assuming top_bottom_df1 and top_bottom_df2 are your two dataframes
    # containing the log odds and topic names for the two different comparisons.

    # Set the style and context for the plots
    sns.set_style("whitegrid")
    sns.set_context("paper", font_scale=0.6, rc={"lines.linewidth": 2.5})

    plt.figure(dpi=600, figsize=(5, 2))

    # Plot positive log odds with blue #4682B4 and negative with red #FF6347, and gray for non-significant (abs(log odds) < 1)
    positive_color = "#4682B4"
    negative_color = "#FF6347"
    non_significant_color = "#D3D3D3"
    sns.barplot(
        x="log_odds",
        y="name",
        data=df,
        palette=[
            (
                positive_color
                if (x < 0 and abs(x) > 1)
                else negative_color if (x > 0 and abs(x) > 1) else non_significant_color
            )
            for x in df["log_odds"]
        ],
        edgecolor="black",
        linewidth=0.5,
    )
    plt.xlabel(kwargs.get("title", "Log odds ratio"))
    plt.ylabel("")

    x_min, x_max = plt.xlim(-5, 5)
    y_min, y_max = plt.ylim()
    plt.text(
        x_min, y_max - 0.5, kwargs.get("text_left", ""), ha="left", va="center"
    )  # second group because it's negative
    plt.text(x_max, y_max - 0.5, kwargs.get("text_right", ""), ha="right", va="center")

    # Adjust layout
    plt.tight_layout()
    plt.savefig(kwargs.get("output_fname", "results/log_odds_test.pdf"))
    # plt.show()


def _log_odds(counts1, counts2, prior, zscore=True):
    # code from Dan Jurafsky
    # note: counts1 will be positive and counts2 will be negative
    sigmasquared = defaultdict(float)
    sigma = defaultdict(float)
    delta = defaultdict(float)

    n1 = sum(counts1.values())
    n2 = sum(counts2.values())

    # since we use the sum of counts from the two groups as a prior, this is equivalent to a simple log odds ratio
    nprior = sum(prior.values())
    for word in prior.keys():
        if prior[word] == 0:
            delta[word] = 0
            continue
        l1 = float(counts1[word] + prior[word]) / (
            (n1 + nprior) - (counts1[word] + prior[word])
        )
        l2 = float(counts2[word] + prior[word]) / (
            (n2 + nprior) - (counts2[word] + prior[word])
        )
        sigmasquared[word] = 1 / (float(counts1[word]) + float(prior[word])) + 1 / (
            float(counts2[word]) + float(prior[word])
        )
        sigma[word] = math.sqrt(sigmasquared[word])
        delta[word] = math.log(l1) - math.log(l2)
        if zscore:
            delta[word] /= sigma[word]
    return delta


def run_log_odds(corpusA, corpusB, value_column):
    counts1 = corpusA[value_column].value_counts().to_dict()
    counts2 = corpusB[value_column].value_counts().to_dict()
    prior = {}
    for k, v in counts1.items():
        prior[k] = v + counts2[k]

    log_odds = _log_odds(counts1, counts2, prior, True)
    log_odds_df = pd.DataFrame.from_dict(log_odds, orient="index", columns=["log_odds"])
    log_odds_df = log_odds_df.sort_values(by="log_odds", ascending=False)
    top_bottom_df = pd.concat(
        [
            log_odds_df[log_odds_df["log_odds"] >= 0],
            log_odds_df[log_odds_df["log_odds"] <= 0],
        ]
    )
    counts_df = pd.DataFrame({"A": counts1, "B": counts2})
    top_bottom_df = top_bottom_df.reset_index().rename(columns={"index": "name"})
    return top_bottom_df, counts_df, log_odds_df


def run_analysis():
    # Utt corpus should be annotated with appropriate columns
    # corpus = pd.read_csv(constants.strategies_fname)
    # XB: figure 3
    corpus = pd.read_csv(
        "/content/drive/Shareddrives/NSSA Research/FEV/FEV_AICopilot/tutor-copilot (Rose's folder)/results/annotated_strategies.csv"
    )

    moments = pd.read_csv(
        "../../../../tutor-copilot (Rose's folder)/results/annotated_moments.csv"
    )

    # Report log likelihood of strategies between TUTOR_COPILOT_ASSIGNMENT = TREATMENT and CONTROL.
    strategy_names = [
        f"strategies-{index}" for index in STRATEGY_ID_2_NAME.keys()
    ]
    # Remove "strategies-1"
    strategy_names.remove("strategies-1")

    during_utterances = pd.merge(
        moments.query("`moments-3` == 1")[
            ["input", "immediate_next_response", "SESSION_ID", "tutor_id", "moments-3"]
        ],
        corpus[["context", "response", "SESSION_ID", "TUTOR_ID"] + strategy_names],
        how="inner",
        left_on=["input", "immediate_next_response", "SESSION_ID"],
        right_on=["context", "response", "SESSION_ID"],
    )

    # Check that columns exist
    for name in strategy_names:
        if name not in corpus.columns:
            raise ValueError(f"Column {name} not in corpus")
    # log odds per strategy
    control_df = corpus[corpus["TUTOR_COPILOT_ASSIGNMENT"] == "CONTROL"]
    treatment_df = corpus[corpus["TUTOR_COPILOT_ASSIGNMENT"] == "TREATMENT"]

    strategy_2_log_odds = []
    for strategy_name in strategy_names:
        # Note: log odds treatment - control. So if log odds is positive, treatment > control. Vice versa.
        log_odds_df, counts_df = run_log_odds(treatment_df, control_df, strategy_name)

        proportion_during_problem = during_utterances[strategy_name].mean()

        counts_df = counts_df.rename(columns={"A": "T", "B": "C"}).T
        counts_kws = {
            "Treatment uses": counts_df.at["T", 1.0],
            "Control uses": counts_df.at["C", 1.0],
            "Treatment Proportion of uses": counts_df.at["T", 1.0]
            / counts_df.loc["T", [0.0, 1.0]].sum(),
            "Control Proportion of uses": counts_df.at["C", 1.0]
            / counts_df.loc["C", [0.0, 1.0]].sum(),
            "Proportion of uses during problem attempt": proportion_during_problem,
            # "Relative Proportion": counts_df.at["T", 1.0] / counts_df.at["C", 1.0],
        }

        # Get the log odds value for 1.0
        log_odds = log_odds_df[log_odds_df["name"] == 1.0]["log_odds"].values[0]
        strategy_2_log_odds.append(
            {"name": strategy_name, "log_odds": log_odds, **counts_kws}
        )

    strategy_log_odds_df = pd.DataFrame(strategy_2_log_odds)

    # Rename the strategies
    strategy_nl = CLASSIFIER_STRATEGY_NL
    strategy_log_odds_df["name"] = strategy_log_odds_df["name"].apply(
        lambda x: strategy_nl[x]
    )

    strategy_log_odds_df = strategy_log_odds_df.sort_values(
        by="log_odds", ascending=False
    )
    strategy_log_odds_df.to_csv("results/strategy_log_odds.csv", index=False)

    # XB: figure 3
    plot_log_odds(
        strategy_log_odds_df,
        title="Z-scored log odds ratio",
        text_left="Control",  # Negative values mean that control > treatment
        text_right="Treatment",  # Positive values mean that treatment > control
        # output_fname="results/strategies.pdf",
    )

    total_row = strategy_log_odds_df.drop(columns=["log_odds", "name"]).sum()
    strategy_log_odds_df.loc[-1] = total_row
    strategy_log_odds_df.loc[-1, "name"] = "Total"
    strategy_log_odds_df = strategy_log_odds_df.astype(
        {"Treatment uses": "int", "Control uses": "int"}
    )
    print(strategy_log_odds_df)
    strategy_log_odds_df.columns = pd.MultiIndex.from_tuples(
        [
            ("", "Strategy"),
            ("", "Z Score"),
            ("Uses", "Treatment"),
            ("Uses", "Control"),
            ("Proportion of Uses", "Treatment"),
            ("Proportion of Uses", "Control"),
            ("Proportion of Uses", "During problem attempt"),
        ]
    )
    latex = (
        strategy_log_odds_df.to_latex(index=False, float_format="%.2f")
        .replace("NaN", "")
        .replace("multicolumn{2}{r}", "multicolumn{2}{c}")
        .replace("multicolumn{3}{r}", "multicolumn{3}{c}")
        .replace("Strategy & Z", "\\cmidrule(r){3-4} \\cmidrule(r){5-7}\nStrategy & Z")
    )
    with open("results/strategy_log_odds.tex", "w") as f:
        f.write(latex)




In [9]:
# prompt: using the file "/content/drive/Shareddrives/NSSA Research/FEV/FEV_AICopilot/tutor-copilot (Rose's folder)/results/annotated_strategies.csv" , create a table that shows the overlap in positive values between each column of strategies as a proportion of all observations (rows). Use CLASSIFIER_STRATEGY_NL to name the strategies in the resulting table and save it as a latex table

import pandas as pd
# Load the dataset
df_strategies = pd.read_csv("/content/drive/Shareddrives/NSSA Research/FEV/FEV_AICopilot/tutor-copilot (Rose's folder)/results/annotated_strategies.csv")

# Select the relevant strategy columns (assuming they are the 'strategies-X' columns)
strategy_cols = [col for col in df_strategies.columns if col.startswith('strategies-') and col in CLASSIFIER_STRATEGY_NL.keys()]

# Calculate the overlap matrix
overlap_matrix = pd.DataFrame(index=strategy_cols, columns=strategy_cols, dtype=float)

total_rows = len(df_strategies)

for col1 in strategy_cols:
    for col2 in strategy_cols:
        # Count rows where both columns have a positive value (assuming positive means strategy was used)
        overlap_count = df_strategies[(df_strategies[col1] > 0) & (df_strategies[col2] > 0)].shape[0]
        # Calculate the proportion
        overlap_proportion = overlap_count / total_rows
        overlap_matrix.loc[col1, col2] = overlap_proportion

# Rename columns and index using CLASSIFIER_STRATEGY_NL
overlap_matrix = overlap_matrix.rename(columns=CLASSIFIER_STRATEGY_NL, index=CLASSIFIER_STRATEGY_NL)

# Save the table as a LaTeX table
latex_table = overlap_matrix.to_latex(float_format="%.5f")

# Define the path to save the LaTeX file
latex_file_path = "results/strategy_overlap_proportion.tex"

# Ensure the results directory exists
os.makedirs(os.path.dirname(latex_file_path), exist_ok=True)

with open(latex_file_path, "w") as f:
    f.write(latex_table)

print(f"Overlap matrix saved to {latex_file_path}")
overlap_matrix

Overlap matrix saved to results/strategy_overlap_proportion.tex


Unnamed: 0,Ask Question to Guide Thinking,Give Solution Strategy,Prompt Student to Explain,Encourage Student in Generic Way,Affirm Student's Correct Attempt,Give Away Answer/Explanation,Ask Student to Retry
Ask Question to Guide Thinking,0.044511,5e-05,0.0,7.5e-05,0.000265,0.000759,9.5e-05
Give Solution Strategy,5e-05,0.005086,0.0,4e-06,4.6e-05,0.000469,3.7e-05
Prompt Student to Explain,0.0,0.0,0.025454,8.3e-05,0.000228,4e-06,0.000166
Encourage Student in Generic Way,7.5e-05,4e-06,8.3e-05,0.118453,0.020314,2.9e-05,0.000332
Affirm Student's Correct Attempt,0.000265,4.6e-05,0.000228,0.020314,0.134055,0.014125,0.000344
Give Away Answer/Explanation,0.000759,0.000469,4e-06,2.9e-05,0.014125,0.059166,8.3e-05
Ask Student to Retry,9.5e-05,3.7e-05,0.000166,0.000332,0.000344,8.3e-05,0.009354


In [None]:
corpus = pd.read_csv(
        "/content/drive/Shareddrives/NSSA Research/FEV/FEV_AICopilot/tutor-copilot (Rose's folder)/results/annotated_strategies.csv"
    )

    #moments = pd.read_csv(
    #    "../../../../tutor-copilot (Rose's folder)/results/annotated_moments.csv"
    #)

    # Report log likelihood of strategies between TUTOR_COPILOT_ASSIGNMENT = TREATMENT and CONTROL.
    strategy_names = [
        f"strategies-{index}" for index in STRATEGY_ID_2_NAME.keys()
    ]
    # Remove "strategies-1"
    strategy_names.remove("strategies-1")

    during_utterances = pd.merge(
        moments.query("`moments-3` == 1")[
            ["input", "immediate_next_response", "SESSION_ID", "tutor_id", "moments-3"]
        ],
        corpus[["context", "response", "SESSION_ID", "TUTOR_ID"] + strategy_names],
        how="inner",
        left_on=["input", "immediate_next_response", "SESSION_ID"],
        right_on=["context", "response", "SESSION_ID"],
    )

    # Check that columns exist
    for name in strategy_names:
        if name not in corpus.columns:
            raise ValueError(f"Column {name} not in corpus")
    # log odds per strategy
    control_df = corpus[corpus["TUTOR_COPILOT_ASSIGNMENT"] == "CONTROL"]
    treatment_df = corpus[corpus["TUTOR_COPILOT_ASSIGNMENT"] == "TREATMENT"]

    strategy_2_log_odds = []
    for strategy_name in strategy_names:
        # Note: log odds treatment - control. So if log odds is positive, treatment > control. Vice versa.
        log_odds_df, counts_df = run_log_odds(treatment_df, control_df, strategy_name)

        proportion_during_problem = during_utterances[strategy_name].mean()

        counts_df = counts_df.rename(columns={"A": "T", "B": "C"}).T
        counts_kws = {
            "Treatment uses": counts_df.at["T", 1.0],
            "Control uses": counts_df.at["C", 1.0],
            "Treatment Proportion of uses": counts_df.at["T", 1.0]
            / counts_df.loc["T", [0.0, 1.0]].sum(),
            "Control Proportion of uses": counts_df.at["C", 1.0]
            / counts_df.loc["C", [0.0, 1.0]].sum(),
            "Proportion of uses during problem attempt": proportion_during_problem,
            # "Relative Proportion": counts_df.at["T", 1.0] / counts_df.at["C", 1.0],
        }

        # Get the log odds value for 1.0
        log_odds = log_odds_df[log_odds_df["name"] == 1.0]["log_odds"].values[0]
        strategy_2_log_odds.append(
            {"name": strategy_name, "log_odds": log_odds, **counts_kws}
        )

    strategy_log_odds_df = pd.DataFrame(strategy_2_log_odds)

    # Rename the strategies
    strategy_nl = CLASSIFIER_STRATEGY_NL
    strategy_log_odds_df["name"] = strategy_log_odds_df["name"].apply(
        lambda x: strategy_nl[x]
    )

    strategy_log_odds_df = strategy_log_odds_df.sort_values(
        by="log_odds", ascending=False
    )
    strategy_log_odds_df.to_csv("results/strategy_log_odds.csv", index=False)

    # XB: figure 3
    plot_log_odds(
        strategy_log_odds_df,
        title="Z-scored log odds ratio",
        text_left="Control",  # Negative values mean that control > treatment
        text_right="Treatment",  # Positive values mean that treatment > control
        # output_fname="results/strategies.pdf",
    )

    total_row = strategy_log_odds_df.drop(columns=["log_odds", "name"]).sum()
    strategy_log_odds_df.loc[-1] = total_row
    strategy_log_odds_df.loc[-1, "name"] = "Total"
    strategy_log_odds_df = strategy_log_odds_df.astype(
        {"Treatment uses": "int", "Control uses": "int"}
    )
    print(strategy_log_odds_df)
    strategy_log_odds_df.columns = pd.MultiIndex.from_tuples(
        [
            ("", "Strategy"),
            ("", "Z Score"),
            ("Uses", "Treatment"),
            ("Uses", "Control"),
            ("Proportion of Uses", "Treatment"),
            ("Proportion of Uses", "Control"),
            ("Proportion of Uses", "During problem attempt"),
        ]
    )
    latex = (
        strategy_log_odds_df.to_latex(index=False, float_format="%.2f")
        .replace("NaN", "")
        .replace("multicolumn{2}{r}", "multicolumn{2}{c}")
        .replace("multicolumn{3}{r}", "multicolumn{3}{c}")
        .replace("Strategy & Z", "\\cmidrule(r){3-4} \\cmidrule(r){5-7}\nStrategy & Z")
    )
    with open("results/strategy_log_odds.tex", "w") as f:
        f.write(latex)


In [None]:
if __name__ == "__main__":
    run_analysis()
