In [132]:
%load_ext autoreload
%autoreload 2

from datasets import load_dataset
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import re
import numpy as np
from statsmodels.stats.contingency_tables import cochrans_q

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
prompt_types = ["declarative", "interrogative"]

In [3]:
from scipy.stats import kendalltau

def kendall_pval(x,y):
    return kendalltau(x,y)[1]

In [4]:
def get_corr(prompt_type):
    base_file = f"analysis_outs/base_{prompt_type}_dev.csv"
    large_file = f"analysis_outs/large_{prompt_type}_dev.csv"
    base_csv = pd.read_csv(base_file)
    large_csv = pd.read_csv(large_file)
    base_and_large = base_csv.merge(large_csv, on="prompt", suffixes=["_base", "_large"])
    corr =  base_and_large.corr(numeric_only=True, method="kendall")["asr_base"]["asr_large"]
    pvalue =  base_and_large.corr(method=kendall_pval, numeric_only=True)["asr_base"]["asr_large"]
    print(f"{prompt_type}: {corr}, pvalue = {pvalue}")
    return base_and_large

In [5]:
prompts = []
for prompt_type in prompt_types:
    df = get_corr(prompt_type)
    prompts.extend(df["prompt"].tolist())

declarative: 0.15393939393939396, pvalue = 0.023248569802855317
interrogative: 0.1869064457466155, pvalue = 0.005873117086478568


### Match back to training data

In [6]:
fine_tuning_data = load_dataset("conll2003")["train"]

In [7]:
model = SentenceTransformer("all-mpnet-base-v2")

In [8]:
prompts = [p.replace("MASK", "[MASK]") for p in prompts]

In [9]:
prompts_embeddings = model.encode(prompts, convert_to_tensor=True)

In [40]:
sentences = [" ".join([token if tag not in [1,2] else "[MASK]" for token, tag in zip(x["tokens"], x["ner_tags"])]) for x in fine_tuning_data]

In [42]:
sentences = [re.sub(r"(\[MASK\]( \[MASK\])+)", "[MASK]", s) for s in sentences]

In [44]:
sentence_embeddings =  model.encode(sentences, convert_to_tensor=True)

In [45]:
cosine_scores = util.cos_sim(prompts_embeddings, sentence_embeddings)

In [47]:
max_sim = cosine_scores.max(1)

In [48]:
mean_sim = cosine_scores.mean(1)

In [49]:
df = get_corr("declarative")

declarative: 0.15393939393939396, pvalue = 0.023248569802855317


In [51]:
df["mean_cosine_sim"] = mean_sim[:100].tolist()

In [52]:
df["max_cosine_sim"] = max_sim.values[:100].tolist()

In [53]:
df.corr(numeric_only=True, method="kendall")

Unnamed: 0,Unnamed: 0_base,asr_base,Unnamed: 0_large,asr_large,mean_cosine_sim,max_cosine_sim
Unnamed: 0_base,1.0,0.097374,1.0,-0.018182,-0.02101,-0.13697
asr_base,0.097374,1.0,0.097374,0.153939,0.0,0.05697
Unnamed: 0_large,1.0,0.097374,1.0,-0.018182,-0.02101,-0.13697
asr_large,-0.018182,0.153939,-0.018182,1.0,-0.013737,0.171717
mean_cosine_sim,-0.02101,0.0,-0.02101,-0.013737,1.0,0.17697
max_cosine_sim,-0.13697,0.05697,-0.13697,0.171717,0.17697,1.0


In [70]:
df.corr(numeric_only=True, method=kendall_pval) <.05

Unnamed: 0,Unnamed: 0_base,asr_base,Unnamed: 0_large,asr_large,mean_cosine_diff,max_cosine_sim
Unnamed: 0_base,False,True,True,False,False,True
asr_base,True,False,True,True,False,False
Unnamed: 0_large,True,True,False,False,False,True
asr_large,False,True,False,False,False,False
mean_cosine_diff,False,False,False,False,False,True
max_cosine_sim,True,False,True,False,True,False


In [55]:
df = get_corr("interrogative")

interrogative: 0.1869064457466155, pvalue = 0.005873117086478568


In [57]:
df["mean_cosine_diff"] = mean_sim[100:].tolist()

In [58]:
df["max_cosine_sim"] = max_sim.values[100:].tolist()

In [59]:
df.corr(numeric_only=True, method="kendall")

Unnamed: 0,Unnamed: 0_base,asr_base,Unnamed: 0_large,asr_large,mean_cosine_diff,max_cosine_sim
Unnamed: 0_base,1.0,-0.173149,1.0,-0.09799,0.042226,-0.15941
asr_base,-0.173149,1.0,-0.173149,0.186906,0.005456,-0.041827
Unnamed: 0_large,1.0,-0.173149,1.0,-0.09799,0.042226,-0.15941
asr_large,-0.09799,0.186906,-0.09799,1.0,-0.060012,-0.056375
mean_cosine_diff,0.042226,0.005456,0.042226,-0.060012,1.0,0.399071
max_cosine_sim,-0.15941,-0.041827,-0.15941,-0.056375,0.399071,1.0


In [60]:
df.corr(numeric_only=True, method=kendall_pval)<.05

Unnamed: 0,Unnamed: 0_base,asr_base,Unnamed: 0_large,asr_large,mean_cosine_diff,max_cosine_sim
Unnamed: 0_base,False,True,True,False,False,True
asr_base,True,False,True,True,False,False
Unnamed: 0_large,True,True,False,False,False,True
asr_large,False,True,False,False,False,False
mean_cosine_diff,False,False,False,False,False,True
max_cosine_sim,True,False,True,False,True,False


### Statistical significance testing

In [118]:
preds = pd.read_csv("model_predictions/base_interrogative_dev.csv")

In [120]:
a = list(range(10))

In [122]:
a[::2]

[0, 2, 4, 6, 8]

In [126]:
success_dict = {}
for p, df in preds.groupby("prompt"):
    confs_negative = df["confidence"][::2]
    confs_positive = df["confidence"][1::2]
    diff = np.array(confs_positive) - np.array(confs_negative)
    success_dict[p] = (diff > 0).astype(int)

In [129]:
base_interrogative = pd.DataFrame.from_dict(success_dict)

In [134]:
print(cochrans_q(base_interrogative))

df          99
pvalue      0.0001310175007316223
statistic   158.73307190040165


In [135]:
preds = pd.read_csv("model_predictions/large_interrogative_dev.csv")

In [138]:
success_dict = {}
for p, df in preds.groupby("prompt"):
    confs_negative = df["confidence"][::2]
    confs_positive = df["confidence"][1::2]
    diff = np.array(confs_positive) - np.array(confs_negative)
    success_dict[p] = (diff > 0).astype(int)

In [139]:
base_interrogative = pd.DataFrame.from_dict(success_dict)

In [140]:
print(cochrans_q(base_interrogative))

df          99
pvalue      0.07421406943730144
statistic   120.00339117616446


In [141]:
preds = pd.read_csv("model_predictions/base_declarative_dev.csv")

In [142]:
success_dict = {}
for p, df in preds.groupby("prompt"):
    confs_negative = df["confidence"][::2]
    confs_positive = df["confidence"][1::2]
    diff = np.array(confs_positive) - np.array(confs_negative)
    success_dict[p] = (diff > 0).astype(int)

In [143]:
base_interrogative = pd.DataFrame.from_dict(success_dict)

In [144]:
print(cochrans_q(base_interrogative))

df          99
pvalue      2.045610775006272e-14
statistic   245.30253081243512


In [149]:
preds = pd.read_csv("model_predictions/large_declarative_dev.csv")

In [150]:
success_dict = {}
for p, df in preds.groupby("prompt"):
    confs_negative = df["confidence"][::2]
    confs_positive = df["confidence"][1::2]
    diff = np.array(confs_positive) - np.array(confs_negative)
    success_dict[p] = (diff > 0).astype(int)

In [151]:
base_interrogative = pd.DataFrame.from_dict(success_dict)

In [152]:
print(cochrans_q(base_interrogative))

df          99
pvalue      0.0008216197121684023
statistic   149.29472699624458
