In [78]:
import pandas as pd
import os
import random 
from difflib import Differ
import nltk
from spacy.lang.en import English
import numpy as np


random.seed(42)

LST = ['crowd', 'expert', 'LLM']

def score_minimality(orig_sent: str, edited_sent: str, normalized: bool = True) -> float:
        """
          Calculate Levenshtein distance(token-level) indicating the minimality of changes between two sentences.
          This method takes in an original sentence and an edited sentence, both as strings.
          It calculates the Levenshtein edit distance between the tokenized versions of these sentences,
          representing the minimum number of single-token edits needed to transform one into the other.
          Parameters:
          - orig_sent (str): The original sentence before editing.
          - edited_sent (str): The edited version of the sentence.
          - normalized (bool, optional): If True, returns a normalized score relative to the length of
            the original sentence. If False, returns the raw edit distance value.
          Returns:
          - float: The calculated minimality score. If ‘normalized’ is True, the score represents the
            proportion of changes relative to the original sentence length.u
            Source:
          """
        nlp = English()
        tokenizer = nlp.tokenizer
        tokenized_original = [t.text for t in tokenizer(orig_sent)]
        tokenized_edited = [t.text for t in tokenizer(edited_sent)]
        levenshtein_dist = nltk.edit_distance(tokenized_original, tokenized_edited)
        if normalized:
            return levenshtein_dist / len(tokenized_original)
        else:
            return levenshtein_dist


def compute_dist(s1, s2):
    #assert((df[SENT_COLUMN] != df[CF_SENT_COLUMN]).all())
    assert len(s1) == len(s2)
    dist = []

    for x, y in zip(s1,s2):
            dist.append(score_minimality(x, y))
    return dist

In [79]:
orig = pd.read_csv('../counterfactually-augmented-data/NLI/original/test.tsv', sep = '\t')
orig.head()

Unnamed: 0,sentence1,sentence2,gold_label
0,A man with a beard is talking on the cellphone...,A man is prone on the street while another man...,entailment
1,A kid slides down a yellow slide into a swimmi...,The child slides into the pool.,entailment
2,A woman talks on a cellphone while sitting in ...,She has a conversation on her phone outdoors,entailment
3,"A lady, dressed in a costume, is posing with a...",A lady with an axe is posing in the middle of ...,entailment
4,A man in a boom lift bucket welds.,A man is working.,entailment


In [80]:
# for premise
orig_sent1_double = []
# for hypothesis
orig_sent2_double = []

for x in orig['sentence1'].values:
    orig_sent1_double.append(x)
    orig_sent1_double.append(x)
# for hypothesis
for y in orig['sentence2'].values:
    orig_sent2_double.append(y)
    orig_sent2_double.append(y)

In [97]:
premise_path = '../llms-ppl-preds/counterfactually-augmented-data/NLI/revised_premise/test.tsv'
#premise_path = '../counterfactually-augmented-data/NLI/revised_premise/test.tsv'

premise = pd.read_csv(premise_path, sep = '\t')


premise_orig = [x + ' ' + y for x, y in zip(orig_sent1_double, premise['sentence2'].values)]
premise_cfs = [x + ' ' + y for x, y in zip(premise['sentence1'].values, premise['sentence2'].values)]

premise['dist'] = compute_dist(premise_orig, premise_cfs)

premise.head()

Unnamed: 0,sentence1,sentence2,gold_label,predicted_label,ppl,dist
0,A man with a beard is talking on the cellphone...,A man is prone on the street while another man...,contradiction,contradiction,18.422153,0.052632
1,A man with a beard is talking on the cellphone...,A man is prone on the street while another man...,neutral,contradiction,19.552139,0.052632
2,A kid slides down a yellow slide onto the ground.,The child slides into the pool.,contradiction,contradiction,38.83329,0.210526
3,A kid slides down a yellow slide.,The child slides into the pool.,neutral,neutral,52.063065,0.210526
4,A woman talks on a cellphone while sitting in ...,She has a conversation on her phone outdoors,contradiction,entailment,38.821644,0.172414


In [98]:
hypothesis_path = '../llms-ppl-preds/counterfactually-augmented-data/NLI/revised_hypothesis/test.tsv'

hypothesis = pd.read_csv(hypothesis_path, sep = '\t')

#assert len(hypothesis['sentence1'].unique()) == len(hypothesis)//2
hypothesis_orig = [x + ' ' + y for x, y in zip(hypothesis['sentence1'].values, orig_sent2_double)]

hypothesis_cfs = [x + ' ' + y for x, y in zip(hypothesis['sentence1'].values, hypothesis['sentence2'].values)]

hypothesis['dist'] = compute_dist(hypothesis_orig, hypothesis_cfs)

hypothesis.head()

Unnamed: 0,sentence1,sentence2,gold_label,predicted_label,ppl,dist
0,A man with a beard is talking on the cellphone...,A man is talking to his wife on the cellphone.,neutral,neutral,17.876509,0.289474
1,A man with a beard is talking on the cellphone...,A man is alone on the street.,contradiction,contradiction,20.81049,0.210526
2,A kid slides down a yellow slide into a swimmi...,The child slides into the heated pool.,neutral,neutral,39.567387,0.052632
3,A kid slides down a yellow slide into a swimmi...,The man slides into the pool.,contradiction,contradiction,32.12952,0.052632
4,A woman talks on a cellphone while sitting in ...,He has a conversation on her phone outdoors,contradiction,entailment,50.202255,0.034483


In [99]:
premise_orig[:4]

['A man with a beard is talking on the cellphone and standing next to someone who is lying down on the street. A man is prone on the street while another man stands next to him.',
 'A man with a beard is talking on the cellphone and standing next to someone who is lying down on the street. A man is prone on the street while another man stands next to him.',
 'A kid slides down a yellow slide into a swimming pool. The child slides into the pool.',
 'A kid slides down a yellow slide into a swimming pool. The child slides into the pool.']

In [100]:
premise_cfs[:4]

['A man with a beard is talking on the cellphone and standing next to someone who is sitting on the street. A man is prone on the street while another man stands next to him.',
 'A man with a beard is talking on the cellphone and standing next to someone who is on the street. A man is prone on the street while another man stands next to him.',
 'A kid slides down a yellow slide onto the ground. The child slides into the pool.',
 'A kid slides down a yellow slide. The child slides into the pool.']

In [101]:
premise_cfs[:4]

['A man with a beard is talking on the cellphone and standing next to someone who is sitting on the street. A man is prone on the street while another man stands next to him.',
 'A man with a beard is talking on the cellphone and standing next to someone who is on the street. A man is prone on the street while another man stands next to him.',
 'A kid slides down a yellow slide onto the ground. The child slides into the pool.',
 'A kid slides down a yellow slide. The child slides into the pool.']

In [102]:
hypothesis_orig[:4]

['A man with a beard is talking on the cellphone and standing next to someone who is lying down on the street. A man is prone on the street while another man stands next to him.',
 'A man with a beard is talking on the cellphone and standing next to someone who is lying down on the street. A man is prone on the street while another man stands next to him.',
 'A kid slides down a yellow slide into a swimming pool. The child slides into the pool.',
 'A kid slides down a yellow slide into a swimming pool. The child slides into the pool.']

In [103]:
hypothesis_cfs[:4]

['A man with a beard is talking on the cellphone and standing next to someone who is lying down on the street. A man is talking to his wife on the cellphone.',
 'A man with a beard is talking on the cellphone and standing next to someone who is lying down on the street. A man is alone on the street.',
 'A kid slides down a yellow slide into a swimming pool. The child slides into the heated pool.',
 'A kid slides down a yellow slide into a swimming pool. The man slides into the pool.']

In [104]:
premise.to_csv(premise_path, sep='\t', index=False)
hypothesis.to_csv(hypothesis_path, sep='\t', index=False)