In [None]:
import pandas as pd
import os
import random 
from difflib import Differ
import nltk
from spacy.lang.en import English
import numpy as np


random.seed(42)

LST = ['crowd', 'expert', 'LLM']

TEXT_COLUMN = 'sentence1'
LABEL_COLUMN = 'gold_label'

CF_TEXT_COLUMN = 'sentecnce1_contrast' 
CF_SENT_COLUMN = 'contrast_label' 




def compare_sentences(sentence1, sentence2):
    differ = Differ()
    diff = list(differ.compare(sentence1.split(), sentence2.split()))

    added_words = [word[2:] for word in diff if word.startswith('+ ')]
    removed_words = [word[2:] for word in diff if word.startswith('- ')]

    return added_words, removed_words

def compare_cfs(df, f_name, cf_name):
    
    added_removed = df.apply(lambda x : compare_sentences(x[f_name], x[cf_name]), axis = 1)
    added = [x[0] for x in added_removed]
    removed = [x[1] for x in added_removed]

    df['added'] = added 
    df['removed'] = removed

    df['#added'] = [len(x) for x in added]
    df['#removed'] = [len(x) for x in removed]


def score_minimality(orig_sent: str, edited_sent: str, normalized: bool = True) -> float:
        """
          Calculate Levenshtein distance(token-level) indicating the minimality of changes between two sentences.
          This method takes in an original sentence and an edited sentence, both as strings.
          It calculates the Levenshtein edit distance between the tokenized versions of these sentences,
          representing the minimum number of single-token edits needed to transform one into the other.
          Parameters:
          - orig_sent (str): The original sentence before editing.
          - edited_sent (str): The edited version of the sentence.
          - normalized (bool, optional): If True, returns a normalized score relative to the length of
            the original sentence. If False, returns the raw edit distance value.
          Returns:
          - float: The calculated minimality score. If ‘normalized’ is True, the score represents the
            proportion of changes relative to the original sentence length.u
            Source:
          """
        nlp = English()
        tokenizer = nlp.tokenizer
        tokenized_original = [t.text for t in tokenizer(orig_sent)]
        tokenized_edited = [t.text for t in tokenizer(edited_sent)]
        levenshtein_dist = nltk.edit_distance(tokenized_original, tokenized_edited)
        if normalized:
            return levenshtein_dist / len(tokenized_original)
        else:
            return levenshtein_dist


def compute_dist(df, name1, name2):
    df['dist'] = df.apply(lambda x : score_minimality(x[name1], x[name2]), axis = 1)
    return df


In [None]:
SPLITS = ['test']
#TASK_expert = 'IMDb'
LLM = 'llama2-20231209'
#LLM = 'mistral-20240118'
LST = [LLM.split('-')[0] if x=='LLM' else x for x in LST]
path_raw = '../llms-raw/{}/NLI/{}/{}.csv'

lst_dfs = []
for split_name in SPLITS:

    df_crowd_orig = pd.read_csv('../counterfactually-augmented-data/NLI/original/{}.tsv'.format(split_name), sep='\t')
    df_crowd_premise = pd.read_csv('../counterfactually-augmented-data/NLI/revised_premise/{}.tsv'.format(split_name), sep='\t')
    df_crowd_hypothesis = pd.read_csv('../counterfactually-augmented-data/NLI/revised_hypothesis/{}.tsv'.format(split_name), sep='\t')

    df_llm_hypothesis = pd.read_csv(path_raw.format(LLM, 'revised_hypothesis', split_name))
    df_llm_premise = pd.read_csv(path_raw.format(LLM, 'revised_premise', split_name))

    df_llm_hypothesis.replace('', np.nan, inplace=True)   
    df_llm_hypothesis.dropna(inplace=True)

    df_llm_premise.replace('', np.nan, inplace=True)   
    df_llm_premise.dropna(inplace=True)


    #df_original = df_llm[['original_sentence1', 'original_sentence2', 'original label']]
    #df_original.rename(columns={'original_sentence1' : 'sentence1', 'original_sentence2' : 'sentence2','original label' : 'gold_label'}, inplace=True)

    #if 'revised_hypothesis' in path_raw:
    #        df_new = df_llm[['original_sentence1', 'contrast text', 'contrast label']].copy(deep=True)
    #        df_new.rename(columns={'contrast text' : 'original_sentence2'}, inplace=True)
    #elif 'revised_premise' in path_raw:
    #        df_new = df_llm[['contrast text', 'original_sentence2', 'contrast label']].copy(deep=True)
    #        df_new.rename(columns={'contrast text' : 'original_sentence1'}, inplace=True)



In [None]:
pre2hypo = {}
hypo2pre = {}


def preprocess(x):
    return x.replace('  ', ' ').strip()

for i, row in df_crowd_orig.iterrows():
    pre2hypo[preprocess(row['sentence1'])] = preprocess(row['sentence2'])
    hypo2pre[preprocess(row['sentence2'])] = preprocess(row['sentence1'])

In [None]:
df_crowd_orig.head(6)

In [None]:
df_crowd_premise.head(6)

In [None]:
df_crowd_premise['sentence1_f'] = df_crowd_premise.apply(lambda x: hypo2pre[preprocess(x['sentence2'])], axis=1) 

lst_dfs.append((df_crowd_premise, 'crowd_premise', ('sentence1_f', 'sentence1')))
df_crowd_premise
## factuals: sentence1_y, sentence2, gold_label_y
## counterfactuals: sentence1_x, sentence2, gold_label_x


In [None]:
df_crowd_hypothesis['sentence2_f'] = df_crowd_hypothesis.apply(lambda x: pre2hypo[preprocess(x['sentence1'])], axis=1) 

lst_dfs.append((df_crowd_hypothesis, 'crowd_hypothesis', ('sentence2_f', 'sentence2')))
df_crowd_premise

In [None]:
lst_dfs.append((df_llm_premise, '{}_premise'.format(LLM.split('-')[0]), ('original_sentence1', 'contrast text')))

df_llm_premise.head()


In [None]:
lst_dfs.append((df_llm_hypothesis, '{}_hypothesis'.format(LLM.split('-')[0]), ('original_sentence2', 'contrast text')))
df_llm_hypothesis.head()


### Additions and Omits 

In [None]:
for x in lst_dfs:
    print('--'*20)
    print(x[1])
    print(x[2][0]) 
    print(x[2][1]) 
    compare_cfs(x[0], x[2][0], x[2][1])


In [None]:
len(lst_dfs[1][0])

In [None]:
plots = ['#added', '#removed']
variants = ['premise', 'hypothesis']
import matplotlib.pyplot as plt
for p in plots: 
    for v in variants:
        # Plot the distributions using histograms
        plt.figure(figsize=(10, 6))

        # Plot for column1

        for x in lst_dfs:
            if v.lower() in x[1].lower():
                plt.hist(x[0][p], alpha=0.5, label=x[1] + ' ({})'.format(x[0][p].mean().round(2)))


        #plt.title('Distri#Additions in Counterfactuals')
        plt.xlabel(p)
        plt.ylabel('Frequency')
        plt.legend()
        plt.savefig('./analysis/nli/additions_histo_{}_{}_{}.png'.format(LLM, v, p))
        plt.show()


### Levenshtein Distance

In [None]:
for x in lst_dfs: 
    print('--'*20)
    print(x[1])
    print(x[2][0]) 
    print(x[2][1]) 
    compute_dist(x[0], x[2][0], x[2][1])

In [None]:
plots = ['dist']
variants = ['premise', 'hypothesis']

for p in plots: 
    for v in variants:
        # Plot the distributions using histograms
        plt.figure(figsize=(10, 6))

        # Plot for column1

        for x in lst_dfs:
            if v.lower() in x[1].lower():
                plt.hist(x[0][p], alpha=0.5, label=x[1] + ' ({})'.format(x[0][p].mean().round(2)))


        #plt.title('Distri#Additions in Counterfactuals')
        plt.xlabel(p)
        plt.ylabel('Frequency')
        plt.legend()
        plt.savefig('./analysis/nli/additions_histo_distance_{}_{}_{}.png'.format(LLM, v, p))
        plt.show()
