In [None]:
from datasets import load_from_disk
import pandas as pd
import numpy as np

In [None]:
original_eval_df =  pd.read_csv("/Users/busesibelkorkmaz/Desktop/Bias-ILQL/scripts/eval/hackernews/original_eval_df.csv")
generated_eval_df =  pd.read_csv("/Users/busesibelkorkmaz/Desktop/Bias-ILQL/scripts/eval/hackernews/generated_eval_df.csv")
generated_eval_df.shape

In [None]:
generated_eval_df = generated_eval_df[generated_eval_df.generated_text != "nan"]
generated_eval_df.shape

In [None]:
original_eval_df['q_val'].mean(), generated_eval_df['q_val'].mean()

In [None]:
s1 = set(original_eval_df['prompt'].values)
s2 = set(generated_eval_df['prompt'].values)

s1 == s2

In [None]:
def tokenize_and_truncate(text, length=256):
    try:
        tokens = text.split()
        return tokens[:length]
    except:
        return [''] * length
        # print(text)
    

original_eval_df['tokens'] = original_eval_df['text'].apply(tokenize_and_truncate)
generated_eval_df['tokens'] = generated_eval_df['generated_text'].apply(tokenize_and_truncate)


In [None]:
import difflib

def compare_texts(tokens1, tokens2):
    d = difflib.Differ()
    diff = d.compare(tokens1, tokens2)
    return list(diff)

original_eval_df['diff'] = original_eval_df.apply(lambda row: compare_texts(row['tokens'], generated_eval_df.loc[row.name, 'tokens']), axis=1)


In [None]:
def extract_changes(diff):
    changes = []
    i = 0
    while i < len(diff):
        if diff[i][0] in ('-', '+'):
            if i + 1 < len(diff) and diff[i + 1][0] == '+':
                change_pair = (diff[i], diff[i + 1])
                i += 2  # Skip the next item as it's part of the current change pair
            else:
                change_pair = (diff[i], None)
                i += 1  # Only increment by 1 as there's no matching '+' change
            changes.append(change_pair)
        else:
            i += 1
    return changes

# Now re-run the line that was causing the error
change_pairs = original_eval_df['diff'].apply(extract_changes).explode().dropna()
change_counts = change_pairs.value_counts()

# Convert to DataFrame
change_counts_df = change_counts.reset_index()
change_counts_df.columns = ['Change Pair', 'Count']


In [None]:
change_counts_df.to_csv('changes.csv', index=False)

In [None]:
change_counts_df

In [None]:
def extract_changes(diff, index):
    changes = []
    i = 0
    while i < len(diff):
        if diff[i][0] in ('-', '+'):
            if i + 1 < len(diff) and diff[i + 1][0] == '+':
                change_pair = (diff[i], diff[i + 1], index)
                i += 2  # Skip the next item as it's part of the current change pair
            else:
                change_pair = (diff[i], None, index)
                i += 1  # Only increment by 1 as there's no matching '+' change
            changes.append(change_pair)
        else:
            i += 1
    return changes

# Now re-run the line that was causing the error
change_pairs = original_eval_df.apply(lambda row: extract_changes(row['diff'], row.name), axis=1).explode().dropna()

# Convert to DataFrame
change_pairs_df = pd.DataFrame(change_pairs.tolist(), columns=['Deletion', 'Addition', 'Row Index'])

# Initialize an empty dictionary to hold the counts and row indices
change_pair_counts = {}

# Iterate through the rows of change_pairs_df
for _, row in change_pairs_df.iterrows():
    # Create a key for the change pair
    change_pair_key = (row['Deletion'], row['Addition'])
    
    # If this change pair has been seen before, increment the count and append the row index
    if change_pair_key in change_pair_counts:
        change_pair_counts[change_pair_key]['Count'] += 1
        change_pair_counts[change_pair_key]['Row Indices'].append(row['Row Index'])
    # If this change pair has not been seen before, initialize the count and row index list
    else:
        change_pair_counts[change_pair_key] = {'Count': 1, 'Row Indices': [row['Row Index']]}

# Convert the dictionary to a DataFrame
paired_changes_df = pd.DataFrame.from_records([(key[0], key[1], value['Count'], value['Row Indices']) for key, value in change_pair_counts.items()], columns=['Deletion', 'Addition', 'Count', 'Row Indices'])

# Display the paired changes DataFrame
paired_changes_df


In [None]:
paired_changes_df.to_csv("paired_changes_df.csv", index=False)

In [None]:
filtered_paired_changes_df = paired_changes_df[paired_changes_df['Deletion'].notnull() & paired_changes_df['Addition'].notnull()]
filtered_paired_changes_df = filtered_paired_changes_df[
    filtered_paired_changes_df.Deletion.str.contains("-", na=False, regex=False) & 
    filtered_paired_changes_df.Addition.str.contains("+", na=False, regex=False)
]

# Reset the index of the resulting DataFrame
filtered_paired_changes_df.reset_index(drop=True, inplace=True)

# Display the filtered paired changes DataFrame
filtered_paired_changes_df.sort_values(by="Count", ascending=True).head(50)

In [None]:
filtered_paired_changes_df.to_csv("paired_changes_df.csv", index=False)

In [None]:
filtered_paired_changes_df

In [None]:
# Function to prepend a single quote to each text field
def prepend_single_quote(text):
    if isinstance(text, str):
        return "'" + text
    return text

# Apply the function to each text field in the DataFrame
filtered_paired_changes_df = filtered_paired_changes_df.applymap(prepend_single_quote)

# Write the modified DataFrame to a CSV file
filtered_paired_changes_df.to_csv('output.csv', index=False)
