In [2]:
from datasets import load_from_disk
import pandas as pd
import numpy as np

In [3]:
original_eval_df =  pd.read_csv("./input_files/hackernews_original_eval_df.csv")
generated_eval_df =  pd.read_csv("./input_files/hackernews_generated_eval_df.csv")
generated_eval_df.shape

(400, 40)

In [4]:
generated_eval_df = generated_eval_df[generated_eval_df.generated_text != "nan"]
generated_eval_df.shape

(400, 40)

In [5]:
original_eval_df['q_val'].mean(), generated_eval_df['q_val'].mean()

(-23.482405801005143, -21.103635362349305)

In [6]:
s1 = set(original_eval_df['prompt'].values)
s2 = set(generated_eval_df['prompt'].values)

s1 == s2

True

In [7]:
def tokenize_and_truncate(text, length=256):
    try:
        tokens = text.split()
        return tokens[:length]
    except:
        return [''] * length
        # print(text)
    

original_eval_df['tokens'] = original_eval_df['text'].apply(tokenize_and_truncate)
generated_eval_df['tokens'] = generated_eval_df['generated_text'].apply(tokenize_and_truncate)


In [8]:
import difflib

def compare_texts(tokens1, tokens2):
    d = difflib.Differ()
    diff = d.compare(tokens1, tokens2)
    return list(diff)

original_eval_df['diff'] = original_eval_df.apply(lambda row: compare_texts(row['tokens'], generated_eval_df.loc[row.name, 'tokens']), axis=1)


In [9]:
def extract_changes(diff):
    changes = []
    i = 0
    while i < len(diff):
        if diff[i][0] in ('-', '+'):
            if i + 1 < len(diff) and diff[i + 1][0] == '+':
                change_pair = (diff[i], diff[i + 1])
                i += 2  # Skip the next item as it's part of the current change pair
            else:
                change_pair = (diff[i], None)
                i += 1  # Only increment by 1 as there's no matching '+' change
            changes.append(change_pair)
        else:
            i += 1
    return changes

# Now re-run the line that was causing the error
change_pairs = original_eval_df['diff'].apply(extract_changes).explode().dropna()
change_counts = change_pairs.value_counts()

# Convert to DataFrame
change_counts_df = change_counts.reset_index()
change_counts_df.columns = ['Change Pair', 'Count']


In [10]:
# change_counts_df.to_csv('changes.csv', index=False)

In [11]:
change_counts_df

Unnamed: 0,Change Pair,Count
0,"(- and, None)",440
1,"(- <a, None)",382
2,"(- to, None)",322
3,"(+ the, + original)",320
4,"(+ description,, + the)",291
...,...,...
8457,"(- speak, None)",1
8458,"(- requirement, None)",1
8459,"(- questions., None)",1
8460,"(- (me), None)",1


In [12]:
def extract_changes(diff, index):
    changes = []
    i = 0
    while i < len(diff):
        if diff[i][0] in ('-', '+'):
            if i + 1 < len(diff) and diff[i + 1][0] == '+':
                change_pair = (diff[i], diff[i + 1], index)
                i += 2  # Skip the next item as it's part of the current change pair
            else:
                change_pair = (diff[i], None, index)
                i += 1  # Only increment by 1 as there's no matching '+' change
            changes.append(change_pair)
        else:
            i += 1
    return changes

# Now re-run the line that was causing the error
change_pairs = original_eval_df.apply(lambda row: extract_changes(row['diff'], row.name), axis=1).explode().dropna()

# Convert to DataFrame
change_pairs_df = pd.DataFrame(change_pairs.tolist(), columns=['Deletion', 'Addition', 'Row Index'])

# Initialize an empty dictionary to hold the counts and row indices
change_pair_counts = {}

# Iterate through the rows of change_pairs_df
for _, row in change_pairs_df.iterrows():
    # Create a key for the change pair
    change_pair_key = (row['Deletion'], row['Addition'])
    
    # If this change pair has been seen before, increment the count and append the row index
    if change_pair_key in change_pair_counts:
        change_pair_counts[change_pair_key]['Count'] += 1
        change_pair_counts[change_pair_key]['Row Indices'].append(row['Row Index'])
    # If this change pair has not been seen before, initialize the count and row index list
    else:
        change_pair_counts[change_pair_key] = {'Count': 1, 'Row Indices': [row['Row Index']]}

# Convert the dictionary to a DataFrame
paired_changes_df = pd.DataFrame.from_records([(key[0], key[1], value['Count'], value['Row Indices']) for key, value in change_pair_counts.items()], columns=['Deletion', 'Addition', 'Count', 'Row Indices'])

# Display the paired changes DataFrame
paired_changes_df


Unnamed: 0,Deletion,Addition,Count,Row Indices
0,- MORE,,2,"[0, 6]"
1,- INFO,+ More,2,"[0, 6]"
2,+ info,,2,"[0, 6]"
3,+ http://jobs.REDACTEDlabs.com,,1,[0]
4,- <a,,382,"[0, 1, 2, 5, 6, 7, 8, 9, 10, 11, 12, 12, 14, 1..."
...,...,...,...,...
8457,- accommodations,+ arrangements,1,[399]
8458,- reflected,+ written,1,[399]
8459,- eclectic,+ elegant,1,[399]
8460,- formatting,,1,[399]


In [None]:
# paired_changes_df.to_csv("paired_changes_df.csv", index=False)

In [13]:
filtered_paired_changes_df = paired_changes_df[paired_changes_df['Deletion'].notnull() & paired_changes_df['Addition'].notnull()]
filtered_paired_changes_df = filtered_paired_changes_df[
    filtered_paired_changes_df.Deletion.str.contains("-", na=False, regex=False) & 
    filtered_paired_changes_df.Addition.str.contains("+", na=False, regex=False)
]

# Reset the index of the resulting DataFrame
filtered_paired_changes_df.reset_index(drop=True, inplace=True)

# Display the filtered paired changes DataFrame
filtered_paired_changes_df.sort_values(by="Count", ascending=True).head(50)

Unnamed: 0,Deletion,Addition,Count,Row Indices
350,- nurturing,+ rewarding,1,[197]
442,- >,+ >*,1,[245]
443,+ -,+ Competitive,1,[245]
445,"- commanding,","+ management,",1,[249]
447,- orchestration,+ orchestrification,1,[251]
448,- SI,+ UI,1,[251]
449,"- versa,",+ then,1,[251]
450,- traits,+ ones,1,[251]
441,- inferior,+ superior,1,[244]
454,+ -,"+ Edinburgh,",1,[253]


In [None]:
# filtered_paired_changes_df.to_csv("paired_changes_df.csv", index=False)

In [14]:
filtered_paired_changes_df

Unnamed: 0,Deletion,Addition,Count,Row Indices
0,- INFO,+ More,2,"[0, 6]"
1,- questions,+ of,1,[1]
2,- >,+ >',14,"[1, 23, 32, 79, 124, 133, 151, 189, 204, 216, ..."
3,- AppHero,+ App,1,[2]
4,- >http://apphero.com/careers</a>,+ based,1,[2]
...,...,...,...,...
697,- utopia.,+ ture.,1,[398]
698,- REDACTED_EMAIL,+ E,1,[398]
699,- accommodations,+ arrangements,1,[399]
700,- reflected,+ written,1,[399]


In [16]:
# Function to prepend a single quote to each text field to avoid silly behavior of Excel (#NAME? error)
def prepend_single_quote(text):
    if isinstance(text, str):
        return "'" + text
    return text

# Apply the function to each text field in the DataFrame
filtered_paired_changes_df = filtered_paired_changes_df.applymap(prepend_single_quote)

# Write the modified DataFrame to a CSV file
# 
filtered_paired_changes_df.to_csv('most_frequent_changes.csv', index=False)
