In [22]:
import pandas as pd
from collections import defaultdict
import numpy as np

def breakup_attacks(df):
    # Break the DF up into smaller DFs
    dfs = []
    current_df = None

    # Iterate over the rows and split on step_num resets
    for i, row in df.iterrows():
        # Check if the step_num resets to -1, indicating a new sequence
        if row['mutation_num'] == -1:
            if current_df is not None and not current_df.empty:
                dfs.append(current_df.reset_index(drop=True))  # Save the current increasing DF
            current_df = pd.DataFrame([row])  # Start a new DataFrame with the reset row
        else:
            # Append the row to the current DataFrame
            current_df = pd.concat([current_df, pd.DataFrame([row])])

    # Add the last DataFrame if it exists and is non-empty
    if current_df is not None and not current_df.empty:
        dfs.append(current_df.reset_index(drop=True))
    
    return dfs


def get_column_value_by_prompt(df: pd.DataFrame, prompt_value: str, target_column: str):
    matching_row = df[df['prompt'] == prompt_value]
    if not matching_row.empty:
        return matching_row[target_column].values[0]
    else:
        return None  # Or raise an error/return a default value if no match is found

In [6]:
main_df = pd.read_csv('/data2/borito1907/impossibility-watermark/data/WQE/dev.csv')
adaptive_df = pd.read_csv('/data2/borito1907/DiffOracle_adaptive_WordMutator_n-steps=1000_attack_results_newest_annotatedfinal.csv')
semstamp_df = pd.read_csv('/data2/borito1907/DiffOracle_semstamp_WordMutator_n-steps=1000_attack_results_newest_annotatedfinal.csv')

In [7]:
adaptive_dfs = breakup_attacks(adaptive_df)
semstamp_dfs = breakup_attacks(semstamp_df)

In [30]:
# Split the list into 5 parts
split_size = len(adaptive_dfs) // 10
df_splits = np.array_split(adaptive_dfs, 10)
concatenated_dfs = [pd.concat(split, axis=0, ignore_index=True).drop(columns=['Unnamed: 0'], errors='ignore') for split in df_splits]
for df_num, df in enumerate(concatenated_dfs, start=1):
    output_path = f"/data2/borito1907/DiffOracle_adaptive_WordMutator_n-steps=1000_attack_results_newest_annotatedfinal{df_num}.csv"
    df.to_csv(output_path, index=False)

In [31]:
# Split the list into 5 parts
split_size = len(semstamp_dfs) // 10
df_splits = np.array_split(adaptive_dfs, 10)
concatenated_dfs = [pd.concat(split, axis=0, ignore_index=True).drop(columns=['Unnamed: 0'], errors='ignore') for split in df_splits]
for df_num, df in enumerate(concatenated_dfs, start=1):
    output_path = f"/data2/borito1907/DiffOracle_semstamp_WordMutator_n-steps=1000_attack_results_newest_annotatedfinal{df_num}.csv"
    df.to_csv(output_path, index=False)