In [None]:
import os
import json
import pandas as pd
import numpy as np
from scipy.stats import pearsonr


In [None]:
# read all csv in the folder
root_path = ""
ling_origr_df = pd.DataFrame()
ling_reph_df = pd.DataFrame()

for root, dirs, files in os.walk(root_path):
    # if root.endswith("merged"):
    #     continue
    for file in files:
        if file.endswith(".csv"):
            file_name = file.split(".")[0]
            split_name, mark = file_name.split("_")
            # if split_name == "test":
            #     continue
            if mark.startswith("original"):
                df = pd.read_csv(root + file)
                df["split_name"] = split_name
                print(f"Read {file_name} with shape {df.shape}")
                ling_origr_df = pd.concat([ling_origr_df, df])
            elif mark.startswith("rephrased"):
                df = pd.read_csv(root + file)
                df["split_name"] = split_name
                print(f"Read {file_name} with shape {df.shape}")
                ling_reph_df = pd.concat([ling_reph_df, df])
            else:
                print(f"Error: file name: {file_name} is not correct!")

In [None]:
ling_reph_df = ling_reph_df[ling_reph_df["re_idx"] == 0]

feat_cols = list(
    set(ling_origr_df.columns)
    - set(["p_idx", "question_NL", "question_example", "difficulty", "split_name"])
)

# merge two dataframes based on the p_idx
ling_merged_df = pd.merge(
    ling_origr_df,
    ling_reph_df,
    on=["p_idx", "split_name"],
    suffixes=("_original", "_rephrased"),
)

# convert Instruction to one-hot encoding
# ling_merged_df = pd.get_dummies(ling_merged_df, columns=["Instruction", "Role", "Scenario"], prefix=["Inst", "Role", "Scen"])
# ling_merged_df = ling_merged_df.drop(columns=["Inst_None", "Role_None", "Scen_None"])

instruction_cols = ["long", "short", "formal", "fluent", "technical", "logical"]
role_cols = ["student", "programmer", "competitor"]
scenario_cols = ["clearer", "improve", "specify"]

In [None]:
model_name = "neo"
root_path = os.path.join()
code_origr_df = pd.DataFrame()
code_reph_df = pd.DataFrame()

for root, dirs, files in os.walk(root_path):
    # if root.endswith("merged"):
    #     continue
    for file in files:
        if file.endswith(".csv"):
            file_name = file.split(".")[0]
            split_name, mark, _, _ = file_name.split("_")
            # if split_name == "test":
            #     continue
            if mark.startswith("original"):
                df = pd.read_csv(os.path.join(root, file))
                df["split_name"] = split_name
                print(f"Read {file_name} with shape {df.shape}")
                code_origr_df = pd.concat([code_origr_df, df])
            elif mark.startswith("rephrased"):
                df = pd.read_csv(os.path.join(root, file))
                df["split_name"] = split_name
                print(f"Read {file_name} with shape {df.shape}")
                code_reph_df = pd.concat([code_reph_df, df])
            else:
                print(f"Error: file name: {file_name} is not correct!")

In [None]:
cm_cols = [
    "semgrep",
    "black",
    "syntaxError_rate",
    "sta_codeBleu",
    "sta_Bleu",
    "sim_codeBleu",
    "sim_Bleu",
    "pass_rate",
    "error_rate",
    "timeout_rate",
]

code_origr_df = code_origr_df.drop(columns=["Instruction", "Role", "Scenario", "Base"])

# merge two dataframes based on the p_idx
code_merged_df = pd.merge(
    code_origr_df,
    code_reph_df,
    on=["p_idx", "split_name"],
    suffixes=("_original", "_rephrased"),
).dropna()
# code_merged_df = pd.get_dummies(code_merged_df, columns=["Instruction", "Role", "Scenario"], prefix=["Inst", "Role", "Scen"])

In [None]:
fin_df = pd.merge(
    ling_merged_df,
    code_merged_df,
    on=["p_idx", "Instruction", "Role", "Scenario", "split_name"],
    how="inner",
).dropna()

In [None]:
test_df = fin_df.loc[
    (fin_df["Instruction"] == "None")
    & (fin_df["Role"] == "None")
    & (fin_df["Scenario"] == "None")
]


In [None]:
test_df

In [None]:
ling_dif = pd.DataFrame()
cm_dif = pd.DataFrame()
cm_orig = pd.DataFrame()

for fc in feat_cols:
    ling_dif[fc] = fin_df[fc + "_rephrased"] - fin_df[fc + "_original"]

for cm in cm_cols:
    cm_dif[cm] = fin_df[cm + "_rephrased"] - fin_df[cm + "_original"]
    cm_orig[cm + "_orig"] = fin_df[cm + "_original"]

causal_df = pd.concat([ling_dif, cm_dif], axis=1)

fin_df = pd.get_dummies(fin_df, columns=["Instruction", "Role", "Scenario"], prefix=["Inst", "Role", "Scen"])
fin_df = fin_df.drop(columns=["Inst_None", "Role_None", "Scen_None"])

meta_cols = []
for c in instruction_cols:
    meta_cols.append("Inst_" + c)
for c in role_cols:
    meta_cols.append("Role_" + c)
for c in scenario_cols:
    meta_cols.append("Scen_" + c)

causal_df = pd.concat([fin_df[meta_cols], causal_df], axis=1)
causal_df.to_csv()

In [None]:
# concat ling_dif and cm_diff
all_dif_df = pd.concat([ling_dif, cm_dif, cm_orig], axis=1)
