In [1]:
import pandas as pd
import numpy as np
import os
import glob
from sklearn.metrics import recall_score

all_experiments_dir = "/home/amirf/GoogleDrive/AmirNadav/CausaLM/Experiments/Sentiment/"
experiments = os.listdir(all_experiments_dir)
experiments = [exp for exp in experiments if "adj" in exp]
print(experiments)

['adj_bias_aggressive_ratio_adj_1', 'adj_bias_gentle_ratio_adj_1', 'adj']


In [2]:
def get_cace_results(all_experiments_dir, cur_experiment):
    cur_experiment_dirs = os.listdir(os.path.join(all_experiments_dir,cur_experiment) + "/unified/COMPARE/lightning_logs/")
    sorted_versions = sorted([int(i.split('_')[1]) for i in cur_experiment_dirs])
    cur_experiment_dir = os.path.join(all_experiments_dir,cur_experiment) + "/unified/COMPARE/lightning_logs/version_" + str(sorted_versions[-2])
    cur_treated_experiment_dir = os.path.join(all_experiments_dir,cur_experiment) + "/unified/COMPARE/lightning_logs/version_" + str(sorted_versions[-1])
#     print(cur_experiment_dir)
    cur_experiment_dir_files = os.listdir(cur_experiment_dir)
    f_file = glob.glob(cur_experiment_dir + "/Sentiment_F_trained_F_ima_control_treated-test-predictions.csv")
    df_f = pd.read_csv(f_file[0])
    cf_file = glob.glob(cur_experiment_dir + "/Sentiment_CF_trained_F_ima_control_treated-test-predictions.csv")
    df_cf = pd.read_csv(cf_file[0])
    task = "ima"
    Task = task.capitalize()
    a_f_file = sorted(glob.glob(cur_treated_experiment_dir + "/Sentiment_F_trained_F_" + task + "_treated-test-predictions.csv"))
    df_a_f = pd.read_csv(a_f_file[0])
    a_cf_file = sorted(glob.glob(cur_treated_experiment_dir + "/Sentiment_CF_trained_F_" + task + "_treated-test-predictions.csv"))
    df_a_cf = pd.read_csv(a_cf_file[0])
    
    accs = [df_f.correct.sum()/len(df_f), df_cf.correct.sum()/len(df_cf), df_a_f.correct.sum()/len(df_a_f), df_a_cf.correct.sum()/len(df_a_cf)]
    
    class_cols = [i for i in df_f.columns if "class" in i]

    TreATE_f = df_a_f[class_cols].subtract(df_f[class_cols]).abs().sum(axis=1).sum()/len(df_a_f)
    TreATE_cf = df_a_cf[class_cols].subtract(df_f[class_cols]).abs().sum(axis=1).sum()/len(df_a_f)
    ATE = df_f[class_cols].subtract(df_cf[class_cols]).abs().sum(axis=1).sum()/len(df_f)
    TreATE = TreATE_f if abs(TreATE_f-ATE)<abs(TreATE_cf-ATE) else TreATE_cf

    all_test_data = "/home/amirf/GoogleDrive/AmirNadav/CausaLM/Data/Sentiment/Raw/unified/"
    df_f = df_f.set_index(["sample_index"])
    df_cf = df_cf.set_index(["sample_index"])
    cur_test_data = os.path.join(all_test_data, cur_experiment + "_test.csv")
    df_test_data = pd.read_csv(cur_test_data)
    df_test_data = df_test_data.set_index(["id"])
    mean_ratio_adj = df_test_data["ratio_adj"].mean()
    neg_concept_indices = df_test_data[df_test_data["ratio_adj"] < mean_ratio_adj].index.tolist()
    pos_concept_indices = df_test_data[df_test_data["ratio_adj"] >= mean_ratio_adj].index.tolist()

    tpr_gap_f = recall_score(df_f.loc[neg_concept_indices]['true'], df_f.loc[neg_concept_indices]['prediction'], average='weighted') - recall_score(df_f.loc[pos_concept_indices]['true'], df_f.loc[pos_concept_indices]['prediction'], average='weighted')
    tpr_gap_cf = recall_score(df_cf.loc[neg_concept_indices]['true'], df_cf.loc[neg_concept_indices]['prediction'], average='weighted') - recall_score(df_cf.loc[pos_concept_indices]['true'], df_cf.loc[pos_concept_indices]['prediction'], average='weighted')    
    
    return ATE, TreATE, accs, np.absolute(tpr_gap_cf)


In [3]:
def get_sequence_accs(cur_df):
    cur_df['sum_correct'] = cur_df['correct'].apply(lambda x: sum([int(i) for i in x.replace(' ', '').replace('[', '').replace(']', '').split(',')]))
    cur_df['example_len'] = cur_df['correct'].apply(lambda x: len([int(i) for i in x.replace(' ', '').replace('[', '').replace(']', '').split(',')]))

    return cur_df['sum_correct'].sum() / cur_df['example_len'].sum()

In [4]:
def get_treated_results(all_experiments_dir, cur_experiment):
    task = "ima"
    Task = task.upper()
    cur_experiment_dirs = os.listdir(os.path.join(all_experiments_dir,cur_experiment) + "/unified/COMPARE/lightning_logs/")
    latest_version = str(sorted([int(i.split('_')[1]) for i in cur_experiment_dirs])[-4])
    cur_experiment_dir = os.path.join(all_experiments_dir,cur_experiment) + "/unified/COMPARE/lightning_logs/version_" + latest_version
    cur_experiment_dir_files = os.listdir(cur_experiment_dir)
    f_file = glob.glob(cur_experiment_dir + "/CONTROL_" + Task + "_MLM_F_trained_F-test-predictions.csv")
    df_f = pd.read_csv(f_file[0])
    cf_file = glob.glob(cur_experiment_dir + "/CONTROL_" + Task + "_MLM_CF_trained_F-test-predictions.csv")
    df_cf = pd.read_csv(cf_file[0])
    a_f_file = glob.glob(cur_experiment_dir + "/CONTROL_" + Task + '_' + task + "_control_treated_F_trained_F-test-predictions.csv")
    df_a_f = pd.read_csv(a_f_file[0])
    a_cf_file = glob.glob(cur_experiment_dir + "/CONTROL_" + Task + '_' + task + "_control_treated_CF_trained_F-test-predictions.csv")
    df_a_cf = pd.read_csv(a_cf_file[0])
    
    treated_accs = [get_sequence_accs(df_f), get_sequence_accs(df_cf), get_sequence_accs(df_a_f), get_sequence_accs(df_a_cf)]
    
    return treated_accs

In [5]:
def get_control_results(all_experiments_dir, cur_experiment):
    task = "ima"
    control = "POS_Tagging"
    cur_experiment_dirs = os.listdir(os.path.join(all_experiments_dir,cur_experiment) + "/unified/COMPARE/lightning_logs/")
    latest_version = str(sorted([int(i.split('_')[1]) for i in cur_experiment_dirs])[-4])
    cur_experiment_dir = os.path.join(all_experiments_dir,cur_experiment) + "/unified/COMPARE/lightning_logs/version_" + latest_version
    cur_experiment_dir_files = os.listdir(cur_experiment_dir)
    f_file = glob.glob(cur_experiment_dir + "/CONTROL_" + control + "_F_trained_F-test-predictions.csv")
    df_f = pd.read_csv(f_file[0])
    cf_file = glob.glob(cur_experiment_dir + "/CONTROL_" + control + "_CF_trained_F-test-predictions.csv")
    df_cf = pd.read_csv(cf_file[0])
    a_f_file = glob.glob(cur_experiment_dir + "/CONTROL_" + control + '_' + task + "_control_treated_F_trained_F-test-predictions.csv")
    df_a_f = pd.read_csv(a_f_file[0])
    a_cf_file = glob.glob(cur_experiment_dir + "/CONTROL_" + control + '_' + task + "_control_treated_CF_trained_F-test-predictions.csv")
    df_a_cf = pd.read_csv(a_cf_file[0])
    
    control_accs = [get_sequence_accs(df_f), get_sequence_accs(df_cf), get_sequence_accs(df_a_f), get_sequence_accs(df_a_cf)]
    
    return control_accs

In [6]:
for experiment in experiments:
    ATE, TreATE, accs, tpr_gap = get_cace_results(all_experiments_dir, experiment)
#     print(experiment)
    print("ATE: %3f, TreATE: %3f " %(ATE, TreATE))
#     print("TPR-GAP: " + str(tpr_gap))
#     print("Accuracies: " + str(accs))

#     treated_accs = get_treated_results(all_experiments_dir, experiment)
#     control_accs = get_control_results(all_experiments_dir, experiment)

#     print("Accuracies on Treated Concepts: " + str(treated_accs))
#     print("Accuracies on Control Concepts: " + str(control_accs))

  _warn_prf(average, modifier, msg_start, len(result))


ATE: 0.728890, TreATE: 0.637894 
ATE: 0.381265, TreATE: 0.331363 
ATE: 0.411683, TreATE: 0.391688 


# Sanity Checks

In [17]:
experiment_name = "adj_bias_aggressive_ratio_adj_1"
# experiment_name = "adj_bias_gentle_ratio_adj_1"
# experiment_name = "adj"

all_experiments_dir = "/home/amirf/GoogleDrive/AmirNadav/CausaLM/Experiments/Sentiment/"
experiment_dirs = os.listdir(os.path.join(all_experiments_dir,experiment_name) + "/unified/COMPARE/lightning_logs/")
sorted_versions = sorted([int(i.split('_')[1]) for i in experiment_dirs])
experiment_dir = os.path.join(all_experiments_dir,experiment_name) + "/unified/COMPARE/lightning_logs/version_" + str(sorted_versions[-2])
treated_experiment_dir = os.path.join(all_experiments_dir,experiment_name) + "/unified/COMPARE/lightning_logs/version_" + str(sorted_versions[-1])
data_dir = "/home/amirf/GoogleDrive/AmirNadav/CausaLM/Data/Sentiment/Raw/unified/"

df_f = pd.read_csv(experiment_dir + "/Sentiment_F_trained_F_ima_control_treated-test-predictions.csv")
df_cf = pd.read_csv(experiment_dir + "/Sentiment_CF_trained_F_ima_control_treated-test-predictions.csv")
df_a_f = pd.read_csv(treated_experiment_dir + "/Sentiment_F_trained_F_ima_treated-test-predictions.csv")
df_a_cf = pd.read_csv(treated_experiment_dir + "/Sentiment_CF_trained_F_ima_treated-test-predictions.csv")

df_test_data = pd.read_csv(data_dir + experiment_name + "_test.csv")

df_test_data = df_test_data.set_index(["id"])
ratio_adj_med = df_test_data['ratio_adj'].median()
print(ratio_adj_med)
small_ratio_adj_ids = df_test_data[df_test_data["ratio_adj"] < (ratio_adj_med*1.9)].index.tolist()
print(len(small_ratio_adj_ids))

df_f = df_f.set_index(["sample_index"])
df_f = df_f.loc[small_ratio_adj_ids]
df_cf = df_cf.set_index(["sample_index"])
df_cf = df_cf.loc[small_ratio_adj_ids]
df_a_f = df_a_f.set_index(["sample_index"])
df_a_f = df_a_f.loc[small_ratio_adj_ids]
df_a_cf = df_a_cf.set_index(["sample_index"])
df_a_cf = df_a_cf.loc[small_ratio_adj_ids]

0.1379492600422833
983


In [18]:
class_cols = [i for i in df_f.columns if "class" in i]

TreATE_f = df_a_f[class_cols].subtract(df_f[class_cols]).abs().sum(axis=1).sum()/len(df_a_f)
TreATE_cf = df_a_cf[class_cols].subtract(df_f[class_cols]).abs().sum(axis=1).sum()/len(df_a_f)
ATE = df_f[class_cols].subtract(df_cf[class_cols]).abs().sum(axis=1).sum()/len(df_f)
TreATE = TreATE_f if abs(TreATE_f-ATE)<abs(TreATE_cf-ATE) else TreATE_cf

ATE, TreATE

(0.7181130472703026, 0.6370708554357875)

In [103]:
df_f_control = pd.read_csv("/home/amirf/GoogleDrive/AmirNadav/CausaLM/Experiments/Sentiment/adj_bias_aggressive_ratio_adj_1/unified/COMPARE/lightning_logs/version_4/CONTROL_IMA_F_trained_F-test-predictions.csv")
df_f_control['sum_correct'] = df_f_control['correct'].apply(lambda x: sum([int(i) for i in x.replace(' ', '').replace('[', '').replace(']', '').split(',')]))
df_f_control['example_len'] = df_f_control['correct'].apply(lambda x: len([int(i) for i in x.replace(' ', '').replace('[', '').replace(']', '').split(',')]))
# print(df_f_control.head())

print(df_f_control['sum_correct'].sum() / df_f_control['example_len'].sum())

0.938328499172999
