### CoHARE Dataset Mapping: hate --> im/explicit hate separately 


In [39]:
train_dir='/home/yumin/hare-hate-speech/data/implicit-hate/IH_exp_co_hare.json'
val_dir='/home/yumin/hare-hate-speech/data/implicit-hate/IH_exp_val.json'
test_dir='/home/yumin/hare-hate-speech/data/implicit-hate/IH_exp_test.json'
file_dir = '/home/yumin/hare-hate-speech/data/implicit-hate/'

In [3]:
# CoHARE Json
import pandas as pd 
train_df = pd.read_json('/home/yumin/hare-hate-speech/data/implicit-hate/IH_exp_co_hare.json', orient='index').reset_index().drop('index', axis=1)
validataion_df = pd.read_json('/home/yumin/hare-hate-speech/data/implicit-hate/IH_exp_val.json', orient='index').reset_index().drop('index', axis=1)
test_df = pd.read_json('/home/yumin/hare-hate-speech/data/implicit-hate/IH_exp_test.json', orient='index').reset_index().drop('index', axis=1)

In [None]:
train_df['label'].value_counts()

In [None]:
import json

# Read the existing JSON file
with open(train_dir, 'r') as file:
    data = json.load(file)

# Filter data based on the 'label' key
filtered_data = {key: value for key, value in data.items() if value['label'] in ['implicit_hate', 'not_hate']}
print(len(filtered_data))

# Save the filtered data to a new JSON file
with open(train_dir.replace('IH_exp_co_hare','IH_exp_co_hare-im'), 'w') as file:
    json.dump(filtered_data, file, indent=2)

### Hate Target Mapping

In [None]:
import pandas as pd 
cti_dir = '/home/yumin/hare-hate-speech/data/implicit-hate/IH_exp_cti.json'
cti_df = pd.read_json(cti_dir, orient='index')

regex_pattern = r'\(A\) Hate.\nThe post targets (.*?)[.]'
cti_df['hate_target'] = cti_df['ft_target'].str.extract(r'The post targets ([^,]+)')
cti_df[['ft_target', 'hate_target']]

# Print rows where 'target_mentions' is not NaN
filtered_df = cti_df[cti_df['hate_target'].notna()]
filtered_df = filtered_df[['id','post','ft_query','ft_target', 'hate_target']].reset_index()
del filtered_df['index']
filtered_df

## Save the CSV file
# filtered_df.to_csv('/home/yumin/hare-hate-speech/data/implicit-hate/IH_hate_target.csv', index=False)

### Training Datatsets Prep

In [None]:
"""CoHARE Datasets Prep code: for T5 Training"""
train_df = pd.read_json('/home/yumin/hare-hate-speech/data/implicit-hate/IH_exp_co_hare.json', orient='index').reset_index().drop('index', axis=1)
validataion_df = pd.read_json('/home/yumin/hare-hate-speech/data/implicit-hate/IH_exp_val.json', orient='index').reset_index().drop('index', axis=1)
test_df = pd.read_json('/home/yumin/hare-hate-speech/data/implicit-hate/IH_exp_test.json', orient='index').reset_index().drop('index', axis=1)

def map_label_for_classification(label):
    if label == "implicit_hate":
        return 1
    elif label == "explicit_hate":
        return 2
    elif label == "not_hate":
        return 0
    else:
        return None  # or handle the case when the label is different


train_df['label_num'] = train_df['label'].apply(map_label_for_classification)
validataion_df['label_num'] = validataion_df['label'].apply(map_label_for_classification)
test_df['label_num'] = test_df['label'].apply(map_label_for_classification)
train_df = train_df.reset_index()
test_df = test_df.reset_index()
validataion_df = validataion_df.reset_index()


def filter_dataframe_by_label(df):
    filtered_df = df.loc[(df['label_num'] == 0) | (df['label_num'] == 2)]
    filtered_df.reset_index(drop=True, inplace=True)
    return filtered_df

# def filter_dataframe_by_label(df):
#     filtered_df = df.loc[(df['ft_target'] == "(A) Not hate.") | (df['ft_target'] == "(B) Explicit Hate.")]
#     filtered_df.reset_index(drop=True, inplace=True)
#     return filtered_df

train_df = filter_dataframe_by_label(train_df)
validataion_df = filter_dataframe_by_label(validataion_df)
test_df = filter_dataframe_by_label(test_df)

# import pandas as pd

concatenated_train_text = pd.concat([train_df['ft_query'], train_df['ft_target']], axis=1)
concatenated_val_text = pd.concat([validataion_df['ft_query'], validataion_df['ft_target']], axis=1)
concatenated_test_text = pd.concat([test_df['ft_query'], test_df['ft_target']], axis=1)

train_df['concatenated'] = concatenated_train_text.apply(' '.join, axis=1)
validataion_df['concatenated'] = validataion_df['ft_query']
test_df['concatenated'] =  test_df['ft_query']
test_df['concatenated'][0]

full_train_dataset = Dataset.from_pandas(train_df[['concatenated', 'label_num']])
full_validataion_dataset = Dataset.from_pandas(validataion_df[['concatenated','label_num']])
full_test_dataset = Dataset.from_pandas(test_df[['concatenated','label_num']])
# datasets = datasets.DatasetDict({"train":full_train_dataset,
                                # "validation" : full_validataion_dataset,
                                # "test" : full_test_dataset})

datasets.update({"train": full_train_dataset})
datasets.update({"validation": full_validataion_dataset})
datasets.update({"test": full_test_dataset})
datasets


### Detection Results Comparison: Sample Mapping

In [5]:
# CoHARE outputs
cohare_im_dir = '/home/yumin/hare-hate-speech/outputs/CoHARE-im/batch4_grad8/results.json'
cohare_ex_dir = '/home/yumin/hare-hate-speech/outputs/CoHARE-ex/batch4_grad8/results.json'
cohare_base_dir = '/home/yumin/hare-hate-speech/outputs/CoHARE-base/batch4_grad8/results.json' 

# CTI outputs
cti_im_dir = '/home/yumin/hare-hate-speech/outputs/CTI-im/batch4_grad8/results.json'
cti_ex_dir = '/home/yumin/hare-hate-speech/outputs/CTI-ex/batch4_grad8/results.json'
cti_base_dir = '/home/yumin/hare-hate-speech/outputs/CTI-base/batch4_grad8/results.json' 

# C outputs
c_im_dir = '/home/yumin/hare-hate-speech/outputs/C-im/batch4_grad8/results.json'
c_ex_dir = '/home/yumin/hare-hate-speech/outputs/C-ex/batch4_grad8/results.json'
c_base_dir = '/home/yumin/hare-hate-speech/outputs/C-base/batch4_grad8/results.json'

In [6]:
# Test Dataset 
im_test_dir = '/home/yumin/hare-hate-speech/data/implicit-hate/IH_test-im.csv'
ex_test_dir = '/home/yumin/hare-hate-speech/data/implicit-hate/IH_test-ex.csv'
base_test_dir = '/home/yumin/hare-hate-speech/data/implicit-hate/IH_test.csv'
im_cohare_test_dir = '/home/yumin/hare-hate-speech/data/implicit-hate/IH_exp_test-im.json'
ex_cohare_test_dir = '/home/yumin/hare-hate-speech/data/implicit-hate/IH_exp_test-ex.json'

im_test_df = pd.read_csv(im_test_dir)
print(im_test_df['class'].value_counts())
pd.read_csv(base_test_dir)

class
not_hate         2639
implicit_hate    1423
Name: count, dtype: int64


Unnamed: 0.1,Unnamed: 0,post,class
0,13832,yes time for people on the right to strap up ...,explicit_hate
1,9034,steve king : obamas #immigration move would tr...,implicit_hate
2,11811,the economist : an insurgent in the white hous...,not_hate
3,462,i lived there it was fine you are just a scare...,implicit_hate
4,4537,wait . i thought white people invented harvard...,not_hate
...,...,...,...
4291,2833,zogbot-silly roper vs brad griffin alt-right y...,not_hate
4292,4783,. why ? muslims can't be white ? what race is ...,implicit_hate
4293,1853,why then even post something that assumes whit...,not_hate
4294,11966,why can't wallace also support trump ?,not_hate


In [7]:
pd.read_json(im_cohare_test_dir, orient='index')

Unnamed: 0,post,label,ft_query,ft_target,id
1,steve king : obamas #immigration move would tr...,implicit_hate,Determine whether the following post is hate s...,(A) Hate.,1
2,the economist : an insurgent in the white hous...,not_hate,Determine whether the following post is hate s...,(B) Not hate.,2
3,i lived there it was fine you are just a scare...,implicit_hate,Determine whether the following post is hate s...,(A) Hate.,3
4,wait . i thought white people invented harvard...,not_hate,Determine whether the following post is hate s...,(B) Not hate.,4
5,Why do white people have such a pleasant scent...,implicit_hate,Determine whether the following post is hate s...,(A) Hate.,5
...,...,...,...,...,...
4291,zogbot-silly roper vs brad griffin alt-right y...,not_hate,Determine whether the following post is hate s...,(B) Not hate.,4291
4292,. why ? muslims can't be white ? what race is ...,implicit_hate,Determine whether the following post is hate s...,(A) Hate.,4292
4293,why then even post something that assumes whit...,not_hate,Determine whether the following post is hate s...,(B) Not hate.,4293
4294,why can't wallace also support trump ?,not_hate,Determine whether the following post is hate s...,(B) Not hate.,4294


In [10]:
cohare_im_df = pd.read_json(cohare_im_dir, orient='index')
def is_correctly_detected(pred_column, true_column):
    if pred_column.split(' 1')[0] == true_column:
        return 'True'
    else: 
        return 'False'

cohare_im_df['correct'] = cohare_im_df[['pred', 'true']].apply(lambda row: is_correctly_detected(*row), axis=1)
cohare_im_df['true']

0           (A) Hate.
1       (B) Not hate.
2           (A) Hate.
3       (B) Not hate.
4           (A) Hate.
            ...      
4057    (B) Not hate.
4058        (A) Hate.
4059    (B) Not hate.
4060    (B) Not hate.
4061        (A) Hate.
Name: true, Length: 4062, dtype: object

In [11]:
def save_result(dir):
    file_name = '-'.join(dir.split('/')[5:7])

    df = pd.read_json(dir, orient='index')
    df['correct'] = df[['pred', 'true']].apply(lambda row: is_correctly_detected(*row), axis=1)
    
    if 'im' in file_name:
        df['detected_post'] = pd.read_csv(im_test_dir)['post'] 
    elif 'ex' in file_name:
        df['detected_post'] = pd.read_csv(ex_test_dir)['post'] 
    elif 'base' in file_name:
        df['detected_post'] = pd.read_csv(base_test_dir)['post']
    df.to_csv(f'/home/yumin/hare-hate-speech/outputs/sample_check/{file_name}.csv', index=False)

In [41]:
# my_list = ['CoHARE-im', 'batch4_grad8']
file_name = '-'.join(cohare_im_dir.split('/')[5:7])
print(file_name)

CoHARE-im-batch4_grad8


In [50]:
save_result(cti_im_dir)

In [52]:
save_result(cti_ex_dir)

In [57]:
save_result(cti_base_dir)

In [58]:
save_result(c_im_dir)

In [59]:
save_result(c_ex_dir)

In [62]:
save_result(c_base_dir)

In [61]:
save_result(cohare_im_dir)

In [60]:
save_result(cohare_ex_dir)

In [12]:
save_result(cohare_base_dir)

### Mispredicted Sample Check DataFrame

In [195]:
import os
import pandas as pd
file_path = '/home/yumin/hare-hate-speech/outputs/sample_check/'
file_names = sorted(os.listdir(file_path))
file_names

['CTI_base_batch4_grad8.csv',
 'CTI_ex_batch4_grad8.csv',
 'CTI_im_batch4_grad8.csv',
 'C_base_batch4_grad8.csv',
 'C_ex_batch4_grad8.csv',
 'C_im_batch4_grad8.csv',
 'CoHARE_base_batch4_grad8.csv',
 'CoHARE_ex_batch4_grad8.csv',
 'CoHARE_im_batch4_grad8.csv']

In [178]:
def mispredicted_sample(csv_name, return_df_or_id):
    """
    - filename: value in dir list 
    - input: filename (string) / output: misclassified rows in dataframe    
    - read CSV file --> save as df 
    - df['correct'] == False <-- misclassified 
    """
    
    file_name = '/home/yumin/hare-hate-speech/outputs/sample_check/' + csv_name 
    df = pd.read_csv(file_name)
    id = df['id']
    mis_id = df[df['correct']==False]['id'].tolist()  # True/False: bool type
    mis_id_list = f"{csv_name.replace('-','_').split('.')[0]}_miss = {mis_id}"
    
    if return_df_or_id == 'df':
        return df[df['correct']==False].reset_index(drop=True)   # drop: w/o making 'index' column from reset_index() 
    elif return_df_or_id == 'mis_id':
        return mis_id_list 
    else: 
        return id                                      

In [None]:
for i in range(len(file_names)):
    print(mispredicted_sample(file_names[i], 'mis_id'))

In [170]:
# C_base_batch4_grad8_miss = [0, 1, 3, 11, 17, 18, 20, 25, 27, 29, 31, 32, 34, 36, 38, 39, 42, 44, 47, 55, 62, 63, 66, 69, 79, 81, 85, 89, 91, 94, 100, 101, 103, 107, 108, 112, 128, 129, 136, 138, 146, 148, 149, 150, 159, 162, 177, 179, 180, 181, 185, 194, 195, 207, 210, 231, 232, 233, 238, 240, 242, 245, 247, 248, 249, 250, 251, 256, 258, 274, 275, 277, 286, 288, 289, 294, 297, 298, 300, 307, 313, 317, 324, 332, 340, 341, 342, 353, 361, 366, 375, 380, 384, 389, 393, 401, 402, 405, 408, 409, 410, 414, 424, 425, 431, 433, 434, 435, 439, 444, 449, 456, 458, 461, 462, 463, 469, 479, 483, 486, 488, 490, 495, 517, 518, 522, 527, 530, 536, 537, 538, 540, 544, 546, 547, 548, 549, 550, 556, 559, 560, 561, 562, 563, 565, 571, 576, 580, 582, 583, 584, 592, 593, 594, 596, 606, 616, 626, 628, 630, 635, 636, 638, 642, 643, 651, 656, 661, 670, 671, 673, 688, 690, 693, 696, 697, 699, 700, 702, 703, 716, 721, 726, 732, 738, 739, 746, 751, 754, 755, 757, 761, 766, 770, 772, 778, 782, 784, 787, 793, 795, 802, 804, 806, 807, 810, 811, 812, 813, 819, 825, 831, 833, 840, 852, 855, 856, 858, 859, 862, 865, 867, 868, 869, 870, 873, 878, 879, 892, 899, 900, 906, 909, 915, 919, 922, 929, 930, 933, 935, 939, 940, 944, 947, 948, 952, 956, 957, 958, 960, 962, 966, 968, 975, 976, 979, 981, 984, 985, 986, 990, 992, 993, 994, 998, 999, 1003, 1005, 1012, 1015, 1016, 1018, 1023, 1024, 1030, 1032, 1033, 1034, 1036, 1038, 1040, 1042, 1047, 1053, 1056, 1059, 1064, 1067, 1071, 1073, 1074, 1075, 1076, 1083, 1090, 1095, 1097, 1099, 1104, 1116, 1118, 1121, 1125, 1126, 1128, 1134, 1136, 1138, 1140, 1141, 1142, 1145, 1146, 1155, 1162, 1171, 1174, 1176, 1177, 1182, 1183, 1185, 1193, 1198, 1200, 1204, 1205, 1209, 1210, 1213, 1215, 1216, 1219, 1220, 1221, 1229, 1230, 1235, 1237, 1240, 1242, 1252, 1253, 1256, 1258, 1262, 1266, 1267, 1268, 1273, 1277, 1278, 1281, 1286, 1292, 1300, 1305, 1306, 1307, 1309, 1310, 1320, 1327, 1328, 1332, 1337, 1339, 1343, 1344, 1356, 1362, 1363, 1364, 1368, 1374, 1376, 1377, 1379, 1387, 1388, 1390, 1391, 1392, 1395, 1396, 1397, 1400, 1403, 1406, 1408, 1411, 1412, 1416, 1420, 1421, 1425, 1428, 1430, 1431, 1436, 1445, 1448, 1450, 1451, 1452, 1455, 1466, 1469, 1474, 1475, 1479, 1485, 1486, 1488, 1492, 1496, 1506, 1518, 1526, 1536, 1538, 1544, 1552, 1553, 1554, 1558, 1560, 1563, 1564, 1571, 1576, 1581, 1583, 1589, 1592, 1597, 1605, 1606, 1607, 1611, 1613, 1614, 1620, 1621, 1624, 1625, 1634, 1637, 1644, 1645, 1649, 1650, 1652, 1656, 1657, 1659, 1660, 1672, 1675, 1680, 1686, 1687, 1691, 1696, 1697, 1698, 1712, 1715, 1719, 1721, 1724, 1725, 1727, 1730, 1732, 1735, 1745, 1751, 1752, 1753, 1757, 1761, 1767, 1771, 1773, 1774, 1776, 1777, 1778, 1779, 1780, 1785, 1787, 1789, 1790, 1792, 1797, 1805, 1807, 1808, 1813, 1818, 1819, 1822, 1823, 1826, 1827, 1836, 1844, 1846, 1863, 1865, 1869, 1870, 1877, 1884, 1886, 1889, 1891, 1893, 1896, 1897, 1898, 1908, 1910, 1913, 1915, 1919, 1925, 1931, 1938, 1940, 1942, 1945, 1949, 1962, 1966, 1970, 1980, 1985, 1987, 1989, 1991, 2003, 2004, 2005, 2006, 2009, 2011, 2013, 2019, 2021, 2022, 2023, 2028, 2029, 2034, 2040, 2043, 2047, 2052, 2063, 2067, 2071, 2072, 2077, 2079, 2080, 2081, 2083, 2086, 2087, 2102, 2109, 2110, 2111, 2113, 2118, 2127, 2129, 2130, 2132, 2137, 2138, 2142, 2145, 2151, 2152, 2153, 2156, 2158, 2161, 2162, 2164, 2169, 2174, 2176, 2182, 2183, 2185, 2186, 2191, 2193, 2194, 2198, 2208, 2216, 2220, 2222, 2227, 2231, 2233, 2234, 2245, 2253, 2255, 2259, 2264, 2269, 2278, 2279, 2287, 2290, 2295, 2303, 2305, 2312, 2316, 2317, 2322, 2325, 2326, 2327, 2329, 2335, 2337, 2344, 2350, 2358, 2359, 2362, 2364, 2376, 2377, 2384, 2385, 2386, 2396, 2401, 2403, 2406, 2407, 2408, 2413, 2414, 2417, 2420, 2422, 2426, 2430, 2431, 2436, 2448, 2459, 2460, 2465, 2468, 2473, 2484, 2485, 2489, 2492, 2494, 2497, 2501, 2506, 2513, 2517, 2529, 2531, 2536, 2542, 2546, 2547, 2548, 2550, 2551, 2552, 2553, 2554, 2555, 2563, 2567, 2569, 2576, 2577, 2578, 2586, 2590, 2591, 2597, 2600, 2601, 2602, 2605, 2606, 2609, 2611, 2615, 2617, 2622, 2623, 2625, 2631, 2633, 2640, 2641, 2643, 2645, 2649, 2650, 2652, 2659, 2661, 2662, 2663, 2669, 2670, 2671, 2677, 2682, 2684, 2692, 2695, 2699, 2702, 2703, 2705, 2708, 2709, 2710, 2718, 2719, 2720, 2725, 2729, 2730, 2741, 2742, 2748, 2749, 2753, 2756, 2757, 2759, 2761, 2762, 2763, 2773, 2774, 2775, 2777, 2779, 2784, 2785, 2786, 2788, 2792, 2793, 2804, 2808, 2809, 2813, 2815, 2816, 2819, 2832, 2837, 2842, 2843, 2857, 2869, 2874, 2876, 2877, 2879, 2899, 2903, 2912, 2921, 2925, 2928, 2930, 2931, 2935, 2945, 2947, 2949, 2960, 2968, 2971, 2975, 2982, 2984, 2986, 3005, 3007, 3014, 3015, 3023, 3024, 3040, 3048, 3051, 3055, 3056, 3057, 3058, 3060, 3064, 3067, 3082, 3088, 3090, 3096, 3100, 3101, 3104, 3106, 3108, 3111, 3117, 3118, 3121, 3125, 3127, 3128, 3132, 3133, 3138, 3146, 3147, 3149, 3150, 3153, 3158, 3159, 3160, 3163, 3164, 3165, 3171, 3181, 3182, 3185, 3190, 3194, 3196, 3197, 3198, 3202, 3205, 3211, 3216, 3217, 3219, 3220, 3226, 3227, 3234, 3238, 3240, 3241, 3246, 3247, 3248, 3250, 3260, 3262, 3269, 3270, 3271, 3272, 3275, 3278, 3279, 3283, 3286, 3289, 3291, 3292, 3296, 3301, 3302, 3306, 3311, 3312, 3313, 3318, 3320, 3325, 3333, 3338, 3342, 3343, 3344, 3345, 3346, 3347, 3356, 3365, 3366, 3368, 3369, 3371, 3372, 3374, 3375, 3377, 3378, 3382, 3383, 3385, 3386, 3387, 3390, 3393, 3395, 3399, 3405, 3411, 3413, 3415, 3417, 3420, 3430, 3433, 3441, 3442, 3447, 3448, 3449, 3455, 3457, 3474, 3477, 3483, 3494, 3495, 3496, 3504, 3505, 3510, 3516, 3518, 3522, 3524, 3525, 3526, 3527, 3528, 3540, 3543, 3546, 3547, 3550, 3551, 3552, 3558, 3559, 3570, 3571, 3577, 3578, 3586, 3590, 3594, 3597, 3602, 3603, 3604, 3605, 3611, 3613, 3617, 3623, 3625, 3629, 3631, 3634, 3636, 3644, 3647, 3649, 3650, 3651, 3652, 3654, 3658, 3662, 3667, 3671, 3673, 3680, 3681, 3688, 3690, 3693, 3699, 3701, 3706, 3707, 3709, 3711, 3714, 3715, 3716, 3718, 3724, 3728, 3729, 3731, 3737, 3739, 3744, 3745, 3754, 3757, 3758, 3759, 3764, 3767, 3769, 3780, 3790, 3794, 3795, 3798, 3807, 3808, 3812, 3813, 3816, 3817, 3823, 3832, 3833, 3834, 3843, 3844, 3848, 3855, 3858, 3867, 3869, 3871, 3872, 3873, 3874, 3879, 3885, 3893, 3900, 3905, 3916, 3917, 3923, 3933, 3939, 3940, 3941, 3946, 3947, 3953, 3955, 3967, 3983, 3985, 3988, 3994, 4000, 4002, 4004, 4011, 4012, 4015, 4016, 4021, 4024, 4030, 4032, 4033, 4037, 4041, 4047, 4049, 4050, 4053, 4054, 4060, 4062, 4064, 4065, 4068, 4069, 4079, 4083, 4086, 4088, 4099, 4101, 4106, 4109, 4115, 4118, 4121, 4124, 4125, 4130, 4133, 4135, 4139, 4142, 4146, 4147, 4148, 4152, 4158, 4159, 4162, 4165, 4169, 4170, 4172, 4175, 4176, 4177, 4181, 4186, 4189, 4200, 4201, 4203, 4206, 4207, 4211, 4213, 4215, 4218, 4219, 4221, 4225, 4227, 4231, 4235, 4237, 4238, 4241, 4243, 4249, 4252, 4253, 4254, 4258, 4259, 4261, 4263, 4267, 4271, 4276, 4277, 4278, 4279, 4281, 4287]
# C_ex_batch4_grad8_miss = [0, 40, 44, 64, 65, 67, 72, 76, 79, 80, 81, 94, 104, 147, 172, 175, 191, 204, 211, 226, 232, 238, 261, 263, 294, 295, 300, 323, 387, 389, 390, 399, 402, 443, 449, 468, 520, 526, 537, 541, 587, 614, 622, 641, 661, 670, 675, 680, 706, 726, 732, 752, 764, 768, 783, 790, 798, 810, 819, 830, 838, 847, 851, 870, 877, 879, 902, 918, 937, 938, 968, 991, 999, 1032, 1047, 1088, 1099, 1100, 1125, 1170, 1178, 1196, 1200, 1201, 1209, 1215, 1231, 1249, 1280, 1301, 1340, 1348, 1381, 1388, 1400, 1422, 1423, 1432, 1447, 1505, 1535, 1552, 1553, 1577, 1598, 1600, 1601, 1607, 1641, 1645, 1647, 1722, 1727, 1747, 1810, 1833, 1854, 1855, 1863, 1864, 1871, 1890, 1906, 1917, 1924, 1929, 1934, 1955, 1958, 1979, 1989, 1991, 2001, 2020, 2031, 2048, 2086, 2089, 2097, 2105, 2109, 2120, 2121, 2155, 2177, 2184, 2208, 2239, 2245, 2305, 2306, 2315, 2322, 2363, 2371, 2385, 2400, 2402, 2412, 2441, 2451, 2461, 2486, 2489, 2511, 2518, 2525, 2533, 2546, 2561, 2580, 2600, 2624, 2633, 2676, 2679, 2690, 2699, 2751, 2758, 2768, 2773, 2781, 2783, 2785, 2805, 2824, 2845, 2862]
# C_im_batch4_grad8_miss = [0, 2, 24, 28, 30, 31, 35, 37, 40, 45, 58, 65, 66, 72, 73, 74, 76, 77, 81, 87, 100, 118, 125, 126, 132, 134, 148, 162, 165, 169, 177, 181, 191, 198, 211, 212, 214, 215, 216, 221, 222, 224, 225, 233, 234, 239, 255, 257, 260, 263, 267, 269, 274, 278, 284, 287, 292, 293, 296, 302, 303, 316, 317, 318, 326, 327, 330, 333, 346, 349, 356, 374, 375, 376, 380, 392, 395, 398, 399, 407, 419, 422, 424, 427, 432, 434, 444, 450, 454, 458, 466, 468, 480, 482, 486, 489, 494, 501, 509, 511, 512, 517, 520, 521, 522, 524, 528, 529, 530, 533, 538, 548, 556, 560, 569, 570, 572, 574, 576, 579, 588, 596, 599, 604, 611, 620, 633, 644, 648, 651, 669, 674, 676, 686, 687, 693, 694, 697, 699, 701, 702, 720, 733, 734, 738, 743, 749, 751, 753, 762, 767, 777, 790, 794, 796, 798, 805, 807, 809, 813, 817, 818, 838, 839, 842, 845, 851, 855, 856, 857, 865, 869, 873, 875, 879, 883, 887, 889, 891, 896, 900, 904, 905, 908, 909, 914, 917, 924, 925, 930, 934, 942, 945, 948, 955, 960, 962, 963, 966, 981, 982, 983, 986, 993, 1000, 1005, 1012, 1015, 1025, 1026, 1027, 1030, 1040, 1042, 1047, 1053, 1054, 1057, 1062, 1063, 1074, 1078, 1084, 1085, 1088, 1092, 1096, 1099, 1106, 1108, 1126, 1132, 1134, 1136, 1137, 1139, 1146, 1153, 1156, 1159, 1162, 1164, 1168, 1170, 1179, 1180, 1183, 1184, 1185, 1198, 1203, 1208, 1214, 1216, 1221, 1241, 1242, 1245, 1249, 1255, 1264, 1269, 1275, 1282, 1284, 1286, 1289, 1290, 1295, 1298, 1300, 1301, 1305, 1308, 1311, 1314, 1315, 1316, 1319, 1320, 1323, 1332, 1334, 1343, 1350, 1351, 1354, 1355, 1356, 1360, 1370, 1371, 1373, 1385, 1387, 1390, 1409, 1414, 1415, 1416, 1419, 1421, 1427, 1429, 1445, 1453, 1454, 1456, 1461, 1464, 1477, 1482, 1485, 1489, 1491, 1502, 1505, 1507, 1509, 1510, 1511, 1512, 1518, 1521, 1522, 1528, 1536, 1539, 1541, 1543, 1544, 1547, 1550, 1551, 1555, 1567, 1568, 1570, 1575, 1580, 1581, 1591, 1594, 1605, 1616, 1625, 1628, 1633, 1634, 1638, 1644, 1645, 1650, 1660, 1664, 1670, 1671, 1672, 1681, 1682, 1684, 1693, 1695, 1698, 1704, 1706, 1709, 1710, 1714, 1716, 1720, 1725, 1734, 1735, 1736, 1740, 1755, 1764, 1765, 1768, 1774, 1781, 1783, 1784, 1786, 1787, 1788, 1800, 1804, 1808, 1810, 1811, 1819, 1821, 1822, 1828, 1833, 1837, 1838, 1849, 1852, 1857, 1866, 1869, 1871, 1875, 1879, 1887, 1890, 1891, 1893, 1894, 1897, 1901, 1903, 1904, 1905, 1907, 1912, 1915, 1916, 1917, 1923, 1926, 1930, 1933, 1934, 1942, 1946, 1949, 1960, 1961, 1962, 1964, 1968, 1970, 1975, 1980, 1987, 1988, 1991, 2011, 2014, 2015, 2019, 2022, 2034, 2038, 2040, 2045, 2049, 2051, 2054, 2057, 2058, 2068, 2069, 2072, 2082, 2083, 2087, 2090, 2108, 2111, 2118, 2127, 2134, 2141, 2148, 2149, 2152, 2165, 2173, 2175, 2176, 2177, 2182, 2185, 2189, 2196, 2198, 2202, 2206, 2211, 2220, 2221, 2224, 2228, 2249, 2250, 2251, 2260, 2264, 2265, 2273, 2281, 2283, 2293, 2297, 2299, 2302, 2309, 2320, 2342, 2343, 2347, 2351, 2352, 2364, 2372, 2375, 2384, 2388, 2393, 2402, 2403, 2404, 2406, 2407, 2409, 2421, 2423, 2439, 2441, 2443, 2448, 2450, 2461, 2463, 2474, 2476, 2483, 2485, 2492, 2493, 2495, 2497, 2500, 2502, 2510, 2519, 2527, 2534, 2536, 2542, 2557, 2558, 2574, 2577, 2589, 2590, 2600, 2603, 2604, 2606, 2607, 2608, 2610, 2618, 2622, 2629, 2630, 2635, 2638, 2643, 2645, 2651, 2655, 2657, 2670, 2673, 2676, 2677, 2678, 2693, 2700, 2705, 2707, 2709, 2718, 2724, 2729, 2733, 2734, 2737, 2742, 2746, 2748, 2753, 2767, 2771, 2772, 2790, 2792, 2799, 2807, 2814, 2820, 2848, 2849, 2854, 2864, 2868, 2869, 2875, 2883, 2884, 2894, 2906, 2908, 2914, 2918, 2921, 2926, 2935, 2936, 2942, 2943, 2944, 2946, 2951, 2953, 2957, 2972, 2975, 2981, 2986, 2989, 2993, 3003, 3007, 3012, 3016, 3020, 3021, 3022, 3024, 3048, 3055, 3059, 3066, 3071, 3075, 3087, 3089, 3091, 3093, 3095, 3097, 3100, 3102, 3109, 3116, 3121, 3122, 3129, 3134, 3139, 3145, 3153, 3158, 3161, 3162, 3164, 3173, 3184, 3186, 3187, 3189, 3190, 3194, 3195, 3200, 3202, 3203, 3204, 3206, 3207, 3212, 3221, 3231, 3238, 3239, 3248, 3255, 3269, 3273, 3280, 3291, 3297, 3305, 3313, 3314, 3315, 3322, 3325, 3327, 3332, 3334, 3337, 3339, 3345, 3349, 3352, 3355, 3356, 3367, 3390, 3394, 3403, 3408, 3409, 3410, 3426, 3430, 3432, 3443, 3445, 3446, 3448, 3452, 3454, 3455, 3456, 3462, 3470, 3471, 3484, 3489, 3499, 3502, 3503, 3511, 3514, 3522, 3523, 3525, 3530, 3555, 3556, 3559, 3570, 3574, 3580, 3581, 3584, 3589, 3591, 3597, 3601, 3611, 3613, 3619, 3620, 3625, 3629, 3630, 3633, 3644, 3646, 3654, 3655, 3662, 3667, 3675, 3698, 3706, 3719, 3721, 3730, 3734, 3747, 3749, 3763, 3764, 3774, 3781, 3793, 3798, 3806, 3808, 3815, 3823, 3824, 3827, 3834, 3839, 3843, 3844, 3849, 3857, 3858, 3860, 3873, 3875, 3882, 3884, 3889, 3891, 3894, 3897, 3905, 3911, 3913, 3916, 3922, 3927, 3933, 3937, 3941, 3946, 3950, 3951, 3959, 3962, 3970, 3972, 3974, 3977, 3978, 3982, 3988, 3991, 3992, 3994, 3997, 3998, 4001, 4004, 4010, 4014, 4016, 4019, 4020, 4030, 4034, 4042, 4043, 4045, 4047, 4061]
# CTI_base_batch4_grad8_miss = [0, 1, 3, 11, 17, 18, 20, 25, 27, 29, 31, 32, 34, 36, 38, 39, 42, 44, 47, 55, 62, 63, 66, 69, 79, 81, 85, 89, 91, 94, 100, 101, 103, 107, 108, 112, 128, 129, 136, 138, 146, 148, 149, 150, 159, 162, 177, 179, 180, 181, 185, 194, 195, 207, 210, 231, 232, 233, 238, 240, 242, 245, 247, 248, 249, 250, 251, 256, 258, 274, 275, 277, 286, 288, 289, 294, 297, 298, 300, 307, 313, 317, 324, 332, 340, 341, 342, 353, 361, 366, 375, 380, 384, 389, 393, 401, 402, 405, 408, 409, 410, 414, 424, 425, 431, 433, 434, 435, 439, 444, 449, 456, 458, 461, 462, 463, 469, 479, 483, 486, 488, 490, 495, 517, 518, 522, 527, 530, 536, 537, 538, 540, 544, 546, 547, 548, 549, 550, 556, 559, 560, 561, 562, 563, 565, 571, 576, 580, 582, 583, 584, 592, 593, 594, 596, 606, 616, 626, 628, 630, 635, 636, 638, 642, 643, 651, 656, 661, 670, 671, 673, 688, 690, 693, 696, 697, 699, 700, 702, 703, 716, 721, 726, 732, 738, 739, 746, 751, 754, 755, 757, 761, 766, 770, 772, 778, 782, 784, 787, 793, 795, 802, 804, 806, 807, 810, 811, 812, 813, 819, 825, 831, 833, 840, 852, 855, 856, 858, 859, 862, 865, 867, 868, 869, 870, 873, 878, 879, 892, 899, 900, 906, 909, 915, 919, 922, 929, 930, 933, 935, 939, 940, 944, 947, 948, 952, 956, 957, 958, 960, 962, 966, 968, 975, 976, 979, 981, 984, 985, 986, 990, 992, 993, 994, 998, 999, 1003, 1005, 1012, 1015, 1016, 1018, 1023, 1024, 1030, 1032, 1033, 1034, 1036, 1038, 1040, 1042, 1047, 1053, 1056, 1059, 1064, 1067, 1071, 1073, 1074, 1075, 1076, 1083, 1090, 1095, 1097, 1099, 1104, 1116, 1118, 1121, 1125, 1126, 1128, 1134, 1136, 1138, 1140, 1141, 1142, 1145, 1146, 1155, 1162, 1171, 1174, 1176, 1177, 1182, 1183, 1185, 1193, 1198, 1200, 1204, 1205, 1209, 1210, 1213, 1215, 1216, 1219, 1220, 1221, 1229, 1230, 1235, 1237, 1240, 1242, 1252, 1253, 1256, 1258, 1262, 1266, 1267, 1268, 1273, 1277, 1278, 1281, 1286, 1292, 1300, 1305, 1306, 1307, 1309, 1310, 1320, 1327, 1328, 1332, 1337, 1339, 1343, 1344, 1356, 1362, 1363, 1364, 1368, 1374, 1376, 1377, 1379, 1387, 1388, 1390, 1391, 1392, 1395, 1396, 1397, 1400, 1403, 1406, 1408, 1411, 1412, 1416, 1420, 1421, 1425, 1428, 1430, 1431, 1436, 1445, 1448, 1450, 1451, 1452, 1455, 1466, 1469, 1474, 1475, 1479, 1485, 1486, 1488, 1492, 1496, 1506, 1518, 1526, 1536, 1538, 1544, 1552, 1553, 1554, 1558, 1560, 1563, 1564, 1571, 1576, 1581, 1583, 1589, 1592, 1597, 1605, 1606, 1607, 1611, 1613, 1614, 1620, 1621, 1624, 1625, 1634, 1637, 1644, 1645, 1649, 1650, 1652, 1656, 1657, 1659, 1660, 1672, 1675, 1680, 1686, 1687, 1691, 1696, 1697, 1698, 1712, 1715, 1719, 1721, 1724, 1725, 1727, 1730, 1732, 1735, 1745, 1751, 1752, 1753, 1757, 1761, 1767, 1771, 1773, 1774, 1776, 1777, 1778, 1779, 1780, 1785, 1787, 1789, 1790, 1792, 1797, 1805, 1807, 1808, 1813, 1818, 1819, 1822, 1823, 1826, 1827, 1836, 1844, 1846, 1863, 1865, 1869, 1870, 1877, 1884, 1886, 1889, 1891, 1893, 1896, 1897, 1898, 1908, 1910, 1913, 1915, 1919, 1925, 1931, 1938, 1940, 1942, 1945, 1949, 1962, 1966, 1970, 1980, 1985, 1987, 1989, 1991, 2003, 2004, 2005, 2006, 2009, 2011, 2013, 2019, 2021, 2022, 2023, 2028, 2029, 2034, 2040, 2043, 2047, 2052, 2063, 2067, 2071, 2072, 2077, 2079, 2080, 2081, 2083, 2086, 2087, 2102, 2109, 2110, 2111, 2113, 2118, 2127, 2129, 2130, 2132, 2137, 2138, 2142, 2145, 2151, 2152, 2153, 2156, 2158, 2161, 2162, 2164, 2169, 2174, 2176, 2182, 2183, 2185, 2186, 2191, 2193, 2194, 2198, 2208, 2216, 2220, 2222, 2227, 2231, 2233, 2234, 2245, 2253, 2255, 2259, 2264, 2269, 2278, 2279, 2287, 2290, 2295, 2303, 2305, 2312, 2316, 2317, 2322, 2325, 2326, 2327, 2329, 2335, 2337, 2344, 2350, 2358, 2359, 2362, 2364, 2376, 2377, 2384, 2385, 2386, 2396, 2401, 2403, 2406, 2407, 2408, 2413, 2414, 2417, 2420, 2422, 2426, 2430, 2431, 2436, 2448, 2459, 2460, 2465, 2468, 2473, 2484, 2485, 2489, 2492, 2494, 2497, 2501, 2506, 2513, 2517, 2529, 2531, 2536, 2542, 2546, 2547, 2548, 2550, 2551, 2552, 2553, 2554, 2555, 2563, 2567, 2569, 2576, 2577, 2578, 2586, 2590, 2591, 2597, 2600, 2601, 2602, 2605, 2606, 2609, 2611, 2615, 2617, 2622, 2623, 2625, 2631, 2633, 2640, 2641, 2643, 2645, 2649, 2650, 2652, 2659, 2661, 2662, 2663, 2669, 2670, 2671, 2677, 2682, 2684, 2692, 2695, 2699, 2702, 2703, 2705, 2708, 2709, 2710, 2718, 2719, 2720, 2725, 2729, 2730, 2741, 2742, 2748, 2749, 2753, 2756, 2757, 2759, 2761, 2762, 2763, 2773, 2774, 2775, 2777, 2779, 2784, 2785, 2786, 2788, 2792, 2793, 2804, 2808, 2809, 2813, 2815, 2816, 2819, 2832, 2837, 2842, 2843, 2857, 2869, 2874, 2876, 2877, 2879, 2899, 2903, 2912, 2921, 2925, 2928, 2930, 2931, 2935, 2945, 2947, 2949, 2960, 2968, 2971, 2975, 2982, 2984, 2986, 3005, 3007, 3014, 3015, 3023, 3024, 3040, 3048, 3051, 3055, 3056, 3057, 3058, 3060, 3064, 3067, 3082, 3088, 3090, 3096, 3100, 3101, 3104, 3106, 3108, 3111, 3117, 3118, 3121, 3125, 3127, 3128, 3132, 3133, 3138, 3146, 3147, 3149, 3150, 3153, 3158, 3159, 3160, 3163, 3164, 3165, 3171, 3181, 3182, 3185, 3190, 3194, 3196, 3197, 3198, 3202, 3205, 3211, 3216, 3217, 3219, 3220, 3226, 3227, 3234, 3238, 3240, 3241, 3246, 3247, 3248, 3250, 3260, 3262, 3269, 3270, 3271, 3272, 3275, 3278, 3279, 3283, 3286, 3289, 3291, 3292, 3296, 3301, 3302, 3306, 3311, 3312, 3313, 3318, 3320, 3325, 3333, 3338, 3342, 3343, 3344, 3345, 3346, 3347, 3356, 3365, 3366, 3368, 3369, 3371, 3372, 3374, 3375, 3377, 3378, 3382, 3383, 3385, 3386, 3387, 3390, 3393, 3395, 3399, 3405, 3411, 3413, 3415, 3417, 3420, 3430, 3433, 3441, 3442, 3447, 3448, 3449, 3455, 3457, 3474, 3477, 3483, 3494, 3495, 3496, 3504, 3505, 3510, 3516, 3518, 3522, 3524, 3525, 3526, 3527, 3528, 3540, 3543, 3546, 3547, 3550, 3551, 3552, 3558, 3559, 3570, 3571, 3577, 3578, 3586, 3590, 3594, 3597, 3602, 3603, 3604, 3605, 3611, 3613, 3617, 3623, 3625, 3629, 3631, 3634, 3636, 3644, 3647, 3649, 3650, 3651, 3652, 3654, 3658, 3662, 3667, 3671, 3673, 3680, 3681, 3688, 3690, 3693, 3699, 3701, 3706, 3707, 3709, 3711, 3714, 3715, 3716, 3718, 3724, 3728, 3729, 3731, 3737, 3739, 3744, 3745, 3754, 3757, 3758, 3759, 3764, 3767, 3769, 3780, 3790, 3794, 3795, 3798, 3807, 3808, 3812, 3813, 3816, 3817, 3823, 3832, 3833, 3834, 3843, 3844, 3848, 3855, 3858, 3867, 3869, 3871, 3872, 3873, 3874, 3879, 3885, 3893, 3900, 3905, 3916, 3917, 3923, 3933, 3939, 3940, 3941, 3946, 3947, 3953, 3955, 3967, 3983, 3985, 3988, 3994, 4000, 4002, 4004, 4011, 4012, 4015, 4016, 4021, 4024, 4030, 4032, 4033, 4037, 4041, 4047, 4049, 4050, 4053, 4054, 4060, 4062, 4064, 4065, 4068, 4069, 4079, 4083, 4086, 4088, 4099, 4101, 4106, 4109, 4115, 4118, 4121, 4124, 4125, 4130, 4133, 4135, 4139, 4142, 4146, 4147, 4148, 4152, 4158, 4159, 4162, 4165, 4169, 4170, 4172, 4175, 4176, 4177, 4181, 4186, 4189, 4200, 4201, 4203, 4206, 4207, 4211, 4213, 4215, 4218, 4219, 4221, 4225, 4227, 4231, 4235, 4237, 4238, 4241, 4243, 4249, 4252, 4253, 4254, 4258, 4259, 4261, 4263, 4267, 4271, 4276, 4277, 4278, 4279, 4281, 4287]
# CTI_ex_batch4_grad8_miss = [0, 40, 44, 64, 65, 67, 72, 76, 79, 80, 81, 94, 104, 147, 172, 175, 191, 204, 211, 226, 232, 238, 261, 263, 294, 295, 300, 323, 387, 389, 390, 399, 402, 443, 449, 468, 520, 526, 537, 541, 587, 614, 622, 641, 661, 670, 675, 680, 706, 726, 732, 752, 764, 768, 783, 790, 798, 810, 819, 830, 838, 847, 851, 870, 877, 879, 902, 918, 937, 938, 968, 991, 999, 1032, 1047, 1088, 1099, 1100, 1125, 1170, 1178, 1196, 1200, 1201, 1209, 1215, 1231, 1249, 1280, 1301, 1340, 1348, 1381, 1388, 1400, 1422, 1423, 1432, 1447, 1505, 1535, 1552, 1553, 1577, 1598, 1600, 1601, 1607, 1641, 1645, 1647, 1722, 1727, 1747, 1810, 1833, 1854, 1855, 1863, 1864, 1871, 1890, 1906, 1917, 1924, 1929, 1934, 1955, 1958, 1979, 1989, 1991, 2001, 2020, 2031, 2048, 2086, 2089, 2097, 2105, 2109, 2120, 2121, 2155, 2177, 2184, 2208, 2239, 2245, 2305, 2306, 2315, 2322, 2363, 2371, 2385, 2400, 2402, 2412, 2441, 2451, 2461, 2486, 2489, 2511, 2518, 2525, 2533, 2546, 2561, 2580, 2600, 2624, 2633, 2676, 2679, 2690, 2699, 2751, 2758, 2768, 2773, 2781, 2783, 2785, 2805, 2824, 2845, 2862]
# CTI_im_batch4_grad8_miss = [0, 2, 24, 28, 30, 31, 35, 37, 40, 45, 58, 65, 66, 72, 73, 74, 76, 77, 81, 87, 100, 118, 125, 126, 132, 134, 148, 162, 165, 169, 177, 181, 191, 198, 211, 212, 214, 215, 216, 221, 222, 224, 225, 233, 234, 239, 255, 257, 260, 263, 267, 269, 274, 278, 284, 287, 292, 293, 296, 302, 303, 316, 317, 318, 326, 327, 330, 333, 346, 349, 356, 374, 375, 376, 380, 392, 395, 398, 399, 407, 419, 422, 424, 427, 432, 434, 444, 450, 454, 458, 466, 468, 480, 482, 486, 489, 494, 501, 509, 511, 512, 517, 520, 521, 522, 524, 528, 529, 530, 533, 538, 548, 556, 560, 569, 570, 572, 574, 576, 579, 588, 596, 599, 604, 611, 620, 633, 644, 648, 651, 669, 674, 676, 686, 687, 693, 694, 697, 699, 701, 702, 720, 733, 734, 738, 743, 749, 751, 753, 762, 767, 777, 790, 794, 796, 798, 805, 807, 809, 813, 817, 818, 838, 839, 842, 845, 851, 855, 856, 857, 865, 869, 873, 875, 879, 883, 887, 889, 891, 896, 900, 904, 905, 908, 909, 914, 917, 924, 925, 930, 934, 942, 945, 948, 955, 960, 962, 963, 966, 981, 982, 983, 986, 993, 1000, 1005, 1012, 1015, 1025, 1026, 1027, 1030, 1040, 1042, 1047, 1053, 1054, 1057, 1062, 1063, 1074, 1078, 1084, 1085, 1088, 1092, 1096, 1099, 1106, 1108, 1126, 1132, 1134, 1136, 1137, 1139, 1146, 1153, 1156, 1159, 1162, 1164, 1168, 1170, 1179, 1180, 1183, 1184, 1185, 1198, 1203, 1208, 1214, 1216, 1221, 1241, 1242, 1245, 1249, 1255, 1264, 1269, 1275, 1282, 1284, 1286, 1289, 1290, 1295, 1298, 1300, 1301, 1305, 1308, 1311, 1314, 1315, 1316, 1319, 1320, 1323, 1332, 1334, 1343, 1350, 1351, 1354, 1355, 1356, 1360, 1370, 1371, 1373, 1385, 1387, 1390, 1409, 1414, 1415, 1416, 1419, 1421, 1427, 1429, 1445, 1453, 1454, 1456, 1461, 1464, 1477, 1482, 1485, 1489, 1491, 1502, 1505, 1507, 1509, 1510, 1511, 1512, 1518, 1521, 1522, 1528, 1536, 1539, 1541, 1543, 1544, 1547, 1550, 1551, 1555, 1567, 1568, 1570, 1575, 1580, 1581, 1591, 1594, 1605, 1616, 1625, 1628, 1633, 1634, 1638, 1644, 1645, 1650, 1660, 1664, 1670, 1671, 1672, 1681, 1682, 1684, 1693, 1695, 1698, 1704, 1706, 1709, 1710, 1714, 1716, 1720, 1725, 1734, 1735, 1736, 1740, 1755, 1764, 1765, 1768, 1774, 1781, 1783, 1784, 1786, 1787, 1788, 1800, 1804, 1808, 1810, 1811, 1819, 1821, 1822, 1828, 1833, 1837, 1838, 1849, 1852, 1857, 1866, 1869, 1871, 1875, 1879, 1887, 1890, 1891, 1893, 1894, 1897, 1901, 1903, 1904, 1905, 1907, 1912, 1915, 1916, 1917, 1923, 1926, 1930, 1933, 1934, 1942, 1946, 1949, 1960, 1961, 1962, 1964, 1968, 1970, 1975, 1980, 1987, 1988, 1991, 2011, 2014, 2015, 2019, 2022, 2034, 2038, 2040, 2045, 2049, 2051, 2054, 2057, 2058, 2068, 2069, 2072, 2082, 2083, 2087, 2090, 2108, 2111, 2118, 2127, 2134, 2141, 2148, 2149, 2152, 2165, 2173, 2175, 2176, 2177, 2182, 2185, 2189, 2196, 2198, 2202, 2206, 2211, 2220, 2221, 2224, 2228, 2249, 2250, 2251, 2260, 2264, 2265, 2273, 2281, 2283, 2293, 2297, 2299, 2302, 2309, 2320, 2342, 2343, 2347, 2351, 2352, 2364, 2372, 2375, 2384, 2388, 2393, 2402, 2403, 2404, 2406, 2407, 2409, 2421, 2423, 2439, 2441, 2443, 2448, 2450, 2461, 2463, 2474, 2476, 2483, 2485, 2492, 2493, 2495, 2497, 2500, 2502, 2510, 2519, 2527, 2534, 2536, 2542, 2557, 2558, 2574, 2577, 2589, 2590, 2600, 2603, 2604, 2606, 2607, 2608, 2610, 2618, 2622, 2629, 2630, 2635, 2638, 2643, 2645, 2651, 2655, 2657, 2670, 2673, 2676, 2677, 2678, 2693, 2700, 2705, 2707, 2709, 2718, 2724, 2729, 2733, 2734, 2737, 2742, 2746, 2748, 2753, 2767, 2771, 2772, 2790, 2792, 2799, 2807, 2814, 2820, 2848, 2849, 2854, 2864, 2868, 2869, 2875, 2883, 2884, 2894, 2906, 2908, 2914, 2918, 2921, 2926, 2935, 2936, 2942, 2943, 2944, 2946, 2951, 2953, 2957, 2972, 2975, 2981, 2986, 2989, 2993, 3003, 3007, 3012, 3016, 3020, 3021, 3022, 3024, 3048, 3055, 3059, 3066, 3071, 3075, 3087, 3089, 3091, 3093, 3095, 3097, 3100, 3102, 3109, 3116, 3121, 3122, 3129, 3134, 3139, 3145, 3153, 3158, 3161, 3162, 3164, 3173, 3184, 3186, 3187, 3189, 3190, 3194, 3195, 3200, 3202, 3203, 3204, 3206, 3207, 3212, 3221, 3231, 3238, 3239, 3248, 3255, 3269, 3273, 3280, 3291, 3297, 3305, 3313, 3314, 3315, 3322, 3325, 3327, 3332, 3334, 3337, 3339, 3345, 3349, 3352, 3355, 3356, 3367, 3390, 3394, 3403, 3408, 3409, 3410, 3426, 3430, 3432, 3443, 3445, 3446, 3448, 3452, 3454, 3455, 3456, 3462, 3470, 3471, 3484, 3489, 3499, 3502, 3503, 3511, 3514, 3522, 3523, 3525, 3530, 3555, 3556, 3559, 3570, 3574, 3580, 3581, 3584, 3589, 3591, 3597, 3601, 3611, 3613, 3619, 3620, 3625, 3629, 3630, 3633, 3644, 3646, 3654, 3655, 3662, 3667, 3675, 3698, 3706, 3719, 3721, 3730, 3734, 3747, 3749, 3763, 3764, 3774, 3781, 3793, 3798, 3806, 3808, 3815, 3823, 3824, 3827, 3834, 3839, 3843, 3844, 3849, 3857, 3858, 3860, 3873, 3875, 3882, 3884, 3889, 3891, 3894, 3897, 3905, 3911, 3913, 3916, 3922, 3927, 3933, 3937, 3941, 3946, 3950, 3951, 3959, 3962, 3970, 3972, 3974, 3977, 3978, 3982, 3988, 3991, 3992, 3994, 3997, 3998, 4001, 4004, 4010, 4014, 4016, 4019, 4020, 4030, 4034, 4042, 4043, 4045, 4047, 4061]
# CoHARE_base_batch4_grad8_miss = [1, 3, 20, 29, 31, 32, 36, 44, 60, 62, 63, 69, 76, 81, 84, 90, 94, 108, 128, 132, 143, 146, 147, 148, 151, 159, 179, 180, 183, 191, 195, 214, 230, 231, 232, 233, 238, 240, 247, 250, 255, 256, 264, 267, 275, 282, 294, 298, 307, 309, 314, 317, 325, 326, 334, 342, 344, 352, 356, 366, 369, 372, 375, 382, 386, 398, 405, 419, 421, 427, 428, 429, 430, 433, 441, 444, 450, 453, 456, 461, 465, 474, 480, 486, 490, 494, 495, 497, 502, 504, 505, 512, 518, 519, 522, 527, 536, 540, 543, 547, 548, 549, 550, 555, 556, 559, 560, 563, 565, 566, 569, 570, 571, 576, 587, 593, 594, 606, 615, 616, 617, 619, 623, 630, 636, 643, 644, 647, 652, 661, 666, 673, 684, 687, 695, 705, 706, 723, 726, 728, 732, 737, 745, 746, 749, 753, 756, 760, 772, 787, 802, 805, 807, 811, 820, 825, 827, 829, 836, 849, 852, 856, 865, 867, 868, 870, 879, 886, 887, 899, 918, 919, 924, 928, 932, 933, 936, 941, 948, 952, 957, 958, 962, 966, 972, 973, 976, 981, 985, 992, 997, 998, 999, 1003, 1005, 1012, 1013, 1015, 1018, 1023, 1026, 1027, 1028, 1030, 1032, 1033, 1038, 1051, 1057, 1061, 1071, 1077, 1082, 1087, 1099, 1101, 1104, 1121, 1122, 1124, 1126, 1128, 1137, 1140, 1141, 1145, 1161, 1166, 1171, 1173, 1174, 1177, 1182, 1184, 1185, 1188, 1193, 1195, 1200, 1204, 1205, 1209, 1213, 1215, 1216, 1219, 1221, 1230, 1231, 1235, 1237, 1245, 1248, 1252, 1262, 1263, 1265, 1267, 1268, 1271, 1277, 1281, 1292, 1298, 1300, 1305, 1308, 1337, 1342, 1343, 1344, 1363, 1371, 1374, 1376, 1377, 1379, 1391, 1392, 1395, 1407, 1408, 1412, 1415, 1426, 1431, 1433, 1436, 1461, 1463, 1468, 1479, 1486, 1488, 1494, 1503, 1504, 1506, 1509, 1516, 1521, 1524, 1526, 1544, 1551, 1552, 1554, 1560, 1581, 1584, 1603, 1604, 1613, 1620, 1621, 1624, 1625, 1628, 1630, 1631, 1634, 1638, 1639, 1644, 1645, 1648, 1652, 1656, 1657, 1663, 1672, 1683, 1696, 1697, 1709, 1724, 1732, 1735, 1740, 1741, 1745, 1750, 1751, 1752, 1757, 1767, 1771, 1777, 1778, 1783, 1789, 1790, 1792, 1801, 1806, 1808, 1813, 1818, 1819, 1820, 1822, 1823, 1835, 1844, 1848, 1850, 1857, 1865, 1875, 1884, 1886, 1889, 1891, 1893, 1898, 1908, 1911, 1915, 1919, 1921, 1922, 1924, 1926, 1931, 1933, 1934, 1942, 1948, 1949, 1961, 1966, 1971, 1980, 1985, 1987, 1989, 1992, 1995, 2002, 2006, 2009, 2010, 2016, 2017, 2019, 2020, 2023, 2024, 2028, 2030, 2033, 2040, 2047, 2064, 2075, 2076, 2079, 2083, 2087, 2089, 2094, 2097, 2102, 2110, 2113, 2115, 2117, 2118, 2133, 2137, 2153, 2155, 2158, 2162, 2164, 2169, 2174, 2179, 2182, 2183, 2184, 2191, 2193, 2194, 2199, 2216, 2234, 2255, 2264, 2277, 2278, 2279, 2282, 2292, 2295, 2303, 2305, 2306, 2307, 2312, 2316, 2324, 2326, 2329, 2335, 2338, 2339, 2344, 2349, 2354, 2358, 2362, 2383, 2385, 2387, 2410, 2430, 2436, 2440, 2448, 2455, 2458, 2460, 2472, 2477, 2484, 2492, 2506, 2517, 2518, 2520, 2531, 2533, 2536, 2546, 2548, 2550, 2551, 2554, 2559, 2568, 2572, 2574, 2576, 2577, 2578, 2588, 2590, 2597, 2631, 2645, 2648, 2652, 2659, 2662, 2663, 2668, 2672, 2675, 2684, 2705, 2729, 2737, 2741, 2742, 2744, 2746, 2749, 2753, 2760, 2761, 2764, 2765, 2777, 2785, 2786, 2787, 2788, 2789, 2793, 2796, 2803, 2809, 2816, 2819, 2829, 2832, 2837, 2838, 2839, 2857, 2860, 2866, 2874, 2879, 2884, 2890, 2903, 2906, 2908, 2912, 2922, 2925, 2931, 2935, 2936, 2939, 2940, 2945, 2948, 2949, 2960, 2971, 2974, 2975, 2977, 2979, 2993, 2997, 3009, 3016, 3020, 3021, 3031, 3042, 3048, 3054, 3055, 3056, 3058, 3062, 3065, 3088, 3095, 3106, 3117, 3119, 3126, 3128, 3132, 3134, 3138, 3142, 3146, 3147, 3153, 3163, 3166, 3181, 3185, 3186, 3199, 3202, 3250, 3254, 3255, 3258, 3260, 3270, 3272, 3276, 3279, 3281, 3296, 3301, 3302, 3319, 3325, 3327, 3338, 3342, 3345, 3366, 3368, 3369, 3374, 3383, 3384, 3385, 3386, 3387, 3389, 3393, 3395, 3402, 3409, 3415, 3424, 3437, 3448, 3469, 3474, 3477, 3481, 3486, 3487, 3496, 3504, 3505, 3506, 3509, 3513, 3516, 3518, 3523, 3525, 3528, 3530, 3533, 3536, 3540, 3543, 3557, 3578, 3581, 3583, 3584, 3588, 3604, 3623, 3625, 3629, 3631, 3644, 3646, 3647, 3649, 3650, 3656, 3658, 3665, 3667, 3673, 3678, 3690, 3693, 3701, 3703, 3707, 3712, 3714, 3715, 3728, 3729, 3731, 3737, 3758, 3763, 3764, 3767, 3779, 3781, 3791, 3792, 3796, 3797, 3800, 3802, 3812, 3819, 3832, 3843, 3848, 3860, 3872, 3881, 3892, 3893, 3908, 3920, 3923, 3935, 3937, 3939, 3965, 3968, 3974, 3986, 3988, 3996, 4015, 4016, 4049, 4050, 4053, 4055, 4060, 4069, 4075, 4083, 4106, 4109, 4115, 4118, 4124, 4125, 4139, 4142, 4147, 4148, 4153, 4155, 4158, 4163, 4168, 4169, 4182, 4191, 4195, 4203, 4205, 4207, 4210, 4214, 4215, 4216, 4219, 4221, 4225, 4227, 4228, 4230, 4234, 4247, 4249, 4252, 4257, 4261, 4267, 4279, 4280, 4281, 4282]
# CoHARE_ex_batch4_grad8_miss = [23, 38, 44, 64, 65, 67, 72, 79, 80, 81, 87, 94, 151, 154, 172, 175, 191, 204, 226, 228, 232, 294, 295, 314, 340, 384, 386, 387, 390, 399, 401, 402, 443, 453, 471, 491, 520, 541, 554, 571, 609, 651, 660, 670, 675, 706, 707, 732, 752, 756, 757, 768, 783, 786, 819, 830, 838, 851, 870, 877, 879, 902, 912, 913, 918, 932, 938, 968, 999, 1022, 1026, 1081, 1088, 1099, 1103, 1127, 1170, 1192, 1200, 1209, 1215, 1227, 1249, 1280, 1300, 1301, 1337, 1340, 1348, 1381, 1400, 1422, 1423, 1477, 1505, 1535, 1552, 1553, 1576, 1600, 1607, 1641, 1645, 1647, 1649, 1711, 1727, 1776, 1794, 1810, 1864, 1871, 1924, 1934, 2048, 2086, 2120, 2155, 2184, 2217, 2296, 2305, 2306, 2322, 2351, 2374, 2387, 2400, 2402, 2441, 2451, 2461, 2486, 2489, 2511, 2512, 2518, 2546, 2547, 2561, 2572, 2580, 2584, 2589, 2624, 2633, 2675, 2676, 2679, 2699, 2740, 2758, 2768, 2807, 2824, 2833, 2845]
# CoHARE_im_batch4_grad8_miss = [0, 2, 22, 24, 28, 35, 65, 66, 72, 74, 76, 77, 87, 100, 115, 118, 124, 129, 132, 133, 134, 136, 145, 154, 165, 169, 177, 181, 185, 192, 193, 195, 215, 216, 217, 222, 224, 225, 233, 234, 238, 239, 257, 263, 267, 269, 274, 278, 287, 302, 303, 306, 318, 326, 330, 333, 335, 346, 349, 357, 364, 376, 379, 383, 390, 392, 395, 398, 402, 419, 422, 424, 427, 430, 432, 434, 438, 444, 450, 454, 457, 458, 461, 466, 482, 486, 501, 503, 505, 508, 509, 512, 517, 521, 522, 524, 528, 529, 530, 533, 537, 539, 541, 548, 550, 554, 560, 569, 572, 576, 582, 588, 595, 596, 599, 604, 607, 611, 622, 634, 642, 669, 674, 676, 680, 686, 693, 694, 697, 701, 702, 714, 720, 733, 734, 743, 748, 749, 762, 767, 777, 790, 793, 796, 798, 805, 807, 813, 818, 825, 826, 829, 838, 851, 855, 856, 859, 861, 876, 883, 887, 891, 894, 896, 900, 905, 906, 909, 914, 918, 924, 925, 929, 930, 931, 934, 936, 948, 956, 957, 958, 960, 962, 963, 966, 967, 972, 974, 981, 986, 993, 1000, 1011, 1015, 1021, 1026, 1027, 1030, 1047, 1053, 1062, 1063, 1065, 1076, 1084, 1088, 1095, 1099, 1106, 1109, 1114, 1125, 1130, 1132, 1134, 1136, 1137, 1142, 1144, 1150, 1151, 1156, 1161, 1164, 1170, 1171, 1179, 1180, 1184, 1188, 1192, 1194, 1198, 1203, 1207, 1208, 1214, 1216, 1221, 1231, 1249, 1260, 1264, 1282, 1286, 1287, 1289, 1300, 1301, 1302, 1304, 1305, 1308, 1315, 1316, 1319, 1320, 1322, 1324, 1340, 1343, 1356, 1373, 1391, 1394, 1397, 1406, 1409, 1412, 1414, 1416, 1419, 1427, 1428, 1429, 1433, 1452, 1453, 1455, 1461, 1480, 1482, 1485, 1503, 1507, 1511, 1512, 1518, 1521, 1522, 1525, 1528, 1535, 1536, 1539, 1541, 1543, 1551, 1553, 1555, 1567, 1570, 1578, 1580, 1585, 1591, 1594, 1598, 1603, 1617, 1625, 1628, 1633, 1638, 1644, 1645, 1654, 1660, 1664, 1670, 1674, 1681, 1682, 1693, 1695, 1698, 1704, 1709, 1711, 1714, 1725, 1734, 1735, 1736, 1740, 1747, 1755, 1765, 1770, 1774, 1781, 1782, 1783, 1787, 1788, 1792, 1797, 1800, 1804, 1808, 1810, 1811, 1819, 1821, 1822, 1830, 1837, 1838, 1852, 1869, 1871, 1873, 1875, 1878, 1879, 1881, 1887, 1891, 1894, 1900, 1901, 1903, 1906, 1907, 1911, 1914, 1926, 1930, 1942, 1946, 1949, 1960, 1964, 1968, 1983, 1986, 1988, 1991, 2011, 2014, 2015, 2019, 2029, 2031, 2045, 2049, 2054, 2057, 2058, 2066, 2068, 2069, 2072, 2073, 2090, 2108, 2121, 2127, 2139, 2145, 2149, 2152, 2173, 2175, 2176, 2182, 2185, 2195, 2196, 2202, 2206, 2211, 2221, 2224, 2227, 2228, 2233, 2250, 2251, 2273, 2281, 2293, 2299, 2302, 2308, 2309, 2310, 2316, 2320, 2333, 2335, 2337, 2344, 2347, 2350, 2351, 2372, 2378, 2387, 2388, 2393, 2398, 2402, 2407, 2409, 2414, 2421, 2423, 2424, 2429, 2438, 2439, 2441, 2443, 2456, 2467, 2476, 2483, 2485, 2500, 2502, 2504, 2514, 2522, 2532, 2536, 2574, 2577, 2589, 2596, 2600, 2607, 2620, 2628, 2629, 2630, 2631, 2635, 2638, 2643, 2645, 2650, 2651, 2660, 2670, 2672, 2676, 2677, 2690, 2693, 2700, 2705, 2709, 2718, 2724, 2737, 2740, 2746, 2763, 2771, 2772, 2790, 2792, 2803, 2805, 2807, 2814, 2825, 2830, 2836, 2837, 2844, 2848, 2854, 2875, 2877, 2882, 2883, 2884, 2891, 2904, 2918, 2921, 2926, 2944, 2950, 2953, 2957, 2963, 2971, 2972, 2981, 2986, 2989, 2993, 3003, 3007, 3008, 3020, 3021, 3024, 3071, 3075, 3076, 3087, 3091, 3092, 3093, 3097, 3100, 3102, 3108, 3116, 3117, 3121, 3122, 3129, 3137, 3139, 3145, 3147, 3161, 3163, 3164, 3173, 3186, 3187, 3202, 3203, 3204, 3206, 3210, 3212, 3226, 3238, 3239, 3269, 3277, 3278, 3279, 3280, 3287, 3297, 3305, 3313, 3314, 3315, 3318, 3320, 3322, 3325, 3327, 3334, 3337, 3339, 3349, 3352, 3355, 3384, 3387, 3389, 3390, 3394, 3396, 3410, 3414, 3426, 3432, 3435, 3443, 3445, 3446, 3448, 3454, 3455, 3470, 3489, 3499, 3502, 3503, 3508, 3510, 3522, 3523, 3525, 3530, 3551, 3559, 3570, 3572, 3581, 3582, 3589, 3591, 3600, 3606, 3619, 3625, 3629, 3633, 3644, 3652, 3654, 3655, 3667, 3685, 3686, 3690, 3698, 3706, 3721, 3729, 3730, 3747, 3749, 3750, 3756, 3767, 3769, 3774, 3778, 3789, 3793, 3805, 3806, 3808, 3812, 3823, 3824, 3827, 3835, 3842, 3843, 3857, 3864, 3873, 3882, 3884, 3888, 3889, 3891, 3894, 3905, 3911, 3913, 3916, 3922, 3927, 3937, 3938, 3941, 3958, 3962, 3970, 3972, 3974, 3978, 3980, 3982, 3986, 3991, 3992, 3994, 3997, 3998, 4001, 4014, 4016, 4019, 4024, 4027, 4028, 4034, 4045, 4047]


In [218]:
"""Define"""
method1, method1_miss = 'CoHARE_im_batch4_grad8', CoHARE_im_batch4_grad8_miss
method2, method2_miss = 'CTI_im_batch4_grad8', CTI_im_batch4_grad8_miss 
method3, method3_miss = 'C_im_batch4_grad8', C_im_batch4_grad8_miss


file_path = '/home/yumin/hare-hate-speech/outputs/sample_check/'
id_list = pd.read_csv(file_path+f'{method1}.csv')['id']
post_list = pd.read_csv(file_path+f'{method1}.csv')['detected_post']
df = pd.DataFrame({'post_id': id_list, 'post':post_list})

method1_pred, method2_pred, method3_pred = f'{method1}_pred', f'{method2}_pred', f'{method3}_pred'
df[method1] = df['post_id'].apply(lambda x: 'Miss' if x in method1_miss else 'O')
df[method1_pred] = pd.read_csv(file_path+f'{method1}.csv')['pred']
df[method2] = df['post_id'].apply(lambda x: 'Miss' if x in method2_miss else 'O')
df[method2_pred] = pd.read_csv(file_path+f'{method2}.csv')['pred']
df[method3] = df['post_id'].apply(lambda x: 'Miss' if x in method3_miss else 'O')
df[method3_pred] = pd.read_csv(file_path+f'{method3}.csv')['pred']
df.to_csv(file_path+'im_batch4_grad8_comparison.csv', index=False) 

In [214]:
check_column = 'CTI_im_batch4_grad8' 

print(df[check_column].value_counts())
pd.read_csv(file_path+f'{check_column}.csv')['correct'].value_counts()

CTI_im_batch4_grad8
O       3221
Miss     841
Name: count, dtype: int64


correct
True     3221
False     841
Name: count, dtype: int64

### Misprediction Comparison

In [11]:
import pandas as pd 
file_path = '/home/yumin/hare-hate-speech/outputs/sample_check/'
csv_name = 'im_batch4_grad8_comparison.csv'

df = pd.read_csv(file_path + csv_name)
df[df['CTI_im_batch4_grad8_pred'] == df['C_im_batch4_grad8_pred']]   # C, CTI prediction is same...? 

Unnamed: 0,post_id,post,CoHARE_im_batch4_grad8,CoHARE_im_batch4_grad8_pred,CTI_im_batch4_grad8,CTI_im_batch4_grad8_pred,C_im_batch4_grad8,C_im_batch4_grad8_pred
0,0,steve king : obamas #immigration move would tr...,Miss,(B) Not hate. 1.The post mentions Steve King a...,Miss,(B) Not hate.,Miss,(B) Not hate.
1,1,the economist : an insurgent in the white hous...,O,(B) Not hate. 1.Identify the content: The post...,O,(B) Not hate.,O,(B) Not hate.
2,2,i lived there it was fine you are just a scare...,Miss,(B) Not hate.,Miss,(B) Not hate.,Miss,(B) Not hate.
3,3,wait . i thought white people invented harvard...,O,"(B) Not hate. 1.The post starts by stating ""wa...",O,(B) Not hate.,O,(B) Not hate.
4,4,Why do white people have such a pleasant scent...,O,(A) Hate. 1.The post starts by asking a questi...,O,(A) Hate.,O,(A) Hate.
...,...,...,...,...,...,...,...,...
4057,4057,zogbot-silly roper vs brad griffin alt-right y...,O,"(B) Not hate. 1.The post mentions ""zogbot-sill...",O,(B) Not hate.,O,(B) Not hate.
4058,4058,. why ? muslims can't be white ? what race is ...,O,(A) Hate. 1.The post starts by questioning why...,O,(A) Hate.,O,(A) Hate.
4059,4059,why then even post something that assumes whit...,O,(B) Not hate. 1.The post starts by questioning...,O,(B) Not hate.,O,(B) Not hate.
4060,4060,why can't wallace also support trump ?,O,(B) Not hate. 1.The post is asking a question ...,O,(B) Not hate.,O,(B) Not hate.


In [12]:
cohare_o_cti_x = df[(df['CoHARE_im_batch4_grad8']=='O') & (df['CTI_im_batch4_grad8']=='Miss')].reset_index()
filtered_df = cohare_o_cti_x[['post_id','post','CoHARE_im_batch4_grad8_pred','CTI_im_batch4_grad8_pred']]
file_name = 'cohare_o_cti_x_batch4_grad8_121823'
filtered_df.to_csv(f'/home/yumin/hare-hate-speech/outputs/sample_check/{file_name}.csv', index=False)

# for i in range(len(cohare_o_cti_x)):
#     print(cohare_o_cti_x['post_id'][i], '\n',cohare_o_cti_x['post'][i], '\n', cohare_o_cti_x['CoHARE_im_batch4_grad8_pred'][i],'\n')

In [13]:
cohare_x_cti_o = df[(df['CoHARE_im_batch4_grad8']=='Miss') & (df['CTI_im_batch4_grad8']=='O')].reset_index()
filtered_df = cohare_x_cti_o[['post_id','post','CoHARE_im_batch4_grad8_pred','CTI_im_batch4_grad8_pred']]
file_name = 'cohare_x_cti_o_batch4_grad8_121823'
filtered_df.to_csv(f'/home/yumin/hare-hate-speech/outputs/sample_check/{file_name}.csv', index=False)

# for i in range(len(cohare_x_cti_o)):
    # print(cohare_x_cti_o['post_id'][i], '\n',cohare_x_cti_o['post'][i], '\n', cohare_x_cti_o['CoHARE_im_batch4_grad8_pred'][i],'\n')

In [77]:
cohare_x_cti_o['CTI_im_batch4_grad8_pred'].value_counts()
cohare_x_cti_o.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 234 entries, 0 to 233
Data columns (total 9 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   index                        234 non-null    int64 
 1   post_id                      234 non-null    int64 
 2   post                         234 non-null    object
 3   CoHARE_im_batch4_grad8       234 non-null    object
 4   CoHARE_im_batch4_grad8_pred  234 non-null    object
 5   CTI_im_batch4_grad8          234 non-null    object
 6   CTI_im_batch4_grad8_pred     234 non-null    object
 7   C_im_batch4_grad8            234 non-null    object
 8   C_im_batch4_grad8_pred       234 non-null    object
dtypes: int64(2), object(7)
memory usage: 16.6+ KB


In [90]:
df_base = cohare_o_cti_x  # 350 rows 

ih_merged = pd.merge(df_base, ih_ti[['post', 'target','implied_statement']], on='post', how='left')

# Rename the 'target' column if needed
ih_merged = ih_merged.rename(columns={'target': 'target', 
                                      'implied_statement':'implied_statement'})
ih_merged.replace({pd.NA: None}, inplace=True)
print(ih_merged['target'].value_counts())
# print(ih_merged['target'].info())  # 115 non-null 
# print(ih_merged['implied_statement'].value_counts())
# print(ih_merged['implied_statement'].info())  # 115 non-null 
ih_merged.to_csv(f'/home/yumin/hare-hate-speech/outputs/sample_check/cohare_o_cti_x_merged_121823.csv', index=False)


target
minorities             15
black people           13
muslims                13
immigrants             12
white people           11
blacks                  9
jews                    5
germans                 3
non-white people        3
whites                  2
liberals                2
black folks             2
people of color         1
black lives matter      1
democrats               1
meat eaters             1
islanders               1
illegals                1
not specified           1
black people.           1
black panthers          1
illegal people          1
immigrant               1
boy scout leaders       1
white nationalists      1
mexicans                1
leftists                1
black                   1
arabian people          1
progressives            1
jews, iraqis            1
white conservatives     1
white men               1
gay people              1
islamic followers       1
illegal immigrants      1
conservatives           1
Name: count, dtype: int64


In [91]:
df_base = cohare_x_cti_o  # 234 rows 

ih_merged = pd.merge(df_base, ih_ti[['post', 'target','implied_statement']], on='post', how='left')

# Rename the 'target' column if needed
ih_merged = ih_merged.rename(columns={'target': 'target', 
                                      'implied_statement':'implied_statement'})
ih_merged.replace({pd.NA: None}, inplace=True)
print(ih_merged['target'].value_counts())
# print(ih_merged['target'].info())  # 132 non-null 
# print(ih_merged['implied_statement'].value_counts())
# print(ih_merged['implied_statement'].info())  # 132 non-null 
ih_merged.to_csv(f'/home/yumin/hare-hate-speech/outputs/sample_check/cohare_x_cti_o_merged_121823.csv', index=False)


target
minorities                      27
white people                    25
jews                            16
muslims                          9
non-white people                 6
black people                     4
white folks                      4
blacks                           3
progressives                     3
immigrants                       3
liberals                         2
democrats                        2
whites                           2
arabs                            2
conservatives                    2
refugees                         1
white men                        1
progressive people               1
politicians                      1
black and white people           1
neocons                          1
blacks, northerners and jews     1
catholics                        1
southern whites                  1
europeans                        1
jewish people                    1
white antifa members             1
arabians                         1
liberals and 

### Implicit Hate Dataset Loading

In [30]:
from datasets import load_dataset
dataset = load_dataset("SALT-NLP/ImplicitHate", split="train")
dataset



Dataset({
    features: ['post', 'implicit_class', 'extra_implicit_class'],
    num_rows: 6346
})

In [35]:
import pandas as pd
dataset_df = pd.DataFrame(dataset)
# dataset_df['extra_implicit_class'].value_counts()
dataset_df['id'] = range(0, len(dataset_df) )
dataset_df

Unnamed: 0,post,implicit_class,extra_implicit_class,id
0,""" : jewish harvard professor noel ignatiev w...",white_grievance,,0
1,rt : tune in today's jmt for my interview w /...,irony,,1
2,: : hate posters ? you want hate buddy ? read ...,white_grievance,,2
3,: hunt for white women : black racist serial k...,stereotypical,,3
4,: looks like he was a fan of the islamic jew-h...,stereotypical,,4
...,...,...,...,...
6341,"Seeing prostitutes is morally wrong, but being...",irony,,6341
6342,I wonder how many females I raped today I spen...,irony,,6342
6343,Having a criminal record is more attractive to...,stereotypical,,6343
6344,Another advantage of being a female: getting b...,inferiority,,6344


In [36]:
import pandas as pd
import json

# Given DataFrame
df = dataset_df

# Convert DataFrame to dictionary
data_dict = df.to_dict(orient='records')

# Organize the data into the desired structure
json_data = {str(item['id']): {'post': item['post'], 
                               'implicit_class': item['implicit_class'], 
                               'extra_implicit_class': item['extra_implicit_class']}
             for item in data_dict}


filename_to_save = '/home/yumin/hare-hate-speech/data/implicit-hate/implicit_hate.json'
with open(filename_to_save, 'w') as json_file:
    json.dump(json_data, json_file, indent=2)


### Mispredicted Samples Categorizing 
1. `implicit_class` = `Stereotypical` --> CoHARE is hard to predict? 

In [41]:
ih = pd.read_json('/home/yumin/hare-hate-speech/data/implicit-hate/implicit_hate.json', orient='index')
ih

Unnamed: 0,post,implicit_class,extra_implicit_class
0,""" : jewish harvard professor noel ignatiev w...",white_grievance,
1,rt : tune in today's jmt for my interview w /...,irony,
2,: : hate posters ? you want hate buddy ? read ...,white_grievance,
3,: hunt for white women : black racist serial k...,stereotypical,
4,: looks like he was a fan of the islamic jew-h...,stereotypical,
...,...,...,...
6341,"Seeing prostitutes is morally wrong, but being...",irony,
6342,I wonder how many females I raped today I spen...,irony,
6343,Having a criminal record is more attractive to...,stereotypical,
6344,Another advantage of being a female: getting b...,inferiority,


In [20]:
ih['implicit_class'].value_counts()

implicit_class
white_grievance    1538
incitement         1269
stereotypical      1133
inferiority         863
irony               797
threatening         666
other                80
Name: count, dtype: int64

In [21]:
ih['extra_implicit_class'].value_counts()

extra_implicit_class
inferiority        55
white_grievance    34
irony              12
stereotypical      12
incitement         11
threatening         6
Name: count, dtype: int64

In [85]:
ih_ti = pd.read_csv('/home/yumin/hare-hate-speech/data/implicit-hate/ih_original_data/implicit_hate_v1_stg3_merged.csv')
ih_ti['target'] = ih_ti['target'].str.lower()
ih_ti

Unnamed: 0,id,post,target,implied_statement
0,524408721473306625,"rt "" white privilege "" "" is when the gun jam...",minority groups,people in minority groups are all in gangs
1,394530795580174337,if you are white in that state watch yourself !,non-white people,non-white people are dangerous
2,794539075164401666,so the 500 % increase in h1b visas ? yeah i'm ...,immigrants,Immigrants are undesirable
3,901162469992808448,rt antiwhites expose their hatred of white ki...,anti-whites,Anti-whites hate white people
4,911282139706007553,capitalism like communism is an enemy of the...,capitalists,Capitalists don't like white people.
...,...,...,...,...
6354,884073715490861056,your people came to a friendly and highly popu...,asian folks,Asian folks have destroyed the country
6355,603471575833137153,it's as if sexual abuse follows perkins around .,immigrants,Immigrants are sexual offenders
6356,669074062862368768,every india lover must boycott these pigs movies,indian folks,Indian folks are pigs
6357,466220911675179008,ask more rss intellectuals to come on powerful...,indian folks,Indian folks are uneducated


In [27]:
pd.DataFrame(ih_ti['post'].isin(ih['post'])).value_counts()

post 
True     5486
False     873
Name: count, dtype: int64

In [59]:
ih_merged = pd.merge(ih, ih_ti[['post', 'target','implied_statement']], on='post', how='left')

# Rename the 'target' column if needed
ih_merged = ih_merged.rename(columns={'target': 'target', 
                                      'implied_statement':'implied_statement'})
ih_merged

Unnamed: 0,post,implicit_class,extra_implicit_class,target,implied_statement
0,""" : jewish harvard professor noel ignatiev w...",white_grievance,,,
1,rt : tune in today's jmt for my interview w /...,irony,,Iranians,Iranians consider Jews infidels
2,: : hate posters ? you want hate buddy ? read ...,white_grievance,,Muslims,Muslims are hateful people
3,: hunt for white women : black racist serial k...,stereotypical,,Black people,Black people are violent
4,: looks like he was a fan of the islamic jew-h...,stereotypical,,Muslims,Muslims are hateful
...,...,...,...,...,...
6341,"Seeing prostitutes is morally wrong, but being...",irony,,,
6342,I wonder how many females I raped today I spen...,irony,,,
6343,Having a criminal record is more attractive to...,stereotypical,,,
6344,Another advantage of being a female: getting b...,inferiority,,,


In [60]:
ih_merged['target'].value_counts()

target
Minorities                             656
Immigrants                             620
Muslims                                542
white people                           499
Jews                                   395
                                      ... 
Female politicians                       1
Terrorists.                              1
People of color/everyone else            1
Muslims (people who worship Allah).      1
Millennials                              1
Name: count, Length: 575, dtype: int64

In [61]:
df = ih_merged

df['id'] = range(0, len(df) )
df.replace({pd.NA: None}, inplace=True)

df

Unnamed: 0,post,implicit_class,extra_implicit_class,target,implied_statement,id
0,""" : jewish harvard professor noel ignatiev w...",white_grievance,,,,0
1,rt : tune in today's jmt for my interview w /...,irony,,Iranians,Iranians consider Jews infidels,1
2,: : hate posters ? you want hate buddy ? read ...,white_grievance,,Muslims,Muslims are hateful people,2
3,: hunt for white women : black racist serial k...,stereotypical,,Black people,Black people are violent,3
4,: looks like he was a fan of the islamic jew-h...,stereotypical,,Muslims,Muslims are hateful,4
...,...,...,...,...,...,...
6341,"Seeing prostitutes is morally wrong, but being...",irony,,,,6341
6342,I wonder how many females I raped today I spen...,irony,,,,6342
6343,Having a criminal record is more attractive to...,stereotypical,,,,6343
6344,Another advantage of being a female: getting b...,inferiority,,,,6344


In [62]:
import json
# df = ih_merged

# df['id'] = range(0, len(df) )

data_dict = df.to_dict(orient='records')

# Organize the data into the desired structure
json_data = {str(item['id']): {'post': item['post'], 
                               'implicit_class': item['implicit_class'], 
                               'extra_implicit_class': item['extra_implicit_class'],
                               'target': item['target'],
                               'implied_statement': item['implied_statement']}
             for item in data_dict}


filename_to_save = '/home/yumin/hare-hate-speech/data/implicit-hate/implicit_hate_ti.json'
with open(filename_to_save, 'w') as json_file:
    json.dump(json_data, json_file, indent=2)



### SBIC Dataset

In [28]:
import os 
file_path_sbic = '/home/yumin/hare-hate-speech/data/sbic/'
os.listdir(file_path_sbic) 


['sbic_train.jsonl',
 'sbic_train_co_hare.jsonl',
 'sbic_train_fr_hare.json',
 'sbic_test.json',
 'sbic_train.json',
 'sbic_train_co_hare.json',
 'sbic_test.jsonl',
 'sbic_valid.jsonl',
 'sbic_train_fr_hare.jsonl',
 'sbic_valid.json']

In [36]:
pd.read_json(file_path_sbic+'sbic_test.json', orient='index')['implied_statement'].value_counts() 

implied_statement
[]                                                                                                                                                                                                                              2767
["trivializes harm to victims"]                                                                                                                                                                                                   36
["trivializes harm to victims."]                                                                                                                                                                                                   7
["are marginalized for a joke"]                                                                                                                                                                                                    3
["women are bitches", "women are stupid"]                         