In [8]:
import pandas as pd
import numpy as np
import os
import json
from unidecode import unidecode
from Levenshtein import ratio as levenshtein_ratio
from typing import List

In [9]:
base_ratings_across_batches_df = pd.read_csv("raw/main_study/ratings_main.csv")
frontier_ratings_across_batches_df = pd.read_csv("raw/frontier_study/ratings_frontier.csv")
all_ratings_across_batches_df = pd.concat([base_ratings_across_batches_df, 
                                           frontier_ratings_across_batches_df], axis=0)

base_highlights_across_batches_df = pd.read_csv("raw/main_study/highlights_main.csv")
frontier_highlights_across_batches_df = pd.read_csv("raw/frontier_study/highlights_frontier.csv")
all_highlights_across_batches_df = pd.concat([base_highlights_across_batches_df, 
                                              frontier_highlights_across_batches_df], axis=0)

base_expr_data_df = pd.read_csv("raw/main_study/expr_data_main.csv")
frontier_expr_data_df = pd.read_csv("raw/frontier_study/expr_data_frontier.csv")
all_expr_data_df = pd.concat([base_expr_data_df, frontier_expr_data_df], axis=0)

base_hlt_scores_df = pd.read_csv("raw/main_study/hlt_scores_main.csv")
base_hlt_scores_df['highlight_annot'] = 1
base_hlt_scores_df['meaningful'] = 1
base_hlt_scores_df['pragmatic'] = 1

frontier_hlt_scores_df = pd.read_csv("raw/frontier_study/hlt_scores_frontier.csv")
frontier_hlt_scores_df['highlight_annot'] = 1
frontier_hlt_scores_df['meaningful'] = 1
frontier_hlt_scores_df['pragmatic'] = 1

all_hlt_scores_df = pd.concat([base_hlt_scores_df, frontier_hlt_scores_df], axis=0)

In [10]:
total_nov_annots = base_ratings_across_batches_df[base_ratings_across_batches_df['novel'] == 1]['novel'].sum()
nov_non_sense_annots = base_ratings_across_batches_df[base_ratings_across_batches_df['novel'] == 1]['meaningful'].sum()
nov_non_prag_anntos = base_ratings_across_batches_df[base_ratings_across_batches_df['novel'] == 1]['pragmatic'].sum()
print(total_nov_annots, nov_non_sense_annots, nov_non_prag_anntos)
print(1-nov_non_sense_annots/total_nov_annots, 1-nov_non_prag_anntos/total_nov_annots)

545 527 517
0.03302752293577982 0.05137614678899083


In [11]:
print("# of ratings: ", base_ratings_across_batches_df.shape[0])
print("# unique expressions: ", base_ratings_across_batches_df.shape)
print("# unique non-pragmatic: ", base_ratings_across_batches_df[base_ratings_across_batches_df['pragmatic'] == 0].shape)
print("# unique novel: ", base_ratings_across_batches_df[base_ratings_across_batches_df['novel'] == 1].shape)
print("# unique non-sensical: ", base_ratings_across_batches_df[base_ratings_across_batches_df['meaningful'] == 0].shape)
print("# of highlights: ", base_highlights_across_batches_df.shape[0])

# of ratings:  7542
# unique expressions:  (7542, 12)
# unique non-pragmatic:  (667, 12)
# unique novel:  (545, 12)
# unique non-sensical:  (252, 12)
# of highlights:  226


In [12]:
print("# of ratings: ", frontier_ratings_across_batches_df.shape[0])
print("# unique expressions: ", frontier_ratings_across_batches_df.shape)
print("# unique non-pragmatic: ", frontier_ratings_across_batches_df[frontier_ratings_across_batches_df['pragmatic'] == 0].shape)
print("# unique novel: ", frontier_ratings_across_batches_df[frontier_ratings_across_batches_df['novel'] == 1].shape)
print("# unique non-sensical: ", frontier_ratings_across_batches_df[frontier_ratings_across_batches_df['meaningful'] == 0].shape)
print("# of highlights: ", frontier_highlights_across_batches_df.shape[0])

# of ratings:  1076
# unique expressions:  (1076, 12)
# unique non-pragmatic:  (55, 12)
# unique novel:  (44, 12)
# unique non-sensical:  (22, 12)
# of highlights:  15


## reg data prep

### functions

In [13]:
def merge_ratings_to_nov_scores(ratings_across_batches_df, expr_data_df):
    """
    Merge expression data with ratings data for regression analysis.
    """
    regr_df = ratings_across_batches_df.merge(expr_data_df,
                                            on=['gen_passage_id', 'expression', 
                                            'batch', 'part'],
                                            how='left'
                                            ,suffixes=('', '_y'))
    print(regr_df.shape)
    assert regr_df['annotator_id'].isna().sum() == 0
    assert regr_df['ppl'].isna().sum() == 0

    cols_to_keep = ['batch', 'gen_passage_id', 'seed_passage_id', 'expression', 
                    'novel', 'meaningful', 'pragmatic', 
                    'annotator_id', 'gen_source',
                    "ppl", "median_smallest", "percent_smallest", 
                    "median_dclm_smallest", "percent_dclm_smallest"]
    regr_df = regr_df[cols_to_keep]
    return regr_df

def merge_nov_scores_to_ratings(ratings_across_batches_df, expr_data_df):
    """
    Merge expression data with ratings data for regression analysis.
    """
    regr_df = expr_data_df.merge(ratings_across_batches_df,
                                            on=['gen_passage_id', 'expression', 
                                            'batch', 'part'],
                                            how='left'
                                            ,suffixes=('', '_y'))
    print(regr_df.shape)
    assert regr_df['ppl'].isna().sum() == 0

    cols_to_keep = ['batch', 'gen_passage_id', 'seed_passage_id', 'expression', 
                    'novel', 'meaningful', 'pragmatic', 
                    'annotator_id', 'gen_source',
                    "ppl", "median_smallest", "percent_smallest", 
                    "median_dclm_smallest", "percent_dclm_smallest"]
    regr_df = regr_df[cols_to_keep]
    return regr_df

def compare_expr_to_hlts(expr: str, hlts: List[str], thresh = 0.9, debug=False):

    if len(hlts) == 0:
        return 0

    if debug: print(f"Expression: {expr}")
    for hl in hlts:
        expr, hl = unidecode(expr), unidecode(hl)
        if expr in hl or hl in expr:
            if debug: print(f"Expression '{expr}' is a subset/superset of the highlight '{hl}'")
            return 1
        elif levenshtein_ratio(expr, hl) >= thresh:
            print(f"Expression '{expr}' is very similar to the highlight '{hl}'")
            return 1
    
    return 0

In [14]:
id_to_annots = {}
id_to_annots.update(
    all_ratings_across_batches_df.groupby('gen_passage_id')['annotator_id'].unique().to_dict()
)
id_to_gen_source = {}
id_to_gen_source.update(
    all_ratings_across_batches_df.groupby('gen_passage_id')['gen_source'].first().to_dict()
)
print(f"Unique ids: {len(id_to_annots)}")
print(f"Unique gen_sources: {len(set(id_to_gen_source.values()))}")

wqrm_df = pd.read_csv("raw/main_study/wqrm_ail_scores.csv")
wqrm_df.head(1)

Unique ids: 110
Unique gen_sources: 5


Unnamed: 0,gen_passage_id,WQRM_score,ai_likelihood
0,18037_olmo1,6.650258,1.0


### raw annotations (pragmatics, meaningful data)

In [42]:
base_front_regr_df = merge_ratings_to_nov_scores(all_ratings_across_batches_df,
                                            all_expr_data_df)
base_front_regr_df = base_front_regr_df.merge(wqrm_df,
                                        on=['gen_passage_id'],
                                        how='left')
assert base_front_regr_df['WQRM_score'].isna().sum() == base_front_regr_df[base_front_regr_df['batch'] == 11].shape[0]  # batch 11 passages are not in the wqrm file
assert base_front_regr_df['ai_likelihood'].isna().sum() == base_front_regr_df[base_front_regr_df['batch'] == 11].shape[0]  # batch 11 passages are not in the wqrm file

cols_to_keep = ['batch', 'gen_passage_id', 'seed_passage_id',  
                'novel', 'meaningful', 'pragmatic', 
                'annotator_id', 'gen_source',
                'ppl', 'WQRM_score', 'ai_likelihood']
print(base_front_regr_df.shape)
base_front_regr_df[cols_to_keep].to_csv("for_linear_models/prehlt_only.csv", index=False)
base_front_regr_df[cols_to_keep + ['expression']].to_csv("for_linear_models/with_exprs/prehlt_only_w_exprs.csv", index=False)

(8618, 33)
(8618, 16)


### incorporate highlights

In [None]:
regr_df = merge_nov_scores_to_ratings(all_ratings_across_batches_df, all_expr_data_df)
print(regr_df.shape)
# regr_df.head()

na_mask = regr_df['annotator_id'].isna()
print("Number of expressions containing annotation: ", regr_df[~na_mask].shape[0])
missing_annotator_df = regr_df[na_mask].copy()
print("# expressions missing annotation: ", missing_annotator_df.shape[0])
print("Expected number of expression in the filled DF: ", missing_annotator_df.shape[0]*3 
      + regr_df[~na_mask].shape[0])

# fill in the missing annotations in the following way:
# for each expression that was not annotated as novel
# we assume it was annotated as non-novel (non-creative) by all annotators 
# (since if it were creative, they would have highlighted it)
# safe to assume non-novel but not non-pragmatic or non-sensical, so we leave those as NaN
expanded_rows = []
for _, row in missing_annotator_df.iterrows():
    annots = id_to_annots[row['gen_passage_id']]
    assert len(annots) != 0
    for a in annots:
        r = row.copy()
        r['annotator_id'] = a
        r['gen_source'] = id_to_gen_source[row['gen_passage_id']]
        r['novel'] = 0
        expanded_rows.append(r)
expanded_df = pd.DataFrame(expanded_rows)
# confirm each expression occurs exactly 3 times in the dataframe
assert expanded_df[expanded_df['batch'] < 11].groupby('gen_passage_id')['expression'].value_counts().value_counts().index == 3
# 4 annotators in batch 11
assert expanded_df[expanded_df['batch'] == 11].groupby('gen_passage_id')['expression'].value_counts().value_counts().index == 4
print("Expanded rows shape: ", expanded_df.shape)

# concatenate to the regression df
filled_regr_df = pd.concat([regr_df[~na_mask], expanded_df], ignore_index=True)
assert filled_regr_df['novel'].isna().sum() == 0    
filled_regr_df = filled_regr_df.sort_values(by=['batch', 'gen_passage_id', 'expression']).reset_index(drop=True)
print("Filled DF shape: ", filled_regr_df.shape)
# filled_regr_df.head(7)

# now, we want to drop any expressions that are sub/supersets or very similar to any highlighted expressions
filled_regr_df['novel_highlight'] = filled_regr_df.apply(lambda row:
        compare_expr_to_hlts(
            row['expression'], 
            # novel higlights from that annotator for that passage
            all_highlights_across_batches_df[
                (all_highlights_across_batches_df['annotator_id'] == row['annotator_id'])
                & (all_highlights_across_batches_df['gen_passage_id'] == row['gen_passage_id'])
            ]['novel_expr'].tolist(),
            thresh=0.9, debug=False
        ), axis=1
    )
print("Number of expressions highlighted: ", all_highlights_across_batches_df.shape[0])
print("Number of non-rated expressions that were part of highlights: ", filled_regr_df['novel_highlight'].sum())
# filled_regr_df[filled_regr_df['novel_highlight'] == 1].head(7)

# now, we will drop all expressions that are parts of novel highlights 
# instead, we will add the novel highlights with their novelty scores directly
# note that we keep any pre-highlighted expression annotations, only drop the assumed annotations
# this risks overcounting some novel expressions, 
# but strictly speaking there is no requirement that subpart of a novel expression is novel
filled_regr_NoNovHlts_df = filled_regr_df[(filled_regr_df['novel_highlight'] == 0)
                                          |
                                          ( (filled_regr_df['novel_highlight'] == 1)
                                           # this is to ensure we do not drop pragmatic annotations 
                                           # even for subparts of creative expressions
                                          & (~filled_regr_df['pragmatic'].isna()) 
                                          )
                                          ].copy()
# drop the novel_highlight column
filled_regr_NoNovHlts_df.drop(columns=['novel_highlight'], inplace=True)

print("Scored highlights shape: ", all_hlt_scores_df.shape)
print("Dims before adding highlights: ", filled_regr_NoNovHlts_df.shape)
filled_regr_replNovHlts_df = pd.concat([filled_regr_NoNovHlts_df, all_hlt_scores_df], ignore_index=True)
assert filled_regr_replNovHlts_df['novel'].isna().sum() == 0
assert (filled_regr_replNovHlts_df[~filled_regr_replNovHlts_df['pragmatic'].isna()].shape[0] 
        == all_ratings_across_batches_df.shape[0] + all_highlights_across_batches_df.shape[0])
print("Dims after adding highlights: ", filled_regr_replNovHlts_df.shape)

assert (regr_df[regr_df['annotator_id'].notna()].shape[0] # number of annotated expressions
 + regr_df[regr_df['batch'] < 11]['annotator_id'].isna().sum() * 3 # 3x missing annotations bc assume 3 annotators rated as non-novel
 + regr_df[regr_df['batch'] == 11]['annotator_id'].isna().sum() * 4 # 4x missing annotations for batch 11
 - filled_regr_df[filled_regr_df['pragmatic'].isna()]['novel_highlight'].sum() # minus the number of expressions that were part of highlights
+ all_hlt_scores_df.shape[0] # plus the number of novel highlights with scores
 ) == filled_regr_replNovHlts_df.shape[0]
# print(filled_regr_replNovHlts_df.drop_duplicates(['gen_passage_id', 'expression']).shape)

filled_regr_replNovHlts_df = filled_regr_replNovHlts_df.merge(wqrm_df,
                                        on=['gen_passage_id'],
                                        how='left')
assert (filled_regr_replNovHlts_df['WQRM_score'].isna().sum() == filled_regr_replNovHlts_df[filled_regr_replNovHlts_df['batch'] == 11].shape[0])  # batch 11 passages are not in the wqrm file
assert (filled_regr_replNovHlts_df['ai_likelihood'].isna().sum() == filled_regr_replNovHlts_df[filled_regr_replNovHlts_df['batch'] == 11].shape[0])  # batch 11 passages are not in the wqrm file

cols_to_keep = ['batch', 'gen_passage_id', 'seed_passage_id', 
                'novel', 'meaningful', 'pragmatic', 
                'annotator_id', 'gen_source',
                'ppl', 'WQRM_score', 'ai_likelihood']
print("Final filled DF shape: ", filled_regr_replNovHlts_df.shape)
filled_regr_replNovHlts_df.head()
filled_regr_replNovHlts_df[cols_to_keep].to_csv("for_linear_models/prehlt_and_hlt.csv", index=False)
filled_regr_replNovHlts_df[cols_to_keep + ['expression']].to_csv("for_linear_models/with_exprs/prehlt_and_hlt_w_exprs.csv", index=False)

(11758, 33)
(11758, 14)
Number of expressions containing annotation:  8618
# expressions missing annotation:  3140
Expected number of expression in the filled DF:  18038
Expanded rows shape:  (9725, 14)
Filled DF shape:  (18343, 14)
Expression 'that she'd have to wait for the snowflakes to come at their own pace' is very similar to the highlight 'she'd have to wait for the snowflakes to come at their own pace.'
Expression 'that she'd hooked up with a sort of invalid' is very similar to the highlight 'she'd hooked up with a sort of invalid.'
Expression 'an aluminum container as big around as a baby's head' is very similar to the highlight ' an aluminum container as big around as a baby's hea'
Expression 'and she found fairly steady work doing sadomasochistic accessories for a private club in the Village' is very similar to the highlight 'fairly steady work doing sadomasochistic accessories for a private club in the Village.'
Expression '" for his love of the game and the smoky haze of t