In [1]:
import pandas as pd

def remove_ref_tokens(df):
    df['sent_cit_masked'] = df['sent_no_cit'] # copy the sentence with [REF] citation masks
    df["sent_no_cit"] = df["sent_no_cit"].str.replace(r"\[REF\]", "", regex=True)
    return df

In [3]:
import glob
import os

datasets = glob.glob(os.path.join("data/dataset", "**/*.jsonl"), recursive=True)
print(datasets)

['data/dataset/nontrivial_100.jsonl', 'data/dataset/nontrivial_llm.jsonl', 'data/dataset/nontrivial_10.jsonl', 'data/dataset/nontrivial_checked.jsonl', 'data/dataset/trivial_llm.jsonl', 'data/dataset/nontrivial_filtered.jsonl', 'data/dataset/split/small_train.jsonl', 'data/dataset/split/nontrivial_100.jsonl']


In [6]:
test_df = pd.read_json('data/dataset/nontrivial_100.jsonl', lines=True)
test_df = remove_ref_tokens(test_df)
test_df.sent_no_cit[:10]

0    (), do show unusual, faint features in their s...
1     studied the dust emission at 450 μm in the ga...
2    The momentum per unit mass of stars formed del...
3    An example of the latter is the dormant blue s...
4     (see also  for further developments) compute ...
5    Overall, these results are consistent with the...
6    This mechanism has found favor in the radio-lo...
7    However, the complementary application of usin...
8    A recent survey of transmission spectra (  ) s...
9    It has been posited (  ) that the transition b...
Name: sent_no_cit, dtype: object

In [7]:
for dataset in datasets:
    df = pd.read_json(dataset, lines=True)
    df = remove_ref_tokens(df)
    df.to_json(dataset, lines=True, orient='records')

In [8]:
# Check that for every dataset, for every line, the 'sent_no_cit' columns has not '[REF]'
for dataset in datasets:
    df = pd.read_json(dataset, lines=True)
    assert not df['sent_no_cit'].str.contains(r"\[REF\]").any(), f"[REF] found in {dataset}"