In [1]:
from pathlib import Path
import pandas as pd
import torch
from filtering import filter
from filtering import ContrieverScoring, RewardModelScoring


device = torch.device('cuda', 0)
hf_cache_dir = '../hf_model_cache'

cs = ContrieverScoring(device=device, hf_cache_dir=hf_cache_dir)
rs = RewardModelScoring(chkpt_dir='./reward_model/checkpoints/rnd_b48_last_do25_checkpoint_01_0006772/', device=device, hf_cache_dir=hf_cache_dir)

In [2]:
# load dataset files and add token length, contriever cos, dot and reward_model scores
# list all dataset files in source_directory

source_data_path = Path('./data')
data_files = list(source_data_path.glob('*/*.parquet'))

def load_file(fn):
    df = pd.read_parquet(path=str(fn), engine='pyarrow', columns=['text', 'summary', 'provenance'])
    return df


df = load_file(data_files[0])
texts = df.text.iloc[0:10].to_list()
summaries = df.summary.iloc[0:10].to_list()

cos_scores, dot_scores = cs.score_multiple(texts, summaries)
reward_scores = rs.score_multiple(texts, summaries)


In [3]:
def token_count(text_list):
    text_tokens = rs.tokenizer(text_list, padding=False, truncation=False)
    return [len(l) for l in text_tokens.input_ids]

texts = df.text.iloc[0:10].to_list()
summaries = df.summary.iloc[0:10].to_list()

token_count(texts)

Token indices sequence length is longer than the specified maximum sequence length for this model (787 > 512). Running this sequence through the model will result in indexing errors


[787, 579, 971, 267, 488, 670, 1094, 529, 444, 159]

In [4]:
from fragments import Fragments

f = Fragments(summary='test', text='test 123')
f.density()



1.0

In [8]:
# new rows for augmented data frame
@torch.no_grad()
def augmente_data_frame(df):
    num_text_tokens = []
    num_summary_tokens = []
    contriever_score_cos = []
    contriever_score_dot = []
    reward = []
    density = []
    compression = []
    coverage = []

    num_rows = len(df)
    batch_size = 32
    for i in range(0, num_rows, batch_size):
        page = df.iloc[i:i+batch_size]
        texts = page.text.to_list()
        summaries = page.summary.to_list()
        
        num_text_tokens.extend(token_count(texts))
        num_summary_tokens.extend(token_count(summaries))

        c, d = cs.score_multiple(texts, summaries)
        contriever_score_cos.extend(c.tolist())
        contriever_score_dot.extend(d.tolist())

        r = rs.score_multiple(texts, summaries)
        reward.extend(r.tolist())

        for t,s in zip(texts, summaries):
            f = Fragments(summary=s, text=t)
            density.append(f.density())
            coverage.append(f.coverage())
            compression.append(f.compression())

    assert all(len(x) == num_rows for x in (num_text_tokens, num_summary_tokens, contriever_score_cos, contriever_score_dot, reward, density, compression, coverage))

    df['t5_text_token_count'] = pd.array(num_text_tokens, dtype="int")
    df['t5_summary_token_count'] = pd.array(num_summary_tokens, dtype="int")
    df['contriever_cos'] = pd.array(contriever_score_cos, dtype="float")
    df['contriever_dot'] = pd.array(contriever_score_dot, dtype="float")
    df['reward'] = pd.array(reward, dtype="float")
    df['density'] = pd.array(density, dtype="float")
    df['compression'] = pd.array(compression, dtype="float")
    df['coverage'] = pd.array(coverage, dtype="float")
    return df

def augment_data_file(fn: Path, compression = 'snappy'):
    fn = Path(fn)

    print(f'reading: {fn}')
    df = load_file(fn)

    df = augmente_data_frame(df)
    
    old_postfix = '.snappy.parquet'
    new_postfix = f'_scored.{compression}.parquet'
    out_fn = fn.parent / fn.name.replace(old_postfix, new_postfix)

    print(f'writing part: {out_fn} ({len(df)} rows)')
    df.to_parquet(out_fn, compression=compression, engine="pyarrow", row_group_size=1000)


for fn in data_files:
    augment_data_file(fn)


reading: data/cnn_dailymail/cnn_dailymail-3.0.0_test.snappy.parquet


