In [None]:
import pandas as pd

In [None]:
data_path = '../data/train.jsonl'
single_source_path = '../data/single-source-all.csv'

In [None]:
df = pd.read_json(data_path, lines=True)
df_verifiable = df[df['verifiable'] == 'VERIFIABLE']
df_unverifiable = df[df['verifiable'] == 'NOT VERIFIABLE']
n_verifiable = len(df_verifiable)
n_unverifiable = len(df_unverifiable)

In [None]:
'''
Given an evidence value in the format used in the FEVER dataset, return the set of evidence article names.
'''
def get_evidence_article_set(evidence):
    evidence_article_set = set()
    for evidence_alternative in evidence:
        for evidence_part in evidence_alternative:
            evidence_article_set.add(evidence_part[2])
    return evidence_article_set

'''
Given an evidence value in the format used in the FEVER dataset, return the evidence article name if there is only one article. Else, return None.
'''
def get_single_evidence_article(evidence):
    evidence_article_set = get_evidence_article_set(evidence)
    if len(evidence_article_set) == 1:
        return next(iter(evidence_article_set))
    return None

In [None]:
# Add a column for the single evidence article name if there is only one, else None.
df_verifiable['evidence_article'] = df_verifiable['evidence'].apply(get_single_evidence_article)

# Filter out the rows where there is no single evidence article.
df_verifiable_single = df_verifiable[df_verifiable['evidence_article'].notnull()]

# Keep the ratio of verifiable to unverifiable the same:
# Compute ratio of verifiable discarded, discard same ratio of unverifiable.
n_verifiable_single = len(df_verifiable_single)
reduction_factor = n_verifiable_single / n_verifiable
reduced_unverifiable = df_unverifiable.sample(frac=reduction_factor, random_state=1)

# Combine the verifiable and unverifiable dataframes.
df_single_source = pd.concat([df_verifiable_single, reduced_unverifiable]).sample(frac=1, random_state=1)


In [None]:
'''
Compress the evidence: Preserve structure but discard annotation ids and article names.
'''
def compress_evidence(evidence):
    compressed_evidence = []
    for evidence_alternative in evidence:
        compressed_evidence_alternative = []
        for evidence_part in evidence_alternative:
            compressed_evidence_alternative.append(evidence_part[3])
        compressed_evidence.append(compressed_evidence_alternative)
    return compressed_evidence


In [None]:
# Add a column for the compressed evidence.
df_single_source['compressed_evidence'] = df_single_source['evidence'].apply(compress_evidence)

In [None]:
# Write the single source data to a csv file.
df_single_source.to_csv(single_source_path, index=False)

In [35]:
# Write the single source data to a csv file.
df_single_source.to_csv(single_source_path, index=False)