In [4]:
from datasets import load_dataset, Dataset
from transformers import pipeline
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from transformers.pipelines.pt_utils import KeyDataset

card = "alex-miller/iati-drr-classifier"
classifier = pipeline("text-classification", model=card,
    # device=0
)

texts = [
    "Fiduciary Risk Assessment of the Monetary Fund",
    "Fiduciary Risk Assessment of the ICF Fund",
    "Fiduciary Risk Assessment of the Green Fund",
    "Fiduciary Risk Assessment of the Adaptation Fund",
    "Climate adaptation as a principal objective"
]
classifier(texts)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


[{'label': '0', 'score': 0.5523359775543213},
 {'label': '1', 'score': 0.5190401077270508},
 {'label': '1', 'score': 0.5831199288368225},
 {'label': '1', 'score': 0.7872835993766785},
 {'label': '1', 'score': 0.8095257878303528}]

In [5]:

def split_into_substrings(text, max_words=200):
    words = text.split()
    substrings = []
    current_substring = []
    for word in words:
        current_substring.append(word)
        if len(current_substring) >= max_words:
            substrings.append(' '.join(current_substring))
            current_substring = []
    if current_substring:
        substrings.append(' '.join(current_substring))
    return substrings


iati = load_dataset("alex-miller/iati-policy-markers", split="train")
drr_relevant_iati = iati.filter(lambda example: example["drr"])
drr_relevant_iati = drr_relevant_iati.filter(lambda example: example["text"] != "" and example["text"] is not None)
drr_relevant_iati = drr_relevant_iati.filter(lambda example: example["drr_sig"] == 0)

# De-duplicate
df = pd.DataFrame(drr_relevant_iati)
print(df.shape)
df = df.drop_duplicates(subset=['text'])
print(df.shape)
drr_relevant_iati = Dataset.from_pandas(df, preserve_index=False)

cols_to_remove = drr_relevant_iati.column_names
cols_to_remove.remove("text")
cols_to_remove.remove("reporting_org_ref")
cols_to_remove.remove("iati_identifier")
drr_relevant_iati = drr_relevant_iati.remove_columns(cols_to_remove)
drr_relevant_iati = drr_relevant_iati.shuffle(seed=1337)
drr_relevant_iati = drr_relevant_iati.select(range(10000))

tokenizer_kwargs = {'truncation':True, 'max_length':512}
pred_out = []
for out in tqdm(classifier(KeyDataset(drr_relevant_iati, "text"), batch_size=8, **tokenizer_kwargs), total=len(drr_relevant_iati)):
    pred_out += [out]

drr_relevant_iati = drr_relevant_iati.add_column("predicted_label", [out["label"] for out in pred_out])
drr_relevant_iati = drr_relevant_iati.add_column("predicted_score", [out["score"] for out in pred_out])
drr_relevant_iati = drr_relevant_iati.remove_columns(["text"])

df = pd.DataFrame(drr_relevant_iati)
df.to_csv('output/drr_for_contrast.csv', index=False)


(262711, 28)
(202177, 28)


ValueError: Column (label) not in table columns (['iati_identifier', 'reporting_org_ref', 'text']).