In [62]:
from datasets import load_dataset
from transformers import pipeline
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from transformers.pipelines.pt_utils import KeyDataset

card = "alex-miller/iati-climate-classifier"
classifier = pipeline("text-classification", model=card,
    device=0
)

texts = [
    "Fiduciary Risk Assessment of the Monetary Fund",
    "Fiduciary Risk Assessment of the ICF Fund",
    "Fiduciary Risk Assessment of the Green Fund",
    "Fiduciary Risk Assessment of the Adaptation Fund",
    "Climate adaptation as a principal objective"
]
classifier(texts)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


[{'label': '0.0', 'score': 0.6474107503890991},
 {'label': '1.0', 'score': 0.7570636868476868},
 {'label': '1.0', 'score': 0.9502613544464111},
 {'label': '1.0', 'score': 0.9701371192932129},
 {'label': '1.0', 'score': 0.9586134552955627}]

In [63]:

def split_into_substrings(text, max_words=200):
    words = text.split()
    substrings = []
    current_substring = []
    for word in words:
        current_substring.append(word)
        if len(current_substring) >= max_words:
            substrings.append(' '.join(current_substring))
            current_substring = []
    if current_substring:
        substrings.append(' '.join(current_substring))
    return substrings


def process_iati(examples):
  examples["label"] = np.maximum(examples["climate_adaptation_sig"], examples["climate_mitigation_sig"])
  examples["label"] = np.minimum(examples["label"], [1] * len(examples["label"])) # Set all 2/3/4s to 1
  examples["text"] = [split_into_substrings(text) for text in examples["text"]]
  return examples


iati = load_dataset("alex-miller/iati-policy-markers", split="train")
climate_relevant_iati = iati.filter(lambda example: example["text"] != "" and example["text"] is not None)
climate_relevant_iati = climate_relevant_iati.map(
    process_iati,
    batched=True,
    num_proc=8
  )
cols_to_remove = climate_relevant_iati.column_names
cols_to_remove.remove("text")
cols_to_remove.remove("label")
cols_to_remove.remove("reporting_org_ref")
cols_to_remove.remove("iati_identifier")
climate_relevant_iati = climate_relevant_iati.remove_columns(cols_to_remove).class_encode_column('label')

flat_climate_relevant_iati = climate_relevant_iati.with_format(
    "pandas"
).map(
    lambda df: df.explode("text"), batched=True
)
flat_climate_relevant_iati.reset_format()

tokenizer_kwargs = {'truncation':True, 'max_length':512}
pred_out = []
for out in tqdm(classifier(KeyDataset(flat_climate_relevant_iati, "text"), batch_size=8, **tokenizer_kwargs), total=len(climate_relevant_iati)):
    pred_out += [out]

flat_climate_relevant_iati = flat_climate_relevant_iati.add_column("predicted_label", [out["label"] for out in pred_out])
flat_climate_relevant_iati = flat_climate_relevant_iati.add_column("predicted_score", [out["score"] for out in pred_out])
flat_climate_relevant_iati = flat_climate_relevant_iati.remove_columns(["text"])

df = pd.DataFrame(flat_climate_relevant_iati)
df.to_csv('predicted_climate_labels.csv', index=False)


  0%|          | 0/862314 [00:00<?, ?it/s]