<a href="https://colab.research.google.com/github/aaalexlit/omdena_climate_change_challenge_notebooks/blob/main/Triplets_from_Climate_FEVER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
!pip install datasets

In [2]:
from datasets import load_dataset
import pandas as pd

In [3]:
ds_orig = load_dataset("climate_fever", split='test')
df = ds_orig.to_pandas()



In [4]:
exploded = df.explode('evidences').reset_index(drop=True)
df_final = pd.concat([pd.json_normalize(exploded['evidences']), exploded], axis=1).drop(columns='evidences')

In [9]:
# take only disputed claims - that means that the claim has at least one definite SUPPORTS and REFUTES label
# and take only refuting/supporting evidences ie take out NEI ones
disputed_claims = df_final.query("claim_label == 3 and evidence_label != 2")[['evidence_id', 'evidence_label', 'evidence', 'claim_id', 'claim']]
# make sure that the first column is the evidence that supports and the second one that refutes 
triplets_df = disputed_claims.merge(disputed_claims, on="claim_id").query("evidence_label_x < evidence_label_y")
# select only relevant columns
triplets_df = triplets_df[["claim_id", "claim_x", "evidence_x", "evidence_y"]]
# rename columns
triplets_df.rename(columns={"claim_x":"claim", "evidence_x":"supports", "evidence_y":"refutes"}, inplace=True)
triplets_df.head(10)

Unnamed: 0,claim_id,claim,supports,refutes
4,55,"[T]he raw data, the actual thermometer data[.....",He said there had probably been no global warm...,The reconstruction found significant variabili...
6,55,"[T]he raw data, the actual thermometer data[.....",He said there had probably been no global warm...,"In fact, if one ignores the unusual El Nino ye..."
7,55,"[T]he raw data, the actual thermometer data[.....",He said there had probably been no global warm...,That's the data we've had for the past 150 yea...
17,60,"So that means that probably about half, maybe ...","In 2018, Michaels asserted on Fox News, ""proba...",Most of the climatic warming over the last 50 ...
18,60,"So that means that probably about half, maybe ...","In 2018, Michaels asserted on Fox News, ""proba...",Human-caused increases in greenhouse gases are...
19,60,"So that means that probably about half, maybe ...","In 2018, Michaels asserted on Fox News, ""proba...",The Intergovernmental Panel on Climate Change ...
38,65,A windmill could spin until it falls apart and...,The spread of tower mills came with a growing ...,The energy consumed to manufacture and transpo...
39,65,A windmill could spin until it falls apart and...,The spread of tower mills came with a growing ...,The energy harvested from the turbine will off...
47,85,Sea-level rise does not seem to depend on ocea...,Because different climate models have slightly...,This depth depends on (among other things) tem...
48,85,Sea-level rise does not seem to depend on ocea...,Because different climate models have slightly...,This acceleration is due mostly to human-cause...


In [10]:
def to_nli_format(record: dict) -> tuple:
  return (record['claim'], {"entailment": record["supports"], "contradiction": record["refutes"]})

In [11]:
nli_formatted_climate_fever = list(map(to_nli_format, triplets_df.to_dict("records")))

In [12]:
nli_formatted_climate_fever

[('[T]he raw data, the actual thermometer data[...] shows that the US has been cooling for 80 to 90 years.',
  {'entailment': 'He said there had probably been no global warming since the 1940s, and "Satellite data show no appreciable warming of the global atmosphere since 1979.',
   'contradiction': 'The reconstruction found significant variability around a long-term cooling trend of −0.02\xa0°C per century, as expected from orbital forcing, interrupted in the 20th century by rapid warming which stood out from the whole period, with the 1990s "the warmest decade, and 1998 the warmest year, at moderately high levels of confidence".'}),
 ('[T]he raw data, the actual thermometer data[...] shows that the US has been cooling for 80 to 90 years.',
  {'entailment': 'He said there had probably been no global warming since the 1940s, and "Satellite data show no appreciable warming of the global atmosphere since 1979.',
   'contradiction': 'In fact, if one ignores the unusual El Nino year of 199