In [5]:
import json
import pandas as pd
from gensim.parsing.preprocessing import preprocess_documents
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.parsing import preprocess_string
# preprocess the data
import re
from nltk.stem import WordNetLemmatizer
import time
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
claim = 'Not only is there no scientific evidence that CO2 is a pollutant, higher CO2 concentrations actually help ecosystems support more plant and animal life.'

In [6]:
# Load the data
with open('data/train-claims.json') as f:
    train_data = json.load(f)

with open('data/dev-claims.json') as f:
    dev_data = json.load(f)

with open('data/test-claims-unlabelled.json') as f:
    test_data = json.load(f)

with open('data/evidence.json') as f:
    evidence_data = json.load(f)

evidences = list(evidence_data.values())

In [6]:
# function to get the evidence for a claim
def get_evidence(evidence_id):
    return evidence_data[evidence_id]

# function to get the claim text
def get_claim_text(claim_id):
    return train_data[claim_id]['claim_text']

# function to get the label for a claim
def get_label(claim_id):
    return train_data[claim_id]['claim_label']

# function to create a dataframe with the claim, evidence and label
def create_dataframe(data):
    rows = []
    for claim_id in data:
        for evidence_id in train_data[claim_id]['evidences']:
            row = {
                'claim_text': get_claim_text(claim_id),
                'evidence_text': get_evidence(evidence_id=evidence_id),
                'label': get_label(claim_id)
            }
            rows.append(row)
    df = pd.concat([pd.DataFrame([row]) for row in rows], ignore_index=True)
    return df


# create the dataframes
train_df = create_dataframe(train_data)
#dev_df = create_dataframe(dev_data)

#print entire evidence for a claim of the train_df
train_df


Unnamed: 0,claim_text,evidence_text,label
0,Not only is there no scientific evidence that ...,At very high concentrations (100 times atmosph...,DISPUTED
1,Not only is there no scientific evidence that ...,Plants can grow as much as 50 percent faster i...,DISPUTED
2,Not only is there no scientific evidence that ...,Higher carbon dioxide concentrations will favo...,DISPUTED
3,El Niño drove record highs in global temperatu...,While ‘climate change’ can be due to natural f...,REFUTES
4,El Niño drove record highs in global temperatu...,This acceleration is due mostly to human-cause...,REFUTES
...,...,...,...
4117,But abnormal temperature spikes in February an...,The coastline sees significantly mild temperat...,NOT_ENOUGH_INFO
4118,Sending oscillating microwaves from an antenna...,"Dielectric heating, also known as electronic h...",SUPPORTS
4119,Sending oscillating microwaves from an antenna...,An example is absorption or emission of radio ...,SUPPORTS
4120,Sending oscillating microwaves from an antenna...,"Water, fat, and other substances in the food a...",SUPPORTS


**TFIDF**

In [7]:
lemma = WordNetLemmatizer()

# function to preprocess the text
def preprocess_text(text):
    text = text.lower()
    # remove stopwords
    text = ' '.join([lemma.lemmatize(word) for word in text.split() if word not in stopwords.words('english')])
    return text

In [8]:
#use tfidf to find the most important evidence for a claim

claim = 'Not only is there no scientific evidence that CO2 is a pollutant, higher CO2 concentrations actually help ecosystems support more plant and animal life.'

vectorizer = TfidfVectorizer()


stop_words = set(stopwords.words('english'))
preprocessed_corpus = [' '.join([lemma.lemmatize(word.lower()) for word in sentence.split() if word not in stop_words]) for key, sentence in evidence_data.items()]
#preprocessed_corpus = preprocess_documents(evidences)

# get the most important evidence for a claim
#get_most_important_evidence(claim, evidence_data)

print(len(preprocessed_corpus))



1208827


In [9]:
vectors = vectorizer.fit_transform(preprocessed_corpus)

In [10]:
def get_relevant_evidence_tfidf(claim, vectors, evidences, top_n=7):
    # Process the claim
    processed_claim = preprocess_text(claim)

    # Get the tfidf vector for the claim
    claim_vector = vectorizer.transform([processed_claim])

    # Get the cosine similarity between the claim and the evidence

    cosine_similarity(claim_vector, vectors)

    # Get the top 5 most similar evidence
    top_n_evidences = cosine_similarity(claim_vector, vectors).argsort()[0][-top_n:]
    # Get the top 5 most similar evidence unprocessed
 
    filtered_items = [item for i, item in enumerate(evidence_data.items()) if i in top_n_evidences]
    most_similar_evidence_keys, most_similar_evidence = zip(*filtered_items)

    most_similar_evidence_keys = list(most_similar_evidence_keys)
    most_similar_evidence = list(most_similar_evidence)
    return most_similar_evidence, most_similar_evidence_keys

get_relevant_evidence_tfidf(claim, vectors, evidence_data)


(['Higher atmospheric CO2 concentrations have led to an increase in dissolved CO2, which causes ocean acidification.',
  'There is also a close correlation between CO2 and temperature, where CO2 has a strong control over global temperatures in Earth history.',
  'The concentration of CO2 in the flue gas is an important key to determine the efficiency of CO2 capture technology.',
  'It is expected that most ecosystems will be affected by higher atmospheric CO2 levels and higher global temperatures.',
  'CO2).',
  "The increased radiative forcing due to increased CO2 in the Earth's atmosphere is based on the physical properties of CO2 and the non-saturated absorption windows where CO2 absorbs outgoing long-wave energy.",
  'Coal, being mostly carbon, emits a lot of CO2 when burnt: it has a high CO2 emission intensity.'],
 ['evidence-66273',
  'evidence-98914',
  'evidence-256886',
  'evidence-364767',
  'evidence-668884',
  'evidence-985452',
  'evidence-1162945'])

**DOC2VEC**

In [None]:


#processed_corpus = [doc.split(" ") for doc in preprocessed_corpus]
#tokenised_corpus = preprocess_documents(evidences)
# Train Doc2Vec model on preprocessed corpus
#documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(tokenised_corpus)]



In [None]:
model = Doc2Vec(documents, vector_size=45, window=2, min_count=1, workers=8)
model.save("models/doc2vec.model")


In [None]:

#  Get the most similar evidence document to a claim
def get_most_similar_evidence(claim, evidence_data, model, top_n=10):
    # Preprocess claim
    processed_claim = preprocess_string(claim)

    # Get claim vector
    claim_vector = model.infer_vector(processed_claim)
    
    # similar docs
    similarity = model.dv.most_similar(claim_vector, topn=top_n)
    print(similarity)
    #top 5 indices
    top_n_indices = [index for index, score in similarity]

    # top 5 evidences
    top_n_evidences = [evidence_data[index] for index in top_n_indices]
    return  top_n_indices, top_n_evidences

model = Doc2Vec.load("models/doc2vec.model")
claim = 'Not only is there no scientific evidence that CO2 is a pollutant, higher CO2 concentrations actually help ecosystems support more plant and animal life.'
top_n_indices, top_n_evidences = get_most_similar_evidence(claim, evidences, model, top_n=10)

print("Claim: ", claim)
print("Top 10 results:")
for i, evidence in enumerate(top_n_evidences):
    print(f'{i+1}. {evidence}')


**BM25**

**Preprocess data and get BM25 vectors for evidence corpus**

In [11]:
from rank_bm25 import BM25Okapi

tokenised_corpus = preprocess_documents(evidences)
bm25 = BM25Okapi(tokenised_corpus)

In [12]:
def get_relevant_evidence_bm25(claim, evidence_data, bm25, top_n=7):
    """ """
    processed_claim = preprocess_string(claim)
  
    # Get BM25 scores for each evidence sentence
    bm25_scores = bm25.get_scores(processed_claim)
    # Get top 5 most similar evidence sentences
    top_n_indices = bm25_scores.argsort()[::-1][:top_n]
    filtered_items = [item for i, item in enumerate(evidence_data.items()) if i in top_n_indices]
    most_similar_evidence_keys, most_similar_evidence = zip(*filtered_items)

    most_similar_evidence_keys = list(most_similar_evidence_keys)
    most_similar_evidence = list(most_similar_evidence)
    #best_evidence = bm25.get_top_n(processed_claim, evidences, n=top_n)
    # Print results
    #print(best_evidence)
    # print("------BM25 Evidence-----")
    # for i, evidence in enumerate(most_similar_evidence):
    #     print(f'{i+1}. {evidence}')
    return most_similar_evidence, most_similar_evidence_keys

get_relevant_evidence_bm25(claim=claim, evidence_data=evidence_data, bm25=bm25)

(['At very high concentrations (100 times atmospheric concentration, or greater), carbon dioxide can be toxic to animal life, so raising the concentration to 10,000 ppm (1%) or higher for several hours will eliminate pests such as whiteflies and spider mites in a greenhouse.',
  'This scientific evidence comes from many sources but is presented in detail in the Millennium Ecosystem Assessment and the planetary boundaries framework.',
  'The concentration of pollutants in the flue gas is higher, making separation easier.',
  'The whitebark pine ecosystem in these high elevations plays many essential roles, providing support to plant and animal life.',
  'Diverse forest plant life exists in the park and the area supports several wild animal species.',
  'They study the various animals and plants that live within an ecosystem and the relationship between the two.',
  'He helped to organize cultural life in Theresienstadt concentration camp.'],
 ['evidence-442946',
  'evidence-528208',
  '

**BERT ATTEMPT**

In [None]:
query = "What is the capital of France?"
passages = [
    "Paris is the capital of France.",
    "The Eiffel Tower is located in Paris.",
    "France is famous for its wine and cheese.",
    "The Louvre is a famous museum in Paris.",
    "The French Revolution started in 1789.",
]




In [129]:
from sentence_transformers import SentenceTransformer, util
import torch
from sentence_transformers.util import semantic_search
#query = "How many people live in London?"
#docs = ["Around 9 Million people live in London", "London is known for its financial district"]
mps_device = torch.device("mps")
#Load the model
model = SentenceTransformer('sentence-transformers/msmarco-distilbert-base-tas-b',device=mps_device)
#model = SentenceTransformer('bounedjarr/sgpt-finetuned-natcat',device=mps_device)
# model = AutoModelForSequenceClassification.from_pretrained("climatebert/distilroberta-base-climate-f")

#Encode query and documents
query_emb = model.encode(claim)
doc_emb = model.encode(most_similar_evidence)

#Compute dot score between query and all document embeddings
scores = util.dot_score(query_emb, doc_emb)[0].cpu().tolist()

#Combine docs & scores
doc_score_pairs = list(zip(most_similar_evidence, scores))

#Sort by decreasing score
doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)

#Output passages & scores
for doc, score in doc_score_pairs:
    print(score, doc)

# results = semantic_search(query_emb, doc_emb, top_k=5) 
# for result in results[0]:
#     print(result)
#     evidence_id = result['corpus_id']
#     sim_score = result['score']
#     print(most_similar_evidence[evidence_id])

96.55245208740234 CO2).
95.90694427490234 It is expected that most ecosystems will be affected by higher atmospheric CO2 levels and higher global temperatures.
93.92660522460938 Higher atmospheric CO2 concentrations have led to an increase in dissolved CO2, which causes ocean acidification.
93.88444519042969 There is also a close correlation between CO2 and temperature, where CO2 has a strong control over global temperatures in Earth history.
92.76532745361328 Coal, being mostly carbon, emits a lot of CO2 when burnt: it has a high CO2 emission intensity.
91.64056396484375 The concentration of CO2 in the flue gas is an important key to determine the efficiency of CO2 capture technology.
91.06651306152344 The increased radiative forcing due to increased CO2 in the Earth's atmosphere is based on the physical properties of CO2 and the non-saturated absorption windows where CO2 absorbs outgoing long-wave energy.


In [None]:
{'corpus_id': 12, 'score': 0.9166639447212219}
Higher carbon dioxide concentrations will favourably affect plant growth and demand for water.
{'corpus_id': 5, 'score': 0.9083583354949951}
At very high concentrations (100 times atmospheric concentration, or greater), carbon dioxide can be toxic to animal life, so raising the concentration to 10,000 ppm (1%) or higher for several hours will eliminate pests such as whiteflies and spider mites in a greenhouse.
{'corpus_id': 1, 'score': 0.8610204458236694}
The concentration of pollutants in the flue gas is higher, making separation easier.
{'corpus_id': 16, 'score': 0.7930693030357361}
This, along with higher temperatures, would mean a higher equilibrium concentration of in the air.
{'corpus_id': 15, 'score': 0.7820011973381042}
Sustainable agriculture is the cultivation of plant and animal materials in a manner that preserves plant and animal ecosystems and that can improve soil health and soil fertility over the long term.

**Saved embedding of evidence document**

In [None]:
# # Define batch size
# batch_size = 1000

# # Encode documents in batches
# doc_emb = []
# for i in range(0, len(evidences), batch_size):
#   start_time = time.time()
#   batch = evidences[i:i+batch_size]
#   batch_emb = model.encode(batch)
#   doc_emb.append(batch_emb)
#   print(f"Batch {i} processed time taken {time.time()- start_time}")
# doc_emb = np.concatenate(doc_emb, axis=0)
# # Save document embeddings as a NumPy array
# np.save('doc_emb.npy', doc_emb)

In [None]:
query_emb = model.encode(claim)
#Compute dot score between query and all document embeddings
scores = util.dot_score(query_emb, doc_emb)[0].cpu().tolist()

#Combine docs & scores
doc_score_pairs = list(zip(evidences, scores))

#Sort by decreasing score
doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)

threshold = 9
#Output passages & scores
for i, (doc, score) in enumerate(doc_score_pairs):
  if i > 10:
    break
  print(score, doc)

In [8]:
train_df.iloc[0]['evidence_text']
temp_evidences= ['At very high concentrations (100 times atmospheric concentration, or greater), carbon dioxide can be toxic to animal life, so raising the concentration to 10,000 ppm (1%) or higher for several hours will eliminate pests such as whiteflies and spider mites in a greenhouse.', 
'Plants can grow as much as 50 percent faster in concentrations of 1,000 ppm CO 2 when compared with ambient conditions, though this assumes no change in climate and no limitation on other nutrients.', 
'Higher carbon dioxide concentrations will favourably affect plant growth and demand for water.']
print(temp_evidences)

['At very high concentrations (100 times atmospheric concentration, or greater), carbon dioxide can be toxic to animal life, so raising the concentration to 10,000 ppm (1%) or higher for several hours will eliminate pests such as whiteflies and spider mites in a greenhouse.', 'Plants can grow as much as 50 percent faster in concentrations of 1,000 ppm CO 2 when compared with ambient conditions, though this assumes no change in climate and no limitation on other nutrients.', 'Higher carbon dioxide concentrations will favourably affect plant growth and demand for water.']


In [136]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
mps_device = torch.device("mps")
model = AutoModelForSequenceClassification.from_pretrained("climatebert/distilroberta-base-climate-f")
#model = AutoModelForSequenceClassification.from_pretrained("sentence-transformers/msmarco-distilbert-base-tas-b")
#model = AutoModelForSequenceClassification.from_pretrained("bounedjarr/sgpt-finetuned-natcat")

#model.to(mps_device)
tokenizer = AutoTokenizer.from_pretrained("climatebert/distilroberta-base-climate-f")
#tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/msmarco-distilbert-base-tas-b")
#tokenizer = AutoTokenizer.from_pretrained("bounedjarr/sgpt-finetuned-natcat")

claim = 'Not only is there no scientific evidence that CO2 is a pollutant, higher CO2 concentrations actually help ecosystems support more plant and animal life.'
evidence = "Plants can grow as much as 50 percent faster in concentrations of 1,000 ppm CO 2 when compared with ambient conditions, though this assumes no change in climate and no limitation on other nutrients."

inputs = tokenizer(claim, evidence, padding='max_length', truncation=True, return_tensors="pt", max_length=512)
#inputs.to(mps_device)
model.eval()
with torch.no_grad():
    scores = model(**inputs).logits
    print(scores)
    label_mapping = ['supports','refutes','not enough info','disputed']
    labels = [label_mapping[score_max] for score_max in scores.argmax(dim=1)]
    print(labels)


Some weights of the model checkpoint at climatebert/distilroberta-base-climate-f were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at climatebert/distilroberta-base-climate-f and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out

tensor([[ 0.1947, -0.2067]])
['supports']


**Prediction Process**

In [33]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

def majority_vote_prediction(claim, relevant_evidences,model_name="amandakonet/climatebert-fact-checking"):
    #mps_device = torch.device("mps")
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    #model.to(mps_device)
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    label_mapping = ['supports','refutes','not_enough_info','disputed']
    predictions = []

    model.eval()
    with torch.no_grad():
        for evidence in relevant_evidences:
            inputs = tokenizer(claim, evidence, padding='max_length', truncation=True, return_tensors="pt", max_length=512)
            #inputs.to(mps_device)
            scores = model(**inputs).logits
            label = label_mapping[scores.argmax(dim=1).item()]
            predictions.append(label.upper())

    majority_vote = max(set(predictions), key=predictions.count)
    #print(claim, majority_vote)
    return majority_vote

# Example usage:
claim = 'Not only is there no scientific evidence that CO2 is a pollutant, higher CO2 concentrations actually help ecosystems support more plant and animal life.'
evidences = ['At very high concentrations (100 times atmospheric concentration, or greater), carbon dioxide can be toxic to animal life, so raising the concentration to 10,000 ppm (1%) or higher for several hours will eliminate pests such as whiteflies and spider mites in a greenhouse.', 
'Plants can grow as much as 50 percent faster in concentrations of 1,000 ppm CO 2 when compared with ambient conditions, though this assumes no change in climate and no limitation on other nutrients.', 
'Higher carbon dioxide concentrations will favourably affect plant growth and demand for water.']
result = majority_vote_prediction(claim, evidences)
print(result)


REFUTES


In [34]:
def get_evidence_make_prediction(claim_data, evidence_data, vectors, bm25, top_n, mode):
    results = {}

    for claim_id in claim_data:
        claim = claim_data[claim_id]['claim_text']
        if mode == "tfidf":
            relevant_evidences, relevant_evidence_keys = get_relevant_evidence_tfidf(claim, vectors, evidence_data, top_n)
        elif mode == "bm25":
            relevant_evidences, relevant_evidence_keys = get_relevant_evidence_bm25(claim, evidence_data, bm25, top_n)

        predicted_label = majority_vote_prediction(claim, relevant_evidences)
        
        results[claim_id] = {
            "claim_text": claim,
            "claim_label": predicted_label,
            "evidences": relevant_evidence_keys
        }
        
    return results

   


In [35]:
predicted_results_bm25 = get_evidence_make_prediction(dev_data, evidence_data, vectors, bm25, top_n=7, mode="bm25")

In [39]:
predicted_results_tfidf = get_evidence_make_prediction(dev_data, evidence_data, vectors, bm25, top_n=7, mode="tfidf")

In [36]:
with open('predicted_results_bm25.json', 'w') as fp:
    json.dump(predicted_results_bm25, fp)

In [40]:
with open('predicted_results_tfidf.json', 'w') as fp:
    json.dump(predicted_results_tfidf, fp)

In [42]:
!python data/eval.py --predictions predicted_results_tfidf.json --groundtruth data/dev-claims.json

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Evidence Retrieval F-score (F)    = 0.07668240850059031
Claim Classification Accuracy (A) = 0.4935064935064935
Harmonic Mean of F and A          = 0.13273940057250957
