In [1]:
import json
import pandas as pd
from gensim.parsing.preprocessing import preprocess_documents
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.parsing import preprocess_string
# preprocess the data
import re
from nltk.stem import WordNetLemmatizer
import time
import torch

In [2]:
# Load the data
with open('data/train-claims.json') as f:
    train_data = json.load(f)

with open('data/dev-claims.json') as f:
    dev_data = json.load(f)

with open('data/test-claims-unlabelled.json') as f:
    test_data = json.load(f)

with open('data/evidence.json') as f:
    evidence_data = json.load(f)

evidences = list(evidence_data.values())

In [3]:
# function to get the evidence for a claim
def get_evidence(evidence_id):
    return evidence_data[evidence_id]

# function to get the claim text
def get_claim_text(claim_id):
    return train_data[claim_id]['claim_text']

# function to get the label for a claim
def get_label(claim_id):
    return train_data[claim_id]['claim_label']

# function to create a dataframe with the claim, evidence and label
def create_dataframe(data):
    rows = []
    for claim_id in data:
        for evidence_id in train_data[claim_id]['evidences']:
            row = {
                'claim_text': get_claim_text(claim_id),
                'evidence_text': get_evidence(evidence_id=evidence_id),
                'label': get_label(claim_id)
            }
            rows.append(row)
    df = pd.concat([pd.DataFrame([row]) for row in rows], ignore_index=True)
    return df


# create the dataframes
train_df = create_dataframe(train_data)
#dev_df = create_dataframe(dev_data)

#print entire evidence for a claim of the train_df
train_df


Unnamed: 0,claim_text,evidence_text,label
0,Not only is there no scientific evidence that ...,At very high concentrations (100 times atmosph...,DISPUTED
1,Not only is there no scientific evidence that ...,Plants can grow as much as 50 percent faster i...,DISPUTED
2,Not only is there no scientific evidence that ...,Higher carbon dioxide concentrations will favo...,DISPUTED
3,El Niño drove record highs in global temperatu...,While ‘climate change’ can be due to natural f...,REFUTES
4,El Niño drove record highs in global temperatu...,This acceleration is due mostly to human-cause...,REFUTES
...,...,...,...
4117,But abnormal temperature spikes in February an...,The coastline sees significantly mild temperat...,NOT_ENOUGH_INFO
4118,Sending oscillating microwaves from an antenna...,"Dielectric heating, also known as electronic h...",SUPPORTS
4119,Sending oscillating microwaves from an antenna...,An example is absorption or emission of radio ...,SUPPORTS
4120,Sending oscillating microwaves from an antenna...,"Water, fat, and other substances in the food a...",SUPPORTS


In [None]:
lemma = WordNetLemmatizer()

# function to preprocess the text
def preprocess_text(text):
    text = text.lower()
    # remove stopwords
    text = ' '.join([lemma.lemmatize(word) for word in text.split() if word not in stopwords.words('english')])
    
    return text

In [None]:
#use tfidf to find the most important evidence for a claim



claim = 'Not only is there no scientific evidence that CO2 is a pollutant, higher CO2 concentrations actually help ecosystems support more plant and animal life.'

vectorizer = TfidfVectorizer()


stop_words = set(stopwords.words('english'))
preprocessed_corpus = [' '.join([lemma.lemmatize(word.lower()) for word in sentence.split() if word not in stop_words]) for key, sentence in evidence_data.items()]

# get the most important evidence for a claim
#get_most_important_evidence(claim, evidence_data)

print(len(preprocessed_corpus))



In [None]:
vectors = vectorizer.fit_transform(preprocessed_corpus)

In [None]:
print(vectors.shape)

In [None]:
# Process the claim
processed_claim = preprocess_text(claim)

# Get the tfidf vector for the claim
claim_vector = vectorizer.transform([processed_claim])

# Get the cosine similarity between the claim and the evidence

cosine_similarity(claim_vector, vectors)

# Get the top 5 most similar evidence
top_5_ = cosine_similarity(claim_vector, vectors).argsort()[0][-5:]
most_similar_evidence = [preprocessed_corpus[i] + "\n" for i in cosine_similarity(claim_vector, vectors).argsort()[0][-5:]]
# Get the top 5 most similar evidence unprocessed
most_similar_evidence_unprocessed = [list(evidence_data.values())[i] + "\n" for i in cosine_similarity(claim_vector, vectors).argsort()[0][-5:]]

print(top_5_)
print("-----Processed claims-------")
print(most_similar_evidence)
print("------Original Evidence-----")
print(most_similar_evidence_unprocessed)



In [None]:
train_df.iloc[0]['evidences']

**DOC2VEC**

In [None]:


#processed_corpus = [doc.split(" ") for doc in preprocessed_corpus]
#tokenised_corpus = preprocess_documents(evidences)
# Train Doc2Vec model on preprocessed corpus
#documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(tokenised_corpus)]



In [None]:
model = Doc2Vec(documents, vector_size=45, window=2, min_count=1, workers=8)
model.save("models/doc2vec.model")


In [None]:

#  Get the most similar evidence document to a claim
def get_most_similar_evidence(claim, evidence_data, model, top_n=10):
    # Preprocess claim
    processed_claim = preprocess_string(claim)

    # Get claim vector
    claim_vector = model.infer_vector(processed_claim)
    
    # similar docs
    similarity = model.dv.most_similar(claim_vector, topn=top_n)
    print(similarity)
    #top 5 indices
    top_n_indices = [index for index, score in similarity]

    # top 5 evidences
    top_n_evidences = [evidence_data[index] for index in top_n_indices]
    return  top_n_indices, top_n_evidences

model = Doc2Vec.load("models/doc2vec.model")
claim = 'Not only is there no scientific evidence that CO2 is a pollutant, higher CO2 concentrations actually help ecosystems support more plant and animal life.'
top_n_indices, top_n_evidences = get_most_similar_evidence(claim, evidences, model, top_n=10)

print("Claim: ", claim)
print("Top 10 results:")
for i, evidence in enumerate(top_n_evidences):
    print(f'{i+1}. {evidence}')


**BM25 attempt**

In [None]:
from rank_bm25 import BM25Okapi

tokenised_corpus = preprocess_documents(evidences)
bm25 = BM25Okapi(tokenised_corpus)

In [None]:

#tokenised_corpus = [doc.split(" ") for doc in preprocessed_corpus]

# Initialize BM25 object

claim = 'Not only is there no scientific evidence that CO2 is a pollutant, higher CO2 concentrations actually help ecosystems support more plant and animal life.'
processed_claim = preprocess_string(claim)
print(processed_claim)
# Get BM25 scores for each evidence sentence
bm25_scores = bm25.get_scores(processed_claim)



In [31]:
# Get top 5 most similar evidence sentences
top_5_indices = bm25_scores.argsort()[::-1][:20]
most_similar_evidence = [list(evidence_data.values())[i] for i in top_5_indices]
#best_evidence = bm25.get_top_n(processed_claim, evidences, n=20)
# Print results
print("-----Processed claims-------")
print(processed_claim)

print("------BM25 Evidence-----")
for i, evidence in enumerate(most_similar_evidence):
    print(f'{i+1}. {evidence}')
print("------Original Evidence-----")
print(train_df.iloc[0]['evidences'])

-----Processed claims-------
['scientif', 'evid', 'pollut', 'higher', 'concentr', 'actual', 'help', 'ecosystem', 'support', 'plant', 'anim', 'life']
------BM25 Evidence-----
1. The whitebark pine ecosystem in these high elevations plays many essential roles, providing support to plant and animal life.
2. The concentration of pollutants in the flue gas is higher, making separation easier.
3. They study the various animals and plants that live within an ecosystem and the relationship between the two.
4. He helped to organize cultural life in Theresienstadt concentration camp.
5. Diverse forest plant life exists in the park and the area supports several wild animal species.
6. At very high concentrations (100 times atmospheric concentration, or greater), carbon dioxide can be toxic to animal life, so raising the concentration to 10,000 ppm (1%) or higher for several hours will eliminate pests such as whiteflies and spider mites in a greenhouse.
7. This scientific evidence comes from many 

**BERT ATTEMPT**

In [None]:
query = "What is the capital of France?"
passages = [
    "Paris is the capital of France.",
    "The Eiffel Tower is located in Paris.",
    "France is famous for its wine and cheese.",
    "The Louvre is a famous museum in Paris.",
    "The French Revolution started in 1789.",
]




In [32]:
from sentence_transformers import SentenceTransformer, util

#query = "How many people live in London?"
#docs = ["Around 9 Million people live in London", "London is known for its financial district"]
mps_device = torch.device("mps")
#Load the model
model = SentenceTransformer('sentence-transformers/msmarco-distilbert-base-tas-b',device=mps_device)

#Encode query and documents
query_emb = model.encode(claim)
doc_emb = model.encode(most_similar_evidence)

#Compute dot score between query and all document embeddings
scores = util.dot_score(query_emb, doc_emb)[0].cpu().tolist()

#Combine docs & scores
doc_score_pairs = list(zip(most_similar_evidence, scores))

#Sort by decreasing score
doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)

#Output passages & scores
for doc, score in doc_score_pairs:
    print(score, doc)


99.15361022949219 At very high concentrations (100 times atmospheric concentration, or greater), carbon dioxide can be toxic to animal life, so raising the concentration to 10,000 ppm (1%) or higher for several hours will eliminate pests such as whiteflies and spider mites in a greenhouse.
96.01148986816406 Higher carbon dioxide concentrations will favourably affect plant growth and demand for water.
91.10009765625 The whitebark pine ecosystem in these high elevations plays many essential roles, providing support to plant and animal life.
90.88199615478516 Scavengers play an important role in the ecosystem by consuming the dead animal and plant material.
90.53111267089844 Sustainable agriculture is the cultivation of plant and animal materials in a manner that preserves plant and animal ecosystems and that can improve soil health and soil fertility over the long term.
90.40798950195312 The concentration of pollutants in the flue gas is higher, making separation easier.
90.3914031982421

**Saved embedding of evidence document**

In [None]:
# # Define batch size
# batch_size = 1000

# # Encode documents in batches
# doc_emb = []
# for i in range(0, len(evidences), batch_size):
#   start_time = time.time()
#   batch = evidences[i:i+batch_size]
#   batch_emb = model.encode(batch)
#   doc_emb.append(batch_emb)
#   print(f"Batch {i} processed time taken {time.time()- start_time}")
# doc_emb = np.concatenate(doc_emb, axis=0)
# # Save document embeddings as a NumPy array
# np.save('doc_emb.npy', doc_emb)

In [28]:
query_emb = model.encode(claim)
#Compute dot score between query and all document embeddings
scores = util.dot_score(query_emb, doc_emb)[0].cpu().tolist()

#Combine docs & scores
doc_score_pairs = list(zip(evidences, scores))

#Sort by decreasing score
doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)

threshold = 9
#Output passages & scores
for i, (doc, score) in enumerate(doc_score_pairs):
  if i > 10:
    break
  print(score, doc)

103.17140197753906 On July 21, 2011, while a guest on the show, he stated: "To suggest that CO2's a pollutant when it's an extremely important gas in the atmosphere for all plant life and therefore for the oxygen that's produced, is just nonsense.
100.86476135253906 Studies have shown that higher CO2 levels lead to reduced plant uptake of nitrogen (and a smaller number showing the same for trace elements such as zinc) resulting in crops with lower nutritional value.
99.15360260009766 At very high concentrations (100 times atmospheric concentration, or greater), carbon dioxide can be toxic to animal life, so raising the concentration to 10,000 ppm (1%) or higher for several hours will eliminate pests such as whiteflies and spider mites in a greenhouse.
99.1375732421875 On the contrary, some microbes can deteriorate the environment: which can lead to elevated SO4 in the water and can also increase microbial production of hydrogen sulfide, a toxin for many aquatic plants and organisms.
98

In [42]:
train_df.iloc[0]['evidences']
temp_evidences= 'At very high concentrations (100 times atmospheric concentration, or greater), carbon dioxide can be toxic to animal life, so raising the concentration to 10,000 ppm (1%) or higher for several hours will eliminate pests such as whiteflies and spider mites in a greenhouse.', 'Plants can grow as much as 50 percent faster in concentrations of 1,000 ppm CO 2 when compared with ambient conditions, though this assumes no change in climate and no limitation on other nutrients.', 'Higher carbon dioxide concentrations will favourably affect plant growth and demand for water.'
print(temp_evidences)

('At very high concentrations (100 times atmospheric concentration, or greater), carbon dioxide can be toxic to animal life, so raising the concentration to 10,000 ppm (1%) or higher for several hours will eliminate pests such as whiteflies and spider mites in a greenhouse.', 'Plants can grow as much as 50 percent faster in concentrations of 1,000 ppm CO 2 when compared with ambient conditions, though this assumes no change in climate and no limitation on other nutrients.', 'Higher carbon dioxide concentrations will favourably affect plant growth and demand for water.')


In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
mps_device = torch.device("mps")
model = AutoModelForSequenceClassification.from_pretrained("climatebert/distilroberta-base-climate-f")
model.to(mps_device)
tokenizer = AutoTokenizer.from_pretrained("climatebert/distilroberta-base-climate-f")

claim = 'Not only is there no scientific evidence that CO2 is a pollutant, higher CO2 concentrations actually help ecosystems support more plant and animal life.'
evidence = "Many studies have shown that human activity, such as burning fossil fuels and deforestation, contributes significantly to climate change."

inputs = tokenizer(claim, temp_evidences, padding='max_length', truncation=True, return_tensors="pt", max_length=512)

model.eval()
with torch.no_grad():
    scores = model(**inputs).logits
    label_mapping = ['not enough info', 'supports', 'refutes']
    labels = [label_mapping[score_max] for score_max in scores.argmax(dim=1)]
    print(labels)


: 

: 