In [1]:
import json
import pandas as pd

In [2]:
# Load the data
with open('data/train-claims.json') as f:
    train_data = json.load(f)

with open('data/dev-claims.json') as f:
    dev_data = json.load(f)

with open('data/test-claims-unlabelled.json') as f:
    test_data = json.load(f)

with open('data/evidence.json') as f:
    evidence_data = json.load(f)


In [3]:
# function to get the evidence for a claim
def get_evidence(evidence_id):
    return evidence_data[evidence_id]

# function to get the claim text
def get_claim_text(claim_id):
    return train_data[claim_id]['claim_text']

# function to get the label for a claim
def get_label(claim_id):
    return train_data[claim_id]['claim_label']

# function to create a dataframe with the claim, evidence and label
def create_dataframe(data):
    rows = []
    for claim_id in data:
        row = {
            'claim_text': get_claim_text(claim_id),
            'evidences': [get_evidence(evidence_id=evidence_id) for evidence_id in train_data[claim_id]['evidences']],
            'label': get_label(claim_id)
        }
        rows.append(row)
    df = pd.concat([pd.DataFrame([row]) for row in rows], ignore_index=True)
    return df


# create the dataframes
train_df = create_dataframe(train_data)
#dev_df = create_dataframe(dev_data)

#print entire evidence for a claim of the train_df
train_df


Unnamed: 0,claim_text,evidences,label
0,Not only is there no scientific evidence that ...,[At very high concentrations (100 times atmosp...,DISPUTED
1,El Niño drove record highs in global temperatu...,[While ‘climate change’ can be due to natural ...,REFUTES
2,"In 1946, PDO switched to a cool phase.",[There is evidence of reversals in the prevail...,SUPPORTS
3,Weather Channel co-founder John Coleman provid...,[There is no convincing scientific evidence th...,DISPUTED
4,"""January 2008 capped a 12 month period of glob...","[With average temperature +8.1 °C (47 °F)., Th...",NOT_ENOUGH_INFO
...,...,...,...
1223,Climate scientists say that aspects of the cas...,"[""It's a fact: climate change made Hurricane H...",SUPPORTS
1224,"In its 5th assessment report in 2013, the IPCC...","[The scientific consensus as of 2013[update], ...",SUPPORTS
1225,"Since the mid 1970s, global temperatures have ...","[Global Warming of 1.5 °C., Multiple independe...",NOT_ENOUGH_INFO
1226,But abnormal temperature spikes in February an...,[A lower air temperature of −94.7 °C (−138.5 °...,NOT_ENOUGH_INFO


In [4]:
# preprocess the data
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

lemma = WordNetLemmatizer()

# function to preprocess the text
def preprocess_text(text):
    text = text.lower()
    # remove stopwords
    text = ' '.join([lemma.lemmatize(word) for word in text.split() if word not in stopwords.words('english')])
    
    return text

In [5]:
#use tfidf to find the most important evidence for a claim
from sklearn.feature_extraction.text import TfidfVectorizer


claim = 'Not only is there no scientific evidence that CO2 is a pollutant, higher CO2 concentrations actually help ecosystems support more plant and animal life.'

vectorizer = TfidfVectorizer()


stop_words = set(stopwords.words('english'))
preprocessed_corpus = [' '.join([lemma.lemmatize(word.lower()) for word in sentence.split() if word not in stop_words]) for key, sentence in evidence_data.items()]

# get the most important evidence for a claim
#get_most_important_evidence(claim, evidence_data)

print(len(preprocessed_corpus))



1208827


In [49]:
vectors = vectorizer.fit_transform(preprocessed_corpus)

In [50]:
# Process the claim
processed_claim = preprocess_text(claim)

# Get the tfidf vector for the claim
claim_vector = vectorizer.transform([processed_claim])

# Get the cosine similarity between the claim and the evidence
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity(claim_vector, vectors)

# Get the top 5 most similar evidence
top_5_ = cosine_similarity(claim_vector, vectors).argsort()[0][-5:]
most_similar_evidence = [preprocessed_corpus[i] + "\n" for i in cosine_similarity(claim_vector, vectors).argsort()[0][-5:]]
# Get the top 5 most similar evidence unprocessed
most_similar_evidence_unprocessed = [list(evidence_data.values())[i] + "\n" for i in cosine_similarity(claim_vector, vectors).argsort()[0][-5:]]

print(top_5_)
print("-----Processed claims-------")
print(most_similar_evidence)
print("------Original Evidence-----")
print(most_similar_evidence_unprocessed)



[985452 364767 256886  66273 668884]
-----Processed claims-------
["the increased radiative forcing due increased co2 earth's atmosphere based physical property co2 non-saturated absorption window co2 absorbs outgoing long-wave energy.\n", 'it expected ecosystem affected higher atmospheric co2 level higher global temperatures.\n', 'the concentration co2 flue gas important key determine efficiency co2 capture technology.\n', 'higher atmospheric co2 concentration led increase dissolved co2, cause ocean acidification.\n', 'co2).\n']
------Original Evidence-----
["The increased radiative forcing due to increased CO2 in the Earth's atmosphere is based on the physical properties of CO2 and the non-saturated absorption windows where CO2 absorbs outgoing long-wave energy.\n", 'It is expected that most ecosystems will be affected by higher atmospheric CO2 levels and higher global temperatures.\n', 'The concentration of CO2 in the flue gas is an important key to determine the efficiency of CO2 c

In [9]:
train_df.iloc[0]['evidences']

['At very high concentrations (100 times atmospheric concentration, or greater), carbon dioxide can be toxic to animal life, so raising the concentration to 10,000 ppm (1%) or higher for several hours will eliminate pests such as whiteflies and spider mites in a greenhouse.',
 'Plants can grow as much as 50 percent faster in concentrations of 1,000 ppm CO 2 when compared with ambient conditions, though this assumes no change in climate and no limitation on other nutrients.',
 'Higher carbon dioxide concentrations will favourably affect plant growth and demand for water.']

**DOC2VEC**

In [6]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from gensim.parsing.preprocessing import preprocess_documents

processed_corpus = preprocess_documents(evidence_data.values())

# Train Doc2Vec model on preprocessed corpus
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(processed_corpus)]



**ORIGINAL**

In [7]:
model = Doc2Vec(documents, vector_size=20, window=4, min_count=2, workers=8)
model.save("models/doc2vec2.model")

In [9]:
from gensim.parsing import preprocess_string
#  Get the most similar evidence document to a claim
def get_most_similar_evidence(claim, evidence_data, model, top_n=5):
    # Preprocess claim
    processed_claim = preprocess_string(claim)

    # Get claim vector
    claim_vector = model.infer_vector(processed_claim)
    
    # similar docs
    similarity = model.dv.most_similar(claim_vector, topn=top_n)
    print(similarity)
    #top 5 indices
    top_n_indices = [index for index, score in similarity]

    # top 5 evidences
    top_n_evidences = [evidence_data[index] for index in top_n_indices]
    return  top_n_indices, top_n_evidences
evidences = list(evidence_data.values())
model = Doc2Vec.load("models/doc2vec2.model")
top_n_indices, top_n_evidences = get_most_similar_evidence(claim, evidences, model, top_n=5)


print(top_n_indices)
print("Claim: ", claim)
print("Top 5 results:")
for i, evidence in enumerate(top_n_evidences):
    print(f'{i}. {evidence}')


[(224207, 0.7665451169013977), (242918, 0.738102912902832), (118000, 0.7309796214103699), (1140304, 0.7275773882865906), (1074523, 0.7186175584793091)]
[224207, 242918, 118000, 1140304, 1074523]
Claim:  Not only is there no scientific evidence that CO2 is a pollutant, higher CO2 concentrations actually help ecosystems support more plant and animal life.
Top 5 results:
0. Bioremediation is used to neutralize pollutants including Hydrocarbons, chlorinated compounds, nitrates, toxic metals and other pollutants through a variety of chemical mechanisms.
1. In practical terms, if continued, it reduces valuable resources to such low levels that their exploitation is no longer sustainable and can lead to the extinction of a species, in addition to having dramatic, unforeseen effects, on the ecosystem.
2. And from the year 2008 to the year 2012, environmental incidents and accidents reduced from 18 to 4.
3. As of 2015, there are 10 active phosphate mines in four states : Florida, North Carolina

In [28]:
train_df.iloc[0]['evidences']

['At very high concentrations (100 times atmospheric concentration, or greater), carbon dioxide can be toxic to animal life, so raising the concentration to 10,000 ppm (1%) or higher for several hours will eliminate pests such as whiteflies and spider mites in a greenhouse.',
 'Plants can grow as much as 50 percent faster in concentrations of 1,000 ppm CO 2 when compared with ambient conditions, though this assumes no change in climate and no limitation on other nutrients.',
 'Higher carbon dioxide concentrations will favourably affect plant growth and demand for water.']

In [28]:
print(documents[0])

TaggedDocument<['john', 'bennet', 'law', 'english', 'entrepreneur', 'agricultur', 'scientist'], [0]>


**BM25 attempt**