<a href="https://colab.research.google.com/github/aaalexlit/omdena_climate_change_challenge_notebooks/blob/main/Index_claims_from_abstracts_for_searches.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Index claims from abstracts for searches

## Download csv with the abstracts retrieved from OpenAlex using keyword search

This is a short version that has only about 10300 abstracts

The csv was obtained using [this code](https://github.com/mcallaghan/NLP-climate-science-tutorial-CCAI/blob/main/A_obtaining_data.ipynb) with some minor modifications from https://openalex.org/ 

In [1]:
!gdown https://drive.google.com/uc?id=14hqOpm4HsNarlWiGnakz3sPsP8g24XWC -O 'openalex_abstracts.csv'

Downloading...
From: https://drive.google.com/uc?id=14hqOpm4HsNarlWiGnakz3sPsP8g24XWC
To: /content/openalex_abstracts.csv
100% 10.8M/10.8M [00:00<00:00, 22.5MB/s]


# Check if GPU is available to install gpu version of faiss

In [2]:
import torch
import os
faiss_to_install = "faiss-gpu"
if not torch.cuda.is_available():
  faiss_to_install = "faiss"

ret_code = os.system(f"pip install farm-haystack[{faiss_to_install}]")
if not ret_code:
  print(f"Installed {faiss_to_install}")

Installed faiss-gpu


# Index documents 

In [20]:
import os
from haystack.document_stores import FAISSDocumentStore
from haystack.nodes import EmbeddingRetriever
import logging
from timeit import default_timer as timer
from haystack import Document
import pandas as pd
import torch
from nltk.tokenize import sent_tokenize
from transformers import AutoTokenizer, pipeline, \
RobertaForSequenceClassification, AutoModelForSequenceClassification
import gc
import itertools
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Model to use for semantic embeddings

In [21]:
MODEL_NAME = 'sentence-transformers/all-MiniLM-L6-v2'
# embedding size used by all-MiniLM-L6-v2
EMBEDDING_DIM = 384

chunk_size = 1800
start_from_row = 0 * chunk_size

logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
pd.options.mode.chained_assignment = None


## Claim detection

In [22]:
claimbuster_tokenizer = AutoTokenizer.from_pretrained("lucafrost/ClaimBuster-DeBERTaV2")
claimbuster_model = AutoModelForSequenceClassification.from_pretrained("lucafrost/ClaimBuster-DeBERTaV2")

In [23]:
def is_claim(sentences,
             model=claimbuster_model,
             tokenizer=claimbuster_tokenizer,
             debug=False):
    if torch.cuda.is_available():
      device = 0
      batch_size = 64
    else:
      device = -1
      batch_size = 1
    pipe = pipeline("text-classification", model=model,
                    tokenizer=tokenizer, device=device)
    labels, probs = [], []
    for out in pipe(sentences, batch_size=batch_size):
        labels.append(out['label'])
        probs.append(out['score'])
    if debug:
        for sentence, label, prob in zip(sentences, labels, probs):
            print(f"{label}({prob:.3f})")
            print(sentence)
    return list(map(lambda l: model.config.label2id[l], labels)), probs

In [24]:
def get_claims_from_texts(df, id_col='id', text_col='abstract', threshold=0.5, debug=False):
    df[text_col] = df[text_col].map(sent_tokenize)
    df = df.explode(text_col)
    sentences = df[text_col].tolist()
    ids = df[id_col].tolist()
    predicted_class_ids, probs = is_claim(sentences, debug=debug)
    claims_df = pd.DataFrame(
        [(doi, sentence) for doi, sentence, label, prob in zip(ids, sentences, predicted_class_ids, probs) if
         label in [1, 2] and prob > threshold], columns=[id_col, 'claims'])
    claims_df = claims_df.groupby(id_col).agg({'claims': lambda x: x.tolist()})
    return claims_df

## Class that deals with initiating and writing to faiss store

In [25]:
class FAISSIndexer():
    def __init__(self, path_to_index_dir,
                 model_name, embedding_dim
                 , path_to_postgres=None) -> None:
        self.path_to_index_dir = path_to_index_dir
        # our db is postgres, only need to set path to faiss index
        if path_to_postgres:
            self.path_to_db = path_to_postgres
            self._set_path_to_index()
        # our db is SQLLite
        else:
            self._set_path_to_index_and_db()
        self.embedding_dim = embedding_dim
        self.model_name = model_name
        self.document_store = self._init_document_store()
        self.retriever = self._init_retriever()

    def _set_path_to_index(self):
        if not os.path.exists(self.path_to_index_dir):
            os.makedirs(self.path_to_index_dir)
        self.path_to_index = os.path.join(self.path_to_index_dir, "faiss_index")

    def _set_path_to_index_and_db(self):
        self._set_path_to_index()
        self.path_to_db = f"sqlite:///{os.path.join(self.path_to_index_dir, 'faiss_document_store.db')}"

    def _init_document_store(self):
        if os.path.exists(self.path_to_index):
            return FAISSDocumentStore.load(index_path=self.path_to_index)
        else:
            return FAISSDocumentStore(
                sql_url=self.path_to_db,
                return_embedding=True,
                similarity='cosine',
                embedding_dim=self.embedding_dim,
                duplicate_documents='skip'
            )

    def _init_retriever(self, progress_bar=True):
        return EmbeddingRetriever(
            document_store=self.document_store,
            embedding_model=self.model_name,
            model_format='sentence_transformers',
            # include article title into the embedding
            embed_meta_fields=["title"],
            progress_bar=progress_bar
        )

    def write_documents(self, docs):
        self.document_store.write_documents(docs)

        print('Updating embeddings ...')

        self.document_store.update_embeddings(
            retriever=self.retriever,
            update_existing_embeddings=False
        )

        print(f'current embedding count is {self.document_store.get_embedding_count()}')
        self.document_store.save(index_path=self.path_to_index)

    def retrieve_matches_for_a_phrase(self, phrase, top_k=10):
        return self.retriever.retrieve(phrase, top_k=top_k)

    def retrieve_matches_for_phrases(self, phrases, top_k=10):
        return self.retriever.retrieve_batch(phrases, top_k=top_k)

## Functions for indexing

In [26]:
def convert_openalex_claims_to_haystack_document(row):
    return [{'content': claim,
             'meta': {
                 'title': row['title'],
                 'publication_year': row['publication_year'],
                 'authors': row['authors'],
                 'doi': row['doi'],
                 'open_alex_id': row['id']
             }} for claim in row['claims']]

In [27]:
def convert_openalex_rows_to_haystack_document(row):
    return [{'content': sent,
             'meta': {
                 'title': row['title'],
                 'publication_year': row['publication_year'],
                 'authors': row['authors'],
                 'doi': row['doi'],
                 'open_alex_id': row['id']
             }} for sent in sent_tokenize(row['abstract'])]

In [28]:
def convert_abstracts_from_openalex_to_haystack_docs(filename, chunk_size, start_from_row,detect_claims=False):
    id_col = 'id'
    chunk_number = 1
    for df in pd.read_csv(filename, chunksize=chunk_size, skiprows=range(1, start_from_row)):
        print(f'starting to index chunk number {chunk_number}')
        df.fillna("", inplace=True)
        if detect_claims:
          claims_df = get_claims_from_texts(df[[id_col, 'abstract']])
          print('Finished extracting claims')
          df = df.merge(claims_df, on=id_col)
          del claims_df
          gc.collect()
          df.drop(columns=['abstract'], inplace=True)
          row_dict = df.to_dict('records')
          yield list(itertools.chain(*[convert_openalex_claims_to_haystack_document(row) for row in row_dict]))
        else:
          row_dict = df.to_dict('records')
          yield list(itertools.chain(*[convert_openalex_rows_to_haystack_document(row) for row in row_dict]))
        chunk_number += 1


In [29]:
def index_docs_from_csv(filename, docs_extractor,
                        indexer,
                        chunk_size, start_from_row):
  for docs in docs_extractor(filename, chunk_size, start_from_row):
    indexer.write_documents(docs)

## Launch indexing process

In [30]:
start = timer()
STORE_PATH = '/content/data/faiss/'
csv_path = '/content/openalex_abstracts.csv'

faiss_indexer = FAISSIndexer(STORE_PATH, MODEL_NAME, EMBEDDING_DIM)

index_docs_from_csv(csv_path,
                    convert_abstracts_from_openalex_to_haystack_docs,
                    faiss_indexer,
                    chunk_size,
                    start_from_row
                    )

end = timer()
print(end - start)

starting to index chunk number 1


Writing Documents:   0%|          | 0/14067 [00:00<?, ?it/s]

Updating embeddings ...


Updating Embedding:   0%|          | 0/14067 [00:00<?, ? docs/s]

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Batches:   0%|          | 0/128 [00:00<?, ?it/s]

current embedding count is 14067
starting to index chunk number 2


Writing Documents:   0%|          | 0/13893 [00:00<?, ?it/s]

Updating embeddings ...


Updating Embedding:   0%|          | 0/27960 [00:00<?, ? docs/s]

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Batches:   0%|          | 0/122 [00:00<?, ?it/s]

current embedding count is 27960
starting to index chunk number 3


Writing Documents:   0%|          | 0/14135 [00:00<?, ?it/s]

Updating embeddings ...


Updating Embedding:   0%|          | 0/42095 [00:00<?, ? docs/s]

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Batches:   0%|          | 0/130 [00:00<?, ?it/s]

current embedding count is 42095
starting to index chunk number 4


Writing Documents:   0%|          | 0/13600 [00:00<?, ?it/s]

Updating embeddings ...


Updating Embedding:   0%|          | 0/55695 [00:00<?, ? docs/s]

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Batches:   0%|          | 0/113 [00:00<?, ?it/s]

current embedding count is 55695
starting to index chunk number 5


Writing Documents:   0%|          | 0/6685 [00:00<?, ?it/s]

Updating embeddings ...


Updating Embedding:   0%|          | 0/62380 [00:00<?, ? docs/s]

Batches:   0%|          | 0/209 [00:00<?, ?it/s]

current embedding count is 62380
230.66624144699995
