Todos: 
- Encoding: DONE
- Models: use a SOTA transformer model? T5
- Search/Ranking: Not that difficult once the model can produce predictions (use softmax/logit scores as relevancy score, can naively search pdf for location of citation)

In [1]:
!pip install simpletransformers
!pip install PyPDF2









In [1]:
from PyPDF2 import PdfReader
from os import listdir
from os.path import isfile, join
import re
from cleantext import clean
from simpletransformers.t5 import T5Model, T5Args
import pandas as pd
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from string import punctuation
from nltk import word_tokenize
from collections import Counter, defaultdict
import numpy as np

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
def get_pdfs():
    source_pdfs = [] # single array of abstract text for sources
    cited_pdfs = defaultdict(lambda: defaultdict(lambda: [])) # source paper --> {relevant: [], nonrelevant: []}
    initial_path = './data/'
    papers = ['paper1', 'paper2', 'paper3', 'paper4', 'paper5', 'paper6', 'paper7']
    for paper in papers:
        source_paper_path = join(initial_path, paper)
        pdfs = [join(source_paper_path, f) for f in listdir(source_paper_path) if isfile(join(source_paper_path, f))]
        source_pdfs.append(pdfs[0])
        source_paper_path += '/Cited/'
        relevance = ['Relevant', 'Less Relevant']
        for rel in relevance:
            cited_paper_path = source_paper_path + rel
            cited_pdf_paths = [join(cited_paper_path, f) for f in listdir(cited_paper_path) if isfile(join(cited_paper_path, f))]
            cited_pdfs[paper][rel] = cited_pdf_paths
    return source_pdfs, cited_pdfs

source_pdf_paths, cited_pdf_paths = get_pdfs()


In [3]:
def add_page_text(input_list, path):
    reader = PdfReader(path)
    for page in reader.pages:
        text = page.extract_text()
        if re.search('abstract', text, flags=re.I):
            input_list.append(text)
            return
    input_list.append('none')

def get_abstract_page(source_pdf_paths, cited_pdf_paths):
    source_abstracts = [] # single array of abstract text for sources
    cited_abstracts = defaultdict(lambda: defaultdict(lambda: [])) # source paper --> {relevant: [], nonrelevant: []}
    for ppath in source_pdf_paths:
        add_page_text(source_abstracts, ppath)
    papers = ['paper1', 'paper2', 'paper3', 'paper4', 'paper5', 'paper6', 'paper7']
    for paper in papers:
        relevance = ['Relevant', 'Less Relevant']
        for rel in relevance:
            page_text_array = []
            for ppath in cited_pdf_paths[paper][rel]:
                add_page_text(page_text_array, ppath)
            cited_abstracts[paper][rel] = page_text_array
    return source_abstracts, cited_abstracts
        
source_abstracts, cited_abstracts = get_abstract_page(source_pdf_paths, cited_pdf_paths)

In [4]:
def clean_text(text):
    return clean(text,
        fix_unicode=True,               # fix various unicode errors
        to_ascii=True,                  # transliterate to closest ASCII representation
        lower=True,                     # lowercase text
        no_line_breaks=True,           # fully strip line breaks as opposed to only normalizing them
        no_urls=False,                  # replace all URLs with a special token
        no_emails=False,                # replace all email addresses with a special token
        no_phone_numbers=False,         # replace all phone numbers with a special token
        no_numbers=False,               # replace all numbers with a special token
        no_digits=False,                # replace all digits with a special token
        no_currency_symbols=False,      # replace all currency symbols with a special token
        no_punct=False,                 # remove punctuations
        replace_with_punct="",          # instead of removing punctuations you may replace them
        replace_with_url="<URL>",
        replace_with_email="<EMAIL>",
        replace_with_phone_number="<PHONE>",
        replace_with_number="<NUMBER>",
        replace_with_digit="0",
        replace_with_currency_symbol="<CUR>",
        lang="en"                       # set to 'de' for German special handling
    )

def clean_source_cite_pair(source_list, cited_dict):
    cleaned_source_list = list(map(clean_text, source_list))
    cleaned_cited_dict = defaultdict(lambda: defaultdict(lambda: []))
    papers = ['paper1', 'paper2', 'paper3', 'paper4', 'paper5', 'paper6', 'paper7']
    for paper in papers:
        relevance = ['Relevant', 'Less Relevant']
        for rel in relevance:
            cleaned_cited_dict[paper][rel] = list(map(clean_text, cited_dict[paper][rel]))
    return cleaned_source_list, cleaned_cited_dict

cleaned_source_abstracts, cleaned_cited_abstracts = clean_source_cite_pair(source_abstracts, cited_abstracts)

In [5]:
def truncate_helper(text):
    if text == 'none':
        return text
    else:
        array = text.split('abstract')
        if len(array[1]) > 512:
            return array[1]
        else:
            return array[1][:512]

def truncate_to_512(input_list, input_dict):
    output_list = []
    output_dict = defaultdict(lambda: defaultdict(lambda: []))
    for text in input_list:
        output_list.append(truncate_helper(text))
    papers = ['paper1', 'paper2', 'paper3', 'paper4', 'paper5', 'paper6', 'paper7']
    for paper in papers:
        relevance = ['Relevant', 'Less Relevant']
        for rel in relevance:
            truncated_text_list = []
            for text in input_dict[paper][rel]:
                truncated_text_list.append(truncate_helper(text))
            output_dict[paper][rel] = truncated_text_list
    return output_list, output_dict

source_abstract_text, cited_abstract_text = truncate_to_512(cleaned_source_abstracts, cleaned_cited_abstracts)

In [6]:
# Test abstract text w/ T5 model and see if stsb mode works well as a relevance classifier
# generate prediction input
prediction_input = []
prefix = 'stsb'
sent1 = 'sentence 1:'
sent2 = 'sentence 2:'
pred_input_dict = defaultdict(lambda: defaultdict(lambda: []))
for abs_text in source_abstract_text:
    papers = ['paper1', 'paper2', 'paper3', 'paper4', 'paper5', 'paper6', 'paper7']
    for paper in papers:
        relevance = ['Relevant', 'Less Relevant']
        for rel in relevance:
            batch = []
            for text in cited_abstract_text[paper][rel]:
                batch.append(f'{prefix} {sent1} {abs_text}. {sent2} {text}')
            pred_input_dict[paper][rel] = batch
            

In [7]:
model_args = T5Args()
model_args.model_max_length = 1024
model = T5Model("t5", "t5-small", args=model_args)

Downloading:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading:   0%|          | 0.00/242M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [8]:
# predictions = defaultdict(lambda: defaultdict(lambda: []))
# papers = ['paper1', 'paper2', 'paper3', 'paper4', 'paper5', 'paper6', 'paper7']
# for paper in papers:
#     relevance = ['Relevant', 'Less Relevant']
#     for rel in relevance:
#         preds = model.predict(pred_input_dict[paper][rel])
#         predictions[paper][rel] = preds
predictions = defaultdict(lambda: [])
papers = ['paper1', 'paper2', 'paper3', 'paper4', 'paper5', 'paper6', 'paper7']
for paper in papers:
    preds = model.predict(pred_input_dict[paper]['Relevant'] + pred_input_dict[paper]['Less Relevant'])
    predictions[paper] = preds

Generating outputs:   0%|          | 0/3 [00:00<?, ?it/s]

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and targets.

Here is a short example:

model_inputs = tokenizer(src_texts, text_target=tgt_texts, ...)

If you either need to use different keyword arguments for the source and target texts, you should do two calls like
this:

model_inputs = tokenizer(src_texts, ...)
labels = tokenizer(text_target=tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.



Decoding outputs:   0%|          | 0/23 [00:00<?, ?it/s]

Generating outputs:   0%|          | 0/5 [00:00<?, ?it/s]

Decoding outputs:   0%|          | 0/35 [00:00<?, ?it/s]

Generating outputs:   0%|          | 0/3 [00:00<?, ?it/s]

Decoding outputs:   0%|          | 0/24 [00:00<?, ?it/s]

Generating outputs:   0%|          | 0/2 [00:00<?, ?it/s]

Decoding outputs:   0%|          | 0/10 [00:00<?, ?it/s]

Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

Decoding outputs:   0%|          | 0/4 [00:00<?, ?it/s]

Generating outputs:   0%|          | 0/2 [00:00<?, ?it/s]

Decoding outputs:   0%|          | 0/11 [00:00<?, ?it/s]

Generating outputs:   0%|          | 0/2 [00:00<?, ?it/s]

Decoding outputs:   0%|          | 0/11 [00:00<?, ?it/s]

In [9]:
print('stsb mode in T5 doesn\'t work well for distinguishing between abstracts')
print(predictions['paper1'])
print(predictions['paper2'])
print(predictions['paper3'])
print(predictions['paper4'])
print(predictions['paper5'])
print(predictions['paper6'])
print(predictions['paper7'])

stsb mode in T5 doesn't work well for distinguishing between abstracts
['1.8', '1.8', '1.8', '1.8', '1.8', '1.8', '1.8', '1.8', '1.8', '1.8', '1.8', '1.8', '1.8', '1.8', '1.8', '1.8', '1.8', '1.8', '1.8', '1.8', '1.8', '1.8', '1.8']
['1.8', '1.8', '1.8', '1.8', '1.8', '1.8', '1.8', '1.8', '1.8', '1.8', '1.8', '1.8', '1.8', '1.8', '1.8', '1.8', '1.8', '1.8', '1.8', '1.8', '1.8', '1.8', '1.8', '1.8', '1.8', '1.8', '1.8', '1.8', '1.8', '1.8', '1.8', '1.8', '1.8', '1.8', '1.8']
['1.8', '1.8', '1.8', '1.8', '1.8', '1.8', '1.8', '1.8', '1.8', '1.8', '1.8', '1.8', '1.8', '1.8', '1.8', '1.8', '1.8', '1.8', '1.8', '1.8', '1.8', '1.8', '1.8', '1.8']
['1.8', '1.8', '1.8', '1.8', '1.8', '1.8', '1.8', '1.8', '1.8', '1.8']
['1.8', '1.8', '1.8', '1.8']
['1.8', '1.8', '1.8', '1.8', '1.8', '1.8', '1.8', '1.8', '1.8', '1.8', '1.8']
['1.8', '1.8', '1.8', '1.8', '1.8', '1.8', '1.8', '1.8', '1.8', '1.8', '1.8']


In [10]:
def tokenize_text(text):
    tokens = word_tokenize(text)
    stop_words = stopwords.words('english') + list(punctuation) + ['\n']
    returned_tokens = []
    for token in tokens:
        if token not in stop_words:
            returned_tokens.append(token)
    return returned_tokens

def tokenize_source_cite_pair(input_list, input_dict):
    tokenized_list = list(map(tokenize_text, input_list))
    tokenized_dict = defaultdict(lambda: defaultdict(lambda: []))
    papers = ['paper1', 'paper2', 'paper3', 'paper4', 'paper5', 'paper6', 'paper7']
    for paper in papers:
        relevance = ['Relevant', 'Less Relevant']
        for rel in relevance:
            tokenized_dict[paper][rel] = list(map(tokenize_text, input_dict[paper][rel]))
    return tokenized_list, tokenized_dict

tokenized_source_abstracts, tokenized_cited_abstracts = tokenize_source_cite_pair(cleaned_source_abstracts, cleaned_cited_abstracts)

In [11]:
def create_vocab_set(input_list, input_dict):
    vocab = set()
    for li in input_list:
        for text in li: 
            vocab.add(text)
    papers = ['paper1', 'paper2', 'paper3', 'paper4', 'paper5', 'paper6', 'paper7']
    for paper in papers:
        relevance = ['Relevant', 'Less Relevant']
        for rel in relevance:
            for li in input_dict[paper][rel]:
                for text in li: 
                    vocab.add(text)
    return vocab

vocab = create_vocab_set(tokenized_source_abstracts, tokenized_cited_abstracts)

In [12]:
def get_term_freq(input_list, input_dict):
    source_tf_list = []
    for doc in input_list:
        source_tf_list.append(Counter(doc))
    cited_tf_dict = defaultdict(lambda: defaultdict(lambda: []))
    papers = ['paper1', 'paper2', 'paper3', 'paper4', 'paper5', 'paper6', 'paper7']
    for paper in papers:
        relevance = ['Relevant', 'Less Relevant']
        for rel in relevance:
            tf_list = []
            for doc in input_dict[paper][rel]:
                tf_list.append(Counter(doc))
            cited_tf_dict[paper][rel] = tf_list
    return source_tf_list, cited_tf_dict

source_tf_abstracts, cited_tf_abstracts = get_term_freq(tokenized_source_abstracts, tokenized_cited_abstracts)

In [13]:
doc_num = len(source_tf_abstracts)
papers = ['paper1', 'paper2', 'paper3', 'paper4', 'paper5', 'paper6', 'paper7']
for paper in papers:
    relevance = ['Relevant', 'Less Relevant']
    for rel in relevance:
        doc_num += len(cited_tf_abstracts[paper][rel])

def get_doc_freq(word, input_list, input_dict):
    doc_freq = 0
    for doc in input_list:
        if doc[word] != 0:
            doc_freq += 1
    papers = ['paper1', 'paper2', 'paper3', 'paper4', 'paper5', 'paper6', 'paper7']
    for paper in papers:
        relevance = ['Relevant', 'Less Relevant']
        for rel in relevance:
            for doc in input_dict[paper][rel]:
                if doc[word] != 0:
                    doc_freq += 1
    return doc_freq

def get_idf_freq(vocab, input_list, input_dict, doc_num):
    # pass in input_list/dict of counters for term counts
    idf_dict = {}
    for word in vocab:
        word_doc_freq = get_doc_freq(word, input_list, input_dict)
        for doc in input_list:
            idf_value = np.log2(doc_num/word_doc_freq)
            idf_dict[word] = idf_value
    return idf_dict

idf_dict = get_idf_freq(vocab, source_tf_abstracts, cited_tf_abstracts, doc_num)

In [14]:
def get_tfidf_dict(vocab, tf_scores, idf_dict):
    output_dict = {}
    for word in vocab:
        output_dict[word] = tf_scores[word]*idf_dict[word]
    return output_dict
    
def get_tfidf_list_dict(vocab, input_list, input_dict, idf_dict):
    source_tfidf_list = []
    for doc in input_list:
        source_tfidf_list.append(get_tfidf_dict(vocab, doc, idf_dict))
    cited_tfidf_dict = defaultdict(lambda: defaultdict(lambda: []))
    papers = ['paper1', 'paper2', 'paper3', 'paper4', 'paper5', 'paper6', 'paper7']
    for paper in papers:
        relevance = ['Relevant', 'Less Relevant']
        for rel in relevance:
            tfidf_list = []
            for doc in input_dict[paper][rel]:
                tfidf_list.append(get_tfidf_dict(vocab, doc, idf_dict))
            cited_tfidf_dict[paper][rel] = tfidf_list
    return source_tfidf_list, cited_tfidf_dict

source_tfidf_abstracts, cited_tfidf_abstracts = get_tfidf_list_dict(vocab, source_tf_abstracts, cited_tf_abstracts, idf_dict)

In [15]:
def convert_tfidf_dict_to_vector(input_dict, vocab):
    tfidf_vector = []
    for word in vocab:
        tfidf_vector.append(input_dict[word])
    return np.array(tfidf_vector)

def get_tfidf_vectorized_documents(input_list, input_dict):
    source_tfidf_vectors = []
    for doc in input_list:
        source_tfidf_vectors.append(convert_tfidf_dict_to_vector(doc, vocab))
    cited_tfidf_vectors = defaultdict(lambda: defaultdict(lambda: []))
    papers = ['paper1', 'paper2', 'paper3', 'paper4', 'paper5', 'paper6', 'paper7']
    for paper in papers:
        relevance = ['Relevant', 'Less Relevant']
        for rel in relevance:
            tfidf_vector_list = []
            for doc in input_dict[paper][rel]:
                tfidf_vector_list.append(convert_tfidf_dict_to_vector(doc, vocab))
            cited_tfidf_vectors[paper][rel] = tfidf_vector_list
    return source_tfidf_vectors, cited_tfidf_vectors

source_tfidf_vectors, cited_tfidf_vectors = get_tfidf_vectorized_documents(source_tfidf_abstracts, cited_tfidf_abstracts)

In [16]:
def cosine_sim(a, b):
    return np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))

def get_most_relevant(source_document_vector, cited_document_vectors_relevant, cited_document_vectors_less_relevant):
    sim_scores = []
    for i, rel_doc in enumerate(cited_document_vectors_relevant):
        sim_scores.append(('Relevant', i, cosine_sim(source_document_vector, rel_doc)))
    for i, not_rel_doc in enumerate(cited_document_vectors_less_relevant):
        sim_scores.append(('Less Relevant', i, cosine_sim(source_document_vector, not_rel_doc)))
    return sorted(sim_scores, key=lambda x: x[2], reverse=True)

In [35]:
def map_index_to_cited_pdfs(index, paper, relevance, cited_pdf_paths):
    return cited_pdf_paths[paper][relevance][index]

def calculate_paper_relevance_scores(source_tfidf_vectors, cited_tfidf_vectors, source_pdf_paths, cited_pdf_paths):
    output_dict = {}
    for i, source_vector in enumerate(source_tfidf_vectors):
        papers = ['paper1', 'paper2', 'paper3', 'paper4', 'paper5', 'paper6', 'paper7']
        for paper in papers:
            relevance_scores = get_most_relevant(source_vector, cited_tfidf_vectors[paper]['Relevant'], cited_tfidf_vectors[paper]['Less Relevant'])
            output_dict[source_pdf_paths[i]] = list(map(lambda x: (x[0], map_index_to_cited_pdfs(x[1], paper, x[0], cited_pdf_paths), x[2]), relevance_scores))
    return output_dict


In [36]:
relevance_dict = calculate_paper_relevance_scores(source_tfidf_vectors, cited_tfidf_vectors, source_pdf_paths, cited_pdf_paths)

In [37]:
def calculate_avg_prec(relevance_scores, num):
    prec_num = 0
    prec_denom = 0
    cumulative = 0
    for i in range(num):
        prec_denom += 1
        if relevance_scores[i][0] == 'Relevant':
            prec_num += 1
        cumulative += prec_num / prec_denom
    return cumulative / num

def calculate_map(relevance_dict, num):
    precisions = []
    for key in relevance_dict.keys():
        precisions.append(calculate_avg_prec(relevance_dict[key], num))
    return sum(precisions)/len(precisions)

print('MAP@4 for VSM model using TFIDF Vectors:', calculate_map(relevance_dict, 4))
# Individual AP@4
for key in relevance_dict.keys():
    print(f'AP@4 for {key}', calculate_avg_prec(relevance_dict[key], 4))

MAP@4 for VSM model using TFIDF Vectors: 0.7380952380952381
AP@4 for ./data/paper1\Shafahi2018.pdf 0.9375
AP@4 for ./data/paper2\Jagieski2020.pdf 1.0
AP@4 for ./data/paper3\Biggio2017.pdf 0.41666666666666663
AP@4 for ./data/paper4\Brusseau2022.pdf 0.8541666666666666
AP@4 for ./data/paper5\Dobrev2022.pdf 1.0
AP@4 for ./data/paper6\Morsa2022.pdf 0.47916666666666663
AP@4 for ./data/paper7\Gohar2022.pdf 0.47916666666666663


In [38]:
print('Below are the top4 most relevant documents according to the VSM model')
for key in relevance_dict.keys():
    print('')
    print(f'Top 4 most relevant documents for {key}')
    print(relevance_dict[key][:4])
    print('')

Below are the top4 most relevant documents according to the VSM model

Top 4 most relevant documents for ./data/paper1\Shafahi2018.pdf
[('Relevant', './data/paper7/Cited/Relevant\\Baker2020.pdf', 0.03478826233833435), ('Relevant', './data/paper7/Cited/Relevant\\Berner2019.pdf', 0.027017218337367855), ('Relevant', './data/paper7/Cited/Relevant\\Gong2014.pdf', 0.016867298059795817), ('Less Relevant', './data/paper7/Cited/Less Relevant\\Olimid12020.pdf', 0.015772062713223494)]


Top 4 most relevant documents for ./data/paper2\Jagieski2020.pdf
[('Relevant', './data/paper7/Cited/Relevant\\Hougardy2010.pdf', 0.029245724444194106), ('Relevant', './data/paper7/Cited/Relevant\\Baker2020.pdf', 0.022833135084655336), ('Relevant', './data/paper7/Cited/Relevant\\Gong2014.pdf', 0.01914741069739826), ('Relevant', './data/paper7/Cited/Relevant\\Berner2019.pdf', 0.01913156061278095)]


Top 4 most relevant documents for ./data/paper3\Biggio2017.pdf
[('Less Relevant', './data/paper7/Cited/Less Relevant\\

In [39]:
print('Below are the top4 most relevant documents according to the VSM model')
for key in relevance_dict.keys():
    print('')
    print(f'Top 4 most relevant documents for {key}')
    print(list(map(lambda x: (x[1], x[2]), relevance_dict[key][:4])))
    print('')

Below are the top4 most relevant documents according to the VSM model

Top 4 most relevant documents for ./data/paper1\Shafahi2018.pdf
[('./data/paper7/Cited/Relevant\\Baker2020.pdf', 0.03478826233833435), ('./data/paper7/Cited/Relevant\\Berner2019.pdf', 0.027017218337367855), ('./data/paper7/Cited/Relevant\\Gong2014.pdf', 0.016867298059795817), ('./data/paper7/Cited/Less Relevant\\Olimid12020.pdf', 0.015772062713223494)]


Top 4 most relevant documents for ./data/paper2\Jagieski2020.pdf
[('./data/paper7/Cited/Relevant\\Hougardy2010.pdf', 0.029245724444194106), ('./data/paper7/Cited/Relevant\\Baker2020.pdf', 0.022833135084655336), ('./data/paper7/Cited/Relevant\\Gong2014.pdf', 0.01914741069739826), ('./data/paper7/Cited/Relevant\\Berner2019.pdf', 0.01913156061278095)]


Top 4 most relevant documents for ./data/paper3\Biggio2017.pdf
[('./data/paper7/Cited/Less Relevant\\Olimid12020.pdf', 0.024665389345255286), ('./data/paper7/Cited/Relevant\\O’Sullivan2019.pdf', 0.017964017615181002), (