Todos: 
- Encoding: DONE
- Models: use a SOTA transformer model? T5
- Search/Ranking: Not that difficult once the model can produce predictions (use softmax/logit scores as relevancy score, can naively search pdf for location of citation)

In [44]:
from PyPDF2 import PdfReader
from os import listdir
from os.path import isfile, join
from collections import defaultdict
import re
from cleantext import clean

In [56]:
def get_pdfs():
    source_pdfs = [] # single array of abstract text for sources
    cited_pdfs = defaultdict(lambda: defaultdict(lambda: [])) # source paper --> {relevant: [], nonrelevant: []}
    initial_path = './data/'
    papers = ['paper1', 'paper2', 'paper3', 'paper4', 'paper5', 'paper6', 'paper7']
    for paper in papers:
        source_paper_path = join(initial_path, paper)
        pdfs = [join(source_paper_path, f) for f in listdir(source_paper_path) if isfile(join(source_paper_path, f))]
        source_pdfs.append(pdfs[0])
        source_paper_path += '/Cited/'
        relevance = ['Relevant', 'Less Relevant']
        for rel in relevance:
            cited_paper_path = source_paper_path + rel
            cited_pdf_paths = [join(cited_paper_path, f) for f in listdir(cited_paper_path) if isfile(join(cited_paper_path, f))]
            cited_pdfs[paper][rel] = cited_pdf_paths
    return source_pdfs, cited_pdfs

source_pdf_paths, cited_pdf_paths = get_pdfs()


In [57]:
def add_page_text(input_list, path):
    reader = PdfReader(path)
    for page in reader.pages:
        text = page.extract_text()
        if re.search('abstract', text, flags=re.I):
            input_list.append(text)
            return
    input_list.append('none')

def get_abstract_page(source_pdf_paths, cited_pdf_paths):
    source_abstracts = [] # single array of abstract text for sources
    cited_abstracts = defaultdict(lambda: defaultdict(lambda: [])) # source paper --> {relevant: [], nonrelevant: []}
    for ppath in source_pdf_paths:
        add_page_text(source_abstracts, ppath)
    papers = ['paper1', 'paper2', 'paper3', 'paper4', 'paper5', 'paper6', 'paper7']
    for paper in papers:
        relevance = ['Relevant', 'Less Relevant']
        for rel in relevance:
            page_text_array = []
            for ppath in cited_pdf_paths[paper][rel]:
                add_page_text(page_text_array, ppath)
            cited_abstracts[paper][rel] = page_text_array
    return source_abstracts, cited_abstracts
        
source_abstracts, cited_abstracts = get_abstract_page(source_pdf_paths, cited_pdf_paths)

In [63]:
def clean_text(text):
    return clean(text,
        fix_unicode=True,               # fix various unicode errors
        to_ascii=True,                  # transliterate to closest ASCII representation
        lower=True,                     # lowercase text
        no_line_breaks=True,           # fully strip line breaks as opposed to only normalizing them
        no_urls=False,                  # replace all URLs with a special token
        no_emails=False,                # replace all email addresses with a special token
        no_phone_numbers=False,         # replace all phone numbers with a special token
        no_numbers=False,               # replace all numbers with a special token
        no_digits=False,                # replace all digits with a special token
        no_currency_symbols=False,      # replace all currency symbols with a special token
        no_punct=False,                 # remove punctuations
        replace_with_punct="",          # instead of removing punctuations you may replace them
        replace_with_url="<URL>",
        replace_with_email="<EMAIL>",
        replace_with_phone_number="<PHONE>",
        replace_with_number="<NUMBER>",
        replace_with_digit="0",
        replace_with_currency_symbol="<CUR>",
        lang="en"                       # set to 'de' for German special handling
    )

def clean_source_cite_pair(source_list, cited_dict):
    cleaned_source_list = list(map(clean_text, source_list))
    cleaned_cited_dict = defaultdict(lambda: defaultdict(lambda: []))
    papers = ['paper1', 'paper2', 'paper3', 'paper4', 'paper5', 'paper6', 'paper7']
    for paper in papers:
        relevance = ['Relevant', 'Less Relevant']
        for rel in relevance:
            cleaned_cited_dict[paper][rel] = list(map(clean_text, cited_dict[paper][rel]))
    return cleaned_source_list, cleaned_cited_dict

cleaned_source_abstracts, cleaned_cited_abstracts = clean_source_cite_pair(source_abstracts, cited_abstracts)

In [66]:
def truncate_helper(text):
    if text == 'none':
        return text
    else:
        array = text.split('abstract')
        if len(array[1]) > 512:
            return array[1]
        else:
            return array[1][:512]

def truncate_to_512(input_list, input_dict):
    output_list = []
    output_dict = defaultdict(lambda: defaultdict(lambda: []))
    for text in input_list:
        output_list.append(truncate_helper(text))
    papers = ['paper1', 'paper2', 'paper3', 'paper4', 'paper5', 'paper6', 'paper7']
    for paper in papers:
        relevance = ['Relevant', 'Less Relevant']
        for rel in relevance:
            truncated_text_list = []
            for text in input_dict[paper][rel]:
                truncated_text_list.append(truncate_helper(text))
            output_dict[paper][rel] = truncated_text_list
    return output_list, output_dict

source_abstract_text, cited_abstract_text = truncate_to_512(cleaned_source_abstracts, cleaned_cited_abstracts)