Download pdfs

In [None]:
! git clone https://github.com/arpytanshu/parspec.git
! pip install -r parspec/requirements.txt
! mkdir -p parspec/data/test

In [20]:
! python parspec/download.py --meta_path=parspec/resources/parspec_test_data.csv --dst_dir_path=parspec/data/test --threads=25
! python parspec/prep_data.py --meta_path=parspec/resources/parspec_test_data.csv --src_dir=data/test --dst_file_path=parspec/data/test-dataset.csv

[|] Progress: [|                                                 ] 1% {1 / 80}[/] Progress: [|                                                 ] 2% {2 / 80}[-] Progress: [||                                                ] 4% {3 / 80}[\] Progress: [||                                                ] 5% {4 / 80}[|] Progress: [|||                                               ] 6% {5 / 80}[/] Progress: [||||                                              ] 8% {6 / 80}[-] Progress: [||||                                              ] 9% {7 / 80}[\] Progress: [|||||                                             ] 10% {8 / 80}[|] Progress: [||||||                                            ] 11% {9 / 80}[/] Progress: [||||||                                            ] 12% {10 / 80}[-] Progress: [|||||||                                           ] 14% {11 / 80}[\] Progress: [||||||||                                          ] 15% {12 / 80}[|] Progress: [||||||||                    

In [34]:

import os
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import pandas as pd
import numpy as np

from pathlib import Path
from parspec.download import download_link
from parspec.prep_data import extract_text_from_pdf



def infer(model, tokenizer, url=None, dst_file=None, dbg=True):

    if (url == None) and (dst_file == None):
        print('No url or dst_file provided. Exiting')
        return

    if url != None:
        dst_file = 'temp.pdf'
        if os.path.exists(dst_file):
            os.remove(dst_file)

        if download_link(url, dst_file):
            if dbg: print('Download successful')

    try:
        text = extract_text_from_pdf(dst_file)
        if dbg: print('Text extraction successful')
    except Exception as e:
        text = ' '
        if dbg: print('Text extraction from pdf failed:', e)

    input_ids = tokenizer.encode(text)
    input_ids = input_ids[:min(len(input_ids), model.config.max_position_embeddings)]
    input_ids = torch.tensor(input_ids).view(1, -1)
    input_ids = input_ids.to(model.device)
    with torch.no_grad():

        outputs = model(input_ids)
    class_ix = outputs.logits.argmax(1).item()
    nl_class = {1: 'Is lighting product? YES', 0: 'Is lighting product? NO'}[class_ix]
    if dbg: print(nl_class)

    if os.path.exists('temp.pdf'):
        os.remove('temp.pdf')

    return class_ix


def evaluate(chkpt_path, files_basepath, test_meta_df='parspec/data/test-dataset.csv'):

    tokenizer = AutoTokenizer.from_pretrained(chkpt_path)
    model = AutoModelForSequenceClassification.from_pretrained(chkpt_path)
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    model = model.to(device)
    model.eval()

    files_basepath = Path(files_basepath)
    test_meta_df = pd.read_csv(test_meta_df)

    preds = []
    labels = []

    for ix, row in test_meta_df.iterrows():
        file_name = row.ID + '.pdf'
        file_path = files_basepath / file_name
        class_ix = infer(model, tokenizer, dst_file=str(file_path), dbg=False)
        # class_ix = infer(model, tokenizer, url=row.URL, dbg=False)
        true_label = row['Is lighting product?']
        # print(f'{ix} - {class_ix} - {true_label}')
        preds.append(class_ix)
        labels.append(true_label)

    preds = np.array(preds)
    labels = np.array(labels)

    correct = (preds == labels).sum().item()
    tp = ((preds == 1) & (labels == 1)).sum()
    tn = ((preds == 0) & (labels == 0)).sum()
    fp = ((preds == 1) & (labels == 0)).sum()
    fn = ((preds == 0) & (labels == 1)).sum()

    print(f'Accuracy: {correct / len(labels)}')

    print(f'TP: {tp} \t| FN: {fn}')
    print(f'FP: {fp} \t| TN: {tn}')


# RUN EVALUATION ON PROVIDED TEST DATA

In [35]:
evaluate(chkpt_path='/content/parspec/checkpoints/checkpoint-270', files_basepath='parspec/data/test', test_meta_df='parspec/data/test-dataset.csv')

Token indices sequence length is longer than the specified maximum sequence length for this model (2869 > 512). Running this sequence through the model will result in indexing errors


Accuracy: 0.875
TP: 16 	| FN: 4
FP: 6 	| TN: 54


# INFERENCE

In [39]:

from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

chkpt_path = '/content/parspec/checkpoints/checkpoint-270'
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

tokenizer = AutoTokenizer.from_pretrained(chkpt_path)
model = AutoModelForSequenceClassification.from_pretrained(chkpt_path).to(device)
model.eval()


url = 'https://www.cooperlighting.com/api/assets/v1/file/CLS/content/347f567de4414421a1dcad3f014a0c77/corelite-continua-sq4-brochure'
infer(model, tokenizer, url=url, dbg=True)

Successfully downloaded https://www.cooperlighting.com/api/assets/v1/file/CLS/content/347f567de4414421a1dcad3f014a0c77/corelite-continua-sq4-brochure
Download successful


Token indices sequence length is longer than the specified maximum sequence length for this model (2440 > 512). Running this sequence through the model will result in indexing errors


Text extraction successful
Is lighting product? YES


1