# Material Property Extractor Demo

In [1]:
import os
import sys
import json
import logging
import pandas as pd
from datetime import datetime
from transformers import HfArgumentParser, set_seed
from seqlbtoolkit.io import set_logging, logging_args, save_json

from pipeline.args import Arguments
from pipeline.crawler import crawl
from pipeline.parser import parse
from pipeline.heuristic_extractor import extract
from pipeline.dataset_constructor import construct_dataset

logger = logging.getLogger(__name__)
set_logging(None)
logger.setLevel(logging.INFO)



## Initialize and assign arguments

The class `Arguments` is originally designed for accepting arguments from CLI.
But it also works with jupyter environment.

In [3]:
args = Arguments()

## Download articles

In [3]:
args.do_crawling = True
args.doi_file_path = './resources/dois.json'
args.raw_article_dir = './data/web_articles/'

if args.do_crawling:
    crawl(args)

  0%|          | 0/32 [00:00<?, ?it/s]

[WDM] - Downloading: 100%|██████████| 7.26M/7.26M [00:00<00:00, 28.2MB/s]


## Parse articles

In [4]:
args.do_parsing = True
args.raw_article_dir = './data/web_articles/'
args.processed_article_dir = './data/web_articles_parsed/'

if args.do_parsing:
    parse(args)

11/15/2022 13:59:11 - INFO - pipeline.parser -   Parsing file 10.1002&sl;adfm.201301345.html
11/15/2022 13:59:12 - INFO - pipeline.parser -   Saving results...
11/15/2022 13:59:12 - INFO - pipeline.parser -   Parsing file 10.1002&sl;macp.201100248.html
11/15/2022 13:59:12 - INFO - pipeline.parser -   Saving results...
11/15/2022 13:59:12 - INFO - pipeline.parser -   Parsing file 10.1007&sl;s10008-009-0968-1.html
11/15/2022 13:59:12 - INFO - pipeline.parser -   Saving results...
11/15/2022 13:59:12 - INFO - pipeline.parser -   Parsing file 10.1007&sl;s11581-018-2788-8.html
11/15/2022 13:59:13 - INFO - pipeline.parser -   Saving results...
11/15/2022 13:59:13 - INFO - pipeline.parser -   Parsing file 10.1016&sl;j.compositesb.2017.03.060.html
11/15/2022 13:59:14 - INFO - pipeline.parser -   Saving results...
11/15/2022 13:59:14 - INFO - pipeline.parser -   Parsing file 10.1016&sl;j.electacta.2009.05.055.html
11/15/2022 13:59:14 - INFO - pipeline.parser -   Saving results...
11/15/2022 13:

## Information Extraction

In [5]:
args.do_extraction = True
args.keyword_path = [r"./resources/tensile.json"]
args.save_html = True
args.save_jsonl = True

In [6]:
if args.do_extraction:
    extract(args)


11/15/2022 14:00:13 - INFO - xfms.tk_cx.src.args -   Setting <class 'xfms.tk_cx.src.args.BertNERConfig'> parameters from ./models/pet-mm-model/bert_config.json.
11/15/2022 14:00:13 - INFO - xfms.tk_cx.src.train -   Loading model from ./models/pet-mm-model
11/15/2022 14:00:15 - INFO - pipeline.heuristic_extractor -   Processing file 10.1002&sl;adfm.201301345.pt
11/15/2022 14:00:15 - INFO - pipeline.heuristic_extractor -   Processing file 10.1002&sl;macp.201100248.pt
11/15/2022 14:00:15 - INFO - pipeline.heuristic_extractor -   Processing file 10.1007&sl;s10008-009-0968-1.pt
11/15/2022 14:00:15 - INFO - pipeline.heuristic_extractor -   Processing file 10.1007&sl;s11581-018-2788-8.pt
11/15/2022 14:00:16 - INFO - pipeline.heuristic_extractor -   Processing file 10.1016&sl;j.compositesb.2017.03.060.pt
11/15/2022 14:00:26 - INFO - pipeline.heuristic_extractor -   Saving HTML results...
11/15/2022 14:00:26 - INFO - pipeline.heuristic_extractor -   Saving JSONL results...
11/15/2022 14:00:26 -