In [15]:
from datasets import load_dataset
from collections import Counter
import spacy
import json

from src.preprocess import *

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
#!python3 -m spacy download en_core_web_sm

In [2]:
# load data
dataset = load_dataset("adsabs/WIESP2022-NER")
dataset

DatasetDict({
    train: Dataset({
        features: ['bibcode', 'label_studio_id', 'ner_ids', 'ner_tags', 'section', 'tokens', 'unique_id'],
        num_rows: 1753
    })
    validation: Dataset({
        features: ['bibcode', 'label_studio_id', 'ner_ids', 'ner_tags', 'section', 'tokens', 'unique_id'],
        num_rows: 1366
    })
    test: Dataset({
        features: ['bibcode', 'label_studio_id', 'ner_ids', 'ner_tags', 'section', 'tokens', 'unique_id'],
        num_rows: 2505
    })
})

# Unbiased Domain Knowledge

In [3]:
# preprocess tags
processed_tags, ner_tokens, text = process_entity_tag(data=dataset['validation'])

In [6]:
# target entity
entity_name

['Organization',
 'Observatory',
 'CelestialObject',
 'Event',
 'CelestialRegion',
 'Identifier']

In [4]:
# taget ner_tags
ner_tags

['B-Organization',
 'B-Observatory',
 'B-CelestialObject',
 'B-Event',
 'B-CelestialRegion',
 'B-Identifier',
 'I-Organization',
 'I-Observatory',
 'I-CelestialObject',
 'I-Event',
 'I-CelestialRegion',
 'I-Identifier']

## High frequency sub-token

for each entity we will generate a list of high frequency subtokens containing:
- top 50 3-grams
- top 50 5-grams

In [20]:
sub_tokens = {}

for entity in entity_name:
    for bi in ["B","I"]:
        for n in [3, 5]:
            tokens = ner_tokens[f"{bi}-{entity}"]
            sub_tokens[f"{bi}-{entity}_{n}_grams"] =[i[0] for i in find_frequent_subword(tokens, n_gram=n, top=50)]

## Pattern Analysis

In [21]:
regex = {}

### Organization:
+ Start with capital letters
+ Name / Country / Org

In [10]:
entity = 'Organization'
word_counts = Counter(ner_tokens[f"B-{entity}"] + ner_tokens[f"I-{entity}"])
top_word = word_counts.most_common(100)
top_word

[('of', 1202),
 ('National', 591),
 ('Science', 557),
 ('University', 524),
 ('Research', 460),
 ('and', 452),
 ('for', 382),
 ('Institute', 320),
 ('de', 311),
 ('Space', 277),
 ('Foundation', 266),
 ('NASA', 253),
 ('Council', 252),
 ('University,', 229),
 ('NSF', 137),
 ('European', 129),
 ('Aeronautics', 115),
 ('Technology', 114),
 ('Laboratory,', 112),
 ('Center', 102),
 ('Astronomy', 102),
 ('Centre', 101),
 ('Ministry', 98),
 ('Foundation,', 97),
 ('California', 95),
 ('State', 92),
 ('Department', 90),
 ('the', 84),
 ('Universities', 82),
 ('STFC', 82),
 ('Technology,', 81),
 ('in', 80),
 ('Physics', 78),
 ('Foundation.', 77),
 ('Office', 75),
 ('Astrophysics', 72),
 ('Propulsion', 71),
 ('Energy', 71),
 ('Jet', 70),
 ('Association', 66),
 ('Australian', 65),
 ('Telescope', 65),
 ('Astronomy,', 65),
 ('University.', 61),
 ('Natural', 59),
 ('Administration.', 58),
 ('Facilities', 56),
 ('Data', 55),
 ('Max', 53),
 ('Academy', 53),
 ('Planck', 53),
 ('Sciences', 51),
 ('China',

### Observatory:
+ Contain 'Observatory'
+ Location
+ All alphabet chars are uppercase

In [11]:
entity = 'Observatory'
word_counts = Counter(ner_tokens[f"B-{entity}"] + ner_tokens[f"I-{entity}"])
top_word = word_counts.most_common(100)
top_word

[('Observatory', 184),
 ('Observatory,', 93),
 ('National', 90),
 ('ALMA', 79),
 ('Astronomical', 56),
 ('ESO', 50),
 ('Astronomy', 48),
 ('Radio', 36),
 ('de', 35),
 ('of', 31),
 ('Astrophysical', 29),
 ('Swift', 27),
 ('Observatory.', 27),
 ('Smithsonian', 24),
 ('SDO', 24),
 ('Keck', 23),
 ('Fermi', 20),
 ('Observatorio', 20),
 ('ESO,', 19),
 ('Optical', 19),
 ('Paranal', 18),
 ('del', 18),
 ('NAOJ.', 17),
 ('La', 17),
 ('Joint', 16),
 ('Roque', 16),
 ('los', 16),
 ('Muchachos', 16),
 ('Silla', 15),
 ('Observatoire', 12),
 ('Cerro', 12),
 ('Solar', 12),
 ('W.', 11),
 ('M.', 11),
 ('Cumbres', 11),
 ('European', 10),
 ('Tololo', 10),
 ('Inter-American', 10),
 ('Gemini', 9),
 ('Southern', 9),
 ('Japan.', 9),
 ('East', 8),
 ('di', 8),
 ('(Las', 8),
 ('Asian', 8),
 ('United', 7),
 ('Arecibo', 7),
 ('States', 7),
 ('Naval', 7),
 ('Observatories', 7),
 ('South', 6),
 ('Shanghai', 6),
 ('Las', 6),
 ('African', 6),
 ('China,', 6),
 ('Japan,', 6),
 ('Osservatorio', 5),
 ('W.M.', 5),
 ('Observ

In [22]:
regex['contain_obs'] = r"observatory|Observatory"

### CelestialObject:
+ Contain 'Sun', 'Earth', 'Gala', 'Milky', 'solar'
+ Numbers + capital letters
+ All alphabet chars are uppercase
+ contain '-'

In [24]:
entity = 'CelestialObject'
word_counts = Counter(ner_tokens[f"B-{entity}"] + ner_tokens[f"I-{entity}"])
top_word = word_counts.most_common(100)
top_word

[('NGC', 83),
 ('HD', 61),
 ('SN', 43),
 ('Cyg', 39),
 ('Galactic', 38),
 ('solar', 33),
 ('Sgr', 23),
 ('Milky', 23),
 ('GW170817', 20),
 ('SDSS', 20),
 ('Tau', 20),
 ('CMB', 18),
 ('KIC', 18),
 ('Leo', 17),
 ('UGC', 16),
 ('A', 16),
 ('ALESS', 16),
 ('X-1', 16),
 ('T', 15),
 ('Way', 15),
 ('M', 14),
 ('LMC', 14),
 ('Orion', 13),
 ('Sun', 12),
 ('A*', 12),
 ('I', 12),
 ('Upper', 11),
 ('Tuc', 11),
 ('Carina', 11),
 ('α', 11),
 ('S2', 10),
 ('PSR', 10),
 ('MW', 10),
 ('B', 10),
 ('SGR', 10),
 ('GG', 10),
 ('Local', 9),
 ('Sun.', 9),
 ('54', 9),
 ('Earth', 8),
 ('Galaxy', 8),
 ('FSR', 8),
 ('Solar', 8),
 ('Sun,', 8),
 ('B211', 8),
 ('III', 8),
 ('OB2', 8),
 ('122.1', 8),
 ('3198', 8),
 ('Cen', 7),
 ('IRAS', 7),
 ('UBC', 7),
 ('A,', 7),
 ('1H', 7),
 ('TXS', 7),
 ('SAX', 7),
 ('b', 7),
 ('Dor', 7),
 ('104', 7),
 ('Group', 7),
 ('100546', 7),
 ('Earth-like', 6),
 ('Taurus', 6),
 ('HH', 6),
 ('Kepler-93b', 6),
 ('PKS', 6),
 ('SNR', 6),
 ('DF', 6),
 ('1ES', 6),
 ('iPTF', 6),
 ('AI', 6),
 ('M

In [25]:
regex['contain_co'] = r"Sun|Earth|Gala|Milky|solar"
regex['contain_cap_num'] = r"[A-Z]+\d+"

sub_tokens['-'] = ["-"]
sub_tokens['greek_letter'] = ['α', 'ε', 'θ']

### Event:
+ Contain year/date

In [13]:
entity = 'Event'
word_counts = Counter(ner_tokens[f"B-{entity}"] + ner_tokens[f"I-{entity}"])
top_word = word_counts.most_common(100)
top_word

[('and', 11),
 ('in', 6),
 ('workshop', 5),
 ('of', 4),
 ('Program', 3),
 ('for', 3),
 ('la', 3),
 ('the', 3),
 ('Coronal', 3),
 ('de', 3),
 ('Summer', 3),
 ('Galactic', 2),
 ('ISIMA', 2),
 ('“Galactic', 2),
 ('Kavli', 2),
 ('Academy', 2),
 ('Magnetic', 2),
 ('conference', 2),
 ('Formation', 2),
 ('programme', 2),
 ('Archaeology', 2),
 ('Precision', 2),
 ('Stellar', 2),
 ('Astrophysics”', 2),
 ('Observations', 2),
 ('2015', 1),
 ('Cloud', 1),
 ('La', 1),
 ('deep', 1),
 ('Convocatoria', 1),
 ('“The', 1),
 ('“New', 1),
 ('“Decoding', 1),
 ('Meteoroid', 1),
 ('Estrategia', 1),
 ('supermassive', 1),
 ('‘Protoplanetary', 1),
 ('LSST:', 1),
 ('infrared', 1),
 ('“Solving', 1),
 ('ESANN-2018', 1),
 ('KITP', 1),
 ('2019', 1),
 ('‘Gaia', 1),
 ('Global', 1),
 ('Physics', 1),
 ('The', 1),
 ('2013', 1),
 ('3rd', 1),
 ('(International', 1),
 ('ISIMA,', 1),
 ('NSF', 1),
 ('‘The', 1),
 ('“Sub-arcsec', 1),
 ('“Implications', 1),
 ('EUROWD-21', 1),
 ('Dartmouth', 1),
 ('Astronomy', 1),
 ('Foreign', 1),


In [26]:
regex['contain_year'] = r"\d{4}"

### CelestialRegion
- contain '°', '′'
- contain '>', '<', '=', '|'
- single lower case character (not 'a')

In [27]:
entity = 'CelestialRegion'
word_counts = Counter(ner_tokens[f"B-{entity}"] + ner_tokens[f"I-{entity}"])
top_word = word_counts.most_common(100)
top_word

[('=', 11),
 ('Galactic', 10),
 ('b', 7),
 ('field', 7),
 ('extragalactic', 5),
 ('|', 5),
 ('BICEP2', 4),
 ('decl.', 4),
 ('COSMOS', 3),
 ('LR', 3),
 ('Universe', 3),
 ('l', 3),
 ('h', 3),
 ('m', 3),
 ('local', 2),
 ('Northern', 2),
 ('galactic', 2),
 ('R.A.', 2),
 ('Planck', 2),
 ('and', 2),
 ('longitude', 2),
 (')', 2),
 ('plane', 2),
 ('plane,', 2),
 ('region', 2),
 ('(2000)', 2),
 ('Common', 2),
 ('Field', 2),
 ('11', 2),
 ('54', 2),
 ('inner', 1),
 ('170°', 1),
 ('(', 1),
 ('(l,', 1),
 ('WISE', 1),
 ('Antlia', 1),
 ('anti-sunward', 1),
 ('sunward', 1),
 ('VPOS', 1),
 ('Boötes', 1),
 ('TESS', 1),
 ('CoRoT', 1),
 ('Cygnus', 1),
 ('Lyra.', 1),
 ('HXDF,', 1),
 ('GOODS-North', 1),
 ('(LR72).', 1),
 ('LR33).', 1),
 ('early', 1),
 ('“', 1),
 ('central', 1),
 ('GSE', 1),
 ('Geocentric', 1),
 ('(GSE)', 1),
 ('303°', 1),
 ('(R.A.', 1),
 ('−4.3°', 1),
 ('−3.3°', 1),
 ('349.2°,', 1),
 ('Hemisphere.', 1),
 ('Galaxy', 1),
 ('7R.A.', 1),
 ('175°,', 1),
 ('−3°', 1),
 ('3°).', 1),
 (',', 1),
 ('(

In [27]:
sub_tokens['degree'] = ['°', '′']
sub_tokens['operation'] = ['>', '<', '=', '|']
regex['hml'] = r"^[hml]$"

### Identifier
- start with 'ADS/JAO'
- start with / contain '#'
- r'/d{4}\./d{1,2}\./d{5}'
- all numbers

In [14]:
entity = 'Identifier'
word_counts = Counter(ner_tokens[f"B-{entity}"] + ner_tokens[f"I-{entity}"])
top_word = word_counts.most_common(100)
top_word

[('CBP1', 6),
 ('CBP2', 4),
 ('AR', 3),
 ('ADS/JAO.ALMA#', 3),
 ('(Version', 3),
 ('version', 2),
 ('NOAA', 2),
 ('CBP1,', 2),
 ('12290', 2),
 ('LIGO', 1),
 ('2016.1.01164.S.', 1),
 ('(v4.7.2,', 1),
 ('ADS/JAO.ALMA#2011.0.00876.S,', 1),
 ('ADS/JAO.ALMA#2012.00650.', 1),
 ('<inline-formula>', 1),
 ('V,', 1),
 ('ADS/JAO.ALMA#2013.1.00111.S.', 1),
 ('ADS/JAO.ALMA#2011.0.00172.S.', 1),
 ('24Jun2019_V6.26.1,', 1),
 ('ADS/JAO.ALMA#2013.1.00798.S.', 1),
 ('ADS/JAO.ALMA[2013.1.00486.S].', 1),
 ('ADS/JAO.ALMA#2013.1.00806.S.', 1),
 ('SOL2014-01-13T21:51M1.3,', 1),
 ('72877.', 1),
 ('ADS/JAO.ALMA#2012.1.00978.S.', 1),
 ('#', 1),
 ('RP200442)', 1),
 ('RP200576)', 1),
 ('HRL0000BABA', 1),
 ('FRT0000BABA', 1),
 ('(v8845;', 1),
 ('(v4.2.2;', 1),
 ('MJLSG32.', 1),
 ('#14125.', 1),
 ('ID', 1),
 ('#14125,', 1),
 ('#14125', 1),
 ('CBP2,', 1),
 ('(PFL1)', 1),
 ('PFL1,', 1),
 ('N03E05', 1),
 ('N03W18', 1),
 ('ADS/JAO.ALMA#2013.1.00151.S,', 1),
 ('ADS/JAO.ALMA#2013.1.00034.S', 1),
 ('ADS/JAO.ALMA#2012.1.00

In [28]:
sub_tokens['#'] = ['#']

regex['ADS/JAO'] = r"^ADS/JAO"
regex['num_seq'] = r'/d{4}\./d{1,2}\./d{5}'

In [30]:
features = {}
features['sub_tokens'] = sub_tokens
features['regex'] = regex

with open('unbiased_domain_knowledge.json', 'w') as fp:
    json.dump(features, fp)

# Biased Domain Knowledge

In [32]:
# preprocess tags
processed_tags_200, ner_tokens_200, _ = process_entity_tag(data=dataset['train'][:200])
processed_tags_500, ner_tokens_500, _ = process_entity_tag(data=dataset['train'][:500])
processed_tags_1000, ner_tokens_1000, _ = process_entity_tag(data=dataset['train'][:1000])
processed_tags, ner_tokens, _ = process_entity_tag(data=dataset['train'])

KeyboardInterrupt: 

## High frequency sub-token

for each entity we will generate a list of high frequency subtokens containing:
- top 50 3-grams
- top 50 5-grams

In [None]:
sub_tokens = {}
processed_tags_list = [processed_tags_200, processed_tags_500, processed_tags_1000]

for entity in entity_name:
    for bi in ["B","I"]:
        for n in [3, 5]:
            tokens = ner_tokens[f"{bi}-{entity}"]
            sub_tokens[f"{bi}-{entity}_{n}_grams"] =[i[0] for i in find_frequent_subword(tokens, n_gram=n, top=50)]