In [100]:
from datasets import load_dataset
from collections import Counter
import spacy
import json

from src.preprocess import *

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
#!python3 -m spacy download en_core_web_sm

In [3]:
# load data
dataset = load_dataset("adsabs/WIESP2022-NER")
dataset

DatasetDict({
    train: Dataset({
        features: ['bibcode', 'label_studio_id', 'ner_ids', 'ner_tags', 'section', 'tokens', 'unique_id'],
        num_rows: 1753
    })
    validation: Dataset({
        features: ['bibcode', 'label_studio_id', 'ner_ids', 'ner_tags', 'section', 'tokens', 'unique_id'],
        num_rows: 1366
    })
    test: Dataset({
        features: ['bibcode', 'label_studio_id', 'ner_ids', 'ner_tags', 'section', 'tokens', 'unique_id'],
        num_rows: 2505
    })
})

# Unbiased Domain Knowledge

In [3]:
# preprocess tags
processed_tags, ner_tokens, text = process_entity_tag(data=dataset['validation'])

In [6]:
# target entity
entity_name

['Organization',
 'Observatory',
 'CelestialObject',
 'Event',
 'CelestialRegion',
 'Identifier']

In [4]:
# taget ner_tags
ner_tags

['B-Organization',
 'B-Observatory',
 'B-CelestialObject',
 'B-Event',
 'B-CelestialRegion',
 'B-Identifier',
 'I-Organization',
 'I-Observatory',
 'I-CelestialObject',
 'I-Event',
 'I-CelestialRegion',
 'I-Identifier']

## High frequency sub-token

for each entity we will generate a list of high frequency subtokens containing:
- top 50 3-grams
- top 50 5-grams

In [20]:
sub_tokens = {}

for entity in entity_name:
    for bi in ["B","I"]:
        for n in [3, 5]:
            tokens = ner_tokens[f"{bi}-{entity}"]
            sub_tokens[f"{bi}-{entity}_{n}_grams"] =[i[0] for i in find_frequent_subword(tokens, n_gram=n, top=50)]

## Pattern Analysis

In [21]:
regex = {}

### Organization:
+ Start with capital letters
+ Name / Country / Org

In [10]:
entity = 'Organization'
word_counts = Counter(ner_tokens[f"B-{entity}"] + ner_tokens[f"I-{entity}"])
top_word = word_counts.most_common(100)
top_word

[('of', 1202),
 ('National', 591),
 ('Science', 557),
 ('University', 524),
 ('Research', 460),
 ('and', 452),
 ('for', 382),
 ('Institute', 320),
 ('de', 311),
 ('Space', 277),
 ('Foundation', 266),
 ('NASA', 253),
 ('Council', 252),
 ('University,', 229),
 ('NSF', 137),
 ('European', 129),
 ('Aeronautics', 115),
 ('Technology', 114),
 ('Laboratory,', 112),
 ('Center', 102),
 ('Astronomy', 102),
 ('Centre', 101),
 ('Ministry', 98),
 ('Foundation,', 97),
 ('California', 95),
 ('State', 92),
 ('Department', 90),
 ('the', 84),
 ('Universities', 82),
 ('STFC', 82),
 ('Technology,', 81),
 ('in', 80),
 ('Physics', 78),
 ('Foundation.', 77),
 ('Office', 75),
 ('Astrophysics', 72),
 ('Propulsion', 71),
 ('Energy', 71),
 ('Jet', 70),
 ('Association', 66),
 ('Australian', 65),
 ('Telescope', 65),
 ('Astronomy,', 65),
 ('University.', 61),
 ('Natural', 59),
 ('Administration.', 58),
 ('Facilities', 56),
 ('Data', 55),
 ('Max', 53),
 ('Academy', 53),
 ('Planck', 53),
 ('Sciences', 51),
 ('China',

### Observatory:
+ Contain 'Observatory'
+ Location
+ All alphabet chars are uppercase

In [11]:
entity = 'Observatory'
word_counts = Counter(ner_tokens[f"B-{entity}"] + ner_tokens[f"I-{entity}"])
top_word = word_counts.most_common(100)
top_word

[('Observatory', 184),
 ('Observatory,', 93),
 ('National', 90),
 ('ALMA', 79),
 ('Astronomical', 56),
 ('ESO', 50),
 ('Astronomy', 48),
 ('Radio', 36),
 ('de', 35),
 ('of', 31),
 ('Astrophysical', 29),
 ('Swift', 27),
 ('Observatory.', 27),
 ('Smithsonian', 24),
 ('SDO', 24),
 ('Keck', 23),
 ('Fermi', 20),
 ('Observatorio', 20),
 ('ESO,', 19),
 ('Optical', 19),
 ('Paranal', 18),
 ('del', 18),
 ('NAOJ.', 17),
 ('La', 17),
 ('Joint', 16),
 ('Roque', 16),
 ('los', 16),
 ('Muchachos', 16),
 ('Silla', 15),
 ('Observatoire', 12),
 ('Cerro', 12),
 ('Solar', 12),
 ('W.', 11),
 ('M.', 11),
 ('Cumbres', 11),
 ('European', 10),
 ('Tololo', 10),
 ('Inter-American', 10),
 ('Gemini', 9),
 ('Southern', 9),
 ('Japan.', 9),
 ('East', 8),
 ('di', 8),
 ('(Las', 8),
 ('Asian', 8),
 ('United', 7),
 ('Arecibo', 7),
 ('States', 7),
 ('Naval', 7),
 ('Observatories', 7),
 ('South', 6),
 ('Shanghai', 6),
 ('Las', 6),
 ('African', 6),
 ('China,', 6),
 ('Japan,', 6),
 ('Osservatorio', 5),
 ('W.M.', 5),
 ('Observ

In [22]:
regex['contain_obs'] = r"observatory|Observatory"

### CelestialObject:
+ Contain 'Sun', 'Earth', 'Gala', 'Milky', 'solar'
+ Numbers + capital letters
+ All alphabet chars are uppercase
+ contain '-'

In [24]:
entity = 'CelestialObject'
word_counts = Counter(ner_tokens[f"B-{entity}"] + ner_tokens[f"I-{entity}"])
top_word = word_counts.most_common(100)
top_word

[('NGC', 83),
 ('HD', 61),
 ('SN', 43),
 ('Cyg', 39),
 ('Galactic', 38),
 ('solar', 33),
 ('Sgr', 23),
 ('Milky', 23),
 ('GW170817', 20),
 ('SDSS', 20),
 ('Tau', 20),
 ('CMB', 18),
 ('KIC', 18),
 ('Leo', 17),
 ('UGC', 16),
 ('A', 16),
 ('ALESS', 16),
 ('X-1', 16),
 ('T', 15),
 ('Way', 15),
 ('M', 14),
 ('LMC', 14),
 ('Orion', 13),
 ('Sun', 12),
 ('A*', 12),
 ('I', 12),
 ('Upper', 11),
 ('Tuc', 11),
 ('Carina', 11),
 ('α', 11),
 ('S2', 10),
 ('PSR', 10),
 ('MW', 10),
 ('B', 10),
 ('SGR', 10),
 ('GG', 10),
 ('Local', 9),
 ('Sun.', 9),
 ('54', 9),
 ('Earth', 8),
 ('Galaxy', 8),
 ('FSR', 8),
 ('Solar', 8),
 ('Sun,', 8),
 ('B211', 8),
 ('III', 8),
 ('OB2', 8),
 ('122.1', 8),
 ('3198', 8),
 ('Cen', 7),
 ('IRAS', 7),
 ('UBC', 7),
 ('A,', 7),
 ('1H', 7),
 ('TXS', 7),
 ('SAX', 7),
 ('b', 7),
 ('Dor', 7),
 ('104', 7),
 ('Group', 7),
 ('100546', 7),
 ('Earth-like', 6),
 ('Taurus', 6),
 ('HH', 6),
 ('Kepler-93b', 6),
 ('PKS', 6),
 ('SNR', 6),
 ('DF', 6),
 ('1ES', 6),
 ('iPTF', 6),
 ('AI', 6),
 ('M

In [25]:
regex['contain_co'] = r"Sun|Earth|Gala|Milky|solar"
regex['contain_cap_num'] = r"[A-Z]+\d+"

sub_tokens['-'] = ["-"]
sub_tokens['greek_letter'] = ['α', 'ε', 'θ']

### Event:
+ Contain year/date

In [13]:
entity = 'Event'
word_counts = Counter(ner_tokens[f"B-{entity}"] + ner_tokens[f"I-{entity}"])
top_word = word_counts.most_common(100)
top_word

[('and', 11),
 ('in', 6),
 ('workshop', 5),
 ('of', 4),
 ('Program', 3),
 ('for', 3),
 ('la', 3),
 ('the', 3),
 ('Coronal', 3),
 ('de', 3),
 ('Summer', 3),
 ('Galactic', 2),
 ('ISIMA', 2),
 ('“Galactic', 2),
 ('Kavli', 2),
 ('Academy', 2),
 ('Magnetic', 2),
 ('conference', 2),
 ('Formation', 2),
 ('programme', 2),
 ('Archaeology', 2),
 ('Precision', 2),
 ('Stellar', 2),
 ('Astrophysics”', 2),
 ('Observations', 2),
 ('2015', 1),
 ('Cloud', 1),
 ('La', 1),
 ('deep', 1),
 ('Convocatoria', 1),
 ('“The', 1),
 ('“New', 1),
 ('“Decoding', 1),
 ('Meteoroid', 1),
 ('Estrategia', 1),
 ('supermassive', 1),
 ('‘Protoplanetary', 1),
 ('LSST:', 1),
 ('infrared', 1),
 ('“Solving', 1),
 ('ESANN-2018', 1),
 ('KITP', 1),
 ('2019', 1),
 ('‘Gaia', 1),
 ('Global', 1),
 ('Physics', 1),
 ('The', 1),
 ('2013', 1),
 ('3rd', 1),
 ('(International', 1),
 ('ISIMA,', 1),
 ('NSF', 1),
 ('‘The', 1),
 ('“Sub-arcsec', 1),
 ('“Implications', 1),
 ('EUROWD-21', 1),
 ('Dartmouth', 1),
 ('Astronomy', 1),
 ('Foreign', 1),


In [26]:
regex['contain_year'] = r"\d{4}"

### CelestialRegion
- contain '°', '′'
- contain '>', '<', '=', '|'
- single lower case character (not 'a')

In [27]:
entity = 'CelestialRegion'
word_counts = Counter(ner_tokens[f"B-{entity}"] + ner_tokens[f"I-{entity}"])
top_word = word_counts.most_common(100)
top_word

[('=', 11),
 ('Galactic', 10),
 ('b', 7),
 ('field', 7),
 ('extragalactic', 5),
 ('|', 5),
 ('BICEP2', 4),
 ('decl.', 4),
 ('COSMOS', 3),
 ('LR', 3),
 ('Universe', 3),
 ('l', 3),
 ('h', 3),
 ('m', 3),
 ('local', 2),
 ('Northern', 2),
 ('galactic', 2),
 ('R.A.', 2),
 ('Planck', 2),
 ('and', 2),
 ('longitude', 2),
 (')', 2),
 ('plane', 2),
 ('plane,', 2),
 ('region', 2),
 ('(2000)', 2),
 ('Common', 2),
 ('Field', 2),
 ('11', 2),
 ('54', 2),
 ('inner', 1),
 ('170°', 1),
 ('(', 1),
 ('(l,', 1),
 ('WISE', 1),
 ('Antlia', 1),
 ('anti-sunward', 1),
 ('sunward', 1),
 ('VPOS', 1),
 ('Boötes', 1),
 ('TESS', 1),
 ('CoRoT', 1),
 ('Cygnus', 1),
 ('Lyra.', 1),
 ('HXDF,', 1),
 ('GOODS-North', 1),
 ('(LR72).', 1),
 ('LR33).', 1),
 ('early', 1),
 ('“', 1),
 ('central', 1),
 ('GSE', 1),
 ('Geocentric', 1),
 ('(GSE)', 1),
 ('303°', 1),
 ('(R.A.', 1),
 ('−4.3°', 1),
 ('−3.3°', 1),
 ('349.2°,', 1),
 ('Hemisphere.', 1),
 ('Galaxy', 1),
 ('7R.A.', 1),
 ('175°,', 1),
 ('−3°', 1),
 ('3°).', 1),
 (',', 1),
 ('(

In [27]:
sub_tokens['degree'] = ['°', '′']
sub_tokens['operation'] = ['>', '<', '=', '|']
regex['hml'] = r"^[hml]$"

### Identifier
- start with 'ADS/JAO'
- start with / contain '#'
- r'/d{4}\./d{1,2}\./d{5}'
- all numbers

In [14]:
entity = 'Identifier'
word_counts = Counter(ner_tokens[f"B-{entity}"] + ner_tokens[f"I-{entity}"])
top_word = word_counts.most_common(100)
top_word

[('CBP1', 6),
 ('CBP2', 4),
 ('AR', 3),
 ('ADS/JAO.ALMA#', 3),
 ('(Version', 3),
 ('version', 2),
 ('NOAA', 2),
 ('CBP1,', 2),
 ('12290', 2),
 ('LIGO', 1),
 ('2016.1.01164.S.', 1),
 ('(v4.7.2,', 1),
 ('ADS/JAO.ALMA#2011.0.00876.S,', 1),
 ('ADS/JAO.ALMA#2012.00650.', 1),
 ('<inline-formula>', 1),
 ('V,', 1),
 ('ADS/JAO.ALMA#2013.1.00111.S.', 1),
 ('ADS/JAO.ALMA#2011.0.00172.S.', 1),
 ('24Jun2019_V6.26.1,', 1),
 ('ADS/JAO.ALMA#2013.1.00798.S.', 1),
 ('ADS/JAO.ALMA[2013.1.00486.S].', 1),
 ('ADS/JAO.ALMA#2013.1.00806.S.', 1),
 ('SOL2014-01-13T21:51M1.3,', 1),
 ('72877.', 1),
 ('ADS/JAO.ALMA#2012.1.00978.S.', 1),
 ('#', 1),
 ('RP200442)', 1),
 ('RP200576)', 1),
 ('HRL0000BABA', 1),
 ('FRT0000BABA', 1),
 ('(v8845;', 1),
 ('(v4.2.2;', 1),
 ('MJLSG32.', 1),
 ('#14125.', 1),
 ('ID', 1),
 ('#14125,', 1),
 ('#14125', 1),
 ('CBP2,', 1),
 ('(PFL1)', 1),
 ('PFL1,', 1),
 ('N03E05', 1),
 ('N03W18', 1),
 ('ADS/JAO.ALMA#2013.1.00151.S,', 1),
 ('ADS/JAO.ALMA#2013.1.00034.S', 1),
 ('ADS/JAO.ALMA#2012.1.00

In [28]:
sub_tokens['#'] = ['#']

regex['ADS/JAO'] = r"^ADS/JAO"
regex['num_seq'] = r'/d{4}\./d{1,2}\./d{5}'

In [30]:
features = {}
features['sub_tokens'] = sub_tokens
features['regex'] = regex

with open('unbiased_domain_knowledge.json', 'w') as fp:
    json.dump(features, fp)

# Biased Domain Knowledge

In [101]:
# preprocess tags
processed_tags_200, ner_tokens_200, _ = process_entity_tag(data=dataset['train'], sample=200)
processed_tags_500, ner_tokens_500, _ = process_entity_tag(data=dataset['train'], sample=500)
processed_tags_1000, ner_tokens_1000, _ = process_entity_tag(data=dataset['train'], sample=1000)
processed_tags, ner_tokens, _ = process_entity_tag(data=dataset['train'])

In [102]:
features = {
    "200":{},
    "500":{},
    "1000":{},
    'all':{}
    }

## High frequency sub-token

for each entity we will generate a list of high frequency subtokens containing:
- top 50 3-grams
- top 50 5-grams

In [103]:
ner_tokens_list = [ner_tokens_200, ner_tokens_500, ner_tokens_1000, ner_tokens]

for ner, t in zip(ner_tokens_list, ['200','500', '1000', 'all']):
    sub_tokens = {}
    for entity in entity_name:
        for bi in ["B","I"]:
            for n in [3, 5]:
                tokens = ner[f"{bi}-{entity}"]
                sub_tokens[f"{bi}-{entity}_{n}_grams"] =[i[0] for i in find_frequent_subword(tokens, n_gram=n, top=50)]
    features[t]['sub_tokens'] = sub_tokens

## Pattern Analysis 200

In [104]:
regex = {}

### Organization:
+ Start with capital letters
+ Location / Org

In [105]:
entity = 'Organization'
word_counts = Counter(ner_tokens_200[f"B-{entity}"] + ner_tokens_200[f"I-{entity}"])
top_word = word_counts.most_common(100)
top_word

[('of', 177),
 ('National', 88),
 ('University', 80),
 ('Science', 77),
 ('and', 64),
 ('for', 57),
 ('Research', 55),
 ('University,', 48),
 ('de', 45),
 ('Space', 38),
 ('NASA', 37),
 ('Foundation', 37),
 ('Institute', 36),
 ('Council', 26),
 ('NSF', 25),
 ('the', 20),
 ('European', 18),
 ('Department', 18),
 ('Foundation,', 18),
 ('State', 18),
 ('Center', 17),
 ('Foundation.', 17),
 ('Laboratory,', 17),
 ('Centre', 16),
 ('Astrophysics', 15),
 ('Aeronautics', 14),
 ('Office', 12),
 ('Astronomy', 12),
 ('California', 11),
 ('Australian', 11),
 ('Energy', 11),
 ('in', 11),
 ('Technology,', 11),
 ('für', 11),
 ('Alfred', 10),
 ('Instituto', 10),
 ('Ministry', 10),
 ('P.', 10),
 ('Sloan', 10),
 ('Max', 9),
 ('New', 9),
 ('Jet', 9),
 ('Data', 9),
 ('Agency', 9),
 ('Planck', 9),
 ('Technology', 9),
 ('Propulsion', 9),
 ('ERC', 8),
 ('STFC', 8),
 ('Natural', 8),
 ('Carnegie', 8),
 ('Universities', 8),
 ('ESA', 8),
 ('Sciences', 8),
 ('Astronomy,', 8),
 ('Inc.,', 8),
 ('Association', 7),
 

### Observatory:
+ Contain 'Observator'
+ Location
+ All alphabet chars are uppercase

In [106]:
entity = 'Observatory'
word_counts = Counter(ner_tokens_200[f"B-{entity}"] + ner_tokens_200[f"I-{entity}"])
top_word = word_counts.most_common(100)
top_word

[('Observatory', 25),
 ('Swift', 13),
 ('ALMA', 12),
 ('National', 9),
 ('Astronomical', 8),
 ('Observatory,', 8),
 ('ESO', 6),
 ('NFO', 6),
 ('URSA', 5),
 ('Astronomy', 5),
 ('Radio', 4),
 ('of', 4),
 ('de', 4),
 ('Paranal', 3),
 ('Observatorio', 3),
 ('Joint', 3),
 ('ESO,', 3),
 ('NAOJ.', 3),
 ('Keck', 3),
 ('del', 3),
 ('Roque', 3),
 ('los', 3),
 ('Muchachos', 3),
 ('Observatory.', 3),
 ('Gemini', 2),
 ('South', 2),
 ('Fermi', 2),
 ('W.', 2),
 ('Japan.', 2),
 ('Astrophysical', 2),
 ('African', 2),
 ('Naval', 2),
 ('Bank', 2),
 ('M.', 2),
 ('German', 1),
 ('(SAAO).', 1),
 ('Mount', 1),
 ('United', 1),
 ('SDO/HMI', 1),
 ('Smithsonian', 1),
 ('U.S.', 1),
 ('Swift/XRT', 1),
 ('Carnegie', 1),
 ('Las', 1),
 ('European', 1),
 ('Jodrell', 1),
 ('OHP', 1),
 ('Neil', 1),
 ('IRAM,', 1),
 ('Green', 1),
 ('Long', 1),
 ('(LBO).', 1),
 ('LBO,', 1),
 ('Observatário', 1),
 ('Shanghai', 1),
 ('(NFO,', 1),
 ('Australian', 1),
 ('AAO,', 1),
 ('ALMA.', 1),
 ('La', 1),
 ('Karoo', 1),
 ('NRAO,', 1),
 ('Ob

In [107]:
regex['contain_obs'] = r"observator|Observator"

### CelestialObject:
+ Contain 'Sun', 'Earth', 'Orion', 'Jupiter', 'Milky', 'solar'
+ length less than 3

In [108]:
entity = 'CelestialObject'
word_counts = Counter(ner_tokens_200[f"B-{entity}"] + ner_tokens_200[f"I-{entity}"])
top_word = word_counts.most_common(100)
top_word

[('NGC', 37),
 ('LMC', 13),
 ('HD', 10),
 ('Galactic', 9),
 ('SMC', 8),
 ('Gaia', 8),
 ('Milky', 7),
 ('M', 7),
 ('solar', 7),
 ('MW', 7),
 ('Kraken', 7),
 ('Perseus', 5),
 ('GRB', 5),
 ('MAXI', 5),
 ('SDSS', 5),
 ('MW-like', 5),
 ('b', 5),
 ('3', 5),
 ('J155258+273728', 5),
 ('1275', 5),
 ('-Enceladus,', 5),
 ('Sgr', 4),
 ('WASP-80', 4),
 ('Earth', 4),
 ('V506', 4),
 ('Cyg', 4),
 ('170817A', 4),
 ('Oph', 4),
 ('CMB', 3),
 ('IC', 3),
 ('Sextans', 3),
 ('quiet-Sun', 3),
 ('J0840', 3),
 ('(M', 3),
 ('planet', 3),
 ('Galaxy', 3),
 ('Kepler', 3),
 ('Antennae.', 3),
 ('SGS', 3),
 ('Helmi', 3),
 ('Kraken,', 3),
 ('31', 3),
 ('Way', 3),
 ('5272', 3),
 ('41641,', 3),
 ('1023', 3),
 ('4113C', 3),
 ('X-3', 3),
 ('G', 2),
 ('Cetus', 2),
 ('Tucana', 2),
 ('OJ', 2),
 ('Titan’s', 2),
 ('TRAPPIST-1', 2),
 ('Earth.', 2),
 ('2d:1e', 2),
 ('3d:1e.', 2),
 ('5d:2e', 2),
 ('HR', 2),
 ('4b:2c:1d', 2),
 ('Pisces', 2),
 ('Sun', 2),
 ('Antennae,', 2),
 ('L483', 2),
 ('PKS', 2),
 ('Local', 2),
 ('M31', 2),
 ('M

In [109]:
regex['contain_co'] = r"Sun|Earth|Orion|Jupiter|Milky|solar"
regex['contain_len3'] = r"^.{3}$"

### Event:
+ No clear patterns

In [110]:
entity = 'Event'
word_counts = Counter(ner_tokens_200[f"B-{entity}"] + ner_tokens_200[f"I-{entity}"])
top_word = word_counts.most_common(100)
top_word

[('Workshop', 3),
 ('NYC', 1),
 ('program—Turbulence', 1),
 ('(ICTS/Prog-taly2018/01).', 1),
 ('Near/Far', 1),
 ('Gaia', 1),
 ('DR2', 1),
 ('on', 1),
 ('Magnetospheres', 1),
 ('of', 1),
 ('Neutron', 1),
 ('Stars', 1),
 ('and', 1),
 ('Black', 1),
 ('Holes', 1),
 ('from', 1),
 ('Angstroms', 1),
 ('to', 1),
 ('Light', 1),
 ('Years', 1),
 ('Globular', 1),
 ('Cluster', 1)]

### CelestialRegion
- no clear pattern

In [111]:
entity = 'CelestialRegion'
word_counts = Counter(ner_tokens_200[f"B-{entity}"] + ner_tokens_200[f"I-{entity}"])
top_word = word_counts.most_common(100)
top_word

[('Galactic', 3),
 ('L', 1),
 ('south', 1),
 ('(', 1),
 ('local', 1),
 ('South', 1),
 ('GAMA', 1),
 ('MACS1149:', 1),
 ('center', 1),
 ('MS', 1),
 ('25.', 1),
 ('celestial', 1),
 ('pole.', 1),
 ('l,', 1),
 ('b', 1),
 (')', 1),
 ('=', 1),
 ('(309.3,', 1),
 ('−1.1),', 1),
 ('Universe', 1),
 ('Ecliptic', 1),
 ('Pole', 1),
 ('regions', 1)]

### Identifier
- start with 'ADS/JAO'

In [112]:
entity = 'Identifier'
word_counts = Counter(ner_tokens_200[f"B-{entity}"] + ner_tokens_200[f"I-{entity}"])
top_word = word_counts.most_common(100)
top_word

[('Jessica', 2),
 ('obs', 1),
 ('AR11726,', 1),
 ('(MET', 1),
 ('557452805).', 1),
 ('(Version', 1),
 ('ADS/JAO.ALMA2016.1.00413.V.', 1),
 ('version', 1),
 ('ADS/JAO.ALMA#2017.1.00124.S.', 1),
 ('v5.1.1;', 1),
 ('191.D-0255.', 1),
 ('ROS-E/M/A/C-SPICE-6-V1.0', 1),
 ('ID', 1),
 ('00087498001).', 1),
 ('Lummene.2', 1),
 ('1', 1),
 ('239557417', 1),
 ('4.2;', 1),
 ('2013', 1)]

In [113]:
regex['ADS/JAO'] = r"^ADS/JAO"

In [114]:
features['200']['regex'] = regex

## Pattern Analysis 500

In [115]:
regex = {}

### Organization:
+ Start with capital letters
+ Location / Org

In [116]:
entity = 'Organization'
word_counts = Counter(ner_tokens_500[f"B-{entity}"] + ner_tokens_500[f"I-{entity}"])
top_word = word_counts.most_common(100)
top_word

[('of', 348),
 ('Science', 181),
 ('National', 175),
 ('Research', 151),
 ('and', 139),
 ('University', 128),
 ('for', 123),
 ('Foundation', 95),
 ('Space', 93),
 ('NASA', 92),
 ('Institute', 85),
 ('de', 85),
 ('Council', 71),
 ('University,', 67),
 ('NSF', 50),
 ('European', 43),
 ('Aeronautics', 38),
 ('California', 36),
 ('Astrophysics', 34),
 ('Centre', 33),
 ('Technology,', 33),
 ('Center', 32),
 ('Laboratory,', 32),
 ('Australian', 31),
 ('Astronomy', 30),
 ('Foundation.', 29),
 ('the', 29),
 ('Natural', 27),
 ('Foundation,', 27),
 ('Department', 26),
 ('in', 26),
 ('State', 26),
 ('Technology', 26),
 ('Jet', 23),
 ('Universities', 23),
 ('Propulsion', 23),
 ('Administration.', 23),
 ('Ministry', 22),
 ('Association', 21),
 ('Data', 21),
 ('Astronomy,', 21),
 ('STFC', 20),
 ('Academy', 20),
 ('Office', 19),
 ('Energy', 19),
 ('ERC', 18),
 ('Telescope', 18),
 ('Inc.,', 18),
 ('CDS,', 17),
 ('Agency', 17),
 ('Planck', 16),
 ('Instituto', 15),
 ('Max', 15),
 ('Scientific', 15),
 ('

### Observatory:
+ Contain 'Observator'
+ Location
+ All alphabet chars are uppercase

In [117]:
entity = 'Observatory'
word_counts = Counter(ner_tokens_500[f"B-{entity}"] + ner_tokens_500[f"I-{entity}"])
top_word = word_counts.most_common(100)
top_word

[('Observatory', 56),
 ('ALMA', 37),
 ('Swift', 23),
 ('Observatory,', 23),
 ('National', 18),
 ('ESO', 18),
 ('Astronomical', 18),
 ('Keck', 10),
 ('of', 10),
 ('Astronomy', 10),
 ('Observatory.', 10),
 ('Radio', 9),
 ('SDO', 9),
 ('Joint', 8),
 ('ESO,', 8),
 ('Paranal', 7),
 ('NAOJ.', 7),
 ('W.', 7),
 ('de', 7),
 ('M.', 7),
 ('NFO', 6),
 ('URSA', 5),
 ('La', 5),
 ('Silla', 5),
 ('South', 4),
 ('Observatorio', 4),
 ('Japan.', 4),
 ('del', 4),
 ('Roque', 4),
 ('los', 4),
 ('Solar', 4),
 ('Fermi', 3),
 ('European', 3),
 ('Australian', 3),
 ('Lowell', 3),
 ('Astrophysical', 3),
 ('African', 3),
 ('Naval', 3),
 ('Muchachos', 3),
 ('Southern', 3),
 ('Observatories', 3),
 ('Gemini', 2),
 ('United', 2),
 ('Smithsonian', 2),
 ('Las', 2),
 ('Shanghai', 2),
 ('AAO,', 2),
 ('Observatoire', 2),
 ('Cerro', 2),
 ('Arecibo', 2),
 ('The', 2),
 ('University', 2),
 ('HINODE', 2),
 ('HINODE/SOT', 2),
 ('Yunnan', 2),
 ('observatory', 2),
 ('States', 2),
 ('Bank', 2),
 ('la', 2),
 ('Côte', 2),
 ('German',

In [118]:
regex['contain_obs'] = r"observator|Observator"
regex['contain_cap.'] = r"([A-Z]\.)+"

### CelestialObject:
+ Contain 'Sun', 'Earth', 'Orion', 'Jupiter', 'Milky', 'solar'
+ length less than 3

In [119]:
entity = 'CelestialObject'
word_counts = Counter(ner_tokens_500[f"B-{entity}"] + ner_tokens_500[f"I-{entity}"])
top_word = word_counts.most_common(100)
top_word

[('NGC', 62),
 ('HD', 37),
 ('Sgr', 35),
 ('Milky', 29),
 ('Galactic', 25),
 ('solar', 19),
 ('LMC', 19),
 ('Way', 18),
 ('M', 17),
 ('MW', 16),
 ('LA', 15),
 ('Jhelum', 15),
 ('Cyg', 14),
 ('4U', 14),
 ('A*', 14),
 ('3', 13),
 ('II', 13),
 ('FW', 12),
 ('Tau', 12),
 ('SMC', 11),
 ('Gaia', 11),
 ('A', 11),
 ('Perseus', 10),
 ('Fornax', 10),
 ('16', 10),
 ('κ', 10),
 ('Proxima', 9),
 ('Magellanic', 8),
 ('Jupiter', 7),
 ('Earth', 7),
 ('Galaxy', 7),
 ('KIC', 7),
 ('Kraken', 7),
 ('PSR', 7),
 ('B', 7),
 ('IRS', 7),
 ('GES', 7),
 ('b', 7),
 ('C', 7),
 ('GRB', 6),
 ('RECX', 6),
 ('Way,', 6),
 ('B2', 6),
 ('system', 6),
 ('Cen', 6),
 ('Cassiopeiae', 6),
 ('Earth.', 5),
 ('(M', 5),
 ('HR', 5),
 ('Sun', 5),
 ('MAXI', 5),
 ('SDSS', 5),
 ('MW-like', 5),
 ('Earth,', 5),
 ('Cygnus', 5),
 ('UFD1', 5),
 ('FSR', 5),
 ('OGLE', 5),
 ('J155258+273728', 5),
 ('1275', 5),
 ('-Enceladus,', 5),
 ('stream', 5),
 ('A*.', 5),
 ('189733b', 5),
 ('WASP-80', 4),
 ('IC', 4),
 ('SN', 4),
 ('V506', 4),
 ('Coma', 4)

In [120]:
regex['contain_co'] = r"Sun|Earth|Orion|Jupiter|Milky|solar"
regex['contain_len3'] = r"^.{3}$"

### Event:
+ No clear patterns

In [121]:
entity = 'Event'
word_counts = Counter(ner_tokens_500[f"B-{entity}"] + ner_tokens_500[f"I-{entity}"])
top_word = word_counts.most_common(100)
top_word

[('Workshop', 3),
 ('of', 3),
 ('and', 3),
 ('Stars', 2),
 ('NYC', 1),
 ('program—Turbulence', 1),
 ('(ICTS/Prog-taly2018/01).', 1),
 ('Near/Far', 1),
 ('workshop', 1),
 ('East', 1),
 ('Origins', 1),
 ('Gaia', 1),
 ('DR2', 1),
 ('on', 1),
 ('Magnetospheres', 1),
 ('Neutron', 1),
 ('Black', 1),
 ('Holes', 1),
 ('from', 1),
 ('Angstroms', 1),
 ('to', 1),
 ('Light', 1),
 ('Years', 1),
 ('Globular', 1),
 ('Cluster', 1),
 ('‘Formation', 1),
 ('Massive', 1),
 ('Clusters', 1),
 ('in', 1),
 ('Dwarf', 1),
 ('Galaxies', 1),
 ('over', 1),
 ('Cosmic', 1),
 ('Time’', 1),
 ('Asia', 1),
 ('Pacific', 1),
 ('Summer', 1),
 ('Institutes).', 1),
 ('Habitable', 1),
 ('Planets', 1)]

### CelestialRegion
- contain '°'

In [122]:
entity = 'CelestialRegion'
word_counts = Counter(ner_tokens_500[f"B-{entity}"] + ner_tokens_500[f"I-{entity}"])
top_word = word_counts.most_common(100)
top_word

[('Galactic', 8),
 ('Plane', 5),
 ('=', 4),
 ('field', 3),
 ('of', 3),
 ('south', 2),
 ('(', 2),
 ('South', 2),
 ('GAMA', 2),
 ('COSMOS', 2),
 ('LSST', 2),
 ('north', 2),
 ('b', 2),
 (')', 2),
 ('Pole', 2),
 ('regions', 2),
 ('fields', 2),
 ('fields.', 2),
 ('G', 2),
 ('δ', 2),
 ('L', 1),
 ('local', 1),
 ('MACS1149:', 1),
 ('(SCP),', 1),
 ('(GP),', 1),
 ('Wide-Fast-Deep', 1),
 ('(WFD).', 1),
 ('α', 1),
 ('(304,', 1),
 ('FFs', 1),
 ('center', 1),
 ('MS', 1),
 ('25.', 1),
 ('celestial', 1),
 ('pole.', 1),
 ('l,', 1),
 ('(309.3,', 1),
 ('−1.1),', 1),
 ('Universe', 1),
 ('Ecliptic', 1),
 ('field.', 1),
 ('630', 1),
 ('Celestial', 1),
 ('266.4°', 1),
 ('and', 1),
 ('−29.0°', 1),
 ('l', 1),
 (',', 1),
 ('(315,', 1),
 ('17)(52,', 1),
 ('15)', 1),
 ('22)(14,', 1),
 ('11)', 1),
 ('declination', 1),
 ('−53°', 1),
 ('19°', 1),
 ('declination,', 1),
 ('19°.', 1)]

In [123]:
regex['contain_deg'] = r"°"

### Identifier
- start with 'ADS/JAO'
- r'AAE-/d{6}'
- start with '(v'

In [124]:
entity = 'Identifier'
word_counts = Counter(ner_tokens_500[f"B-{entity}"] + ner_tokens_500[f"I-{entity}"])
top_word = word_counts.most_common(100)
top_word

[('(v3)', 4),
 ('AAE-141220', 3),
 ('Jessica', 2),
 ('AAE-141220.', 2),
 ('AAE-061228,', 2),
 ('obs', 1),
 ('AR11726,', 1),
 ('(MET', 1),
 ('557452805).', 1),
 ('(Version', 1),
 ('ADS/JAO.ALMA2016.1.00413.V.', 1),
 ('version', 1),
 ('ADS/JAO.ALMA#2017.1.00124.S.', 1),
 ('v5.1.1;', 1),
 ('191.D-0255.', 1),
 ('ROS-E/M/A/C-SPICE-6-V1.0', 1),
 ('(v14.0.0,', 1),
 ('(v13.2),', 1),
 ('(v1.5.1,', 1),
 ('(v6.20,', 1),
 ('(v12.9.0k;', 1),
 ('ADS/JAO.ALMA#2013.1.00220.S,', 1),
 ('ADS/JAO.ALMA#2013.1.00226.S.', 1),
 ('ADS/JAO.ALMA#2012.1.00712.S', 1),
 ('ADS/JAO.ALMA#2013.1.00278.S.', 1),
 ('ADS/JAO.ALMA', 1),
 ('#2012.1.00437.S.', 1),
 ('v1.0.3', 1),
 ('ADS/JAO.ALMA#2015.1.00773.S.', 1),
 ('(v2),', 1),
 ('(v3.1),', 1),
 ('(v3),', 1),
 ('(v4)', 1),
 ('(version', 1),
 ('AR', 1),
 ('AAC-150108.', 1),
 ('ADS/JAO.', 1),
 ('ID', 1),
 ('00087498001).', 1),
 ('Lummene.2', 1),
 ('1', 1),
 ('239557417', 1),
 ('4.2;', 1),
 ('2013', 1),
 ('#2015.1.00633.S', 1),
 ('prod3b-v1)', 1),
 ('12673', 1),
 ('ALMA#2016

In [125]:
regex['ADS/JAO'] = r"^ADS/JAO"
regex['AAE'] = r'AAE-/d{6}'
regex['(v'] = r'^\(v'

In [126]:
features['500']['regex'] = regex

## Pattern Analysis 1000

In [127]:
regex = {}

### Organization:
+ Start with capital letters
+ Location / Org

In [128]:
entity = 'Organization'
word_counts = Counter(ner_tokens_1000[f"B-{entity}"] + ner_tokens_1000[f"I-{entity}"])
top_word = word_counts.most_common(100)
top_word

[('of', 750),
 ('National', 366),
 ('Science', 366),
 ('University', 315),
 ('and', 305),
 ('Research', 302),
 ('for', 233),
 ('de', 210),
 ('Space', 190),
 ('Institute', 182),
 ('Foundation', 177),
 ('NASA', 175),
 ('Council', 156),
 ('University,', 144),
 ('NSF', 99),
 ('European', 87),
 ('Aeronautics', 79),
 ('Laboratory,', 73),
 ('California', 72),
 ('Technology,', 68),
 ('Centre', 63),
 ('Technology', 63),
 ('Astronomy', 62),
 ('Center', 61),
 ('the', 60),
 ('Natural', 58),
 ('Foundation.', 57),
 ('State', 55),
 ('Foundation,', 54),
 ('Department', 53),
 ('Australian', 53),
 ('Ministry', 51),
 ('Universities', 49),
 ('Jet', 48),
 ('Propulsion', 48),
 ('STFC', 47),
 ('in', 47),
 ('Astrophysics', 46),
 ('Astronomy,', 45),
 ('Data', 43),
 ('Office', 43),
 ('Association', 42),
 ('Sciences', 42),
 ('Agency', 41),
 ('Physics', 40),
 ('Telescope', 40),
 ('für', 39),
 ('Academy', 38),
 ('Inc.,', 36),
 ('Administration.', 36),
 ('Instituto', 34),
 ('Max', 32),
 ('Planck', 32),
 ('China', 3

### Observatory:
+ Contain 'Observator'
+ Location
+ All alphabet chars are uppercase

In [129]:
entity = 'Observatory'
word_counts = Counter(ner_tokens_1000[f"B-{entity}"] + ner_tokens_1000[f"I-{entity}"])
top_word = word_counts.most_common(100)
top_word

[('Observatory', 100),
 ('ALMA', 60),
 ('Observatory,', 50),
 ('National', 37),
 ('Astronomical', 37),
 ('ESO', 34),
 ('Swift', 32),
 ('of', 21),
 ('SDO', 18),
 ('Astronomy', 18),
 ('de', 18),
 ('Keck', 17),
 ('Radio', 16),
 ('Observatory.', 16),
 ('Paranal', 13),
 ('Joint', 12),
 ('ESO,', 12),
 ('W.', 12),
 ('M.', 12),
 ('Observatorio', 11),
 ('La', 11),
 ('Astrophysical', 11),
 ('Silla', 11),
 ('Gemini', 10),
 ('NAOJ.', 10),
 ('Roque', 10),
 ('del', 10),
 ('los', 10),
 ('Smithsonian', 9),
 ('Japan.', 9),
 ('Muchachos', 9),
 ('Solar', 8),
 ('Shanghai', 7),
 ('Fermi', 6),
 ('NFO', 6),
 ('Australian', 6),
 ('Observatories', 6),
 ('Las', 5),
 ('European', 5),
 ('URSA', 5),
 ('Observatoire', 5),
 ('Lowell', 5),
 ('Rome/PSPT', 5),
 ('Observatory;', 5),
 ('Southern', 5),
 ('China,', 5),
 ('Nacional/MCTI,', 5),
 ('South', 4),
 ('Observatário', 4),
 ('Strasbourg', 4),
 ('Naval', 4),
 ('Optical', 4),
 ('United', 3),
 ('AAO,', 3),
 ('Cerro', 3),
 ('Arecibo', 3),
 ('NAOJ', 3),
 ('W.M.', 3),
 ('P

In [130]:
regex['contain_obs'] = r"observator|Observator"
regex['contain_cap.'] = r"([A-Z]\.)+"

### CelestialObject:
+ Contain 'Sun', 'Earth', 'Orion', 'Jupiter', 'Milky', 'solar'
+ Contain κ,e, *

In [131]:
entity = 'CelestialObject'
word_counts = Counter(ner_tokens_1000[f"B-{entity}"] + ner_tokens_1000[f"I-{entity}"])
top_word = word_counts.most_common(100)
top_word

[('NGC', 123),
 ('HD', 59),
 ('solar', 47),
 ('Milky', 42),
 ('Galactic', 40),
 ('GRB', 39),
 ('Sgr', 38),
 ('LMC', 31),
 ('Way', 26),
 ('SN', 24),
 ('MW', 24),
 ('Cyg', 24),
 ('Abell', 22),
 ('M', 20),
 ('SDSS', 17),
 ('4U', 17),
 ('A', 16),
 ('II', 16),
 ('SMC', 15),
 ('PSR', 15),
 ('LA', 15),
 ('Jhelum', 15),
 ('Sun', 14),
 ('3', 14),
 ('A*', 14),
 ('Perseus', 12),
 ('16', 12),
 ('B', 12),
 ('FW', 12),
 ('Tau', 12),
 ('Fornax', 11),
 ('Earth', 11),
 ('Gaia', 11),
 ('Cen', 11),
 ('Galaxy', 10),
 ('Magellanic', 10),
 ('κ', 10),
 ('e', 10),
 ('system', 10),
 ('b', 10),
 ('KIC', 9),
 ('Cygnus', 9),
 ('Orion', 9),
 ('Proxima', 9),
 ('Way,', 9),
 ('Jupiter', 8),
 ('Earth’s', 8),
 ('3C', 8),
 ('Earth,', 8),
 ('Mrk', 8),
 ('170817A', 8),
 ('M31', 7),
 ('Kraken', 7),
 ('MW-like', 7),
 ('Ret', 7),
 ('IRS', 7),
 ('GES', 7),
 ('ALS', 7),
 ('C', 7),
 ('A,', 7),
 ('Aur', 7),
 ('GJ', 6),
 ('Earth.', 6),
 ('Local', 6),
 ('Virgo', 6),
 ('Sun.', 6),
 ('RECX', 6),
 ('B,', 6),
 ('2MASS', 6),
 ('HR8799'

In [132]:
regex['contain_co'] = r"Sun|Earth|Orion|Jupiter|Milky|solar"
regex['contain_spe_char'] = r"[κe*]"

### Event:
+ No clear patterns

In [133]:
entity = 'Event'
word_counts = Counter(ner_tokens_1000[f"B-{entity}"] + ner_tokens_1000[f"I-{entity}"])
top_word = word_counts.most_common(100)
top_word

[('Workshop', 4),
 ('Gaia', 4),
 ('of', 4),
 ('and', 4),
 ('Summer', 3),
 ('workshops', 3),
 ('Stars', 2),
 ('Black', 2),
 ('Holes', 2),
 ('-ESO', 2),
 ('Program', 2),
 ('at', 2),
 ('NYC', 1),
 ('program—Turbulence', 1),
 ('(ICTS/Prog-taly2018/01).', 1),
 ('Near/Far', 1),
 ('workshop', 1),
 ('East', 1),
 ('Origins', 1),
 ('Time-delay', 1),
 ('Astro', 1),
 ('“The', 1),
 ('Star', 1),
 ('Exoplanet', 1),
 ('2017', 1),
 ('Data', 1),
 ('Aspen', 1),
 ('DR2', 1),
 ('on', 1),
 ('Magnetospheres', 1),
 ('Neutron', 1),
 ('from', 1),
 ('Angstroms', 1),
 ('to', 1),
 ('Light', 1),
 ('Years', 1),
 ('Globular', 1),
 ('Cluster', 1),
 ('‘Formation', 1),
 ('Massive', 1),
 ('Clusters', 1),
 ('in', 1),
 ('Dwarf', 1),
 ('Galaxies', 1),
 ('over', 1),
 ('Cosmic', 1),
 ('Time’', 1),
 ('Asia', 1),
 ('Pacific', 1),
 ('Institutes).', 1),
 ('Habitable', 1),
 ('Planets', 1),
 ('conferences', 1),
 ('Lens', 1),
 ('Modelling', 1),
 ('Challenge', 1),
 ('Hack', 1),
 ('Week', 1),
 ('X-ray', 1),
 ('Spectral-Timing', 1),
 (

### CelestialRegion
- contain '°'
- contain [()=><]
- one alphabet

In [134]:
entity = 'CelestialRegion'
word_counts = Counter(ner_tokens_1000[f"B-{entity}"] + ner_tokens_1000[f"I-{entity}"])
top_word = word_counts.most_common(100)
top_word

[('Galactic', 10),
 ('=', 10),
 ('COSMOS', 7),
 ('field', 7),
 ('Field', 6),
 ('b', 5),
 ('Plane', 5),
 ('A', 5),
 ('l', 4),
 ('δ', 4),
 ('UDS', 4),
 ('footprint', 4),
 ('fields', 3),
 ('fields.', 3),
 ('of', 3),
 ('south', 2),
 ('(', 2),
 ('local', 2),
 ('South', 2),
 ('GAMA', 2),
 ('LSST', 2),
 ('north', 2),
 ('VEXAS-SMW', 2),
 ('solar', 2),
 ('high', 2),
 (')', 2),
 ('Pole', 2),
 ('regions', 2),
 ('field.', 2),
 ('G', 2),
 ('>', 2),
 ('15,', 2),
 ('latitude', 2),
 ('L', 1),
 ('MACS1149:', 1),
 ('(SCP),', 1),
 ('(GP),', 1),
 ('Wide-Fast-Deep', 1),
 ('(WFD).', 1),
 ('α', 1),
 ('(304,', 1),
 ('FFs', 1),
 ('IKN', 1),
 ('VEXAS-DESW', 1),
 ('VEXAS-PSW', 1),
 ('declinations', 1),
 ('UDSz', 1),
 ('CANDELS', 1),
 ('AEGIS,', 1),
 ('COSMOS,', 1),
 ('UDS,', 1),
 ('GOODS-N,', 1),
 ('GOODS-S', 1),
 ('PHAT', 1),
 ('HXDF.', 1),
 ('center', 1),
 ('MS', 1),
 ('25.', 1),
 ('celestial', 1),
 ('pole.', 1),
 ('l,', 1),
 ('(309.3,', 1),
 ('−1.1),', 1),
 ('Universe', 1),
 ('Ecliptic', 1),
 ('630', 1),
 ('C

In [135]:
regex['contain_deg'] = r"°"
regex['contain_ope_char'] = r"[\(\)=><]"
regex['single_char'] = r"^[A-Za-z]$"

### Identifier
- start with 'ADS/JAO'
- r'AAE-/d{6}'
- start with '(v'
- contain #
- r'/d{4}\./d{1,2}\./d{5}'

In [136]:
entity = 'Identifier'
word_counts = Counter(ner_tokens_1000[f"B-{entity}"] + ner_tokens_1000[f"I-{entity}"])
top_word = word_counts.most_common(100)
top_word

[('(v3)', 4),
 ('ADS/JAO.ALMA', 3),
 ('AAE-141220', 3),
 ('Jessica', 2),
 ('(Version', 2),
 ('version', 2),
 ('(version', 2),
 ('AAE-141220.', 2),
 ('AAE-061228,', 2),
 ('obs', 1),
 ('AR11726,', 1),
 ('(MET', 1),
 ('557452805).', 1),
 ('ADS/JAO.ALMA2016.1.00413.V.', 1),
 ('ADS/JAO.ALMA#2017.1.00124.S.', 1),
 ('v5.1.1;', 1),
 ('191.D-0255.', 1),
 ('ROS-E/M/A/C-SPICE-6-V1.0', 1),
 ('(v14.0.0,', 1),
 ('(v13.2),', 1),
 ('(v1.5.1,', 1),
 ('(v6.20,', 1),
 ('(v12.9.0k;', 1),
 ('ADS/JAO.ALMA#2013.1.00220.S,', 1),
 ('ADS/JAO.ALMA#2013.1.00226.S.', 1),
 ('ADS/JAO.ALMA#2012.1.00712.S', 1),
 ('ADS/JAO.ALMA#2013.1.00278.S.', 1),
 ('#2012.1.00437.S.', 1),
 ('v1.0.3', 1),
 ('ADS/JAO.ALMA#2015.1.00773.S.', 1),
 ('(v2),', 1),
 ('(v3.1),', 1),
 ('(v3),', 1),
 ('(v4)', 1),
 ('AR', 1),
 ('AAC-150108.', 1),
 ('ADS/JAO.', 1),
 ('ADS/JAO.ALMA#2016.1.00459.S.', 1),
 ('340.S.', 1),
 ('ADS/JAO.ALMA#2011.0.00419.S.', 1),
 ('10.17909/t9-my41-h234', 1),
 ('XMM5', 1),
 ('ADS/JAO.ALMA#[2015.1.01572.S]', 1),
 ('ADS/J

In [137]:
regex['ADS/JAO'] = r"^ADS/JAO"
regex['AAE'] = r'AAE-/d{6}'
regex['(v'] = r'^\(v'
regex['num_seq'] = r'/d{4}\./d{1,2}\./d{5}'
regex['#'] = r'#'

In [138]:
features['1000']['regex'] = regex

## Pattern Analysis All

In [139]:
regex = {}

### Organization:
+ Start with capital letters
+ Location / Org

In [140]:
entity = 'Organization'
word_counts = Counter(ner_tokens[f"B-{entity}"] + ner_tokens[f"I-{entity}"])
top_word = word_counts.most_common(100)
top_word

[('of', 1406),
 ('Science', 720),
 ('National', 706),
 ('University', 596),
 ('and', 565),
 ('Research', 549),
 ('for', 458),
 ('Institute', 367),
 ('de', 359),
 ('Space', 345),
 ('NASA', 334),
 ('Foundation', 334),
 ('Council', 299),
 ('University,', 294),
 ('NSF', 182),
 ('European', 159),
 ('Technology', 142),
 ('Aeronautics', 142),
 ('Laboratory,', 136),
 ('California', 123),
 ('Centre', 122),
 ('Astronomy', 118),
 ('Department', 116),
 ('Technology,', 116),
 ('Center', 111),
 ('State', 110),
 ('Foundation,', 107),
 ('the', 102),
 ('Ministry', 101),
 ('Astrophysics', 99),
 ('Foundation.', 99),
 ('in', 99),
 ('Natural', 98),
 ('Universities', 94),
 ('Jet', 89),
 ('Astronomy,', 89),
 ('Propulsion', 89),
 ('STFC', 85),
 ('Association', 84),
 ('Australian', 84),
 ('Office', 81),
 ('Planck', 81),
 ('Max', 79),
 ('Sciences', 76),
 ('Telescope', 75),
 ('Data', 71),
 ('Energy', 68),
 ('Physics', 68),
 ('Academy', 67),
 ('University.', 67),
 ('Agency', 66),
 ('Inc.,', 64),
 ('Facilities', 6

### Observatory:
+ Contain 'Observator'
+ Location
+ All alphabet chars are uppercase

In [141]:
entity = 'Observatory'
word_counts = Counter(ner_tokens[f"B-{entity}"] + ner_tokens[f"I-{entity}"])
top_word = word_counts.most_common(100)
top_word

[('Observatory', 206),
 ('ALMA', 128),
 ('Observatory,', 89),
 ('National', 83),
 ('ESO', 67),
 ('Astronomical', 62),
 ('Swift', 40),
 ('Astronomy', 39),
 ('Observatory.', 38),
 ('Radio', 32),
 ('of', 32),
 ('de', 32),
 ('ESO,', 28),
 ('Astrophysical', 27),
 ('Joint', 26),
 ('SDO', 26),
 ('Paranal', 25),
 ('Smithsonian', 24),
 ('NAOJ.', 24),
 ('La', 24),
 ('Observatorio', 23),
 ('Roque', 21),
 ('del', 21),
 ('los', 21),
 ('Silla', 21),
 ('Keck', 20),
 ('Gemini', 19),
 ('Muchachos', 19),
 ('Solar', 17),
 ('Las', 16),
 ('European', 16),
 ('Southern', 16),
 ('W.', 14),
 ('M.', 14),
 ('Japan.', 13),
 ('Shanghai', 12),
 ('Optical', 12),
 ('Observatories', 12),
 ('Fermi', 11),
 ('Paris', 9),
 ('Lowell', 9),
 ('Naval', 9),
 ('Cumbres', 9),
 ('Observatoire', 8),
 ('Global', 8),
 ('Telescope', 8),
 ('Network', 8),
 ('Australian', 7),
 ('Observatory;', 7),
 ('China,', 7),
 ('Observatories,', 7),
 ('Campanas', 7),
 ('Incorporated,', 7),
 ('United', 6),
 ('NFO', 6),
 ('AAO,', 6),
 ('Murchison', 6)

In [142]:
regex['contain_obs'] = r"observator|Observator"
regex['contain_cap.'] = r"([A-Z]\.)+"

### CelestialObject:
+ Contain 'Sun', 'Earth', 'Orion', 'Jupiter', 'Milky', 'solar'
+ length less than 3 all capital
+ Capital and number

In [143]:
entity = 'CelestialObject'
word_counts = Counter(ner_tokens[f"B-{entity}"] + ner_tokens[f"I-{entity}"])
top_word = word_counts.most_common(100)
top_word

[('NGC', 185),
 ('solar', 100),
 ('HD', 80),
 ('Sgr', 64),
 ('Galactic', 64),
 ('Milky', 63),
 ('GRB', 48),
 ('LMC', 39),
 ('Way', 39),
 ('SN', 38),
 ('MW', 33),
 ('Earth', 31),
 ('Cyg', 31),
 ('A', 31),
 ('Sun', 30),
 ('PSR', 29),
 ('GJ', 23),
 ('Abell', 23),
 ('M', 22),
 ('PKS', 22),
 ('Mrk', 22),
 ('A*', 19),
 ('3C', 18),
 ('4U', 18),
 ('SMC', 17),
 ('SDSS', 17),
 ('Orion', 17),
 ('MAXI', 16),
 ('KIC', 16),
 ('b', 16),
 ('II', 16),
 ('Perseus', 15),
 ('LA', 15),
 ('Jhelum', 15),
 ('Jupiter', 14),
 ('Earth’s', 14),
 ('M87', 14),
 ('3', 14),
 ('Earth.', 13),
 ('Magellanic', 13),
 ('16', 13),
 ('B', 13),
 ('R', 13),
 ('system', 13),
 ('876', 13),
 ('Virgo', 12),
 ('FW', 12),
 ('S', 12),
 ('Mon', 12),
 ('Tau', 12),
 ('A,', 12),
 ('Scl', 12),
 ('IC', 11),
 ('Fornax', 11),
 ('Galaxy', 11),
 ('Gaia', 11),
 ('Earth,', 11),
 ('Sun.', 11),
 ('Cen', 11),
 ('V1534', 11),
 ('Way,', 11),
 ('1', 11),
 ('Sextans', 10),
 ('Local', 10),
 ('κ', 10),
 ('e', 10),
 ('DDO', 10),
 ('WASP-113', 10),
 ('Sco'

In [144]:
regex['contain_co'] = r"Sun|Earth|Orion|Jupiter|Milky|solar"
regex['contain_len3'] = r"^[A-Z]{3}$"
regex['contain_spe_char'] = r"[κe*]"
regex['cap_num'] = r"([A-Z]+\d+|\d+[A-Z]+)"

### Event:
+ contain year
+ contain /

In [145]:
entity = 'Event'
word_counts = Counter(ner_tokens[f"B-{entity}"] + ner_tokens[f"I-{entity}"])
top_word = word_counts.most_common(100)
top_word

[('of', 8),
 ('Workshop', 6),
 ('Gaia', 6),
 ('and', 6),
 ('workshop', 5),
 ('Summer', 5),
 ('on', 4),
 ('workshops', 3),
 ('2018', 3),
 ('in', 3),
 ('Program', 3),
 ('NYC', 2),
 ('“The', 2),
 ('Aspen', 2),
 ('Coronal', 2),
 ('Cluster', 2),
 ('Neutron', 2),
 ('Stars', 2),
 ('Black', 2),
 ('Holes', 2),
 ('-ESO', 2),
 ('at', 2),
 ('Sprint,', 2),
 ('Loops', 2),
 ('for', 2),
 ('FRB', 2),
 ('program—Turbulence', 1),
 ('(ICTS/Prog-taly2018/01).', 1),
 ('Near/Far', 1),
 ('East', 1),
 ('Origins', 1),
 ('Time-delay', 1),
 ('Astro', 1),
 ('Star', 1),
 ('Exoplanet', 1),
 ('2017', 1),
 ('Data', 1),
 ('ZTF', 1),
 ('Astrochemistry', 1),
 ('‘Galactic', 1),
 ('Galaxy', 1),
 ('International', 1),
 ('Chamonix', 1),
 ('CIDER', 1),
 ('SPHERIC2018', 1),
 ('“Observational', 1),
 ('MIAPP', 1),
 ('VOSS', 1),
 ('WWU', 1),
 ('Kavli', 1),
 ('(CFC)', 1),
 ('final', 1),
 ('2017−2019', 1),
 ('7th', 1),
 ('DR2', 1),
 ('Magnetospheres', 1),
 ('from', 1),
 ('Angstroms', 1),
 ('to', 1),
 ('Light', 1),
 ('Years', 1),
 (

In [146]:
regex['year'] = r"/d{4}"
regex['/'] = r"\/"

### CelestialRegion
- contain '°'
- contain [()=><-]
- one alphabet
- contain field

In [147]:
entity = 'CelestialRegion'
word_counts = Counter(ner_tokens[f"B-{entity}"] + ner_tokens[f"I-{entity}"])
top_word = word_counts.most_common(100)
top_word

[('Galactic', 28),
 ('=', 15),
 ('halo', 12),
 ('field', 11),
 ('COSMOS', 9),
 ('Field', 8),
 ('local', 6),
 ('l', 6),
 ('±', 6),
 ('MW', 5),
 ('latitude', 5),
 ('b', 5),
 ('Plane', 5),
 ('fields', 5),
 ('and', 5),
 ('A', 5),
 ('δ', 4),
 ('UDS', 4),
 ('disc', 4),
 ('BOSS1441', 4),
 ('Universe', 4),
 ('of', 4),
 ('footprint', 4),
 ('longitude', 4),
 ('GAMA', 3),
 ('high', 3),
 ('GOODS-S', 3),
 ('ORPH-F3', 3),
 ('CDF-S', 3),
 ('supergalactic', 3),
 ('regions', 3),
 ('field.', 3),
 ('fields.', 3),
 ('field,', 3),
 ('constellation.', 3),
 ('h', 3),
 ('−1', 3),
 ('south', 2),
 ('(', 2),
 ('South', 2),
 ('LSST', 2),
 ('north', 2),
 ('VEXAS-SMW', 2),
 ('solar', 2),
 ('CANDELS', 2),
 ('LAGER-COSMOS', 2),
 ('R.A.', 2),
 (')', 2),
 ('Ecliptic', 2),
 ('Pole', 2),
 ('G', 2),
 ('>', 2),
 ('15,', 2),
 ('disc,', 2),
 ('CMZ', 2),
 ('star', 2),
 ('halo.', 2),
 ('decl.', 2),
 ('0.6;', 2),
 ('0.8)', 2),
 ('Cartesian', 2),
 ('coordinates', 2),
 ('L', 1),
 ('MACS1149:', 1),
 ('(SCP),', 1),
 ('(GP),', 1),
 

In [148]:
regex['contain_deg'] = r"°"
regex['contain_ope_char'] = r"[\(\)=><-]"
regex['single_char'] = r"^[A-Za-z]$"
regex['field'] = r"field"

### Identifier
- start with 'ADS/JAO'
- r'AAE-/d{6}'
- start with '(v'
- contain #
- r'/d{4}\./d{1,2}\./d{5}'

In [149]:
entity = 'Identifier'
word_counts = Counter(ner_tokens[f"B-{entity}"] + ner_tokens[f"I-{entity}"])
top_word = word_counts.most_common(100)
top_word

[('ADS/JAO.ALMA', 4),
 ('(v3)', 4),
 ('NOAA', 4),
 ('(version', 3),
 ('AAE-141220', 3),
 ('Jessica', 2),
 ('(Version', 2),
 ('version', 2),
 ('AAE-141220.', 2),
 ('AAE-061228,', 2),
 ('3.6.8', 2),
 ('1', 2),
 ('12335,', 2),
 ('obs', 1),
 ('AR11726,', 1),
 ('(MET', 1),
 ('557452805).', 1),
 ('ADS/JAO.ALMA2016.1.00413.V.', 1),
 ('ADS/JAO.ALMA#2017.1.00124.S.', 1),
 ('v5.1.1;', 1),
 ('191.D-0255.', 1),
 ('ROS-E/M/A/C-SPICE-6-V1.0', 1),
 ('(v14.0.0,', 1),
 ('(v13.2),', 1),
 ('(v1.5.1,', 1),
 ('(v6.20,', 1),
 ('(v12.9.0k;', 1),
 ('ADS/JAO.ALMA#2013.1.00220.S,', 1),
 ('ADS/JAO.ALMA#2013.1.00226.S.', 1),
 ('ADS/JAO.ALMA#2012.1.00712.S', 1),
 ('ADS/JAO.ALMA#2013.1.00278.S.', 1),
 ('#2012.1.00437.S.', 1),
 ('v1.0.3', 1),
 ('ADS/JAO.ALMA#2015.1.00773.S.', 1),
 ('(v2),', 1),
 ('(v3.1),', 1),
 ('(v3),', 1),
 ('(v4)', 1),
 ('AR', 1),
 ('AAC-150108.', 1),
 ('ADS/JAO.', 1),
 ('ADS/JAO.ALMA#2016.1.00459.S.', 1),
 ('340.S.', 1),
 ('ADS/JAO.ALMA#2011.0.00419.S.', 1),
 ('10.17909/t9-my41-h234', 1),
 ('XM

In [150]:
regex['ADS/JAO'] = r"^ADS/JAO"
regex['AAE'] = r'AAE-/d{6}'
regex['(v'] = r'^\(v'
regex['num_seq'] = r'/d{4}\./d{1,2}\./d{5}'
regex['#'] = r'#'

In [151]:
features['all']['regex'] = regex

In [152]:
with open('biased_domain_knowledge.json', 'w') as fp:
    json.dump(features, fp)

# Wholistic Domain Knowledge

In [155]:
# preprocess tags
entire_dataset = []
for item in dataset['validation']:
    entire_dataset.append(item)
for item in dataset['train']:
    entire_dataset.append(item)
for item in dataset['test']:
    entire_dataset.append(item)

processed_tags, ner_tokens, text = process_entity_tag(data=entire_dataset)

In [157]:
len(entire_dataset)

5624

## High frequency sub-token

for each entity we will generate a list of high frequency subtokens containing:
- top 50 3-grams
- top 50 5-grams

In [156]:
sub_tokens = {}

for entity in entity_name:
    for bi in ["B","I"]:
        for n in [3, 5]:
            tokens = ner_tokens[f"{bi}-{entity}"]
            sub_tokens[f"{bi}-{entity}_{n}_grams"] =[i[0] for i in find_frequent_subword(tokens, n_gram=n, top=50)]

## Pattern Analysis

In [158]:
regex = {}

### Organization:
+ Start with capital letters
+ Name / Country / Org

In [159]:
entity = 'Organization'
word_counts = Counter(ner_tokens[f"B-{entity}"] + ner_tokens[f"I-{entity}"])
top_word = word_counts.most_common(100)
top_word

[('of', 4638),
 ('Science', 2337),
 ('National', 2316),
 ('University', 1972),
 ('Research', 1860),
 ('and', 1807),
 ('for', 1522),
 ('Institute', 1256),
 ('de', 1171),
 ('Space', 1150),
 ('Foundation', 1124),
 ('NASA', 1087),
 ('Council', 1015),
 ('University,', 943),
 ('European', 576),
 ('NSF', 564),
 ('Technology', 457),
 ('Aeronautics', 452),
 ('Laboratory,', 417),
 ('Centre', 393),
 ('Center', 386),
 ('California', 377),
 ('Astronomy', 376),
 ('Ministry', 364),
 ('Department', 355),
 ('Technology,', 355),
 ('State', 345),
 ('Foundation,', 345),
 ('in', 343),
 ('STFC', 322),
 ('the', 321),
 ('Foundation.', 319),
 ('Universities', 313),
 ('Astrophysics', 313),
 ('Astronomy,', 288),
 ('Natural', 279),
 ('Propulsion', 278),
 ('Jet', 274),
 ('Australian', 271),
 ('Office', 270),
 ('Association', 261),
 ('Telescope', 251),
 ('Max', 244),
 ('Planck', 244),
 ('Energy', 237),
 ('Sciences', 236),
 ('Physics', 234),
 ('University.', 231),
 ('Facilities', 229),
 ('Academy', 222),
 ('ERC', 21

### Observatory:
+ Contain 'Observatory'
+ Location
+ All alphabet chars are uppercase

In [160]:
entity = 'Observatory'
word_counts = Counter(ner_tokens[f"B-{entity}"] + ner_tokens[f"I-{entity}"])
top_word = word_counts.most_common(100)
top_word

[('Observatory', 684),
 ('ALMA', 372),
 ('Observatory,', 294),
 ('National', 282),
 ('ESO', 206),
 ('Astronomical', 187),
 ('Astronomy', 146),
 ('de', 123),
 ('Swift', 112),
 ('Observatory.', 112),
 ('Radio', 111),
 ('Astrophysical', 111),
 ('of', 100),
 ('Smithsonian', 94),
 ('SDO', 93),
 ('ESO,', 80),
 ('Observatorio', 80),
 ('Paranal', 75),
 ('La', 74),
 ('Joint', 72),
 ('NAOJ.', 72),
 ('Fermi', 72),
 ('del', 71),
 ('Silla', 69),
 ('Keck', 68),
 ('Roque', 66),
 ('los', 66),
 ('Muchachos', 60),
 ('European', 50),
 ('Gemini', 50),
 ('Optical', 50),
 ('Southern', 49),
 ('Las', 43),
 ('Observatoire', 41),
 ('Solar', 39),
 ('W.', 35),
 ('M.', 35),
 ('Observatories', 33),
 ('Cumbres', 32),
 ('Observatories,', 30),
 ('Shanghai', 28),
 ('Japan.', 28),
 ('di', 27),
 ('Telescope', 26),
 ('Naval', 25),
 ('Cerro', 23),
 ('NAOC', 21),
 ('United', 21),
 ('W.M.', 21),
 ('States', 21),
 ('NAOJ', 20),
 ('Tololo', 20),
 ('Global', 20),
 ('Network', 20),
 ('China,', 19),
 ('Campanas', 19),
 ('South', 

In [161]:
regex['contain_obs'] = r"observator|Observator"

### CelestialObject:
+ Contain 'Sun', 'Earth', 'Gala', 'Milky', 'solar'
+ Numbers + capital letters
+ All alphabet chars are uppercase
+ contain '-'

In [162]:
entity = 'CelestialObject'
word_counts = Counter(ner_tokens[f"B-{entity}"] + ner_tokens[f"I-{entity}"])
top_word = word_counts.most_common(100)
top_word

[('NGC', 471),
 ('solar', 278),
 ('HD', 253),
 ('Galactic', 210),
 ('SN', 174),
 ('Milky', 145),
 ('Sgr', 144),
 ('GRB', 116),
 ('LMC', 107),
 ('Cyg', 98),
 ('Sun', 94),
 ('Way', 89),
 ('BAT', 81),
 ('Earth', 78),
 ('A', 78),
 ('MW', 76),
 ('PSR', 66),
 ('M', 62),
 ('KIC', 60),
 ('Tau', 55),
 ('3C', 48),
 ('SDSS', 46),
 ('Orion', 46),
 ('A*', 45),
 ('UGC', 43),
 ('Cen', 43),
 ('T', 43),
 ('4U', 41),
 ('Sun.', 41),
 ('B', 40),
 ('SMC', 40),
 ('Jupiter', 39),
 ('system', 38),
 ('Sun,', 37),
 ('Galaxy', 36),
 ('Solar', 36),
 ('PKS', 36),
 ('IRAS', 36),
 ('GJ', 35),
 ('b', 32),
 ('IC', 31),
 ('GW170817', 29),
 ('Magellanic', 29),
 ('Leo', 28),
 ('M31', 27),
 ('Mrk', 27),
 ('+', 27),
 ('X-1', 27),
 ('Earth,', 26),
 ('Perseus', 26),
 ('A,', 26),
 ('Abell', 26),
 ('AT', 26),
 ('MAXI', 25),
 ('II', 25),
 ('Way,', 25),
 ('Tauri', 24),
 ('R', 23),
 ('Sco', 23),
 ('CMB', 22),
 ('Local', 22),
 ('Virgo', 22),
 ('M87', 22),
 ('1', 22),
 ('Aur', 22),
 ('BYF', 22),
 ('Earth’s', 21),
 ('Cygnus', 21),
 

In [163]:
regex['contain_co'] = r"Sun|Earth|Gala|Milky|solar|Jupiter"
regex['contain_cap_num'] = r"([A-Z]+\d+|\d+[A-Z]+)"
regex['contain_less_3'] = r".{1,3}"

sub_tokens['-'] = ["-"]
sub_tokens['greek_letter'] = ['α', 'ε', 'θ','ω']

### Event:
+ Contain year/date
+ contain workshop meeting

In [164]:
entity = 'Event'
word_counts = Counter(ner_tokens[f"B-{entity}"] + ner_tokens[f"I-{entity}"])
top_word = word_counts.most_common(100)
top_word

[('and', 32),
 ('workshop', 27),
 ('of', 24),
 ('in', 23),
 ('Gaia', 18),
 ('Summer', 12),
 ('workshops', 12),
 ('on', 12),
 ('the', 10),
 ('-ESO', 10),
 ('conferences', 9),
 ('Program', 8),
 ('for', 8),
 ('Workshop', 7),
 ('Coronal', 6),
 ('Kavli', 5),
 ('NYC', 5),
 ('2017', 5),
 ('2018', 5),
 ('Formation', 5),
 ('at', 5),
 ('“The', 4),
 ('Physics', 4),
 ('Star', 4),
 ('International', 4),
 ('2016', 4),
 ('meetings', 4),
 ('conference', 4),
 ('Research', 4),
 ('meeting', 4),
 ('Observations', 4),
 ('Astrophysics', 4),
 ('Sprint,', 4),
 ('Team', 4),
 ('Solar', 4),
 ('2015', 3),
 ('Galactic', 3),
 ('‘The', 3),
 ('Galaxy', 3),
 ('la', 3),
 ('Magnetic', 3),
 ('de', 3),
 ('June', 3),
 ('Disks', 3),
 ('Archaeology', 3),
 ('Precision', 3),
 ('Stellar', 3),
 ('Black', 3),
 ('Institute', 3),
 ('Modelling', 3),
 ('from', 3),
 ('MHD', 3),
 ('“New', 2),
 ('KITP', 2),
 ('ISIMA', 2),
 ('“Galactic', 2),
 ('The', 2),
 ('Data', 2),
 ('Aspen', 2),
 ('MIAPP', 2),
 ('Cluster', 2),
 ('‘Dynamic', 2),
 ('‘C

In [165]:
regex['contain_year'] = r"\d{4}"
regex['event'] = r"workshop|meeting|Workshop|Meeting"

### CelestialRegion
- contain '°', '′'
- contain '>', '<', '=', '|'
- single lower case character (not 'a')

In [167]:
entity = 'CelestialRegion'
word_counts = Counter(ner_tokens[f"B-{entity}"] + ner_tokens[f"I-{entity}"])
top_word = word_counts.most_common(100)
top_word

[('=', 74),
 ('Galactic', 53),
 ('field', 47),
 ('COSMOS', 30),
 ('b', 22),
 ('and', 22),
 ('l', 18),
 ('h', 17),
 ('from', 17),
 ('to', 17),
 ('δ', 16),
 ('decl.', 16),
 ('GAMA', 14),
 ('Field', 14),
 ('R.A.', 13),
 ('halo', 13),
 ('regions', 13),
 ('m', 13),
 ('α', 12),
 ('latitude', 12),
 (')', 11),
 ('of', 11),
 ('local', 10),
 ('(', 10),
 ('Universe', 10),
 (',', 9),
 ('field,', 9),
 ('fields', 9),
 ('J2000', 8),
 ('−12', 8),
 ('field.', 8),
 ('2000', 8),
 ('Southern', 7),
 ('Deep', 7),
 ('fields.', 7),
 ('±', 7),
 ('area', 7),
 ('Northern', 6),
 ('XXL-S', 6),
 ('XXL-N', 6),
 ('longitude', 6),
 ('fields,', 6),
 ('galactic', 5),
 ('extragalactic', 5),
 ('J1030', 5),
 ('RA', 5),
 ('high', 5),
 ('MW', 5),
 ('disc', 5),
 ('XXL', 5),
 ('|', 5),
 ('region', 5),
 ('Plane', 5),
 ('footprint', 5),
 ('A', 5),
 ('BICEP2', 4),
 ('low', 4),
 ('solar', 4),
 ('UDS', 4),
 ('BOSS1441', 4),
 ('E-CDFS', 4),
 ('gb1', 4),
 ('Sky', 4),
 ('footprint,', 4),
 ('371,', 4),
 ('371.', 4),
 ('l,', 4),
 ('Dec'

In [168]:
sub_tokens['degree'] = ['°', '′']
sub_tokens['operation'] = ['>', '<', '=', '|', "(",")", '±']
sub_tokens['field'] = ["field", "Field"]
regex['hml'] = r"^[hml]$"

### Identifier
- start with 'ADS/JAO'
- contain 'version'
- start with '(v)'
- start with / contain '#'
- r'/d{4}\./d{1,2}\./d{5}'
- all numbers
- AAE-\d{6}

In [169]:
entity = 'Identifier'
word_counts = Counter(ner_tokens[f"B-{entity}"] + ner_tokens[f"I-{entity}"])
top_word = word_counts.most_common(100)
top_word

[('version', 13),
 ('(version', 13),
 ('AR', 12),
 ('143,', 10),
 ('cycle', 9),
 ('23.', 9),
 ('ADS/JAO.ALMA', 8),
 ('NOAA', 7),
 ('CBP1', 6),
 ('(Version', 6),
 ('23', 6),
 ('ADS/JAO.ALMA#', 5),
 ('AAS', 5),
 ('AAS,', 5),
 ('CBP2', 4),
 ('(v3)', 4),
 ('ID', 3),
 ('AAE-141220', 3),
 ('(v.', 3),
 ('solar', 3),
 ('2;', 3),
 ('LIGO', 2),
 ('2016.1.01164.S.', 2),
 ('CBP1,', 2),
 ('ADS/JAO.ALMA#2012.1.00453.S.', 2),
 ('Jessica', 2),
 ('v5.1.1;', 2),
 ('AAE-141220.', 2),
 ('AAE-061228,', 2),
 ('3.6.8', 2),
 ('(v10108;', 2),
 ('2011.0.00902.S', 2),
 ('(v4.8;', 2),
 ('(v12.9.1;', 2),
 ('(ObsID:', 2),
 ('ver.', 2),
 ('(ObsIDs', 2),
 ('(Obs.ID', 2),
 ('12290', 2),
 ('1', 2),
 ('12335,', 2),
 ('No.', 2),
 ('24', 2),
 ('12127', 2),
 ('11610', 2),
 ('11630', 2),
 ('(v4.7.2,', 1),
 ('ADS/JAO.ALMA#2011.0.00876.S,', 1),
 ('ADS/JAO.ALMA#2012.00650.', 1),
 ('<inline-formula>', 1),
 ('V,', 1),
 ('ADS/JAO.ALMA#2013.1.00111.S.', 1),
 ('ADS/JAO.ALMA#2011.0.00172.S.', 1),
 ('24Jun2019_V6.26.1,', 1),
 ('ADS/J

In [170]:
sub_tokens['#'] = ['#']

regex['ADS/JAO'] = r"^ADS/JAO"
regex['num_seq'] = r'/d{4}\./d{1,2}\./d{5}'
regex['version'] = r"version|Version"
regex['(v'] = r"^\(v"
regex['AAE'] = r'^AAE-\d{6}'

In [171]:
features = {}
features['sub_tokens'] = sub_tokens
features['regex'] = regex

with open('domain_knowledge/wholistic_domain_knowledge.json', 'w') as fp:
    json.dump(features, fp)