In [1]:
import json
import re
import pickle

## Gather Data

Use nuhil's bangladesh-geocode repository to collect GPE names of Bangladesh. This is fairly accurate.

In [2]:
!git clone https://github.com/nuhil/bangladesh-geocode.git

Cloning into 'bangladesh-geocode'...
remote: Enumerating objects: 22, done.[K
remote: Counting objects: 100% (22/22), done.[K
remote: Compressing objects: 100% (21/21), done.[K
remote: Total 182 (delta 8), reused 0 (delta 0), pack-reused 160[K
Receiving objects: 100% (182/182), 670.16 KiB | 598.00 KiB/s, done.
Resolving deltas: 100% (79/79), done.


In [3]:
in_prefix = 'bangladesh-geocode'
out_prefix = 'interim'

!mkdir -p interim

## Extract all hierarchies of GPE

In [4]:
with open(in_prefix + '/districts/districts.json') as f:
    content = json.load(f)

districts = [d['bn_name'] for d in content[2]['data']]

'Districts: {}'.format(len(districts))

'Districts: 64'

In [5]:
with open(in_prefix + '/divisions/divisions.json') as f:
    content = json.load(f)

divisions = [d['bn_name'] for d in content[2]['data']]

'Divisions: {}'.format(len(divisions))

'Divisions: 8'

In [6]:
with open(in_prefix + '/unions/unions.json') as f:
    content = json.load(f)

unions = [d['bn_name'] for d in content[2]['data']]

'Unions: {}'.format(len(unions))

'Unions: 4540'

In [7]:
with open(in_prefix + '/upazilas/upazilas.json') as f:
    content = json.load(f)

upazilas = [d['bn_name'] for d in content[2]['data']]

'Upzillas: {}'.format(len(upazilas))

'Upzillas: 491'

In [8]:
import itertools

unique_gpe = set(itertools.chain(divisions, districts, unions, upazilas))

'Total Unique GPE: {}'.format(len(unique_gpe))

'Total Unique GPE: 4351'

## Transform Special Names

Convert নাখালপাড়া (পশ্চিম) to পশ্চিম নাখালপাড়া।

In [9]:
import re
r = re.compile('(.*)\s*\((.*)\)')

unique_gpe.update([' '.join(r.findall(name)[0][::-1]).strip() for name in unique_gpe if '(' in name])

'Total Unique GPE: {}'.format(len(unique_gpe))

'Total Unique GPE: 4434'

## Dump into Pickle

Dump the names into a pickle for future use in other projects.

In [10]:
import pickle

with open(out_prefix + '/unique_gpe.pickle', 'wb') as f:
    pickle.dump(unique_gpe, f)

## Convert to spaCy Matchers

We will train spaCy NER with this.

In [11]:
def make_pattern(name):
    return [{'TEXT': t} for t in name.split()]

def make_patterns(entities):
    return [make_pattern(entity) for entity in entities]

patterns = make_patterns(unique_gpe)

In [12]:
with open(out_prefix + '/patterns.pickle', 'wb') as f:
    pickle.dump(patterns, f)