# Geoparsing Package

## 1. Python libs needed
### 1.1. Python libs: 
psycopg2 >= 2.5, nltk >= 3.2, numpy >= 1.7, shapely >= 1.5
### 1.2. Database: 
PostgreSQL >= 9.3 and PostGIS >= 2.1 database loaded with database image containing global cities
### 1.3.  NLTK corpra:
import nltk
	nltk.download()
	==> install all or at least stopwords, names and wordnet

In [1]:
import nltk

In [2]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml
showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


KeyboardInterrupt: 

**showing info** https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml
**showing info** https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml
**D:\Program Files\nltk_data**  3.25 GB

## 2. Databases needed for geoparsing
### 2.1. Download SQL tar dumps (failed)
	au_nz_places.tar [1 Mb]
	north_america_places.tar [18 Mb]
	europe_places.tar [137 Mb]
	global_cities10.tar [4131 Mb]
### 2.2. Connect to PostgreSQL and create the database with the required PostGIS and hstore extensions
	CREATE DATABASE openstreetmap;
	CREATE EXTENSION IF NOT EXISTS postgis;
	CREATE EXTENSION IF NOT EXISTS fuzzystrmatch;
	CREATE EXTENSION IF NOT EXISTS postgis_tiger_geocoder;
	CREATE EXTENSION IF NOT EXISTS hstore;
	CREATE SCHEMA IF NOT EXISTS reveal;
### 2.3. Import the precomputed database tables for global cities and places
	pg_restore -F t -1 -d openstreetmap global_cities10.tar
	pg_restore -F t -1 -d openstreetmap europe_places.tar
	pg_restore -F t -1 -d openstreetmap north_america_places.tar
	pg_restore -F t -1 -d openstreetmap au_nz_places.tar

## 3. Example code geoparse
Geoparse some text using the default focus areas in the Postgres database. Fully documented example PY file can be found at geoparsepy.example_geoparse.py note: loading 1,000,000+ global locations into memory at startup is slow (up to 60 minutes) but subsequently the geoparsing of text is very fast (real-time speeds)

In [7]:
import os, sys, logging, traceback, codecs, datetime, copy, time, ast, math, re, random, shutil, json

In [9]:
import geoparsepy

ModuleNotFoundError: No module named 'config_helper'

In [None]:
LOG_FORMAT = ('%(message)s')
logger = logging.getLogger( __name__ )
logging.basicConfig( level=logging.INFO, format=LOG_FORMAT )
logger.info('logging started')

dictGeospatialConfig = geoparsepy.geo_parse_lib.get_geoparse_config( 
	lang_codes = ['en'],
	logger = logger,
	corpus_dir = None,
	whitespace = u'"\u201a\u201b\u201c\u201d()',
	sent_token_seps = ['\n','\r\n', '\f', u'\u2026'],
	punctuation = """,;\/:+-#~&*=!?""",
	)

databaseHandle = geoparsepy.PostgresqlHandler.PostgresqlHandler( 'postgres', 'postgres', 'localhost', 5432, 'openstreetmap', 600 )

dictLocationIDs = {}
listFocusArea=[ 'global_cities10', 'europe_places', 'north_america_places', 'au_nz_places' ]
for strFocusArea in listFocusArea :
	dictLocationIDs[strFocusArea + '_admin'] = [-1,-1]
	dictLocationIDs[strFocusArea + '_poly'] = [-1,-1]
	dictLocationIDs[strFocusArea + '_line'] = [-1,-1]
	dictLocationIDs[strFocusArea + '_point'] = [-1,-1]

cached_locations = geoparsepy.geo_preprocess_lib.cache_preprocessed_locations( databaseHandle, dictLocationIDs, 'reveal', dictGeospatialConfig )
logger.info( 'number of cached locations = ' + str(len(cached_locations)) )

databaseHandle.close()

indexed_locations = geoparsepy.geo_parse_lib.calc_inverted_index( cached_locations, dictGeospatialConfig )
logger.info( 'number of indexed phrases = ' + str(len(indexed_locations.keys())) )

indexed_geoms = geoparsepy.geo_parse_lib.calc_geom_index( cached_locations )
logger.info( 'number of indexed geoms = ' + str(len(indexed_geoms.keys())) )

osmid_lookup = geoparsepy.geo_parse_lib.calc_osmid_lookup( cached_locations )

dictGeomResultsCache = {}

listText = [
	u'hello New York, USA its Bill from Bassett calling',
	u'live on the BBC Victoria Derbyshire is visiting Derbyshire for an exclusive UK interview',
	]

listTokenSets = []
listGeotags = []
for nIndex in range(len(listText)) :
	strUTF8Text = listText[ nIndex ]
	listToken = geoparsepy.common_parse_lib.unigram_tokenize_microblog_text( strUTF8Text, dictGeospatialConfig )
	listTokenSets.append( listToken )
	listGeotags.append( None )

listMatchSet = geoparsepy.geo_parse_lib.geoparse_token_set( listTokenSets, indexed_locations, dictGeospatialConfig )

strGeom = 'POINT(-1.4052268 50.9369033)'
listGeotags[0] = strGeom

listMatchGeotag = geoparsepy.geo_parse_lib.reverse_geocode_geom( [strGeom], indexed_geoms, dictGeospatialConfig )
if len( listMatchGeotag[0] ) > 0  :
	for tupleOSMIDs in listMatchGeotag[0] :
		setIndexLoc = osmid_lookup[ tupleOSMIDs ]
		for nIndexLoc in setIndexLoc :
			strName = cached_locations[nIndexLoc][1]
			logger.info( 'Reverse geocoded geotag location [index ' + str(nIndexLoc) + ' osmid ' + repr(tupleOSMIDs) + '] = ' + strName )

for nIndex in range(len(listMatchSet)) :
	logger.info( 'Text = ' + listText[nIndex] )
	listMatch = listMatchSet[ nIndex ]
	strGeom = listGeotags[ nIndex ]
	setOSMID = set([])
	for tupleMatch in listMatch :
		nTokenStart = tupleMatch[0]
		nTokenEnd = tupleMatch[1]
		tuplePhrase = tupleMatch[3]
		for tupleOSMIDs in tupleMatch[2] :
			setIndexLoc = osmid_lookup[ tupleOSMIDs ]
			for nIndexLoc in setIndexLoc :
				logger.info( 'Location [index ' + str(nIndexLoc) + ' osmid ' + repr(tupleOSMIDs) + ' @ ' + str(nTokenStart) + ' : ' + str(nTokenEnd) + '] = ' + ' '.join(tuplePhrase) )
				break
	listLocMatches = geoparsepy.geo_parse_lib.create_matched_location_list( listMatch, cached_locations, osmid_lookup )
	geoparsepy.geo_parse_lib.filter_matches_by_confidence( listLocMatches, dictGeospatialConfig, geom_context = strGeom, geom_cache = dictGeomResultsCache )
	geoparsepy.geo_parse_lib.filter_matches_by_geom_area( listLocMatches, dictGeospatialConfig )
	geoparsepy.geo_parse_lib.filter_matches_by_region_of_interest( listLocMatches, [-148838, -62149], dictGeospatialConfig )
	setOSMID = set([])
	for nMatchIndex in range(len(listLocMatches)) :
		nTokenStart = listLocMatches[nMatchIndex][1]
		nTokenEnd = listLocMatches[nMatchIndex][2]
		tuplePhrase = listLocMatches[nMatchIndex][3]
		strGeom = listLocMatches[nMatchIndex][4]
		tupleOSMID = listLocMatches[nMatchIndex][5]
		dictOSMTags = listLocMatches[nMatchIndex][6]
		if not tupleOSMID in setOSMID :
			setOSMID.add( tupleOSMID )
			listNameMultilingual = geoparsepy.geo_parse_lib.calc_multilingual_osm_name_set( dictOSMTags, dictGeospatialConfig )
			strNameList = ';'.join( listNameMultilingual )
			strOSMURI = geoparsepy.geo_parse_lib.calc_OSM_uri( tupleOSMID, strGeom )
			logger.info( 'Disambiguated Location [index ' + str(nMatchIndex) + ' osmid ' + repr(tupleOSMID) + ' @ ' + str(nTokenStart) + ' : ' + str(nTokenEnd) + '] = ' + strNameList + ' : ' + strOSMURI )

## 4. Databases needed for preprocessing focus areas
To preprocess your own focus areas (e.g. a city with all its streets and buildings) you need a local deployment of the planet OpenStreetmapDatabase. 

Once a focus area is preprocessed a database table will be created for it. This can be used in the geoparse just like the 'global_cities10' focus area is in the previous example.

[Osm2pgsql](https://wiki.openstreetmap.org/wiki/Osm2pgsql#From_the_package_manager)
**Osm2pgsql** is a command-line based program that converts OpenStreetMap data to postGIS-enabled PostgreSQL databases.

[Planet.osm](https://wiki.openstreetmap.org/wiki/Planet.osm)
**Planet.osm** is the OpenStreetMap data in one file: all the nodes, ways and relations that make up our map. 

A new version is released every week. It's a big file (on 2018-04-21, the plain OSM XML variant takes over 913 GB when uncompressed from the 66.6 GB bzip2-compressed or 39.6GB PBF-compressed downloaded data file).

## 5.  code preprocess focus area
Preprocessing new focus area tables in the Postgres database. Fully documented example PY file can be found at geoparsepy.example_preprocess_focus_area.py

In [None]:
import os, sys, logging, traceback, codecs, datetime, copy, time, ast, math, re, random, shutil, json
import geoparsepy.config_helper, geoparsepy.common_parse_lib, geoparsepy.PostgresqlHandler, geoparsepy.geo_parse_lib, geoparsepy.geo_preprocess_lib

LOG_FORMAT = ('%(message)s')
logger = logging.getLogger( __name__ )
logging.basicConfig( level=logging.INFO, format=LOG_FORMAT )
logger.info('logging started')

dictFocusAreaSpec = {
	'southampton' : {
		'focus_area_id' : 'southampton',
		'admin': ['southampton','south east england', 'united kingdom'],
		'admin_lookup_table' : 'global_cities10_admin',
	}
}

dictGlobalSpec = None

dictGeospatialConfig = geoparsepy.geo_parse_lib.get_geoparse_config( 
	lang_codes = ['en'],
	logger = logger,
	corpus_dir = None,
	whitespace = u'"\u201a\u201b\u201c\u201d()',
	sent_token_seps = ['\n','\r\n', '\f', u'\u2026'],
	punctuation = """,;\/:+-#~&*=!?""",
	)

dbHandlerPool = {}
dbHandlerPool['admin'] = geoparsepy.PostgresqlHandler.PostgresqlHandler( 'postgres', 'postgres', 'localhost', 5432, 'openstreetmap' )
dbHandlerPool['point'] = geoparsepy.PostgresqlHandler.PostgresqlHandler( 'postgres', 'postgres', 'localhost', 5432, 'openstreetmap' )
dbHandlerPool['poly'] = geoparsepy.PostgresqlHandler.PostgresqlHandler( 'postgres', 'postgres', 'localhost', 5432, 'openstreetmap' )
dbHandlerPool['line'] = geoparsepy.PostgresqlHandler.PostgresqlHandler( 'postgres', 'postgres', 'localhost', 5432, 'openstreetmap' )

for strFocusArea in dictFocusAreaSpec.keys() :
	logger.info( 'starting focus area ' + strFocusArea )
	jsonFocusArea = dictFocusAreaSpec[strFocusArea]
	geoparsepy.geo_preprocess_lib.create_preprocessing_tables( jsonFocusArea, dbHandlerPool['admin'], 'reveal', delete_contents = False, logger = logger )
	dictNewLocations = geoparsepy.geo_preprocess_lib.execute_preprocessing_focus_area( jsonFocusArea, dbHandlerPool, 'reveal', logger = logger )
	logger.info( 'finished focus area ' + strFocusArea )
	logger.info( 'location id range : ' + repr(dictNewLocations) )

dbHandlerPool['admin'].close()
dbHandlerPool['point'].close()
dbHandlerPool['poly'].close()
dbHandlerPool['line'].close()