# PostGIS on Greenplum Database
## 8.1 System preparation
### 8.1.1 Set system variables, connection string, Greenplum Database auto stats, etc.

In [1]:
import os, re
from IPython.display import display_html

import pygments.lexers
from pygments import highlight
from pygments.formatters import HtmlFormatter

CONNECTION_STRING = os.getenv('AWSGPDBCONN')

cs = re.match('^postgresql:\/\/(\S+):(\S+)@(\S+):(\S+)\/(\S+)$', CONNECTION_STRING)

DB_USER   = cs.group(1)
DB_PWD    = cs.group(2)
DB_SERVER = cs.group(3)
DB_PORT   = cs.group(4)
DB_NAME   = cs.group(5)

%reload_ext sql
%sql $CONNECTION_STRING

'Connected: gpadmin@gpadmin'

In [2]:
%%sql $DB_USER@$DB_SERVER
SHOW gp_autostats_mode;
ALTER DATABASE gpadmin SET gp_autostats_mode TO 'NONE';
SHOW gp_autostats_mode;

1 rows affected.
Done.
1 rows affected.


gp_autostats_mode
ON_NO_STATS


### 8.1.2 Prepare AWS System and setup awscli library via pip

In [3]:
shfilecode = !pygmentize -f html -O full,style=friendly -l shell script/1-1-system-prepare.sh
display_html('\n'.join(shfilecode), raw=True)

In [4]:
!ssh-keygen -R $DB_SERVER
!ssh-keyscan $DB_SERVER >> ~/.ssh/known_hosts
!scp -i ~/.ssh/aws-gp.pem script/1-1-system-prepare.sh $DB_USER@$DB_SERVER:system-prepare.sh
!ssh -i ~/.ssh/aws-gp.pem $DB_USER@$DB_SERVER 'chmod +x ./system-prepare.sh'
!ssh -i ~/.ssh/aws-gp.pem $DB_USER@$DB_SERVER 'sudo ./system-prepare.sh'

Host ec2-35-176-68-36.eu-west-2.compute.amazonaws.com not found in /root/.ssh/known_hosts
# ec2-35-176-68-36.eu-west-2.compute.amazonaws.com:22 SSH-2.0-OpenSSH_7.4
# ec2-35-176-68-36.eu-west-2.compute.amazonaws.com:22 SSH-2.0-OpenSSH_7.4
# ec2-35-176-68-36.eu-west-2.compute.amazonaws.com:22 SSH-2.0-OpenSSH_7.4
1-1-system-prepare.sh                         100%  712    80.6KB/s   00:00    
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1734k  100 1734k    0     0  13.3M      0 --:--:-- --:--:-- --:--:-- 13.3M
DEPRECATION: Python 2.7 will reach the end of its life on January 1st, 2020. Please upgrade your Python as Python 2.7 won't be maintained after that date. A future version of pip will drop support for Python 2.7. More details about Python 2 support in pip, can be found at https://pip.pypa.io/en/latest/development/release-process/#python-2-support
Collecting pip
  Downl

Installing collected packages: docutils, PyYAML, pyasn1, rsa, jmespath, six, python-dateutil, urllib3, botocore, colorama, futures, s3transfer, awscli
  Found existing installation: docutils 0.15.2
    Uninstalling docutils-0.15.2:
      Successfully uninstalled docutils-0.15.2
  Found existing installation: PyYAML 3.10
***********************
* Get AWS CLI version *
***********************
ERROR: Cannot uninstall 'PyYAML'. It is a distutils installed project and thus we cannot accurately determine which files belong to it which would lead to only a partial uninstall.
aws-cli/1.15.30 Python/2.7.5 Linux/3.10.0-957.21.3.el7.x86_64 botocore/1.10.30


### 8.1.3 Provide AWS Access Key ID & Secret Access Key

In [5]:
shfilecode = !pygmentize -f html -O full,style=friendly -l bash script/1-2-aws-configure.sh
display_html('\n'.join(shfilecode), raw=True)

In [6]:
import getpass

!scp -i ~/.ssh/aws-gp.pem script/1-2-aws-configure.sh $DB_USER@$DB_SERVER:aws-configure.sh
!ssh -i ~/.ssh/aws-gp.pem $DB_USER@$DB_SERVER 'chmod +x ./aws-configure.sh'

cmd = 'sudo ./aws-configure.sh ' 
cmd = cmd + getpass.getpass("AWS Access Key ID [None]:") 
cmd = cmd + ' ' + getpass.getpass("AWS Secret Access Key [None]:")

!ssh -i ~/.ssh/aws-gp.pem $DB_USER@$DB_SERVER $cmd

1-2-aws-configure.sh                          100%  484    63.8KB/s   00:00    
AWS Access Key ID [None]:········
AWS Secret Access Key [None]:········
AWS S3 Configuration setup correctly


### 8.1.4 Create Greenplum Database Schema and Tables for Demo

In [151]:
sqlfilecode = !pygmentize -f html -O full,style=friendly -l postgres script/8-0-db-maintenance.sql
display_html('\n'.join(sqlfilecode), raw=True)

In [152]:
query = !cat script/8-0-db-maintenance.sql
%sql $DB_USER@$DB_SERVER {''.join(query)}

Done.
Done.


[]

In [154]:
sqlfilecode = !pygmentize -f html -O full,style=friendly -l postgres script/8-1-create-schema-tables.sql
display_html('\n'.join(sqlfilecode), raw=True)

In [155]:
query = !cat script/8-1-create-schema-tables.sql
%sql $DB_USER@$DB_SERVER {''.join(query)}

Done.
Done.
Done.
Done.
Done.
13 rows affected.


[]

In [156]:
sqlfilecode = !pygmentize -f html -O full,style=friendly -l postgres script/8-2-create-london-lsoa-table.sql
display_html('\n'.join(sqlfilecode), raw=True)

In [157]:
query = !cat script/8-2-create-london-lsoa-table.sql
%sql $DB_USER@$DB_SERVER {''.join(query)}

Done.
Done.


[]

### 8.1.5 Copy source files from AWS S3

In [158]:
shfilecode = !pygmentize -f html -O full,style=friendly -l bash script/8-3-copy-data-from-s3.sh
display_html('\n'.join(shfilecode), raw=True)

In [159]:
!scp -i ~/.ssh/aws-gp.pem script/8-3-copy-data-from-s3.sh $DB_USER@$DB_SERVER:copy-data-from-s3.sh
!ssh -i ~/.ssh/aws-gp.pem $DB_USER@$DB_SERVER 'chmod +x ./copy-data-from-s3.sh'
!ssh -i ~/.ssh/aws-gp.pem $DB_USER@$DB_SERVER "sudo ./copy-data-from-s3.sh"

8-3-copy-data-from-s3.sh                      100%  853   111.1KB/s   00:00    
total 4
drwxr-xr-x   2 gpadmin root    6 Oct 23 14:33 ./
drwxr-xr-x. 21 root    root 4096 Oct 23 14:33 ../
download: s3://uber-movement-ldn/PCD_OA_LSOA_MSOA_LAD_AUG19_UK_LU.csv to ./PCD_OA_LSOA_MSOA_LAD_AUG19_UK_LU.csv
download: s3://uber-movement-ldn/uber-movement-ldn.tar.gz to ./uber-movement-ldn.tar.gz
download: s3://uber-movement-ldn/ukpostcodes.csv to ./ukpostcodes.csv
download: s3://uber-movement-ldn/2018-MRDB-minimal.shp to ./2018-MRDB-minimal.shp
download: s3://uber-movement-ldn/2018-MRDB-minimal.shx to ./2018-MRDB-minimal.shx
download: s3://uber-movement-ldn/2018-MRDB-minimal.dbf to ./2018-MRDB-minimal.dbf


### 8.1.6 Load data
- **Load `uber-movement-ldn.tar.gz` file into `demo.uber_mov_ldn` table using GPLoad**

In [160]:
sqlfilecode = !pygmentize -f html -O full,style=friendly -l yaml script/8-4-load-uber-movement-ldn.yaml
display_html('\n'.join(sqlfilecode), raw=True)

In [161]:
!scp -i ~/.ssh/aws-gp.pem script/8-4-load-uber-movement-ldn.yaml $DB_USER@$DB_SERVER:load-uber-movement-ldn.yaml
!ssh -i ~/.ssh/aws-gp.pem $DB_USER@$DB_SERVER 'if [ -f ./load-uber-movement-ldn.log ]; then rm ./load-uber-movement-ldn.log; fi'

cmd = "gpload -d {0} -f ./load-uber-movement-ldn.yaml -l ./load-uber-movement-ldn.log 2>&1".format(DB_USER) 
!ssh -i ~/.ssh/aws-gp.pem $DB_USER@$DB_SERVER $cmd

8-4-load-uber-movement-ldn.yaml               100%  354    48.6KB/s   00:00    
2019-10-23 14:34:14|INFO|gpload session started 2019-10-23 14:34:14
2019-10-23 14:34:14|INFO|no host supplied, defaulting to localhost
2019-10-23 14:34:14|INFO|started gpfdist -p 8000 -P 9000 -f "/var/tmp_s3_data/uber-movement-ldn.tar.gz" -t 30 -m 1000000
2019-10-23 14:34:19|INFO|reusing external table ext_gpload_reusable_ba92e9a0_f56f_11e9_8ee2_0646937a68fc
2019-10-23 14:35:18|WARN|24 bad rows
2019-10-23 14:35:18|WARN|Please use following query to access the detailed error
2019-10-23 14:35:18|WARN|select * from gp_read_error_log('ext_gpload_reusable_ba92e9a0_f56f_11e9_8ee2_0646937a68fc') where cmdtime > to_timestamp('1571837654.34')
2019-10-23 14:35:18|INFO|running time: 64.35 seconds
2019-10-23 14:35:18|INFO|rows Inserted          = 68939823
2019-10-23 14:35:18|INFO|rows Updated           = 0
2019-10-23 14:35:18|INFO|data formatting errors = 24


- **Load `PCD_OA_LSOA_MSOA_LAD_AUG19_UK_LU.csv` file into `demo.postcode_lookup` table using GPLoad**

In [162]:
sqlfilecode = !pygmentize -f html -O full,style=friendly -l yaml script/8-4-load-postcodes.yaml
display_html('\n'.join(sqlfilecode), raw=True)

In [163]:
!scp -i ~/.ssh/aws-gp.pem script/8-4-load-postcodes.yaml $DB_USER@$DB_SERVER:load-postcodes.yaml
!ssh -i ~/.ssh/aws-gp.pem $DB_USER@$DB_SERVER 'if [ -f ./load-postcodes.log ]; then rm ./load-postcodes.log; fi'

cmd = "gpload -d {0} -f ./load-postcodes.yaml -l ./load-postcodes.log 2>&1".format(DB_USER) 
!ssh -i ~/.ssh/aws-gp.pem $DB_USER@$DB_SERVER $cmd

8-4-load-postcodes.yaml                       100%  369    52.7KB/s   00:00    
2019-10-23 14:35:28|INFO|gpload session started 2019-10-23 14:35:28
2019-10-23 14:35:28|INFO|no host supplied, defaulting to localhost
2019-10-23 14:35:28|INFO|started gpfdist -p 8000 -P 9000 -f "/var/tmp_s3_data/PCD_OA_LSOA_MSOA_LAD_AUG19_UK_LU.csv" -t 30 -m 1000000
2019-10-23 14:35:28|INFO|reusing external table ext_gpload_reusable_fd555fa2_f56f_11e9_b84a_0646937a68fc
2019-10-23 14:35:29|INFO|running time: 1.00 seconds
2019-10-23 14:35:29|INFO|rows Inserted          = 2632804
2019-10-23 14:35:29|INFO|rows Updated           = 0
2019-10-23 14:35:29|INFO|data formatting errors = 0
2019-10-23 14:35:29|INFO|gpload succeeded


- **Load `ukpostcodes.csv` file into `demo.postcodelatlng` table using GPLoad**

In [164]:
sqlfilecode = !pygmentize -f html -O full,style=friendly -l yaml script/8-4-load-postcodes-coord.yaml
display_html('\n'.join(sqlfilecode), raw=True)

In [165]:
!scp -i ~/.ssh/aws-gp.pem script/8-4-load-postcodes-coord.yaml $DB_USER@$DB_SERVER:load-postcodes-coord.yaml
!ssh -i ~/.ssh/aws-gp.pem $DB_USER@$DB_SERVER 'if [ -f ./load-postcodes-coord.log ]; then rm ./load-postcodes-coord.log; fi'

cmd = "gpload -d {0} -f ./load-postcodes-coord.yaml -l ./load-postcodes-coord.log 2>&1".format(DB_USER) 
!ssh -i ~/.ssh/aws-gp.pem $DB_USER@$DB_SERVER $cmd

8-4-load-postcodes-coord.yaml                 100%  347    43.9KB/s   00:00    
2019-10-23 14:35:36|INFO|gpload session started 2019-10-23 14:35:36
2019-10-23 14:35:36|INFO|no host supplied, defaulting to localhost
2019-10-23 14:35:36|INFO|started gpfdist -p 8000 -P 9000 -f "/var/tmp_s3_data/ukpostcodes.csv" -t 30 -m 1000000
2019-10-23 14:35:36|INFO|reusing external table ext_gpload_reusable_780315f4_f571_11e9_8192_0646937a68fc
2019-10-23 14:35:36|INFO|running time: 0.45 seconds
2019-10-23 14:35:36|INFO|rows Inserted          = 1762397
2019-10-23 14:35:36|INFO|rows Updated           = 0
2019-10-23 14:35:36|INFO|data formatting errors = 0
2019-10-23 14:35:36|INFO|gpload succeeded


In [166]:
sqlfilecode = !pygmentize -f html -O full,style=friendly -l postgres script/8-4-load-postcodes-alter.sql
display_html('\n'.join(sqlfilecode), raw=True)

In [167]:
#query = "ALTER TABLE demo.postcode_lookup \
#	ALTER COLUMN dointr TYPE INTEGER USING COALESCE(NULLIF(dointr, ''), '0')::INT; \
# \
#ALTER TABLE demo.postcode_lookup \
#	ALTER COLUMN doterm TYPE INTEGER USING COALESCE(NULLIF(doterm, ''), '0')::INT; \
# \
#ALTER TABLE demo.postcode_lookup \
#	ALTER COLUMN usertype TYPE SMALLINT USING COALESCE(NULLIF(usertype, ''), '0')::SMALLINT;"
query = !cat script/8-4-load-postcodes-alter.sql
%sql $DB_USER@$DB_SERVER {''.join(query)}

Done.
Done.
Done.


[]

- **Load Lower Layer Super Output Area information from `london-lsoa.json` file into `demo.london_LSOA` table**

In [168]:
import boto3
import json

s3 = boto3.resource('s3')

content_object = s3.Object('uber-movement-ldn', 'london-lsoa.json')
file_content = content_object.get()['Body'].read().decode('utf-8')
json_content = json.loads(file_content)

data = json.dumps(json_content)

query = "DELETE FROM demo.london_LSOA_TXT; \
INSERT INTO demo.london_LSOA_TXT (input) VALUES('{0}')".format(str(data))
%sql $DB_USER@$DB_SERVER {''.join(query)}

query2 = "INSERT INTO demo.london_LSOA \
SELECT \
    dat_features->>'type' AS dat_type, \
    ST_GeomFromGeoJSON(dat_features->>'geometry') AS geometry, \
    ((dat_features->>'properties')::json)->>'msoa_code' AS msoa_code, \
    ((dat_features->>'properties')::json)->>'msoa_name' AS msoa_name, \
    ((dat_features->>'properties')::json)->>'la_code' AS la_code, \
    ((dat_features->>'properties')::json)->>'la_name' AS la_name, \
    (((dat_features->>'properties')::json)->>'geoeast')::INT AS geoeast, \
    (((dat_features->>'properties')::json)->>'geonorth')::INT AS geonorth, \
    (((dat_features->>'properties')::json)->>'popeast')::INT AS popeast, \
    (((dat_features->>'properties')::json)->>'popnorth')::INT AS popnorth, \
    (((dat_features->>'properties')::json)->>'area_km2')::NUMERIC AS area_km2, \
    (((dat_features->>'properties')::json)->>'MOVEMENT_ID')::INT AS movement_id, \
    ((dat_features->>'properties')::json)->>'DISPLAY_NAME' AS display_name \
FROM ( \
    SELECT JSON_ARRAY_ELEMENTS(dat->'features')::json AS dat_features \
    FROM ( \
        SELECT input::JSON AS dat \
        FROM demo.london_LSOA_TXT \
    ) A \
) foo"
%sql $DB_USER@$DB_SERVER {query2}

0 rows affected.
1 rows affected.
983 rows affected.


[]

- **Load shape/geography data from `2018-MRDB-minimal.shp` file into `demo.major_roads_network` table using SHP2PGSQL and psql utilities**

In [169]:
shp2pgsqlcode = !pygmentize -f html -O full,style=friendly -l shell script/8-5-major-road-network-shp2pgsql.sh
display_html('\n'.join(shp2pgsqlcode), raw=True)

In [170]:
shp2pgsqlcmd = !cat script/8-5-major-road-network-shp2pgsql.sh
!ssh -i ~/.ssh/aws-gp.pem $DB_USER@$DB_SERVER "{''.join(shp2pgsqlcmd)}"

Shapefile type: Arc
Postgis type: MULTILINESTRING[2]


In [171]:
psqlfilecode = !pygmentize -f html -O full,style=friendly -l postgres script/8-5-major-road-network-psql.sh
display_html('\n'.join(psqlfilecode), raw=True)

In [172]:
psql_cmd = !cat script/8-5-major-road-network-psql.sh
psql_cmd = ''.join(psql_cmd).format(DB_USER)
!ssh -i ~/.ssh/aws-gp.pem $DB_USER@$DB_SERVER "{psql_cmd}"

SET
SET
psql:/var/tmp_s3_data/2018-MRDB-minimal.sql:3: ERROR:  column not found in geometry_columns table
CONTEXT:  SQL statement "SELECT DropGeometryColumn('', $1 , $2 , $3 )"
PL/pgSQL function "dropgeometrycolumn" line 4 at SQL statement
psql:/var/tmp_s3_data/2018-MRDB-minimal.sql:4: ERROR:  table "major_roads_network" does not exist
BEGIN
psql:/var/tmp_s3_data/2018-MRDB-minimal.sql:8: NOTICE:  CREATE TABLE will create implicit sequence "major_roads_network_gid_seq" for serial column "major_roads_network.gid"
psql:/var/tmp_s3_data/2018-MRDB-minimal.sql:8: NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'gid' as the Greenplum Database data distribution key for this table.
HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
CREATE TABLE
psql:/var/tmp_s3_data/2018-MRDB-minimal.sql:9: NOTICE:  ALTER TABLE / ADD PRIMARY KEY will create implicit index "major_road

In [173]:
sqlfilecode = !pygmentize -f html -O full,style=friendly -l postgres script/8-5-major-road-network-check.sql
display_html('\n'.join(sqlfilecode), raw=True)

In [174]:
query = !cat script/8-5-major-road-network-check.sql
%sql $DB_USER@$DB_SERVER {''.join(query)}

1 rows affected.


count
18193


## 8.2 PostGIS

In [None]:
!ssh -i ~/.ssh/aws-gp.pem $DB_USER@$DB_SERVER '

## Scratchpad