# PostGIS on Greenplum Database
## 8.1 System preparation
### 8.1.1 Set system variables, connection string, Greenplum Database auto stats, etc.

In [1]:
import os, re
from IPython.display import display_html

import pygments.lexers
from pygments import highlight
from pygments.formatters import HtmlFormatter

CONNECTION_STRING = os.getenv('AWSGPDBCONN')

cs = re.match('^postgresql:\/\/(\S+):(\S+)@(\S+):(\S+)\/(\S+)$', CONNECTION_STRING)

DB_USER   = cs.group(1)
DB_PWD    = cs.group(2)
DB_SERVER = cs.group(3)
DB_PORT   = cs.group(4)
DB_NAME   = cs.group(5)

%reload_ext sql
%sql $CONNECTION_STRING

'Connected: gpadmin@gpadmin'

In [2]:
%%sql $DB_USER@$DB_SERVER
SHOW gp_autostats_mode;
ALTER DATABASE gpadmin SET gp_autostats_mode TO 'NONE';
SHOW gp_autostats_mode;

1 rows affected.
Done.
1 rows affected.


gp_autostats_mode
ON_NO_STATS


### 8.1.2 Prepare AWS System and setup awscli library via pip

In [3]:
shfilecode = !pygmentize -f html -O full,style=friendly -l shell script/1-1-system-prepare.sh
display_html('\n'.join(shfilecode), raw=True)

In [4]:
!ssh-keygen -R $DB_SERVER
!ssh-keyscan $DB_SERVER >> ~/.ssh/known_hosts
!scp -i ~/.ssh/aws-gp.pem script/1-1-system-prepare.sh $DB_USER@$DB_SERVER:system-prepare.sh
!ssh -i ~/.ssh/aws-gp.pem $DB_USER@$DB_SERVER 'chmod +x ./system-prepare.sh'
!ssh -i ~/.ssh/aws-gp.pem $DB_USER@$DB_SERVER 'sudo ./system-prepare.sh'

Host ec2-35-178-31-55.eu-west-2.compute.amazonaws.com not found in /root/.ssh/known_hosts
# ec2-35-178-31-55.eu-west-2.compute.amazonaws.com:22 SSH-2.0-OpenSSH_7.4
# ec2-35-178-31-55.eu-west-2.compute.amazonaws.com:22 SSH-2.0-OpenSSH_7.4
# ec2-35-178-31-55.eu-west-2.compute.amazonaws.com:22 SSH-2.0-OpenSSH_7.4
1-1-system-prepare.sh                         100%  712    94.3KB/s   00:00    
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1734k  100 1734k    0     0  12.6M      0 --:--:-- --:--:-- --:--:-- 12.5M
DEPRECATION: Python 2.7 will reach the end of its life on January 1st, 2020. Please upgrade your Python as Python 2.7 won't be maintained after that date. A future version of pip will drop support for Python 2.7. More details about Python 2 support in pip, can be found at https://pip.pypa.io/en/latest/development/release-process/#python-2-support
Collecting pip
  Downl

Installing collected packages: docutils, PyYAML, pyasn1, rsa, colorama, futures, jmespath, six, python-dateutil, urllib3, botocore, s3transfer, awscli
  Found existing installation: docutils 0.15.2
    Uninstalling docutils-0.15.2:
      Successfully uninstalled docutils-0.15.2
  Found existing installation: PyYAML 3.10
ERROR: Cannot uninstall 'PyYAML'. It is a distutils installed project and thus we cannot accurately determine which files belong to it which would lead to only a partial uninstall.
***********************
* Get AWS CLI version *
***********************
aws-cli/1.15.30 Python/2.7.5 Linux/3.10.0-957.21.3.el7.x86_64 botocore/1.10.30


### 8.1.3 Provide AWS Access Key ID & Secret Access Key

In [5]:
shfilecode = !pygmentize -f html -O full,style=friendly -l bash script/1-2-aws-configure.sh
display_html('\n'.join(shfilecode), raw=True)

In [6]:
import getpass

!scp -i ~/.ssh/aws-gp.pem script/1-2-aws-configure.sh $DB_USER@$DB_SERVER:aws-configure.sh
!ssh -i ~/.ssh/aws-gp.pem $DB_USER@$DB_SERVER 'chmod +x ./aws-configure.sh'

cmd = 'sudo ./aws-configure.sh ' 
cmd = cmd + getpass.getpass("AWS Access Key ID [None]:") 
cmd = cmd + ' ' + getpass.getpass("AWS Secret Access Key [None]:")

!ssh -i ~/.ssh/aws-gp.pem $DB_USER@$DB_SERVER $cmd

1-2-aws-configure.sh                          100%  484    74.5KB/s   00:00    
AWS Access Key ID [None]:········
AWS Secret Access Key [None]:········
AWS S3 Configuration setup correctly


### 8.1.4 Create Greenplum Database Schema and Tables for Demo

In [132]:
sqlfilecode = !pygmentize -f html -O full,style=friendly -l postgres script/8-0-db-maintenance.sql
display_html('\n'.join(sqlfilecode), raw=True)

In [133]:
query = !cat script/8-0-db-maintenance.sql
%sql $DB_USER@$DB_SERVER {''.join(query)}

Done.
Done.


[]

In [134]:
sqlfilecode = !pygmentize -f html -O full,style=friendly -l postgres script/8-1-create-schema-tables.sql
display_html('\n'.join(sqlfilecode), raw=True)

In [135]:
query = !cat script/8-1-create-schema-tables.sql
%sql $DB_USER@$DB_SERVER {''.join(query)}

Done.
Done.


[]

### 8.1.5 Copy source files from AWS S3

In [112]:
shfilecode = !pygmentize -f html -O full,style=friendly -l bash script/8-3-copy-data-from-s3.sh
display_html('\n'.join(shfilecode), raw=True)

In [113]:
!scp -i ~/.ssh/aws-gp.pem script/8-3-copy-data-from-s3.sh $DB_USER@$DB_SERVER:copy-data-from-s3.sh
!ssh -i ~/.ssh/aws-gp.pem $DB_USER@$DB_SERVER 'chmod +x ./copy-data-from-s3.sh'
!ssh -i ~/.ssh/aws-gp.pem $DB_USER@$DB_SERVER "sudo ./copy-data-from-s3.sh"

8-3-copy-data-from-s3.sh                      100%  892   148.7KB/s   00:00    
total 4
drwxr-xr-x   2 gpadmin root    6 Oct 24 12:03 ./
drwxr-xr-x. 21 root    root 4096 Oct 24 12:03 ../
download: s3://uber-movement-ldn/PCD_OA_LSOA_MSOA_LAD_AUG19_UK_LU.csv to ./PCD_OA_LSOA_MSOA_LAD_AUG19_UK_LU.csv
download: s3://uber-movement-ldn/uber-movement-ldn.tar.gz to ./uber-movement-ldn.tar.gz
download: s3://uber-movement-ldn/ukpostcodes.csv to ./ukpostcodes.csv
download: s3://uber-movement-ldn/2018-MRDB-minimal.shp to ./2018-MRDB-minimal.shp
download: s3://uber-movement-ldn/2018-MRDB-minimal.shx to ./2018-MRDB-minimal.shx
download: s3://uber-movement-ldn/2018-MRDB-minimal.dbf to ./2018-MRDB-minimal.dbf
download: s3://uber-movement-ldn/dft_traffic_counts_raw_counts.tar.gz to ./dft_traffic_counts_raw_counts.tar.gz


### 8.1.6 Load data
- **Load Uber Movement (London) data from `uber-movement-ldn.tar.gz` file into `demo.uber_mov_ldn` table using GPLoad**

In [165]:
sqlfilecode = !pygmentize -f html -O full,style=friendly -l yaml script/8-4-load-uber-movement-ldn.yaml
display_html('\n'.join(sqlfilecode), raw=True)

In [166]:
!ssh -i ~/.ssh/aws-gp.pem $DB_USER@$DB_SERVER 'if [ -f ./load-uber-movement-ldn.yaml ]; then rm ./load-uber-movement-ldn.yaml; fi'
!ssh -i ~/.ssh/aws-gp.pem $DB_USER@$DB_SERVER 'if [ -f ./load-uber-movement-ldn.log ]; then rm ./load-uber-movement-ldn.log; fi'

!scp -i ~/.ssh/aws-gp.pem script/8-4-load-uber-movement-ldn.yaml $DB_USER@$DB_SERVER:load-uber-movement-ldn.yaml
cmd = "gpload -d {0} -f ./load-uber-movement-ldn.yaml -l ./load-uber-movement-ldn.log 2>&1".format(DB_USER) 
!ssh -i ~/.ssh/aws-gp.pem $DB_USER@$DB_SERVER $cmd

8-4-load-uber-movement-ldn.yaml               100%  357    51.7KB/s   00:00    
2019-10-24 13:02:44|INFO|gpload session started 2019-10-24 13:02:44
2019-10-24 13:02:44|INFO|no host supplied, defaulting to localhost
2019-10-24 13:02:44|INFO|started gpfdist -p 8000 -P 9000 -f "/var/tmp_s3_data/uber-movement-ldn.tar.gz" -t 30 -m 1000000
2019-10-24 13:02:49|INFO|reusing external table ext_gpload_reusable_dff0c132_f63b_11e9_9caf_06d6193fe600
2019-10-24 13:03:36|WARN|24 bad rows
2019-10-24 13:03:36|WARN|Please use following query to access the detailed error
2019-10-24 13:03:36|WARN|select * from gp_read_error_log('ext_gpload_reusable_dff0c132_f63b_11e9_9caf_06d6193fe600') where cmdtime > to_timestamp('1571918564.33')
2019-10-24 13:03:36|INFO|running time: 52.18 seconds
2019-10-24 13:03:36|INFO|rows Inserted          = 68939823
2019-10-24 13:03:36|INFO|rows Updated           = 0
2019-10-24 13:03:36|INFO|data formatting errors = 24


- **Load UK Postcodes/Lower-/Middle Layer Super Output Area Information from `PCD_OA_LSOA_MSOA_LAD_AUG19_UK_LU.csv` file into `demo.postcode_lookup` table using GPLoad**

In [169]:
sqlfilecode = !pygmentize -f html -O full,style=friendly -l yaml script/8-4-load-postcodes.yaml
display_html('\n'.join(sqlfilecode), raw=True)

In [170]:
!scp -i ~/.ssh/aws-gp.pem script/8-4-load-postcodes.yaml $DB_USER@$DB_SERVER:load-postcodes.yaml
!ssh -i ~/.ssh/aws-gp.pem $DB_USER@$DB_SERVER 'if [ -f ./load-postcodes.log ]; then rm ./load-postcodes.log; fi'

cmd = "gpload -d {0} -f ./load-postcodes.yaml -l ./load-postcodes.log 2>&1".format(DB_USER) 
!ssh -i ~/.ssh/aws-gp.pem $DB_USER@$DB_SERVER $cmd

8-4-load-postcodes.yaml                       100%  742   134.7KB/s   00:00    
2019-10-24 13:04:29|INFO|gpload session started 2019-10-24 13:04:29
2019-10-24 13:04:29|INFO|no host supplied, defaulting to localhost
2019-10-24 13:04:29|INFO|started gpfdist -p 8000 -P 9000 -f "/var/tmp_s3_data/PCD_OA_LSOA_MSOA_LAD_AUG19_UK_LU.csv" -t 30 -m 1000000
2019-10-24 13:04:29|INFO|reusing external table ext_gpload_reusable_071eb26e_f63c_11e9_9fc6_06d6193fe600
2019-10-24 13:04:35|INFO|running time: 6.51 seconds
2019-10-24 13:04:35|INFO|rows Inserted          = 2632804
2019-10-24 13:04:35|INFO|rows Updated           = 0
2019-10-24 13:04:35|INFO|data formatting errors = 0
2019-10-24 13:04:35|INFO|gpload succeeded


- **Load UK Postcodes Geogaphical Information `ukpostcodes.csv` file into `demo.postcodelatlng` table using GPLoad**

In [171]:
sqlfilecode = !pygmentize -f html -O full,style=friendly -l yaml script/8-4-load-postcodes-coord.yaml
display_html('\n'.join(sqlfilecode), raw=True)

In [172]:
!scp -i ~/.ssh/aws-gp.pem script/8-4-load-postcodes-coord.yaml $DB_USER@$DB_SERVER:load-postcodes-coord.yaml
!ssh -i ~/.ssh/aws-gp.pem $DB_USER@$DB_SERVER 'if [ -f ./load-postcodes-coord.log ]; then rm ./load-postcodes-coord.log; fi'

cmd = "gpload -d {0} -f ./load-postcodes-coord.yaml -l ./load-postcodes-coord.log 2>&1".format(DB_USER) 
!ssh -i ~/.ssh/aws-gp.pem $DB_USER@$DB_SERVER $cmd

8-4-load-postcodes-coord.yaml                 100%  351    42.7KB/s   00:00    
2019-10-24 13:04:57|INFO|gpload session started 2019-10-24 13:04:57
2019-10-24 13:04:57|INFO|no host supplied, defaulting to localhost
2019-10-24 13:04:57|INFO|started gpfdist -p 8000 -P 9000 -f "/var/tmp_s3_data/ukpostcodes.csv" -t 30 -m 1000000
2019-10-24 13:04:57|INFO|reusing external table ext_gpload_reusable_0a974140_f63c_11e9_846c_06d6193fe600
2019-10-24 13:04:57|INFO|running time: 0.57 seconds
2019-10-24 13:04:57|INFO|rows Inserted          = 1762397
2019-10-24 13:04:57|INFO|rows Updated           = 0
2019-10-24 13:04:57|INFO|data formatting errors = 0
2019-10-24 13:04:57|INFO|gpload succeeded


- **Load Lower Layer Super Output Area information from `london-lsoa.json` file into `demo.london_LSOA` table**

In [173]:
sqlfilecode = !pygmentize -f html -O full,style=friendly -l postgres script/8-4-load-london-lsoa-txt.sql
display_html('\n'.join(sqlfilecode), raw=True)

In [174]:
import boto3
import json

s3 = boto3.resource('s3')

content_object = s3.Object('uber-movement-ldn', 'london-lsoa.json')
file_content = content_object.get()['Body'].read().decode('utf-8')
json_content = json.loads(file_content)

data = json.dumps(json_content)

lnd_lsoa_txt_cmd = !cat script/8-4-load-london-lsoa-txt.sql
lnd_lsoa_txt_cmd = ''.join(lnd_lsoa_txt_cmd).format(str(data))

%sql $DB_USER@$DB_SERVER {lnd_lsoa_txt_cmd}

0 rows affected.
1 rows affected.


[]

In [175]:
sqlfilecode = !pygmentize -f html -O full,style=friendly -l postgres script/8-4-load-london-lsoa.sql
display_html('\n'.join(sqlfilecode), raw=True)

In [176]:
lnd_lsoa_cmd = !cat script/8-4-load-london-lsoa.sql

%sql $DB_USER@$DB_SERVER {''.join(lnd_lsoa_cmd)}

983 rows affected.
1 rows affected.


count
983


- **Load Shape file of the major road network Information from `2018-MRDB-minimal.shp` file into `demo.major_roads_network` table using SHP2PGSQL and psql utilities**

In [177]:
shp2pgsqlcode = !pygmentize -f html -O full,style=friendly -l shell script/8-5-major-road-network-shp2pgsql.sh
display_html('\n'.join(shp2pgsqlcode), raw=True)

In [178]:
shp2pgsqlcmd = !cat script/8-5-major-road-network-shp2pgsql.sh
!ssh -i ~/.ssh/aws-gp.pem $DB_USER@$DB_SERVER "{''.join(shp2pgsqlcmd)}"

Shapefile type: Arc
Postgis type: MULTILINESTRING[2]


In [179]:
psqlfilecode = !pygmentize -f html -O full,style=friendly -l postgres script/8-5-major-road-network-psql.sh
display_html('\n'.join(psqlfilecode), raw=True)

In [180]:
psql_cmd = !cat script/8-5-major-road-network-psql.sh
psql_cmd = ''.join(psql_cmd).format(DB_USER)
!ssh -i ~/.ssh/aws-gp.pem $DB_USER@$DB_SERVER "{psql_cmd}"

SET
SET
psql:/var/tmp_s3_data/2018-MRDB-minimal.sql:3: ERROR:  column not found in geometry_columns table
CONTEXT:  SQL statement "SELECT DropGeometryColumn('', $1 , $2 , $3 )"
PL/pgSQL function "dropgeometrycolumn" line 4 at SQL statement
psql:/var/tmp_s3_data/2018-MRDB-minimal.sql:4: ERROR:  table "major_roads_network" does not exist
BEGIN
psql:/var/tmp_s3_data/2018-MRDB-minimal.sql:8: NOTICE:  CREATE TABLE will create implicit sequence "major_roads_network_gid_seq" for serial column "major_roads_network.gid"
psql:/var/tmp_s3_data/2018-MRDB-minimal.sql:8: NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'gid' as the Greenplum Database data distribution key for this table.
HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
CREATE TABLE
psql:/var/tmp_s3_data/2018-MRDB-minimal.sql:9: NOTICE:  ALTER TABLE / ADD PRIMARY KEY will create implicit index "major_road

In [181]:
sqlfilecode = !pygmentize -f html -O full,style=friendly -l postgres script/8-5-major-road-network-check.sql
display_html('\n'.join(sqlfilecode), raw=True)

In [182]:
query = !cat script/8-5-major-road-network-check.sql
%sql $DB_USER@$DB_SERVER {''.join(query)}

1 rows affected.


count
18193


- **Load Vehicle counts recorded on major and minor roads (2000 - 2018) from `dft_traffic_counts_raw_counts.zip` file into `demo.dft_traffic_counts_raw` table using GPLoad**

In [183]:
sqlfilecode = !pygmentize -f html -O full,style=friendly -l yaml script/8-6-load-raw-traffic-counts.yaml
display_html('\n'.join(sqlfilecode), raw=True)

In [184]:
!scp -i ~/.ssh/aws-gp.pem script/8-6-load-raw-traffic-counts.yaml $DB_USER@$DB_SERVER:load-raw-traffic-counts.yaml
!ssh -i ~/.ssh/aws-gp.pem $DB_USER@$DB_SERVER 'if [ -f ./load-raw-traffic-counts.log ]; then rm ./load-raw-traffic-counts.log; fi'

cmd = "gpload -d {0} -f ./load-raw-traffic-counts.yaml -l ./load-raw-traffic-counts.log 2>&1".format(DB_USER) 
!ssh -i ~/.ssh/aws-gp.pem $DB_USER@$DB_SERVER $cmd

8-6-load-raw-traffic-counts.yaml              100%  401    89.4KB/s   00:00    
2019-10-24 13:05:59|INFO|gpload session started 2019-10-24 13:05:59
2019-10-24 13:05:59|INFO|no host supplied, defaulting to localhost
2019-10-24 13:05:59|INFO|started gpfdist -p 8000 -P 9000 -f "/var/tmp_s3_data/dft_traffic_counts_raw_counts.tar.gz" -t 30 -m 1000000
2019-10-24 13:05:59|INFO|reusing external table ext_gpload_reusable_d7ec740c_f647_11e9_9df4_06d6193fe600
2019-10-24 13:06:04|WARN|1 bad row
2019-10-24 13:06:04|WARN|Please use following query to access the detailed error
2019-10-24 13:06:04|WARN|select * from gp_read_error_log('ext_gpload_reusable_d7ec740c_f647_11e9_9df4_06d6193fe600') where cmdtime > to_timestamp('1571918759.63')
2019-10-24 13:06:04|INFO|running time: 4.57 seconds
2019-10-24 13:06:04|INFO|rows Inserted          = 3919788
2019-10-24 13:06:04|INFO|rows Updated           = 0
2019-10-24 13:06:04|INFO|data formatting errors = 1


## 8.2 PostGIS

In [33]:
!ssh -i ~/.ssh/aws-gp.pem $DB_USER@$DB_SERVER '

/bin/sh: 1: Syntax error: Unterminated quoted string


## Scratchpad