###OGC GetCapabilities

Do they even contain text? Not so much, no.

In [2]:
%matplotlib inline
import pandas as pd
import json as js  # name conflict with sqla
import sqlalchemy as sqla
from sqlalchemy.orm import sessionmaker
from IPython.display import display
from IPython.display import Image

In [3]:
# grab the clean text from the rds
with open('../local/big_rds.conf', 'r') as f:
    conf = js.loads(f.read())

# our connection
engine = sqla.create_engine(conf.get('connection'))

In [3]:
# average completeness (tokens / expected)
sql = """
select round(avg(json_array_length(tokens) / expected_total::numeric * 100.), 2) as mean_pct,
    round(avg(json_array_length(tokens)), 2) as mean_elements
from ogc_tokens;
"""
df = pd.read_sql(sql, engine)
df

Unnamed: 0,mean_pct,mean_elements
0,32.02,6.33


So a mean of 30% - of the 19 (wms service) or 22 (ows serviceidentification/serviceprovider) elements we're counting, we get an average of ~6 elements included.

We know that the presence of an element doesn't mean the presence of informative text. The Name element is notorious for this, often defaulting to "WMS" only. Token count is also not great for a lot of these elements. A lot of the Provider information is limited to one token conceptually. 

So what happens if we look at title, abstract and keywords (our high-information elements)?

In [6]:
# how many, ignoring NAME, title or abstract elements are simply 'W*S' 
# where * is a wildcard

sql = """
with t as (
	select id, response_id, 
		trim(both '"' from (j.value->'tag')::text) as tag, 
		trim(both '"' from (j.value->'values')::text) as tokens
	from ogc_tokens, json_array_elements(tokens) j
	where trim(both '"' from (j.value->'tag')::text) ilike '%%/Abstract' 
		or trim(both '"' from (j.value->'tag')::text) ilike '%%/Title' 
		or trim(both '"' from (j.value->'tag')::text) ilike '%%/KeywordList/%%'
)
select count(*) as num_elements, 
	round(count(*) / 24248. * 100., 2) as pct_elements,
	count(distinct t.id) as num_responses,
	round(count(distinct t.id) / 3828. * 100., 2) as pct_responses
from t
where (t.tag ilike '%%/Abstract' or t.tag ilike '%%/Title') 
	and (t.tokens ilike 'W%%S' or t.tokens ='CSW' or t.tokens = 'SOS' or t.tokens ilike 'OGC:W%%S');
"""

# we have, from the set, a total of 24,248 extracted elements for our three patterns
# across 3828 responses (getcapabilities only)
df = pd.read_sql(sql, engine)
df

Unnamed: 0,num_elements,pct_elements,num_responses,pct_responses
0,724,2.99,481,12.57


Alright, that's just shy of 20% of the responses have the main identification elements defaulting to the service acronym.

We'll exclude those from the token counts.

In [8]:
# average number of tokens per bucket

sql = """
with t as (
	select id, response_id, 
		trim(both '"' from (j.value->'tag')::text) as original_tag, 
		case
			when trim(both '"' from (j.value->'tag')::text) ilike '%%/Abstract' then 'abstract'
			when trim(both '"' from (j.value->'tag')::text) ilike '%%/Title' then 'title'
			when trim(both '"' from (j.value->'tag')::text) ilike '%%/Keyword%%' then 'keyword'
		end as tag,
		replace(trim(both '"' from (j.value->'values')::text), ' | ', ' ') as token_string,
		regexp_split_to_array(replace(trim(both '"' from (j.value->'values')::text), ' | ', ' '), E'\\\s+') as tokens
	from ogc_tokens, json_array_elements(tokens) j
	where trim(both '"' from (j.value->'tag')::text) ilike '%%/Abstract' 
		or trim(both '"' from (j.value->'tag')::text) ilike '%%Title' 
		or trim(both '"' from (j.value->'tag')::text) ilike '%%/Keyword%%'
)
select t.tag, 
	round(avg(array_length(t.tokens, 1)), 2) as average_tokens,
	min(array_length(t.tokens, 1)) as min_tokens,
	max(array_length(t.tokens, 1)) as max_tokens
from t
where not((t.tag ilike 'abstract' or t.tag ilike 'title') 
	and (t.token_string ilike 'W%%S' or t.token_string ='CSW' or t.token_string = 'SOS' or t.token_string ilike 'OGC:W%%S'))
group by t.tag
order by average_tokens DESC;
"""
df = pd.read_sql(sql, engine)
df

Unnamed: 0,tag,average_tokens,min_tokens,max_tokens
0,abstract,25.27,1,3161
1,keyword,6.9,1,111
2,title,2.13,1,21


In [9]:
# average tokens per element

sql = """
with t as (
	select id, response_id, 
		trim(both '"' from (j.value->'tag')::text) as original_tag, 
		(string_to_array(trim(both '"' from (j.value->'tag')::text), '/'))[array_upper(string_to_array(trim(both '"' from (j.value->'tag')::text), '/'), 1)] as tag,
		regexp_split_to_array(replace(trim(both '"' from (j.value->'values')::text), ' | ', ' '), E'\\\s+') as tokens
	from ogc_tokens, json_array_elements(tokens) j
)
select t.tag, 
	round(avg(array_length(t.tokens, 1)), 2) as average_tokens,
	min(array_length(t.tokens, 1)) as min_tokens,
	max(array_length(t.tokens, 1)) as max_tokens
from t
group by t.tag
order by average_tokens DESC;
"""

df = pd.read_sql(sql, engine)
df

Unnamed: 0,tag,average_tokens,min_tokens,max_tokens
0,Abstract,19.03,1,3161
1,Name,15.32,1,603
2,AccessConstraints,8.01,1,288
3,Keyword,6.9,1,111
4,DeliveryPoint,5.69,1,11
5,Address,5.66,1,11
6,HoursOfService,4.35,1,24
7,ProviderName,4.22,1,17
8,ContactInstructions,3.64,1,16
9,ContactOrganization,3.59,1,19


Except these kinds of buckets don't recognize the distributions of the subsets - how much of the skew is related to a third of the responses coming from one host, for example?


Number of Hosts serving responses that have varying token counts for a Tag

| Tag                       | Hosts with Varying Token Counts | Percent of OGC Hosts | 
|---------------------------|---------------------------------|----------------------| 
| Abstract                  | 50                              | 22.6                 | 
| AccessConstraints         | 9                               | 4.1                  | 
| Address                   | 6                               | 2.7                  | 
| AddressType               | 6                               | 2.7                  | 
| ContactFacsimileTelephone | 2                               | 0.9                  | 
| ContactInstructions       | 1                               | 0.5                  | 
| ContactOrganization       | 9                               | 4.1                  | 
| ContactPerson             | 10                              | 4.5                  | 
| ContactPosition           | 11                              | 5.0                  | 
| ContactVoiceTelephone     | 3                               | 1.4                  | 
| Country                   | 5                               | 2.3                  | 
| DeliveryPoint             | 2                               | 0.9                  | 
| Fees                      | 2                               | 0.9                  | 
| HoursOfService            | 3                               | 1.4                  | 
| IndividualName            | 3                               | 1.4                  | 
| Keyword                   | 26                              | 11.8                 | 
| Name                      | 29                              | 13.1                 | 
| PositionName              | 6                               | 2.7                  | 
| PostalCode                | 1                               | 0.5                  | 
| PostCode                  | 1                               | 0.5                  | 
| ProviderName              | 2                               | 0.9                  | 
| ServiceType               | 12                              | 5.4                  | 
| ServiceTypeVersion        | 5                               | 2.3                  | 
| StateOrProvince           | 3                               | 1.4                  | 
| Title                     | 42                              | 19.0                 | 
| Voice                     | 3                               | 1.4                  | 


(The OGC includes responses from 221 hosts total.)






In [7]:
# thoughts on the variation - where is it *not* found?
# excluding the contact infor because that is expected
# to be the same publisher info

sql = """
with t as (
	select id, response_id, 
		trim(both '"' from (j.value->'tag')::text) as original_tag, 
		(string_to_array(trim(both '"' from (j.value->'tag')::text), '/'))[array_upper(string_to_array(trim(both '"' from (j.value->'tag')::text), '/'), 1)] as tag,
		trim(both '"' from (j.value->'values')::text) as tokens
	from ogc_tokens, json_array_elements(tokens) j
)
select distinct t.tag, t.tokens, count(t.tokens) as num_per_tag
from t
where t.original_tag not ilike '%%/Contact%%' and t.tag != '@href'
group by t.tag, t.tokens
having count(t.tokens) >= 10
order by t.tag, num_per_tag DESC
;
"""
df = pd.read_sql(sql, engine)
with pd.option_context('display.max_rows', 200):
    display(df)

Unnamed: 0,tag,tokens,num_per_tag
0,Abstract,WMS,396
1,Abstract,Scientific Data,319
2,Abstract,A compliant implementation of WMS plus most of...,19
3,Abstract,This is the reference implementation of WFS 1....,18
4,Abstract,Map service to support the Coastal Flood Expos...,13
5,AccessConstraints,none,788
6,AccessConstraints,,110
7,AccessConstraints,NONE,95
8,AccessConstraints,Although these data have been processed succes...,17
9,AccessConstraints,None. The Kansas Biological Survey (KBS) and i...,13


In [8]:
# elements/expected elements binned for validity
sql = """
with k as 
(
	with t as (
		select id, response_id, expected_total,
			trim(both '"' from (j.value->'tag')::text) as tag, 
			trim(both '"' from (j.value->'values')::text) as tokens
		from ogc_tokens, json_array_elements(tokens) j
	)
	select response_id, count(response_id) as num_tags, 
		round(count(response_id) / max(expected_total)::numeric * 100., 2) as pct_expected
	from t
	group by response_id
)
select r.host, v.valid,
	round(avg(pct_expected), 2) as avg_completed,
	min(pct_expected) as min_expected,
	max(pct_expected) as max_expected
from k
	join responses r on k.response_id = r.id
	left outer join validations v on v.response_id = r.id
group by r.host, v.valid
order by r.host, avg_completed DESC;
"""
df = pd.read_sql(sql, engine)
with pd.option_context('display.max_rows', 200):
    display(df)

Unnamed: 0,host,valid,avg_completed,min_expected,max_expected
0,107.20.228.18,True,21.05,21.05,21.05
1,acdisc.sci.gsfc.nasa.gov,True,5.26,5.26,5.26
2,apps.fs.fed.us,False,21.05,21.05,21.05
3,apps.fs.fed.us,True,20.38,13.64,21.05
4,apps.fs.usda.gov,True,21.05,21.05,21.05
5,apps.geomatics.gov.nt.ca,True,13.64,13.64,13.64
6,apps.nd.gov,False,59.09,59.09,59.09
7,arcgis.sd.gov,True,13.64,13.64,13.64
8,atlas.geog.pdx.edu,True,14.21,13.64,18.18
9,atlas.resources.ca.gov,False,21.05,21.05,21.05




<img src="screenshots/ogc_elements_invalid.png" width="300" style="float:left;margin-right:50px;"/>
<img src="screenshots/ogc_elements_valid.png" width="350" style="float:left;"/>

These are a little goofy. On the left, average, minimum and maximum elements included/expected elements from **invalid** responses from a server. On the right, average, minimum and maximum elements included/expected elements from **valid** responses from a server. 

The areas that aren't sort of orange are where there's variation in the number of elements emmitted as part of the GetCapabilities. So orange-ish is no variation in the presence of elements in the responses for a server; yellow/green blocks are servers that do vary in what is included in the GetCapabilities. There's not a good pattern, like low completion percent is automatically invalid. The validity is based on the entire document, not just the service information. 