###OGC GetCapabilities

Do they even contain text? Not so much, no.

In [1]:
%matplotlib inline
import pandas as pd
import json as js  # name conflict with sqla
import sqlalchemy as sqla
from sqlalchemy.orm import sessionmaker
from IPython.display import display
from IPython.display import Image

In [2]:
# grab the clean text from the rds
with open('../local/big_rds.conf', 'r') as f:
    conf = js.loads(f.read())

# our connection
engine = sqla.create_engine(conf.get('connection'))

In [3]:
# average completeness (tokens / expected)
sql = """
select round(avg(json_array_length(tokens) / expected_total::numeric * 100.), 2) as mean_pct,
    round(avg(json_array_length(tokens)), 2) as mean_elements
from ogc_tokens;
"""
df = pd.read_sql(sql, engine)
df

Unnamed: 0,mean_pct,mean_elements
0,32.02,6.33


So a mean of 30% - of the 19 (wms service) or 22 (ows serviceidentification/serviceprovider) elements we're counting, we get an average of ~6 elements included.

We know that the presence of an element doesn't mean the presence of informative text. The Name element is notorious for this, often defaulting to "WMS" only. Token count is also not great for a lot of these elements. A lot of the Provider information is limited to one token conceptually. 

So what happens if we look at title, abstract and keywords (our high-information elements)?

In [6]:
# how many, ignoring NAME, title or abstract elements are simply 'W*S' 
# where * is a wildcard

sql = """
with t as (
	select id, response_id, 
		trim(both '"' from (j.value->'tag')::text) as tag, 
		trim(both '"' from (j.value->'values')::text) as tokens
	from ogc_tokens, json_array_elements(tokens) j
	where trim(both '"' from (j.value->'tag')::text) ilike '%%/Abstract' 
		or trim(both '"' from (j.value->'tag')::text) ilike '%%/Title' 
		or trim(both '"' from (j.value->'tag')::text) ilike '%%/KeywordList/%%'
)
select count(*) as num_elements, 
	round(count(*) / 24248. * 100., 2) as pct_elements,
	count(distinct t.id) as num_responses,
	round(count(distinct t.id) / 3828. * 100., 2) as pct_responses
from t
where (t.tag ilike '%%/Abstract' or t.tag ilike '%%/Title') 
	and (t.tokens ilike 'W%%S' or t.tokens ='CSW' or t.tokens = 'SOS' or t.tokens ilike 'OGC:W%%S');
"""

# we have, from the set, a total of 24,248 extracted elements for our three patterns
# across 3828 responses (getcapabilities only)
df = pd.read_sql(sql, engine)
df

Unnamed: 0,num_elements,pct_elements,num_responses,pct_responses
0,724,2.99,481,12.57


Alright, that's just shy of 20% of the responses have the main identification elements defaulting to the service acronym.

We'll exclude those from the token counts.

In [8]:
# average number of tokens per bucket

sql = """
with t as (
	select id, response_id, 
		trim(both '"' from (j.value->'tag')::text) as original_tag, 
		case
			when trim(both '"' from (j.value->'tag')::text) ilike '%%/Abstract' then 'abstract'
			when trim(both '"' from (j.value->'tag')::text) ilike '%%/Title' then 'title'
			when trim(both '"' from (j.value->'tag')::text) ilike '%%/Keyword%%' then 'keyword'
		end as tag,
		replace(trim(both '"' from (j.value->'values')::text), ' | ', ' ') as token_string,
		regexp_split_to_array(replace(trim(both '"' from (j.value->'values')::text), ' | ', ' '), E'\\\s+') as tokens
	from ogc_tokens, json_array_elements(tokens) j
	where trim(both '"' from (j.value->'tag')::text) ilike '%%/Abstract' 
		or trim(both '"' from (j.value->'tag')::text) ilike '%%Title' 
		or trim(both '"' from (j.value->'tag')::text) ilike '%%/Keyword%%'
)
select t.tag, 
	round(avg(array_length(t.tokens, 1)), 2) as average_tokens,
	min(array_length(t.tokens, 1)) as min_tokens,
	max(array_length(t.tokens, 1)) as max_tokens
from t
where not((t.tag ilike 'abstract' or t.tag ilike 'title') 
	and (t.token_string ilike 'W%%S' or t.token_string ='CSW' or t.token_string = 'SOS' or t.token_string ilike 'OGC:W%%S'))
group by t.tag
order by average_tokens DESC;
"""
df = pd.read_sql(sql, engine)
df

Unnamed: 0,tag,average_tokens,min_tokens,max_tokens
0,abstract,25.27,1,3161
1,keyword,6.9,1,111
2,title,2.13,1,21


In [9]:
# average tokens per element

sql = """
with t as (
	select id, response_id, 
		trim(both '"' from (j.value->'tag')::text) as original_tag, 
		(string_to_array(trim(both '"' from (j.value->'tag')::text), '/'))[array_upper(string_to_array(trim(both '"' from (j.value->'tag')::text), '/'), 1)] as tag,
		regexp_split_to_array(replace(trim(both '"' from (j.value->'values')::text), ' | ', ' '), E'\\\s+') as tokens
	from ogc_tokens, json_array_elements(tokens) j
)
select t.tag, 
	round(avg(array_length(t.tokens, 1)), 2) as average_tokens,
	min(array_length(t.tokens, 1)) as min_tokens,
	max(array_length(t.tokens, 1)) as max_tokens
from t
group by t.tag
order by average_tokens DESC;
"""

df = pd.read_sql(sql, engine)
df

Unnamed: 0,tag,average_tokens,min_tokens,max_tokens
0,Abstract,19.03,1,3161
1,Name,15.32,1,603
2,AccessConstraints,8.01,1,288
3,Keyword,6.9,1,111
4,DeliveryPoint,5.69,1,11
5,Address,5.66,1,11
6,HoursOfService,4.35,1,24
7,ProviderName,4.22,1,17
8,ContactInstructions,3.64,1,16
9,ContactOrganization,3.59,1,19
