### Linkrot check of known web service URLs

Starting with any OGC or OpenSearch (9,379 responses).

In [2]:
import json as js  # name conflict with sqla
import sqlalchemy as sqla
from sqlalchemy.orm import sessionmaker
from sqlalchemy.dialects.postgresql import *
from sqlalchemy import and_
from mpp.models import Response
from datetime import datetime
import requests

In [3]:
# load the postgres connection file
with open('../local/big_rds.conf', 'r') as f:
    conf = js.loads(f.read())

# our connection
engine = sqla.create_engine(conf.get('connection'))
Session = sessionmaker()
Session.configure(bind=engine)
session = Session()

In [4]:
def head(url):
    # return the status code, redirect url and error
    try:
        rsp = requests.head(url, timeout=30)
        return rsp.status_code, rsp.url, ''
    except Exception as ex:
        return 900, '', str(ex).replace(',', ';').replace('\n', ' ')
    
def get(url):
    try:
        rsp = requests.get(url, timeout=30)
        return rsp.status_code, rsp.url, ''
    except Exception as ex:
        return 900, '', str(ex).replace(',', ';').replace('\n', ' ')

In [4]:
sql = """
with i
as (
    select d.response_id, jsonb_array_elements(d.identity::jsonb) ident
    from identities d
    where d.identity is not null
)

select r.id, r.source_url, r.initial_harvest_date, i.ident->'protocol' as protocol
from responses r join i on i.response_id = r.id
where i.ident->>'protocol' != 'FGDC' and i.ident->>'protocol' != 'ISO';
"""

In [11]:
with open('outputs/webservice_linkrot.csv', 'w') as f:
    f.write('response_id, status, redirect, error, date_verified\n')

cnt = 0
for response_id, source_url, harvest_date, protocol in session.execute(sql):
    status, redirect, error = head(source_url)
    
    with open('outputs/webservice_linkrot.csv', 'a') as f:
        f.write(','.join([str(response_id), str(status), redirect, error, datetime.now().isoformat()])+'\n')

    cnt += 1
    
    if cnt % 1000 == 0:
        print 'completed', cnt
    

## Revising for OGC

Some systems don't respond correctly to HEAD requests (400s instead of the 200 for the GET).

In [6]:
sql = """
with i
as (
    select d.response_id, jsonb_array_elements(d.identity::jsonb) ident
    from identities d
    where d.identity is not null
)

select r.id,
  r.source_url,
  r.initial_harvest_date, 'OGC' as protocol
from responses r join i on r.id = i.response_id
  join service_linkrot s on s.response_id = r.id
where i.ident->>'protocol' = 'OGC' and round(s.status, -2) > 200 and round(s.status, -2) < 900;
"""

with open('outputs/webservice_ogc_linkrot.csv', 'w') as f:
    f.write('response_id, status, redirect, error, date_verified\n')

cnt = 0
for response_id, source_url, harvest_date, protocol in session.execute(sql):
    status, redirect, error = get(source_url)

    
    with open('outputs/webservice_ogc_linkrot.csv', 'a') as f:
        f.write(','.join([str(response_id), str(status), '"{0}"'.format(redirect), '"{0}"'.format(error.replace('"', "'")), datetime.now().isoformat()])+'\n')

    cnt += 1
    
    if cnt % 100 == 0:
        print 'completed', cnt

completed 100
completed 200
completed 300
completed 400
completed 500
completed 600
completed 700
completed 800
completed 900
completed 1000
completed 1100
completed 1200
completed 1300
completed 1400
completed 1500
completed 1600


In [None]:
# TODO: update the previous linkrot records with the new results
# regenerate the pivot tables 