<a href="https://colab.research.google.com/github/anjackson/contentminer/blob/master/openVirus_EThOS_API.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Using the openVirus EThOS API

The indexed theses can be queried using the [Solr API](), although there are some limitations in place to avoid the server getting overloaded.

Here, we look for all the theses that mention 'coronavirus' or 'coronaviruses', and count hits we get.

In [14]:
import requests
import json

# Search API:
surl = "https://services.anjackson.net/ov/solr/discovery/select"
def get_docs(q):
  params = {
      'q': q,
      'df': 'text',   # This field contains the text to search.
      'rows': 999999  # This ensures we return as many matches as possible (TODO up the limit)
  }
  # Only supports GET or POST requests, but POST is better if the queries get long:
  r = requests.post(surl,data=params)
  if r.status_code == 200:
    resp = r.json()['response']
    return resp
  else:
    raise Exception(r.text)

# Do a query:
results = get_docs('coronavirus OR coronaviruses')
print("Found %s hits!" % results['numFound'])

# Print an example result with all the detail shown:
if results['numFound'] > 0:
  r1 = results['docs'][0]
  print("The first result looks like this:\n%s" % (json.dumps(r1, indent=2)))


Found 154 hits!
The first result looks like this:
{
  "url_type": [
    "normal"
  ],
  "timestamp": [
    20200405050342
  ],
  "public_suffix": [
    "ac.uk"
  ],
  "type": [
    "Document"
  ],
  "content_language": [
    "en"
  ],
  "host_surt": [
    "(uk,",
    "(uk,ac,",
    "(uk,ac,reading,",
    "(uk,ac,reading,centaur,"
  ],
  "hash": [
    "sha1:6KWE4OGWG4JMSRJP2DMIC4NIPFZNYUPU"
  ],
  "content_type_norm": [
    "pdf"
  ],
  "url": [
    "http://centaur.reading.ac.uk/85135/1/22835680_Bukhari_thesis_redacted.pdf"
  ],
  "warc_key_id": [
    "<urn:uuid:d6d94697-e0d7-413e-9fd5-a4356f5e4ad7>"
  ],
  "content_text_length": [
    303363
  ],
  "content_type_served": [
    "application/pdf"
  ],
  "content_type_full": [
    "application/pdf"
  ],
  "content_metadata": [
    "date=2019-07-18T14:50:55Z",
    "pdf:unmappedUnicodeCharsPerPage=0",
    "pdf:PDFVersion=1.7",
    "pdf:docinfo:title=Microsoft Word - Thesis-12-06-2019.docx",
    "xmp:CreatorTool=Word",
    "pdf:hasXFA=false"

We can now build a simple search form around that function, and use it to get lists of matching URLs.

In [8]:
#@title Search EThOS { vertical-output: true, display-mode: "form" }

from google.colab import files
import requests
import ipywidgets as widgets
from IPython.display import display, HTML
query = widgets.Text(description="Query:", value="coronavirus OR coronaviruses")
local_file = HTML("<a href=\">./hits.txt\">Download matching URLs</a>")
button = widgets.Button(description="Search!")
download_button = widgets.Button(description="Download!")
output = widgets.Output()

# Interactive code:
def on_button_clicked(b):
  with output:
    output.clear_output()
    # Find all the thesis PDFs that mention coronavirus:
    resp = get_docs(query.value)
    print("Found %s hits..." % resp['numFound'])
    for doc in resp['docs']:
      print("%s" % doc['url'][0])

def on_download_clicked(b):
  with output:
    output.clear_output()
    with open('hits.txt', 'w') as f:
      # Find all the thesis PDFs that mention coronavirus:
      resp = get_docs(query.value)
      print("Found %s hits..." % resp['numFound'])
      for doc in resp['docs']:
        f.write("%s\n" % doc['url'][0])
    # Download:
    files.download('hits.txt')

button.on_click(on_button_clicked)
download_button.on_click(on_download_clicked)
display(query, button, download_button, output)


Text(value='coronavirus OR coronaviruses', description='Query:')

Button(description='Search!', style=ButtonStyle())

Button(description='Download!', style=ButtonStyle())

Output()