In [3]:
import requests
import urllib
import pandas as pd
from requests_html import HTML
from requests_html import HTMLSession

In [4]:
def get_source(url):
    """Return the source code for the provided URL. 

    Args: 
        url (string): URL of the page to scrape.

    Returns:
        response (object): HTTP response object from requests_html. 
    """

    try:
        session = HTMLSession()
        response = session.get(url)
        return response

    except requests.exceptions.RequestException as e:
        print(e)

In [9]:
def scrape_gov(query):

    query = urllib.parse.quote_plus(query)
    response = get_source("https://www.digitalmarketplace.service.gov.uk/digital-outcomes-and-specialists/opportunities?q=" + query)

    links = list(response.html.absolute_links)
    gov_domains = ('https://www.digitalmarketplace.service.gov.uk/digital-outcomes-and-specialists/opportunities')

    for url in links[:]:
        if url.startswith(gov_domains):
            links.remove(url)

    return links

In [10]:
scrape_gov("data")

['https://www.nationalarchives.gov.uk/information-management/re-using-public-sector-information/uk-government-licensing-framework/crown-copyright/',
 'https://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/',
 'https://www.gov.uk/guidance/g-cloud-buyers-guide',
 'https://www.gov.uk/guidance/how-to-sell-your-digital-outcomes-and-specialists-services',
 'https://www.gov.uk/guidance/the-crown-hosting-data-centres-framework-on-the-digital-marketplace',
 'https://www.gov.uk/guidance/digital-outcomes-and-specialists-buyers-guide',
 'https://www.digitalmarketplace.service.gov.uk/privacy-notice',
 'https://www.digitalmarketplace.service.gov.uk/accessibility-statement',
 'https://www.gov.uk/guidance/digital-marketplace-buyers-guide',
 'https://www.gov.uk/guidance/g-cloud-suppliers-guide',
 'https://www.gov.uk/guidance/digital-marketplace-suppliers-guide',
 'https://www.digitalmarketplace.service.gov.uk/terms-and-conditions',
 'https://www.digitalmarketplace.service.gov.uk/',


def get_results(query):
    
    query = urllib.parse.quote_plus(query)
    response = get_source("https://www.digitalmarketplace.service.gov.uk/digital-outcomes-and-specialists/opportunities?q=" + query + "&lot=digital-outcomes&statusOpenClosed=open")
    
    return response

In [21]:
def get_results(query):
    
    query = urllib.parse.quote_plus(query)
    response = get_source("https://www.digitalmarketplace.service.gov.uk/digital-outcomes-and-specialists/opportunities?q=" + query)
    
    return response

In [22]:
def parse_results(response):
    
    css_identifier_result = ".tF2Cxc"
    css_identifier_title = "h3"
    css_identifier_link = ".yuRUbf a"
    css_identifier_text = ".VwiC3b"
    
    results = response.html.find(css_identifier_result)

    output = []
    
    for result in results:

        item = {
            'title': result.find(css_identifier_title, first=True).text,
            'link': result.find(css_identifier_link, first=True).attrs['href'],
            'text': result.find(css_identifier_text, first=True).text
        }
        
        output.append(item)
        
    return output

In [23]:
def gov_dos_search(query):
    response = get_results(query)
    return parse_results(response)

In [27]:
results = get_results("data")
results

<Response [200]>

In [44]:
from bs4 import BeautifulSoup
query = 'data'
status = 'closed'
page = requests.get("https://www.digitalmarketplace.service.gov.uk/digital-outcomes-and-specialists/opportunities?q=" + query + "&lot=digital-outcomes&statusOpenClosed=" + status)
soup = BeautifulSoup(page.content, 'html.parser')

# Extract title of page
page_title = soup.title

# Extract body of page
page_body = soup.body

# Extract head of page
page_head = soup.head

# print the result
print(page_title, page_head)

<title>
  Supplier opportunities – Digital Outcomes and Specialists – Digital Marketplace
</title> <head>
<meta charset="utf-8"/>
<title>
  Supplier opportunities – Digital Outcomes and Specialists – Digital Marketplace
</title>
<meta content="width=device-width, initial-scale=1, viewport-fit=cover" name="viewport"/>
<meta content="#0b0c0c" name="theme-color"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<link href="/static/images/favicon.ico" rel="shortcut icon" sizes="16x16 32x32 48x48" type="image/x-icon"/>
<link color="#0b0c0c" href="/static/images/govuk-mask-icon.svg" rel="mask-icon"/>
<link href="/static/images/govuk-apple-touch-icon-180x180.png" rel="apple-touch-icon" sizes="180x180"/>
<link href="/static/images/govuk-apple-touch-icon-167x167.png" rel="apple-touch-icon" sizes="167x167"/>
<link href="/static/images/govuk-apple-touch-icon-152x152.png" rel="apple-touch-icon" sizes="152x152"/>
<link href="/static/images/govuk-apple-touch-icon.png" rel="apple-touch-icon"/>

In [45]:
# Extract first <h1>(...)</h1> text
first_h1 = soup.select('h1')[0].text
page_title,first_h1

(<title>
   Supplier opportunities – Digital Outcomes and Specialists – Digital Marketplace
 </title>,
 'Digital Outcomes and Specialists opportunities')

In [46]:
page_body

<body class="govuk-template__body">
<script>document.body.className = ((document.body.className) ? document.body.className + ' js-enabled' : 'js-enabled');</script>
<a class="govuk-skip-link" href="#main-content">Skip to main content</a>
<div aria-describedby="dm-cookie-banner__heading" aria-label="Cookie Banner" class="dm-cookie-banner govuk-width-container" data-module="dm-cookie-banner" id="dm-cookie-banner" role="region">
<div class="dm-cookie-banner__wrapper">
<h2 class="dm-cookie-banner__heading govuk-heading-m" id="dm-cookie-banner__heading">
      Can we store analytics cookies on your device?
    </h2>
<p class="govuk-body">
      Analytics cookies help us understand how our website is being used.
    </p>
<div class="dm-cookie-banner__buttons">
<button aria-describedby="dm-cookie-banner__heading" class="govuk-button dm-cookie-banner__button dm-cookie-banner__button--accept" data-accept-cookies="true" data-module="govuk-button" type="submit">
        Yes<span class="govuk-visu

In [48]:
# Create all_a_tags as empty list
all_a_tags = []

# Set all_a_tags to all a tags of the soup
for element in soup.select('a'):
    all_a_tags.append(element.text)
    
all_a_tags

['Skip to main content',
 'How Digital Marketplace uses cookies',
 'change your cookie settings',
 '\n\n\n\n\n\n\n\n\n            GOV.UK\n          \n\n\n          Digital Marketplace\n        \n',
 '\n              \n                Guidance\n              \n                ',
 '\n              \n                Help\n              \n                ',
 '\n              \n                Log in\n              \n                ',
 'send your feedback',
 'Digital Marketplace',
 'Supplier opportunities',
 'All categories',
 'Digital specialists (786)',
 'User research participants (30)',
 'Clear filters',
 '\n\n\n\n\n\n',
 'Download data',
 'info@crowncommercial.gov.uk',
 'Social care data access project, user needs discovery - BC16292',
 'Designing Data Stewardship Models for Artificial Intelligence (AI) R&D',
 'Technical support outputs for the Maritime Digital Delivery Services (MDDS) (ND-0269)',
 'OUPA10903 - Digital Partner Procurement',
 'UK SBS DDaT22595 BEIS - Climate Change Agr