In [67]:
import ssl, OpenSSL
import json
import requests
from bs4 import BeautifulSoup as bs

In [62]:
def get_ssl(url):
    """
    Given the URL of a website, return a JSON representing:
    - If there is an SSL certificate present
    - If yes, who issued it
    - Cryptographic details about the certificate
    """
    cert = ssl.get_server_certificate((url, 443))
    x509 = OpenSSL.crypto.load_certificate(OpenSSL.crypto.FILETYPE_PEM, 
                                          cert)

    issuer = x509.get_issuer().get_components()
    issuer_dict = {key.decode('utf-8'): value.decode('utf-8') 
                   for (key, value) in dict(issuer).items()}
    not_after = x509.get_notAfter()
    not_before = x509.get_notBefore()
    alg = x509.get_signature_algorithm()
    
    return json.dumps({"algorithm": str(alg, "utf-8"), 
                   "not_after": str(not_after, "utf-8"),
                   "not_before": str(not_before, "utf-8"),
                   "issuer": issuer_dict})
    

In [63]:
get_ssl("theguardian.com")

'{"algorithm": "sha256WithRSAEncryption", "not_after": "20190608220149Z", "not_before": "20180607220149Z", "issuer": {"C": "BE", "O": "GlobalSign nv-sa", "CN": "GlobalSign CloudSSL CA - SHA256 - G3"}}'

In [137]:
def get_privacy_policy(url):
    r_page = requests.get("http://{}".format(url)).text
    bs_page = bs(r_page, "lxml")
    policy_url = bs_page.find(lambda x: x.name == "a" and has_privacy(x.text))
    try:
        link = policy_url["href"]
        if link.startswith("/"):
            return url + link
        else:
            return url
    except:
        return ""

def has_privacy(text):
    """ Returnt true if text refers to privacy policy """
    return ("privacy policy" in text.lower()) or ("privacy" in text.lower())

In [140]:
get_privacy_policy("theguardian.com")

'theguardian.com'