In [161]:
import ssl, OpenSSL
import json
import requests
from bs4 import BeautifulSoup as bs
import re

In [62]:
def get_ssl(url):
    """
    Given the URL of a website, return a JSON representing:
    - If there is an SSL certificate present
    - If yes, who issued it
    - Cryptographic details about the certificate
    """
    cert = ssl.get_server_certificate((url, 443))
    x509 = OpenSSL.crypto.load_certificate(OpenSSL.crypto.FILETYPE_PEM, 
                                          cert)

    issuer = x509.get_issuer().get_components()
    issuer_dict = {key.decode('utf-8'): value.decode('utf-8') 
                   for (key, value) in dict(issuer).items()}
    not_after = x509.get_notAfter()
    not_before = x509.get_notBefore()
    alg = x509.get_signature_algorithm()
    
    return json.dumps({"algorithm": str(alg, "utf-8"), 
                   "not_after": str(not_after, "utf-8"),
                   "not_before": str(not_before, "utf-8"),
                   "issuer": issuer_dict})
    

In [63]:
get_ssl("theguardian.com")

'{"algorithm": "sha256WithRSAEncryption", "not_after": "20190608220149Z", "not_before": "20180607220149Z", "issuer": {"C": "BE", "O": "GlobalSign nv-sa", "CN": "GlobalSign CloudSSL CA - SHA256 - G3"}}'

In [245]:
def get_privacy_policy_link(url):
    r_page = requests.get("https://duckduckgo.com/html/?q=privacy policy site:{}".format(url)).text
    bs_page = bs(r_page, "lxml")
    return bs_page.find_all("a", attrs = {"class": "result__url"})[0].text.strip()

def find_emails(text):
    """ Find all emails present in a string """
    return set(re.findall(r'[\w.+-]+@[^\W_]+[.-][A-Za-z0-9.-]+', text))

In [246]:
policy_url = get_privacy_policy_link("phys.org")

if not(policy_url.startswith("http")):
    policy_url = "http://" + policy_url
    
bs_page = bs(requests.get(policy_url).text, "lxml")
find_emails(str(bs_page))

set()

In [247]:
policy_url

'http://phys.org/tags/privacy policy/'