From 9cd5417ebb3a47f2c771870da8d909fb3d5aef45 Mon Sep 17 00:00:00 2001 From: KnitCode Date: Wed, 27 Mar 2019 13:02:42 -0400 Subject: [PATCH 1/8] implements an IDNA-encoded version of publicsuffix2. includes additional functionality for strict checks, ignoring wildcards, and finding eTLD only. maps main function of get_public_suffix() to get_sld() for clarity. --- MANIFEST.in | 1 + README.rst | 94 +++++++++++++++-- publicsuffix2.LICENSE | 4 + setup.py | 8 +- src/publicsuffix2/__init__.py | 185 +++++++++++++++++++++++++++++----- tests.py | 47 ++++++++- 6 files changed, 302 insertions(+), 37 deletions(-) diff --git a/MANIFEST.in b/MANIFEST.in index 580f88e..70aa444 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -2,5 +2,6 @@ graft src include CHANGELOG.rst include README.rst +include publicsuffix2.LICENSE global-exclude *.py[co] __pycache__ *.so *.pyd diff --git a/README.rst b/README.rst index 9dd3487..55b8ed4 100644 --- a/README.rst +++ b/README.rst @@ -1,21 +1,37 @@ Public Suffix List module for Python ==================================== -This module allows you to get the public suffix of a domain name using the -Public Suffix List from http://publicsuffix.org +This module allows you to get the public suffix, as well as the registrable domain, + of a domain name using the Public Suffix List from http://publicsuffix.org A public suffix is a domain suffix under which you can register domain -names. Some examples of public suffixes are ".com", ".co.uk" and "pvt.k12.wy.us". +names. It is sometimes referred to as the extended TLD (eTLD). +Some examples of public suffixes are ".com", ".co.uk" and "pvt.k12.wy.us". Accurately knowing the public suffix of a domain is useful when handling web browser cookies, highlighting the most important part of a domain name in a user interface or sorting URLs by web site. +This module builds the public suffix list as a Trie structure, making it more efficient +than other string-based modules available for the same purpose. It can be used +effectively in large-scale distributed environments, such as PySpark. + This Python module includes with a copy of the Public Suffix List so that it is usable out of the box. Newer versions try to provide reasonably fresh copies of this list. It also includes a convenience method to fetch the latest list. -The code is a fork of the publicsuffix package and uses the same base API. -You just need to import publicsuffix2 instead +The code is a fork of the publicsuffix2 package and includes the same base API. In +addition, it contains a few variants useful for certain use cases, such as the option to +ignore wildcards or return only the extended TLD (eTLD). +Publicsuffix2 is a an extension of publicsuffix, and uses the same base API. +You just need to import publicsuffix2 instead. + +The public suffix list is now provided in UTF-8 format. To correctly process +IDNA-encoded domains, either the query or the list must be converted. This module +contains the option to IDNA-encode the public suffix list upon creating the Trie; this +is set to happen by default. If your use case includes UTF-8 domains, e.g., '食狮.com.cn', +you'll need to set the IDNA-encoding flag to False on instantiation (see examples below). +Failure to use the correct encoding for your use case can lead to incorrect results for +domains that utilize unicode characters. The code is MIT-licensed and the publicsuffix data list is MPL-2.0-licensed. @@ -31,6 +47,10 @@ The code is MIT-licensed and the publicsuffix data list is MPL-2.0-licensed. Usage ----- +To install from source, first build the package file: + python setup.py build sdist +and then pip install from the dist directory. + Install with:: pip install publicsuffix2 @@ -103,6 +123,63 @@ You can use it this way:: Note that the once loaded, the data file is cached and therefore fetched only once. +If using this library in large-scale pyspark processing, you should instantiate the class as +a global variable, not within a user function. The class methods can then be used within user +functions for distributed processing. + +Changes in this Fork +-------------------- + +This fork of publicsuffix2 addresses a change in the format to the standard public suffix list, +which was previously IDNA-encoded and now is in UTF-8 format, as well as some additional +functionality useful to certain use cases. These additions include the ability to ignore +wildcards and to require strict adherence to the TLDs included in the list. Lastly, we include +some convenience functions for obtaining only the extended TLD (eTLD) rather than the +registrable domain (SLD). These are outlined below. + +IDNA-encoding. The public suffix list is now provided in UTF-8 format. For those use cases that +include IDNA-encoded domains, the module will not return accurate results unless the list is +converted. In this fork, IDNA encoding is included as a parameter in the class and is on by +default.:: + + >>> from publicsuffix2 import PublicSuffixList + >>> psl = PublicSuffixList(idna=True) # on by default + >>> psl.get_public_suffix('www.google.com') + 'google.com' + >>> psl = PublicSuffixList(idna=False) # use UTF-8 encodings + >>> psl.get_public_suffix('食狮.com.cn') + '食狮.com.cn' + +Ignore wildcards. In some use cases, particularly those related to large-scale domain processing, +the user might want to ignore wildcards to create more aggregation. This is possible by setting +the parameter wildcard=False. + +Require valid eTLDs (strict). In the publicsuffix2 module, a domain with an invalid TLD will still return +a public suffix, e.g,:: + + >>> psl.get_public_suffix('www.mine.local') + 'mine.local' + + +This is useful for many use cases, while in others, we want to ensure that the domain includes a +valid eTLD. In this case, the boolean parameter strict provides a solution. If this flag is set, +an invalid TLD will return None.:: + + >>> psl.get_public_suffix('www.mine.local', strict=True) is None + True + +Return eTLD only. The standard use case for publicsuffix2 is to return the registrable domain +according to the public suffix list. In some cases, however, we only wish to find the eTLD +itself. In this fork, this is available via the get_tld() method.:: + + >>> psl.get_tld('www.google.com') + 'com' + +All of the methods and functions include the wildcard and strict parameters. + +For convenience, the public method get_sld() is available. This is identical to the method +get_public_suffix() and is intended to clarify the output for some users. + Source ------ @@ -116,7 +193,11 @@ branch:: History ------- -This code is forked from Tomaž Šolc's fork of David Wilson's code originally at: +This code is forked from NexB's fork of Tomaž Šolc's fork of David Wilson's code. + +The original publicsuffix2 code is Copyright (c) 2015 nexB Inc. + +David Wilson's code originally at: https://www.tablix.org/~avian/git/publicsuffix.git @@ -138,6 +219,7 @@ License The code is MIT-licensed. The vendored public suffix list data from Mozilla is under the MPL-2.0. +Copyright (c) 2019 Renée Burton Copyright (c) 2015 nexB Inc. diff --git a/publicsuffix2.LICENSE b/publicsuffix2.LICENSE index 101aff9..d4dc33c 100644 --- a/publicsuffix2.LICENSE +++ b/publicsuffix2.LICENSE @@ -1,3 +1,7 @@ +Copyright (c) 2019 Renée Burton +This code is based on nexB Inc. code found at: +https://www.github.com/nexB/python-publicsuffix2 + Copyright (c) 2015 nexB Inc. This code is based on Tomaž Šolc fork of David Wilson code originally at https://www.tablix.org/~avian/git/publicsuffix.git diff --git a/setup.py b/setup.py index 127fdc0..dea7830 100644 --- a/setup.py +++ b/setup.py @@ -81,14 +81,14 @@ def run(self): setup( name='publicsuffix2', - version='2.20190205', + version='2.20190328', license='MIT and MPL-2.0', description='Get a public suffix for a domain name using the Public Suffix ' 'List. Forked from and using the same API as the publicsuffix package.', long_description='%s\n%s' % (read('README.rst'), read('CHANGELOG.rst')), - author='nexB Inc., Tomaz Solc and David Wilson', - author_email='info@nexb.com', - url='https://github.com/nexB/python-publicsuffix2', + author='Renée Burton, nexB Inc., Tomaz Solc and David Wilson', + author_email='', + url='https://github.com/KnitCode/python-publicsuffix2', packages=find_packages('src'), package_dir={'': 'src'}, py_modules=[splitext(basename(path))[0] for path in glob('src/*.py')], diff --git a/src/publicsuffix2/__init__.py b/src/publicsuffix2/__init__.py index a755bd9..f0d97b2 100644 --- a/src/publicsuffix2/__init__.py +++ b/src/publicsuffix2/__init__.py @@ -1,4 +1,6 @@ # -*- coding: utf-8 -*- +# +# Copyright (c) 2019 nexB Inc. and Renée Burton # Copyright (c) 2015 nexB Inc. # This code is based on Tomaž Šolc's fork of David Wilson's code originally at # https://www.tablix.org/~avian/git/publicsuffix.git @@ -41,9 +43,8 @@ from __future__ import unicode_literals import codecs -from contextlib import closing -from datetime import datetime import os.path +import warnings try: from urllib.request import urlopen, Request @@ -61,7 +62,7 @@ class PublicSuffixList(object): - def __init__(self, psl_file=None): + def __init__(self, psl_file=None, idna=True): """ Read and parse a public suffix list. `psl_file` is either a file location string, or a file-like object, or an iterable of lines from a @@ -70,7 +71,14 @@ def __init__(self, psl_file=None): If psl_file is None, the vendored file named "public_suffix_list.dat" is loaded. It is stored side by side with this Python package. + The Mozilla public suffix list is no longer IDNA-encoded, it is UTF-8. For use cases with domains that are IDNA + encoded, choose idna=True and the list will be converted upon loading. The wrong encoding will provide incorrect + answers in either use case. + The file format is described at http://publicsuffix.org/ + + :param psl_file: string or None + :param idna: boolean, whether to convert file to IDNA-encoded strings """ # Note: we test for None as we accept empty lists as inputs if psl_file is None or isinstance(psl_file, str): @@ -79,14 +87,26 @@ def __init__(self, psl_file=None): else: # assume file-like psl = psl_file - root = self._build_structure(psl) + root = self._build_structure(psl, idna) self.root = self._simplify(root) def _find_node(self, parent, parts): + """ + recursive processing of a line of the publc suffix list to build the Trie. Each line is processed into a + dictionary, which may contain sub-Trie, and nodes terminate in node of either 0 or 1 (negate). + + This method takes the current parent Trie, and searches it for the next part in the line (child). If not found, + it adds a node to the Trie, creating a new branch with the [0]. If found, the existing sub-Trie is passed for + the next part. + + :param parent: current Trie, form is Tuple (negate, dict of Trie) + :param parts: list of strings + :return: recursive search for remaining domain parts + """ if not parts: return parent - if len(parent) == 1: + if len(parent) == 1: # this initiates the Trie from a new node as [negate, dict()] parent.append({}) assert len(parent) == 2 @@ -94,14 +114,22 @@ def _find_node(self, parent, parts): child = parts.pop() - child_node = children.get(child, None) + child_node = children.get(child, None) # if child already exists as a node, grab the sub-Trie - if not child_node: + if not child_node: # if it doesn't exist, creates a new node and initialized with [0] children[child] = child_node = [0] return self._find_node(child_node, parts) def _add_rule(self, root, rule): + """ + initial setup for a line of the public suffix list. If it starts with ! that is a negation operation. this + calls the find_node() method recursively to build out the Trie for this rule. + + :param root: root Trie + :param rule: string, line of public suffixlist + :return: None + """ if rule.startswith('!'): negate = 1 rule = rule[1:] @@ -112,24 +140,60 @@ def _add_rule(self, root, rule): self._find_node(root, parts)[0] = negate def _simplify(self, node): + """ + condenses the lines of the Trie + + :param node: node in the Trie, either 0/1 or a subTrie + :return: simplified Trie, form Tuple + """ if len(node) == 1: return node[0] return (node[0], dict((k, self._simplify(v)) for (k, v) in node[1].items())) - def _build_structure(self, fp): + def _build_structure(self, fp, idna): + """ + build a Trie from the public suffix list. If idna==True, idna-encode each line before building. + + The Trie is comprised of tuples that encode whether the line is a negation line (0 or 1), and terminate with 0. + Each node is represented with two-tuple of the form (negate, dict of children / sub-Trie). + A partial subTrie therefore looks like: + (0, {'ac': 0, 'co': (0, {'blogspot': 0}), 'gv': 0,....}) + where each tuple starts with the negation encoding, and each leaf in the Trie as a dictionary element returns 0. + + :param fp: pointer for the public suffix list + :param idna: boolean, convert lines to idna-encoded strings + :return: Trie + """ root = [0] for line in fp: line = line.strip() if not line or line.startswith('//'): continue + if idna: + line = line.encode('idna').decode() self._add_rule(root, line.split()[0].lstrip('.')) return root - def _lookup_node(self, matches, depth, parent, parts): + def _lookup_node(self, matches, depth, parent, parts, wildcard): + """ + traverses the Trie recursively to find the parts. By default, the traverse follows wildcards, as appropriate for + the public suffix list, but if wildcard is set to False, it will stop at wildcard leaves. This can be useful for + summarizing complex wildcard domains like those under amazonaws.com. + + the lookup is tracked via a list, initially set to all None, that marks the negation flags of nodes it matches. + each match will be marked for later composition of the SLD. + + :param matches: list, parts long, None (initial), 0, or 1 + :param depth: int, how far in the Trie this run is + :param parent: Tuple, the current subTrie + :param parts: list of domain parts, strings + :param wildcard: boolean, whether to process wildcard nodes + :return: None, recursive call + """ if parent in (0, 1): negate = parent children = None @@ -142,37 +206,71 @@ def _lookup_node(self, matches, depth, parent, parts): for name in ('*', parts[-depth]): child = children.get(name, None) if child is not None: - self._lookup_node(matches, depth + 1, child, parts) + if wildcard: + self._lookup_node(matches, depth + 1, child, parts, wildcard) + elif name != '*': + self._lookup_node(matches, depth + 1, child, parts, wildcard) - def get_public_suffix(self, domain): + def get_sld(self, domain, wildcard=True, strict=False): """ - Return the public suffix for a `domain` DNS name string. + gets the second-level-domain (SLD) or private suffix of a given domain according to the public suffix list. + The public suffix list includes wildcards, so if wildcard is set to True, this will follow the wildcard on + traversal, otherwise it will stop at wildcard nodes. - For example:: - >>> get_public_suffix("www.example.com") - "example.com" + The logic does not check by default whether the TLD is in the Trie, so for example, 'www.this.local' will + return 'this.local'. If you want to ensure the TLD is in the public suffix list, use strict=True. - Note that for internationalized domains the list at - http://publicsuffix.org uses decoded names, so it is - up to the caller to decode any Punycode-encoded names as unicode. + :param domain: string, needs to match the encoding of the PSL (idna or UTF8) + :param wildcard: boolean, follow wildcard patterns + :param strict: boolean, check the TLD is valid, return None if not + :return: string, the SLD for the domain """ - + if not domain: + return None parts = domain.lower().strip('.').split('.') hits = [None] * len(parts) + if strict and parts[-1] not in self.root[1].keys(): + return None - self._lookup_node(hits, 1, self.root, parts) + self._lookup_node(hits, 1, self.root, parts, wildcard) for i, what in enumerate(hits): if what is not None and what == 0: return '.'.join(parts[i:]) + def get_public_suffix(self, domain, wildcard=True, strict=False): + return self.get_sld(domain, wildcard, strict) + + def get_tld(self, domain, wildcard=True, strict=False): + """ + gets the TLD, or public suffix, of a domain using the public suffix list. uses wildcards if set, and checks for + valid top TLD is strict=True. + + this will return the domain itself when it is an ICANN TLD, e.g., 'com' returns 'com', for follow on processing, + while 'co.uk' return 'uk'. On the other hand, more complicated domains will return their public suffix, e.g., + 'google.co.uk' will return 'co.uk'. Root ('.') will return empty string. + + :param domain: string + :param wildcard: boolean, follow wildcards in Trie + :param strict: boolean, check that top TLD is valid in Trie + :return: string, the TLD for the domain + """ + sld = self.get_sld(domain, wildcard, strict) + if sld is None: + return None + elif sld.count(".") > 0: + return ".".join(sld.split(".")[1:]) + else: + return sld + _PSL = None -def get_public_suffix(domain, psl_file=None): +def get_sld(domain, psl_file=None, wildcard=True, idna=True, strict=False): """ - Return the public suffix for a `domain` DNS name string. + Return the private suffix or SLD for a `domain` DNS name string. The original publicsuffix2 library used the method + get_public_suffix() for this purpose, but get_private_suffix() is more proper. Convenience function that builds and caches a PublicSuffixList object. Optionally read, and parse a public suffix list. `psl_file` is either a file @@ -185,9 +283,48 @@ def get_public_suffix(domain, psl_file=None): The file format is described at http://publicsuffix.org/ """ global _PSL - _PSL = _PSL or PublicSuffixList(psl_file) - return _PSL.get_public_suffix(domain) + _PSL = _PSL or PublicSuffixList(psl_file, idna=idna) + return _PSL.get_sld(domain, wildcard=wildcard, strict=strict) + +def get_tld(domain, psl_file=None, wildcard=True, idna=True, strict=False): + """ + Return the public suffix for a `domain` DNS name string. (this is actually the private suffix that is returned) + Convenience function that builds and caches a PublicSuffixList object. + + Optionally read, and parse a public suffix list. `psl_file` is either a file + location string, or a file-like object, or an iterable of lines from a + public suffix data file. + + If psl_file is None, the vendored file named "public_suffix_list.dat" is + loaded. It is stored side by side with this Python package. + + The file format is described at http://publicsuffix.org/ + """ + global _PSL + _PSL = _PSL or PublicSuffixList(psl_file, idna=idna) + return _PSL.get_tld(domain, wildcard=wildcard, strict=strict) + + +def get_public_suffix(domain, psl_file=None, wildcard=True, idna=True, strict=False): + """ + Included for compatibility with the original publicsuffix2 library -- this function returns the private + suffix or SLD of the domain. To get the public suffix, use get_tld(). + Convenience function that builds and caches a PublicSuffixList object. + + Optionally read, and parse a public suffix list. `psl_file` is either a file + location string, or a file-like object, or an iterable of lines from a + public suffix data file. + + If psl_file is None, the vendored file named "public_suffix_list.dat" is + loaded. It is stored side by side with this Python package. + + The file format is described at http://publicsuffix.org/ + """ + warnings.warn("method returns private suffix, SLD, or registrable domain. " + "equivalent to method get_sld(). to get the public suffix itself, use get_tld().", + UserWarning) + return get_sld(domain, psl_file, wildcard, idna, strict) def fetch(): diff --git a/tests.py b/tests.py index 7bc5972..d8623fc 100644 --- a/tests.py +++ b/tests.py @@ -89,7 +89,7 @@ def test_get_public_suffix_from_list_with_fqdn(self): assert 'example.com' == psl.get_public_suffix('example.com.') def test_get_public_suffix_from_list_with_unicode(self): - psl = publicsuffix.PublicSuffixList([u('\u0440\u0444')]) + psl = publicsuffix.PublicSuffixList([u('\u0440\u0444')], idna=False) assert u('\u0440\u0444') == psl.get_public_suffix(u('\u0440\u0444')) assert u('example.\u0440\u0444') == psl.get_public_suffix(u('example.\u0440\u0444')) @@ -182,8 +182,49 @@ def test_get_public_suffix_from_builtin_full_publicsuffix_org(self): assert 'test.k12.ak.us' == psl.get_public_suffix('test.k12.ak.us') assert 'test.k12.ak.us' == psl.get_public_suffix('www.test.k12.ak.us') - # unicode - assert u('www.\u9999\u6e2f') == psl.get_public_suffix(u('www.\u9999\u6e2f')) + +class TestPublicSuffixIdna(unittest.TestCase): + + def test_idna_encoded(self): + psl = publicsuffix.PublicSuffixList(idna=True) # actually the default + assert 'xn--85x722f.com.cn' == psl.get_public_suffix('xn--85x722f.com.cn') + assert 'xn--85x722f.xn--55qx5d.cn' == psl.get_public_suffix('xn--85x722f.xn--55qx5d.cn') + assert 'xn--85x722f.xn--55qx5d.cn' == psl.get_public_suffix('www.xn--85x722f.xn--55qx5d.cn') + assert 'shishi.xn--55qx5d.cn' == psl.get_public_suffix('shishi.xn--55qx5d.cn') + + def test_utf8_encoded(self): + psl = publicsuffix.PublicSuffixList(idna=False) # uses the list provided utf-8 defaults + assert '食狮.com.cn' == psl.get_public_suffix('食狮.com.cn') + assert '食狮.公司.cn' == psl.get_public_suffix('食狮.公司.cn') + assert '食狮.公司.cn' == psl.get_public_suffix('www.食狮.公司.cn') + assert 'shishi.公司.cn' == psl.get_public_suffix('shishi.公司.cn') + + def test_exceptions(self): + psl = publicsuffix.PublicSuffixList() + assert 'www.ck' == psl.get_public_suffix('www.www.ck') # www is the exception + assert 'this.that.ck' == psl.get_public_suffix('this.that.ck') + + def test_no_wildcard(self): + psl = publicsuffix.PublicSuffixList() + # test completion when no wildcards should be processed + assert 'com.pg' == psl.get_public_suffix('telinet.com.pg', wildcard=False) + assert 'ap-southeast-1.elb.amazonaws.com' == psl.get_public_suffix('blah.ap-southeast-1.elb.amazonaws.com', + wildcard=False) + + def test_convenience_functions(self): + psl = publicsuffix.PublicSuffixList() + # these functions should be identical + assert psl.get_sld('www.google.com') == psl.get_public_suffix('www.google.com') + assert psl.get_sld('www.test.ak.us') == psl.get_public_suffix('www.test.ak.us') + + def test_tld_function(self): + psl = publicsuffix.PublicSuffixList() + # checks that the eTLD or TLD is produced + assert psl.get_tld('com') == 'com' + assert psl.get_tld('telinet.com.pg', wildcard=True) == 'com.pg' + assert psl.get_tld('telinet.com.pg', wildcard=False) == 'pg' + assert psl.get_tld('telinet.co.uk', wildcard=False) == 'co.uk' + assert psl.get_tld('blah.local', strict=True) is None if __name__ == '__main__': From 93c5b46b4b4d8f0aacd5220333c0644dcfc91b2c Mon Sep 17 00:00:00 2001 From: Vito Piserchia Date: Fri, 31 May 2019 10:00:54 +0200 Subject: [PATCH 2/8] make test compatible with python2 --- tests.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests.py b/tests.py index d8623fc..61c80c1 100644 --- a/tests.py +++ b/tests.py @@ -194,10 +194,10 @@ def test_idna_encoded(self): def test_utf8_encoded(self): psl = publicsuffix.PublicSuffixList(idna=False) # uses the list provided utf-8 defaults - assert '食狮.com.cn' == psl.get_public_suffix('食狮.com.cn') - assert '食狮.公司.cn' == psl.get_public_suffix('食狮.公司.cn') - assert '食狮.公司.cn' == psl.get_public_suffix('www.食狮.公司.cn') - assert 'shishi.公司.cn' == psl.get_public_suffix('shishi.公司.cn') + assert u'食狮.com.cn' == psl.get_public_suffix(u'食狮.com.cn') + assert u'食狮.公司.cn' == psl.get_public_suffix(u'食狮.公司.cn') + assert u'食狮.公司.cn' == psl.get_public_suffix(u'www.食狮.公司.cn') + assert u'shishi.公司.cn' == psl.get_public_suffix(u'shishi.公司.cn') def test_exceptions(self): psl = publicsuffix.PublicSuffixList() From 423738e8d1fece05f1857dac34d32bda9a04486b Mon Sep 17 00:00:00 2001 From: KnitCode Date: Mon, 10 Jun 2019 07:14:38 -0400 Subject: [PATCH 3/8] fixes bug in RST file; updates to single publicsuffix2 documentation for merge into publicsuffix2 library. --- README.rst | 126 ++++++++++++++++++------------------------ publicsuffix2.LICENSE | 4 -- setup.py | 9 +-- 3 files changed, 60 insertions(+), 79 deletions(-) diff --git a/README.rst b/README.rst index 55b8ed4..2e03477 100644 --- a/README.rst +++ b/README.rst @@ -15,20 +15,18 @@ This module builds the public suffix list as a Trie structure, making it more ef than other string-based modules available for the same purpose. It can be used effectively in large-scale distributed environments, such as PySpark. -This Python module includes with a copy of the Public Suffix List so that it is +This Python module includes with a copy of the Public Suffix List (PSL) so that it is usable out of the box. Newer versions try to provide reasonably fresh copies of -this list. It also includes a convenience method to fetch the latest list. +this list. It also includes a convenience method to fetch the latest list. The PSL does +change regularly. -The code is a fork of the publicsuffix2 package and includes the same base API. In +The code is a fork of the publicsuffix package and includes the same base API. In addition, it contains a few variants useful for certain use cases, such as the option to -ignore wildcards or return only the extended TLD (eTLD). -Publicsuffix2 is a an extension of publicsuffix, and uses the same base API. -You just need to import publicsuffix2 instead. +ignore wildcards or return only the extended TLD (eTLD). You just need to import publicsuffix2 instead. The public suffix list is now provided in UTF-8 format. To correctly process -IDNA-encoded domains, either the query or the list must be converted. This module -contains the option to IDNA-encode the public suffix list upon creating the Trie; this -is set to happen by default. If your use case includes UTF-8 domains, e.g., '食狮.com.cn', +IDNA-encoded domains, either the query or the list must be converted. By default, the +module converts the PSL. If your use case includes UTF-8 domains, e.g., '食狮.com.cn', you'll need to set the IDNA-encoding flag to False on instantiation (see examples below). Failure to use the correct encoding for your use case can lead to incorrect results for domains that utilize unicode characters. @@ -43,29 +41,34 @@ The code is MIT-licensed and the publicsuffix data list is MPL-2.0-licensed. :target: https://travis-ci.org/nexB/python-publicsuffix2 :alt: develop branch tests status - Usage ----- -To install from source, first build the package file: - python setup.py build sdist -and then pip install from the dist directory. - Install with:: pip install publicsuffix2 -The module provides a function to query a domain name:: +The module provides functions to obtain the base domain, or sld, of an fqdn, as well as one +to get just the public suffix. In addition, the functions a number of boolean parameters that +control how wildcards are handled. In addition to the functions, the module exposes a class that +parses the PSL, and allows for more control. + +The module provides two equivalent functions to query a domain name, and return the base domain, +or second-level-doamin; get_public_suffix() and get_sld():: >>> from publicsuffix2 import get_public_suffix >>> get_public_suffix('www.example.com') 'example.com' + >>> get_sld('www.example.com') + 'example.com' >>> get_public_suffix('www.example.co.uk') 'example.co.uk' >>> get_public_suffix('www.super.example.co.uk') 'example.co.uk' -This function loads and caches the public suffix list. +This function loads and caches the public suffix list. To obtain the latest version of the +PSL, use the fetch() function to first download the latest version. Alternatively, you can pass +a custom list. For more control, there is also a class that parses a Public Suffix List and allows the same queries on individual domain names:: @@ -85,7 +88,6 @@ combination with a port number or a username, etc.). It is up to the caller to ensure only domain names are passed to the get_public_suffix() method. - The get_public_suffix() function and the PublicSuffixList class initializer accept an optional argument pointing to a public suffix file. This can either be a file path, an iterable of public suffix lines, or a file-like object pointing to an @@ -102,45 +104,10 @@ suffix list data. This will use the cached latest loaded above:: >>> get_public_suffix('www.example.co.uk') 'example.co.uk' - -To update the bundled suffix list use the provided setup.py command:: - - python setup.py update_psl - -The update list will be saved in `src/publicsuffix2/public_suffix_list.dat` -and you can build a new wheel with this bundled data. - -Alternatively, there is a fetch() function that will fetch the latest version -of a Public Suffix data file from https://publicsuffix.org/list/public_suffix_list.dat -You can use it this way:: - - >>> from publicsuffix import get_public_suffix - >>> from publicsuffix import fetch - >>> psl_file = fetch() - >>> get_public_suffix('www.example.com', psl_file) - 'example.com' - -Note that the once loaded, the data file is cached and therefore fetched only -once. - -If using this library in large-scale pyspark processing, you should instantiate the class as -a global variable, not within a user function. The class methods can then be used within user -functions for distributed processing. - -Changes in this Fork --------------------- - -This fork of publicsuffix2 addresses a change in the format to the standard public suffix list, -which was previously IDNA-encoded and now is in UTF-8 format, as well as some additional -functionality useful to certain use cases. These additions include the ability to ignore -wildcards and to require strict adherence to the TLDs included in the list. Lastly, we include -some convenience functions for obtaining only the extended TLD (eTLD) rather than the -registrable domain (SLD). These are outlined below. - -IDNA-encoding. The public suffix list is now provided in UTF-8 format. For those use cases that -include IDNA-encoded domains, the module will not return accurate results unless the list is -converted. In this fork, IDNA encoding is included as a parameter in the class and is on by -default.:: +IDNA-encoding. The public suffix list is now in UTF-8 format. For those use cases that +include IDNA-encoded domains, the list must be converted. Publicsuffix2 includes idna +encoding as a parameter of the PublicSuffixList initialization and is true by +default. For UTF-8 use cases, set the idna parameter to False:: >>> from publicsuffix2 import PublicSuffixList >>> psl = PublicSuffixList(idna=True) # on by default @@ -155,7 +122,7 @@ the user might want to ignore wildcards to create more aggregation. This is poss the parameter wildcard=False. Require valid eTLDs (strict). In the publicsuffix2 module, a domain with an invalid TLD will still return -a public suffix, e.g,:: +return a base domain, e.g,:: >>> psl.get_public_suffix('www.mine.local') 'mine.local' @@ -168,7 +135,8 @@ an invalid TLD will return None.:: >>> psl.get_public_suffix('www.mine.local', strict=True) is None True -Return eTLD only. The standard use case for publicsuffix2 is to return the registrable domain +Return eTLD only. The standard use case for publicsuffix2 is to return the registrable, +or base, domain according to the public suffix list. In some cases, however, we only wish to find the eTLD itself. In this fork, this is available via the get_tld() method.:: @@ -180,11 +148,34 @@ All of the methods and functions include the wildcard and strict parameters. For convenience, the public method get_sld() is available. This is identical to the method get_public_suffix() and is intended to clarify the output for some users. +To update the bundled suffix list use the provided setup.py command:: + + python setup.py update_psl + +The update list will be saved in `src/publicsuffix2/public_suffix_list.dat` +and you can build a new wheel with this bundled data. + +Alternatively, there is a fetch() function that will fetch the latest version +of a Public Suffix data file from https://publicsuffix.org/list/public_suffix_list.dat +You can use it this way:: + + >>> from publicsuffix import get_public_suffix + >>> from publicsuffix import fetch + >>> psl_file = fetch() + >>> get_public_suffix('www.example.com', psl_file) + 'example.com' + +Note that the once loaded, the data file is cached and therefore fetched only +once. + +If using this library in large-scale pyspark processing, you should instantiate the class as +a global variable, not within a user function. The class methods can then be used within user +functions for distributed processing. Source ------ -Get a local copy of the development repository. The development takes +Get a local copy of the development repository. The development takes place in the ``develop`` branch. Stable releases are tagged in the ``master`` branch:: @@ -193,9 +184,7 @@ branch:: History ------- -This code is forked from NexB's fork of Tomaž Šolc's fork of David Wilson's code. - -The original publicsuffix2 code is Copyright (c) 2015 nexB Inc. +This code is forked from Tomaž Šolc's fork of David Wilson's code. David Wilson's code originally at: @@ -203,9 +192,6 @@ https://www.tablix.org/~avian/git/publicsuffix.git Copyright (c) 2014 Tomaž Šolc -The API is essentially the same as publicsuffix including using the same package -name to allow a straight forward replacement. - David Wilson's code was originally at: from http://code.google.com/p/python-public-suffix-list/ @@ -216,27 +202,25 @@ Copyright (c) 2009 David Wilson License ------- -The code is MIT-licensed. +The code is MIT-licensed. The vendored public suffix list data from Mozilla is under the MPL-2.0. -Copyright (c) 2019 Renée Burton - Copyright (c) 2015 nexB Inc. Copyright (c) 2014 Tomaž Šolc Copyright (c) 2009 David Wilson - + Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - + The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE diff --git a/publicsuffix2.LICENSE b/publicsuffix2.LICENSE index d4dc33c..101aff9 100644 --- a/publicsuffix2.LICENSE +++ b/publicsuffix2.LICENSE @@ -1,7 +1,3 @@ -Copyright (c) 2019 Renée Burton -This code is based on nexB Inc. code found at: -https://www.github.com/nexB/python-publicsuffix2 - Copyright (c) 2015 nexB Inc. This code is based on Tomaž Šolc fork of David Wilson code originally at https://www.tablix.org/~avian/git/publicsuffix.git diff --git a/setup.py b/setup.py index dea7830..bd7bb9c 100644 --- a/setup.py +++ b/setup.py @@ -81,14 +81,15 @@ def run(self): setup( name='publicsuffix2', - version='2.20190328', + version='2.20190610', license='MIT and MPL-2.0', description='Get a public suffix for a domain name using the Public Suffix ' 'List. Forked from and using the same API as the publicsuffix package.', + long_description_content_type = 'text/x-rst', long_description='%s\n%s' % (read('README.rst'), read('CHANGELOG.rst')), - author='Renée Burton, nexB Inc., Tomaz Solc and David Wilson', + author='nexB Inc., Tomaz Solc and David Wilson', author_email='', - url='https://github.com/KnitCode/python-publicsuffix2', + url='https://github.com/nextb/python-publicsuffix2', packages=find_packages('src'), package_dir={'': 'src'}, py_modules=[splitext(basename(path))[0] for path in glob('src/*.py')], @@ -109,7 +110,7 @@ def run(self): 'Development Status :: 5 - Production/Stable', ], keywords=[ - 'domain', 'public suffix', 'suffix', 'dns', 'tld', + 'domain', 'public suffix', 'suffix', 'dns', 'tld', 'sld', 'psl', ], cmdclass={'update_psl': UpdatePslCommand}, ) From ee43c539fa82309346e4f965468117274c44ea5d Mon Sep 17 00:00:00 2001 From: KnitCode Date: Mon, 10 Jun 2019 07:38:26 -0400 Subject: [PATCH 4/8] doc fix --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 2e03477..9f76f32 100644 --- a/README.rst +++ b/README.rst @@ -2,7 +2,7 @@ Public Suffix List module for Python ==================================== This module allows you to get the public suffix, as well as the registrable domain, - of a domain name using the Public Suffix List from http://publicsuffix.org +of a domain name using the Public Suffix List from http://publicsuffix.org A public suffix is a domain suffix under which you can register domain names. It is sometimes referred to as the extended TLD (eTLD). From ddacf163c94dce012b1fc06258fbf9c64053e61e Mon Sep 17 00:00:00 2001 From: KnitCode Date: Mon, 10 Jun 2019 07:42:55 -0400 Subject: [PATCH 5/8] doc fix --- README.rst | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/README.rst b/README.rst index 9f76f32..06ed9f3 100644 --- a/README.rst +++ b/README.rst @@ -5,11 +5,15 @@ This module allows you to get the public suffix, as well as the registrable doma of a domain name using the Public Suffix List from http://publicsuffix.org A public suffix is a domain suffix under which you can register domain -names. It is sometimes referred to as the extended TLD (eTLD). -Some examples of public suffixes are ".com", ".co.uk" and "pvt.k12.wy.us". +names, or under which the suffix owner does not control the subdomains. +Some examples of public suffixes in the former example are ".com", +".co.uk" and "pvt.k12.wy.us"; examples of the latter case are "github.io" and +"blogspot.com". The public suffix is sometimes referred to as the effective +or extended TLD (eTLD). Accurately knowing the public suffix of a domain is useful when handling web browser cookies, highlighting the most important part of a domain name -in a user interface or sorting URLs by web site. +in a user interface or sorting URLs by web site. It is also used in a wide range +of research and applications that leverages Domain Name System (DNS) data. This module builds the public suffix list as a Trie structure, making it more efficient than other string-based modules available for the same purpose. It can be used From 3497e84bb0a622d06525f31d3f36a6b63e93f04e Mon Sep 17 00:00:00 2001 From: KnitCode Date: Mon, 10 Jun 2019 07:51:18 -0400 Subject: [PATCH 6/8] more doc imporovements --- README.rst | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/README.rst b/README.rst index 06ed9f3..13e5903 100644 --- a/README.rst +++ b/README.rst @@ -85,6 +85,8 @@ Suffix List and allows the same queries on individual domain names:: 'example.co.uk' >>> psl.get_public_suffix('www.super.example.co.uk') 'example.co.uk' + >>> psl.get_sld('www.super.example.co.uk') + 'example.co.uk' Note that the ``host`` part of an URL can contain strings that are not plain DNS domain names (IP addresses, Punycode-encoded names, name in @@ -108,7 +110,7 @@ suffix list data. This will use the cached latest loaded above:: >>> get_public_suffix('www.example.co.uk') 'example.co.uk' -IDNA-encoding. The public suffix list is now in UTF-8 format. For those use cases that +**IDNA-encoding.** The public suffix list is now in UTF-8 format. For those use cases that include IDNA-encoded domains, the list must be converted. Publicsuffix2 includes idna encoding as a parameter of the PublicSuffixList initialization and is true by default. For UTF-8 use cases, set the idna parameter to False:: @@ -121,17 +123,21 @@ default. For UTF-8 use cases, set the idna parameter to False:: >>> psl.get_public_suffix('食狮.com.cn') '食狮.com.cn' -Ignore wildcards. In some use cases, particularly those related to large-scale domain processing, +**Ignore wildcards.** In some use cases, particularly those related to large-scale domain processing, the user might want to ignore wildcards to create more aggregation. This is possible by setting -the parameter wildcard=False. +the parameter wildcard=False.:: + + >>> psl.get_public_suffix('telinet.com.pg', wildcard=False) + 'com.pg' + >>> psl.get_public_suffix('telinet.com.pg', wildcard=True) + 'telinet.com.pg' -Require valid eTLDs (strict). In the publicsuffix2 module, a domain with an invalid TLD will still return +**Require valid eTLDs (strict).** In the publicsuffix2 module, a domain with an invalid TLD will still return return a base domain, e.g,:: >>> psl.get_public_suffix('www.mine.local') 'mine.local' - This is useful for many use cases, while in others, we want to ensure that the domain includes a valid eTLD. In this case, the boolean parameter strict provides a solution. If this flag is set, an invalid TLD will return None.:: @@ -139,20 +145,22 @@ an invalid TLD will return None.:: >>> psl.get_public_suffix('www.mine.local', strict=True) is None True -Return eTLD only. The standard use case for publicsuffix2 is to return the registrable, +**Return eTLD only.** The standard use case for publicsuffix2 is to return the registrable, or base, domain according to the public suffix list. In some cases, however, we only wish to find the eTLD -itself. In this fork, this is available via the get_tld() method.:: +itself. This is available via the get_tld() method.:: >>> psl.get_tld('www.google.com') 'com' + >>> psl.get_tld('www.google.co.uk') + 'co.uk' All of the methods and functions include the wildcard and strict parameters. For convenience, the public method get_sld() is available. This is identical to the method get_public_suffix() and is intended to clarify the output for some users. -To update the bundled suffix list use the provided setup.py command:: +To **update the bundled suffix list** use the provided setup.py command:: python setup.py update_psl From 2d5dafe45307b6a3e96447f45ac3a4c9f19352a5 Mon Sep 17 00:00:00 2001 From: KnitCode Date: Thu, 8 Aug 2019 18:13:51 -0500 Subject: [PATCH 7/8] adds instance attribute 'tlds' per issue ticket request for a way to access the 'list' of TLDs. fixes some documentation typos. version bump to 20190808. --- CHANGELOG.rst | 4 ++++ README.rst | 18 +++++++++++++++--- setup.py | 2 +- src/publicsuffix2/__init__.py | 5 +++++ 4 files changed, 25 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index c76a17f..8058e4b 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,6 +1,10 @@ Changelog --------- +2019-08-08 publicsuffix2 2.20190808 + + * adds additional functionality and handles change to PSL format + * adds attribute to retrieve the PSL as a list 2019-02-05 publicsuffix2 2.201902051213 diff --git a/README.rst b/README.rst index 13e5903..c4aa385 100644 --- a/README.rst +++ b/README.rst @@ -99,7 +99,7 @@ an optional argument pointing to a public suffix file. This can either be a file path, an iterable of public suffix lines, or a file-like object pointing to an opened list:: - >>> from publicsuffix import get_public_suffix + >>> from publicsuffix2 import get_public_suffix >>> psl_file = 'path to some psl data file' >>> get_public_suffix('www.example.com', psl_file) 'example.com' @@ -171,8 +171,8 @@ Alternatively, there is a fetch() function that will fetch the latest version of a Public Suffix data file from https://publicsuffix.org/list/public_suffix_list.dat You can use it this way:: - >>> from publicsuffix import get_public_suffix - >>> from publicsuffix import fetch + >>> from publicsuffix2 import get_public_suffix + >>> from publicsuffix2 import fetch >>> psl_file = fetch() >>> get_public_suffix('www.example.com', psl_file) 'example.com' @@ -180,6 +180,18 @@ You can use it this way:: Note that the once loaded, the data file is cached and therefore fetched only once. +The extracted public suffix list, that is the tlds and their modifiers, is put into +an instance variable, tlds, which can be accessed as an attribute, tlds.:: + + >>> psl = PublicSuffixList() + >>> psl.tlds[:5] + ['ac', + 'com.ac', + 'edu.ac', + 'gov.ac', + 'net.ac'] + +**Using the module in large-scale processing** If using this library in large-scale pyspark processing, you should instantiate the class as a global variable, not within a user function. The class methods can then be used within user functions for distributed processing. diff --git a/setup.py b/setup.py index bd7bb9c..9cdb887 100644 --- a/setup.py +++ b/setup.py @@ -81,7 +81,7 @@ def run(self): setup( name='publicsuffix2', - version='2.20190610', + version='2.20190808', license='MIT and MPL-2.0', description='Get a public suffix for a domain name using the Public Suffix ' 'List. Forked from and using the same API as the publicsuffix package.', diff --git a/src/publicsuffix2/__init__.py b/src/publicsuffix2/__init__.py index f0d97b2..0eb6573 100644 --- a/src/publicsuffix2/__init__.py +++ b/src/publicsuffix2/__init__.py @@ -161,11 +161,15 @@ def _build_structure(self, fp, idna): (0, {'ac': 0, 'co': (0, {'blogspot': 0}), 'gv': 0,....}) where each tuple starts with the negation encoding, and each leaf in the Trie as a dictionary element returns 0. + Also creates an instance attribute, tlds, which simply contains the publicsuffix list, with the modifiers such + as wildcards, as a list. This can be accessed for post-processing by the application. + :param fp: pointer for the public suffix list :param idna: boolean, convert lines to idna-encoded strings :return: Trie """ root = [0] + tlds = [] for line in fp: line = line.strip() @@ -176,6 +180,7 @@ def _build_structure(self, fp, idna): self._add_rule(root, line.split()[0].lstrip('.')) + self.tlds = tlds # a list of eTLDs with their modifiers, e.g., * return root def _lookup_node(self, matches, depth, parent, parts, wildcard): From be418d2474593e8aa9859aa4aaefefd0b13ff74d Mon Sep 17 00:00:00 2001 From: KnitCode Date: Fri, 9 Aug 2019 14:31:37 -0500 Subject: [PATCH 8/8] corrects the url and email address --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 9cdb887..6ee0962 100644 --- a/setup.py +++ b/setup.py @@ -88,8 +88,8 @@ def run(self): long_description_content_type = 'text/x-rst', long_description='%s\n%s' % (read('README.rst'), read('CHANGELOG.rst')), author='nexB Inc., Tomaz Solc and David Wilson', - author_email='', - url='https://github.com/nextb/python-publicsuffix2', + author_email='info@nexb.com', + url='https://github.com/nexb/python-publicsuffix2', packages=find_packages('src'), package_dir={'': 'src'}, py_modules=[splitext(basename(path))[0] for path in glob('src/*.py')],