In [28]:
import json
import networkx as nx
import pandas as pd

from collections import namedtuple

#### Licence categories

In [None]:
# we have several categories of license.
# In the paper "Open Source License Inconsistencies on GitHub", they distinguish between:
# 1. permissive
# 2. weak copyleft
# 3. (strong) copyleft
# 4. other

In [3]:
# scancodedb
scancodedb_path = '../data/scancode_index.json'
db = json.load(open(scancodedb_path, 'r'))

In [5]:
# example
db[0]

{'license_key': '389-exception',
 'category': 'Copyleft Limited',
 'spdx_license_key': '389-exception',
 'other_spdx_license_keys': [],
 'is_exception': True,
 'is_deprecated': False,
 'json': '389-exception.json',
 'yaml': '389-exception.yml',
 'html': '389-exception.html',
 'license': '389-exception.LICENSE'}

In [8]:
# store all license categories
license_categories = set([data['category'] for data in db])

In [9]:
license_categories

{'CLA',
 'Commercial',
 'Copyleft',
 'Copyleft Limited',
 'Free Restricted',
 'Patent License',
 'Permissive',
 'Proprietary Free',
 'Public Domain',
 'Source-available',
 'Unstated License'}

In [None]:
# https://www.gnu.org/philosophy/categories.html
# FREE
# 'Public Domain' > 'Permissive' > 'Copyleft Limited' or 'Weak Copyleft' > 'Copyleft' or 'Strong Copyleft' 
#
# - 'CLA' (Contributor License Agreement) used with Copyleft licenses (https://en.wikipedia.org/wiki/Contributor_License_Agreement)
# - 'Proprietary Free'?
#
# NON-FREE
# 'Commercial'
# 'Source-available' ( The source code or installers are publicly available, but they do not guarantee the same freedom of open-source)
# 'Patent License'
# 'Free Restricted'?
#
# UNSTATED
# 'Unstated License'

In [10]:
FREE_LICENSES = ['Public Domain', 'Permissive', 'Copyleft Limited', 'Copyleft', 'Proprietary Free']
NON_FREE_LICENSES = ['Commercial', 'Source-available', 'Patent License', 'Free Restricted']

#### Inconsistency detector

In [20]:
# Find whether there is an inconsistency between two licenses
# - The term inconsistency refers to the use of two different licenses within the same project.
def find_inconsistency(licence_type_1, licence_type_2):
    hybrid_inconsistence = False
    unstated_license = False

    License_inconsistency = namedtuple('License_inconsistency', ['inconsistency', 'hybrid_inconsistency', 'unstated_license'])

    # no inconsistency if both licences are equal
    if licence_type_1 == licence_type_2:
        return License_inconsistency(False, hybrid_inconsistence, unstated_license)
    
    # flag unstated license
    if licence_type_1 == 'Unstated License' or licence_type_2 == 'Unstated License':
        unstated_license = True
    
    # no inconsistency if one of the licences is public domain
    if licence_type_1 == 'Public Domain' or licence_type_2 == 'Public Domain':
        return License_inconsistency(False, hybrid_inconsistence, unstated_license)
    else:
        # there is an inconsistency if both licenses are free or non-free, but are not the same
        if licence_type_1 in FREE_LICENSES and licence_type_2 in FREE_LICENSES:
            return License_inconsistency(True, hybrid_inconsistence, unstated_license)
        elif licence_type_1 in NON_FREE_LICENSES and licence_type_2 in NON_FREE_LICENSES:
            return License_inconsistency(True, hybrid_inconsistence, unstated_license)
        else:
            # there is an inconsistency if one of the licenses is free and the other is non-free
            hybrid_inconsistence = True
            return License_inconsistency(True, hybrid_inconsistence, unstated_license)

In [25]:
print(find_inconsistency('Permissive', 'Permissive'))
print(find_inconsistency('Permissive', 'Unstated License'))
print(find_inconsistency('Permissive', 'Copyleft Limited'))
print(find_inconsistency('Copyleft Limited', 'Public Domain'))
print(find_inconsistency('Copyleft Limited', 'Commercial'))

License_inconsistency(inconsistency=False, hybrid_inconsistency=False, unstated_license=False)
License_inconsistency(inconsistency=True, hybrid_inconsistency=True, unstated_license=True)
License_inconsistency(inconsistency=True, hybrid_inconsistency=False, unstated_license=False)
License_inconsistency(inconsistency=False, hybrid_inconsistency=False, unstated_license=False)
License_inconsistency(inconsistency=True, hybrid_inconsistency=True, unstated_license=False)


#### Conflict detector

In [None]:
# we need to build a direct graph to represent the dependencies between licenses
# if there is a direct path between two nodes, then the two licenses are compatible
# we focus on the compatibility of the most common FOSS (free and open-source sw) licenses
# - spdx_license_key
# - compatibilit graph: https://en.wikipedia.org/wiki/License_compatibility

In [42]:
compatibility_data = pd.read_csv('../data/compatibility_graph.graph')
compatibility_graph = nx.from_pandas_edgelist(compatibility_data, source='license_1', target='license_2', create_using=nx.DiGraph())

In [45]:
def find_conflict(license_1, license_2, compatibility_graph):
    if not license_1 in compatibility_graph.nodes() or not license_2 in compatibility_graph.nodes():
        print('One of the licenses is not among the most common FOSS licenses or is misspelled')
        return False
    
    # if there is a path between license_1 and license_2, then they are compatible
    if nx.has_path(compatibility_graph, license_1, license_2):
        return True
    else:
        print('The two licenses are not compatible')
        return False

In [46]:
print(find_conflict('MIT', 'GPL-3.0-only', compatibility_graph))
print(find_conflict('MIT', 'GPL-3.0', compatibility_graph))
print(find_conflict('GPL-3.0-or-later', 'MIT', compatibility_graph))

True
One of the licenses is not among the most common FOSS licenses or is misspelled
False
The two licenses are not compatible
False
