# Analyse Library of Congress Subject Headings

Get the number of concepts and the depth of the ontology


You can download the lastest version of LC Subject Headings (LCSH) (SKOS/RDF only) from https://id.loc.gov/download/

WARNING: it is important to download the **ndjson** version

In [3]:
import json
from collections import deque

In [1]:
with open('lcsh.skos.ndjson','r') as f:
    content = f.readlines()
# you may also want to remove whitespace characters like `\n` at the end of each line
content = [x.strip() for x in content]

In [2]:
print(content[0])

{"@context": {"cs": "http://purl.org/vocab/changeset/schema#", "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", "rdfs": "http://www.w3.org/2000/01/rdf-schema#", "rdfs1": "http://www.w3.org/1999/02/22-rdf-schema#", "skos": "http://www.w3.org/2004/02/skos/core#", "skosxl": "http://www.w3.org/2008/05/skos-xl#", "xsd": "http://www.w3.org/2001/XMLSchema#", "about": "http://id.loc.gov/authorities/subjects/sh2009116899"}, "@graph": [{"@id": "http://id.loc.gov/authorities/subjects/sh2009116899", "@type": "skos:Concept", "skos:changeNote": [{"@id": "_:N4bc468bfb040444196abfe01db09e2c2"}, {"@id": "_:N84d2f5a19189411c8ec0500588920d6e"}], "skos:editorial": "[Record generated for validation purposes.]", "skos:inScheme": {"@id": "http://id.loc.gov/authorities/subjects"}, "skos:prefLabel": {"@language": "en", "@value": "Bee culture--Juvenile literature"}}, {"@id": "_:N4bc468bfb040444196abfe01db09e2c2", "@type": "cs:ChangeSet", "cs:changeReason": "revised", "cs:createdDate": {"@type": "xsd:dateTi

In [4]:
this = json.loads(content[0])
print(this["@context"])

{'cs': 'http://purl.org/vocab/changeset/schema#', 'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', 'rdfs': 'http://www.w3.org/2000/01/rdf-schema#', 'rdfs1': 'http://www.w3.org/1999/02/22-rdf-schema#', 'skos': 'http://www.w3.org/2004/02/skos/core#', 'skosxl': 'http://www.w3.org/2008/05/skos-xl#', 'xsd': 'http://www.w3.org/2001/XMLSchema#', 'about': 'http://id.loc.gov/authorities/subjects/sh2009116899'}


In [5]:
concepts = dict()
for row_s in content:
    row = json.loads(row_s)
    subject = row['@context']['about']
    r_subject = subject.split('/')[len(subject.split('/')) -1]
    
    if r_subject not in concepts:
        concepts[r_subject] = 1
    else:
        print('Found a copy of {}'.format(r_subject))

In [6]:
print("Number of unique concepts: {}".format(len(concepts)))

Number of unique concepts: 434265


In [7]:
hier = dict()

def get_sbuject(subject):
    return subject.split('/')[len(subject.split('/')) -1]

for row_s in content:
    row = json.loads(row_s)
    
    subject = row['@context']['about']
    r_subject = subject.split('/')[len(subject.split('/')) -1]
    
    graph = row['@graph']
    for node in graph:
        if "skos:narrower" in node:
            narrowers = node["skos:narrower"]
            
            if isinstance(narrowers, dict):
                narrowers = [narrowers]
            
            if r_subject not in hier:
                hier[r_subject] = list()
                
            for narrower in narrowers:
                try:
                    narrid = narrower['@id']
                    r_narrid = get_sbuject(narrid)
                    hier[r_subject].append(r_narrid)
                except TypeError:
                    print(narrowers)
                    
            
            break        

In [16]:
# print(len(hier))
# key = list(hier.keys())[0]
# print('{}: {}'.format(key, hier[key]))

In [9]:
unhier = dict()

for row_s in content:
    row = json.loads(row_s)
    
    subject = row['@context']['about']
    r_subject = subject.split('/')[len(subject.split('/')) -1]
    
    graph = row['@graph']
    for node in graph:
        if "skos:narrower" in node:
            narrowers = node["skos:narrower"]
            
            if isinstance(narrowers, dict):
                narrowers = [narrowers]
            
            
            for narrower in narrowers:
                try:
                    narrid = narrower['@id']
                    r_narrid = get_sbuject(narrid)
                    
                    if r_narrid not in unhier:
                        unhier[r_narrid] = list()
                    
                    unhier[r_narrid].append(r_subject)
                except TypeError:
                    print(narrowers)
                    
            
            break        

In [15]:
# print(len(unhier))
# key = list(unhier.keys())[0]
# print('{}: {}'.format(key, unhier[key]))

# Getting info about the depth

In [11]:
for concept, value in concepts.items():
    queue = deque() 
    max_depth = value
    queue.append({"t":concept,"d":value})
    while len(queue) > 0:
        dequeued = queue.popleft()
        if dequeued["t"] in unhier:
            broaders = unhier[dequeued["t"]]
            new_depth = dequeued["d"]+1
            if new_depth > max_depth:
                max_depth = new_depth
            for broader in broaders:
                queue.append({"t":broader,"d":dequeued["d"]+1})
    
    concepts[concept] = max_depth

In [13]:
import pandas as pd
list_of_depths = pd.DataFrame.from_dict(concepts, orient='index', columns=['depth'])

In [14]:
list_of_depths.sort_values('depth', inplace=True, ascending=False)
list_of_depths.head()

Unnamed: 0,depth
sh85099276,27
sh85102150,26
sh85046895,26
sh85020019,26
sh2006003411,26


# Testing the longest path

In [17]:
#getpath

concept = "sh85099276" # this is the concept with highest depth
value = 1


queue = deque() 
max_depth = value
queue.append({"t":concept,"d":value, "p":'null'})
while len(queue) > 0:
    dequeued = queue.popleft()
    print(json.dumps(dequeued))
    if dequeued["t"] in unhier:
        broaders = unhier[dequeued["t"]]
        new_depth = dequeued["d"]+1
        if new_depth > max_depth:
            max_depth = new_depth
        for broader in broaders:
            queue.append({"t":broader,"d":dequeued["d"]+1,"p":dequeued["t"]})
            
print(max_depth)

{"t": "sh85099276", "d": 1, "p": "null"}
{"t": "sh85059402", "d": 2, "p": "sh85099276"}
{"t": "sh85099278", "d": 2, "p": "sh85099276"}
{"t": "sh85099275", "d": 2, "p": "sh85099276"}
{"t": "sh85050435", "d": 3, "p": "sh85059402"}
{"t": "sh85086562", "d": 3, "p": "sh85099278"}
{"t": "sh85046633", "d": 3, "p": "sh85099278"}
{"t": "sh85099278", "d": 3, "p": "sh85099275"}
{"t": "sh85047658", "d": 3, "p": "sh85099275"}
{"t": "sh85102839", "d": 4, "p": "sh85050435"}
{"t": "sh85048088", "d": 4, "p": "sh85050435"}
{"t": "sh85047658", "d": 4, "p": "sh85050435"}
{"t": "sh85022900", "d": 4, "p": "sh85086562"}
{"t": "sh85080664", "d": 4, "p": "sh85046633"}
{"t": "sh85105988", "d": 4, "p": "sh85046633"}
{"t": "sh85060668", "d": 4, "p": "sh85046633"}
{"t": "sh85086562", "d": 4, "p": "sh85099278"}
{"t": "sh85046633", "d": 4, "p": "sh85099278"}
{"t": "sh85047259", "d": 4, "p": "sh85047658"}
{"t": "sh2003007697", "d": 5, "p": "sh85102839"}
{"t": "sh85034249", "d": 5, "p": "sh85048088"}
{"t": "sh85047259