# Installing needed Libraries

##### This will install rdflib to create the rdf version of the IEEE Taxonomy

In [None]:
!pip install rdflib==6.3.1

##### This will install PyPDF2 to extract text from IEEE Taxonomy PDF file

###### Note: After extracting the text from IEEE Taxonomy PDF file it has been manually analysed to eliminate any possible erros and to obtain a clean text file of IEEE Taxonomy

In [None]:
!pip install PyPDF2==3.0.1

# Importing

In [20]:
import re
import json
import urllib3
import urllib.parse
from PyPDF2 import PdfReader
from collections import defaultdict
from rdflib import Graph, Literal, RDF, URIRef, Namespace, RDFS, namespace
from rdflib.namespace import FOAF, XSD, SKOS, OWL, RDFS

#### Processing IEEE Taxonomy PDF File and extracting text from it

In [24]:
input_file = 'Path to IEEE Taxonomy PDF File'

In [5]:
reader = PdfReader(input_file)

In [6]:
text = ""

for page in reader.pages:
    text += page.extract_text() + "\n"

In [33]:
with open('Path to IEEE Taxonomy Text File', 'w') as f:
    f.write(text)

# Functions

In [51]:
def lower_case(topic:str)->str:
    return topic.lower()

def replace_char(topic:str)->str:
    return topic.replace(" ","_")

def eliminate_doublespace(topic:str)->str:
    return topic.replace("  "," ")

def clean_topic(topic:str)->str:
    return topic.strip(".")

def clean_label(topic:str)->str:
    return topic.replace("_"," ")

def escape(topic:str)->str:
    return urllib.parse.quote(topic.encode('utf8'), safe='')

# Extracting Taxonomy from Text File

In [52]:
txt_file = open("Path to IEEE Taxonomy Cleaned Text File", "r")
lines = txt_file.read().split("\n")

In [53]:
print(lines[:10])

['Aerospace and electronic systems', '....Aerospace control', '........Air traffic control', '........Attitude control', '........Ground support', '....Aerospace engineering', '........Aerospace biophysics', '........Aerospace electronics', '........Aerospace safety', '............Air safety']


In [54]:
ieee_taxonomy = defaultdict(lambda: {"broader": set(), "narrower": set()})

In [55]:
positions_levels = [len(line) - len(line.lstrip('.')) for line in lines]
parent_stack = []

In [None]:
# The hierarchy is represented by the number of dots at the beginning of the line and because the hierarchy is not always consistent in every release of the taxonomy, we use this approach to choose to select the best heuristic to represent the hierarchy
tax_year = "2024"

In [None]:
for line_idx in range(len(lines)):
    current_topic = eliminate_doublespace(lower_case(replace_char(clean_topic(lines[line_idx]))))
    current_level = positions_levels[line_idx]

    while parent_stack and parent_stack[-1][1] >= current_level:
        parent_stack.pop()
    
    broader_topics = set()
    if parent_stack:
        for parent_topic, parent_level in parent_stack:
            if tax_year == "2024":
                if current_level == parent_level + 5:
                    ieee_taxonomy[current_topic]["broader"].add(parent_topic)
                    ieee_taxonomy[parent_topic]["narrower"].add(current_topic)
                    broader_topics.add(parent_topic)
                    broader_topics.update(ieee_taxonomy[parent_topic]["broader"])
            else:
                if current_level == parent_level + 4:
                    ieee_taxonomy[current_topic]["broader"].add(parent_topic)
                    ieee_taxonomy[parent_topic]["narrower"].add(current_topic)
                    broader_topics.add(parent_topic)
                    broader_topics.update(ieee_taxonomy[parent_topic]["broader"])
                
    ieee_taxonomy[current_topic]["broader"].update(broader_topics)
    parent_stack.append((current_topic, current_level))

In [60]:
# inspection
key = list(ieee_taxonomy.keys())[69]
print(f"{key}:{ieee_taxonomy[key]}")

antenna_radiation_patterns:{'broader': {'antennas', 'antennas_and_propagation'}, 'narrower': {'near-field_radiation_pattern'}}


# Creating RDF

In [61]:
g = Graph()

schema_url = "https://ieee-taxonomy.org/"
topic_url = URIRef(f"{str(schema_url)}schema#Topic")

ieee = Namespace(schema_url)
g.bind("ieee", ieee)
g.bind("owl", OWL)
g.bind("skos", SKOS)
g.bind("rdf", RDF)

In [62]:
# adding definition of topic
g.add((topic_url, RDF.type, OWL.Class))
g.add((topic_url, RDFS.subClassOf, SKOS.Concept))

<Graph identifier=N2d86601c31c14a84af56682ba05e0a2b (<class 'rdflib.graph.Graph'>)>

In [63]:
for topic, relations in ieee_taxonomy.items():
    __topic = URIRef(f"{schema_url}{escape(topic)}")
    g.add((__topic, RDFS.label, Literal(clean_label(topic))))
    g.add((__topic, RDF.type, topic_url))
    
    for broader in relations["broader"]:
        __broader = URIRef(f"{schema_url}{escape(broader)}")
        g.add((__topic, SKOS.broader, __broader))
        g.add((__broader, SKOS.narrower, __topic))
    
    for narrower in relations["narrower"]:
        __narrower = URIRef(f"{schema_url}{escape(narrower)}")
        g.add((__narrower, SKOS.broader, __topic))
        g.add((__topic, SKOS.narrower, __narrower))

In [64]:
rdf_formats = {
    "turtle": "ttl",
    "nt": "nt",
    "xml": "xml"
}

In [65]:
print("---- Printing RDF versions ----")
for fmt, ext in rdf_formats.items():
    g.serialize(destination=f'./rdf/ieee-taxonomy.{ext}', format=fmt)
    print(f"Saved: ieee-taxonomy.{ext}")

---- Printing RDF versions ----
Saved: ieee-taxonomy.ttl
Saved: ieee-taxonomy.nt
Saved: ieee-taxonomy.xml
