# Installing needed Libraries

##### This will install rdflib to create the rdf version of the IEEE Taxonomy

In [None]:
!pip install rdflib==6.3.1

##### This will install PyPDF2 to extract text from IEEE Taxonomy PDF file

###### Note: After extracting the text from IEEE Taxonomy PDF file it has been manually analyzed to eliminate any possible errors and to obtain a clean text file of IEEE Taxonomy

In [None]:
!pip install PyPDF2==3.0.1

# Importing

In [2]:
import re
import json
import urllib3
from PyPDF2 import PdfReader
import urllib.parse
from rdflib import Graph, Literal, RDF, URIRef, Namespace, RDFS, namespace
from rdflib.namespace import FOAF, XSD, SKOS, OWL, RDFS

#### Processing IEEE Taxonomy PDF File and extracting text from it

In [24]:
input_file = 'Path to IEEE Taxonomy PDF File'

In [5]:
reader = PdfReader(input_file)

In [6]:
text = ""

for page in reader.pages:
    text += page.extract_text() + "\n"

In [33]:
with open('Path to IEEE Taxonomy Text File', 'w') as f:
    f.write(text)

# Functions

In [7]:
def lower_case(topic:str)->str:
    return topic.lower()

def replace_char(topic:str)->str:
    return topic.replace(" ","_")

def clean_topic(topic:str)->str:
    return topic.strip(".")

def get_level_of_topic(topic:str)->str:
    count=0
    while topic[count]==".":
        count+=1
    return count

def check_position_levels(positions_levels:list)->bool:
    for level in positions_levels:
        if level % 4 != 0:
            return True
    return False

def clean_label(topic:str)->str:
    return topic.replace("_"," ")

def escape(topic:str)->str:
    return urllib.parse.quote(topic.encode('utf8'), safe='')

# Extracting Taxonomy from Text File

In [8]:
txt_file = open("Path to IEEE Taxonomy Cleaned Text File", "r")
lines = txt_file.read().split("\n")

In [9]:
# Checking levels
positions_levels = list()
for line in lines:
    level = get_level_of_topic(line)
    positions_levels.append(level)
    
    
if check_position_levels(positions_levels):
    print("Misalignment between levels. Check the input file")
else:
    print("All levels are well formatted!")

All levels are well formatted!


In [10]:
ieee_taxonomy = dict()
for line in range(len(lines)):
    children = list()    
    for following_line in range(line+1, len(lines)):
        if positions_levels[following_line] <= positions_levels[line]:
            break
        elif positions_levels[following_line] == positions_levels[line]+4:
            children.append(lower_case(replace_char(clean_topic(lines[following_line]))))
        else:
            pass
            
    if len(children) > 0:
        ieee_taxonomy[lower_case(replace_char(clean_topic(lines[line])))] = children

In [11]:
# inspection
key = list(ieee_taxonomy.keys())[0]
print(f"{key}:{ieee_taxonomy[key]}")

aerospace_and_electronic_systems:['aerospace_control', 'aerospace_engineering', 'aerospace_materials', 'aircraft_manufacture', 'aircraft_navigation', 'aircraft_propulsion', 'command_and_control_systems', 'electronic_warfare', 'military_equipment', 'sensor_systems', 'sonar', 'telemetry']


# Creating RDF

In [17]:
g = Graph()

schema_url = "https://ieee-taxonomy.org/"
topic_url = URIRef(f"{str(schema_url)}schema#Topic")

ieee = Namespace(schema_url)
g.bind("ieee", ieee)
g.bind("owl", OWL)
g.bind("skos", SKOS)
g.bind("rdf", RDF)

In [None]:
# adding definition of topic
g.add((topic_url, RDF.type, OWL.Class))
g.add((topic_url, RDFS.subClassOf, SKOS.Concept))

In [19]:
for topic, subtopics in ieee_taxonomy.items():
    
    __topic = URIRef(f"{schema_url}{escape(topic)}")
    g.add([__topic, RDFS.label, Literal(clean_label(topic))])
    g.add((__topic, RDF.type, topic_url))
    
    for subtopic in subtopics:

        __subtopic = URIRef(f"{schema_url}{escape(subtopic)}")
        g.add([__subtopic, RDFS.label, Literal(clean_label(subtopic))])
        g.add((__subtopic, RDF.type, topic_url))

        g.add((__subtopic, SKOS.broader, __topic))
        g.add((__topic, SKOS.narrower, __subtopic))      

In [25]:
print("--- printing rdf versions")
g.serialize(destination='./rdf/ieee-taxonomy.ttl')
g.serialize(destination='./rdf/ieee-taxonomy.nt', format="nt")
g.serialize(destination='./rdf/ieee-taxonomy.xml', format="xml")

--- printing rdf versions


In [None]:
# g.serialize(destination='./rdf/ieee-taxonomy.jsonld', format='json-ld', indent=4)