In [2]:
import os
from rdflib import Graph
from pathlib import Path
from fins.dag import *
import pandas as pd

In [19]:
cwd = Path("~/workspace/projects/spacefactory/factory-space").expanduser().resolve()

graph = Graph()
for file_path in Path(cwd).rglob("*/*.ttl"):
    print(file_path.relative_to(file_path.home()))
    graph.parse(file_path.as_posix(), format="turtle")

workspace/projects/spacefactory/factory-space/inspace-economy/output/human-spaceflight/data.ttl
workspace/projects/spacefactory/factory-space/inspace-economy/output/in-space-manufacturing/data.ttl
workspace/projects/spacefactory/factory-space/inspace-economy/output/surface-habitats/data.ttl
workspace/projects/spacefactory/factory-space/inspace-economy/output/in-space-transportation/data.ttl
workspace/projects/spacefactory/factory-space/inspace-economy/output/space-utilities/data.ttl
workspace/projects/spacefactory/factory-space/inspace-economy/output/miscellaneous/data.ttl
workspace/projects/spacefactory/factory-space/inspace-economy/output/space-stations/data.ttl
workspace/projects/spacefactory/factory-space/inspace-economy/output/cargo-transportation/data.ttl
workspace/projects/spacefactory/factory-space/inspace-economy/output/space-resources/data.ttl
workspace/projects/spacefactory/factory-space/inspace-economy/output/surface-spacecraft/data.ttl
workspace/projects/spacefactory/facto

In [20]:
for s, p, o in graph:
    if p == Space.parent_category:
        graph.remove((s, p, o))
        graph.add((o, Space.sub_category, s))

In [21]:
graph.serialize(destination=cwd / "factory-space.ttl", format="turtle")

<Graph identifier=N94e86253bff64ac6a6e06be956cf8f6a (<class 'rdflib.graph.Graph'>)>

In [23]:
graph.serialize(
    destination=cwd / "factory-space.nt", format="ntriples", encoding="utf-8"
)

<Graph identifier=N94e86253bff64ac6a6e06be956cf8f6a (<class 'rdflib.graph.Graph'>)>

In [25]:
query = """
PREFIX space: <%s>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

SELECT DISTINCT *
WHERE {
    ?company a space:Company ;
        space:url ?url .
}
""" % (
    kgns.prefix2ns["space"]
)

results = graph.query(query)

# Convert SPARQL results to a list of dictionaries
data = []
for row in results:
    data.append(
        {
            "company": str(row["company"]),
            "url": str(row["url"]),
        }
    )

# Create a DataFrame from the list of dictionaries
df = pd.DataFrame(data)
df.to_csv(
    cwd / "companies.csv",
    index=False,
    header=["company", "url"],
)

In [26]:
query = """
PREFIX space: <%s>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

SELECT DISTINCT *
WHERE {
    ?cat a space:Category .
    ?cat rdfs:label ?cat_label .
    OPTIONAL { ?cat rdfs:comment ?cat_comment . }

    OPTIONAL {
        ?parent_cat space:sub_category ?cat .
        ?parent_cat rdfs:label ?parent_cat_label .
    }
}
""" % (
    kgns.prefix2ns["space"]
)

results = graph.query(query)

In [27]:
# Convert SPARQL results to a list of dictionaries
data = []
for row in results:
    data.append(
        {
            "cat_label": str(row.cat_label),
            "cat_comment": str(row.cat_comment) if row.cat_comment else None,
            "parent_cat_label": (
                str(row.parent_cat_label) if row.parent_cat_label else None
            ),
        }
    )

# Create a DataFrame from the list of dictionaries
df = pd.DataFrame(data)

# Group the parent_cat_label by cat_label and merge them into an array
df_grouped = (
    df.groupby("cat_label")["parent_cat_label"]
    .apply(lambda x: ", ".join(x.dropna().unique().tolist()))
    .reset_index()
)
df_grouped.columns = ["cat_label", "parent_cat_labels"]

# # Merge the grouped dataframe with the original dataframe to get the cat_comment column
df_merged = pd.merge(
    df_grouped,
    df[["cat_label", "cat_comment"]].drop_duplicates(),
    on="cat_label",
    how="left",
)

df_merged.to_csv(cwd / "grouped_categories.csv", index=False)
# df_grouped.to_csv(cwd / "grouped_categories.csv", index=False)