# Semantic Graph Analysis
Creating a graph of relationships between data fields based on vocabulary definitions.

In [7]:
# Import required libraries
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import csv
import openai
import os
from dotenv import load_dotenv

In [3]:
# Function to open a CSV file
def read_csv_file(filename):
    """Reads a CSV file separated by ';' and returns a Pandas DataFrame."""
    try:
        df = pd.read_csv(filename, sep=';', engine='python')  # Explicitly set separator
        return df
    except FileNotFoundError:
        print(f"Error: The file '{filename}' was not found.")
        return None
    

In [13]:
def call_chatgpt_api(outputNodes, vocabulary, headers, prompt):
    """
    Calls the OpenAI ChatGPT API with a prompt, outputNodes DataFrame, vocabulary dict, and headers list.
    
    Parameters:
        outputNodes (pd.DataFrame): The DataFrame containing nodes
        vocabulary (dict): Dictionary of vocabulary relationships
        headers (list): List of headers from the parquet file
        prompt (str): The text prompt for ChatGPT
        
    Returns:
        str: The ChatGPT response
    """
    api_key = os.getenv("OPENAI_API_KEY")  # Load API key securely from environment
    if not api_key:
        raise ValueError("Error: API key not found! Ensure it's set in the .env file.")

    client = openai.OpenAI(api_key=api_key)

    # Convert DataFrame to string
    output_nodes_str = outputNodes.to_string(index=False)
    
    # Convert vocabulary dict to formatted string
    vocab_str = "Vocabulary relationships:\n"
    for key, values in vocabulary.items():
        vocab_str += f"{key}: {', '.join(values)}\n"
    
    # Convert headers list to string
    headers_str = "Headers from parquet file:\n" + "\n".join(f"- {header}" for header in headers)

    # Combine input into the final prompt
    full_prompt = f"""
    Given the following dataset:
    {output_nodes_str}

    And the following headers from the parquet file:
    {headers_str}
    
    With the following vocabulary relationships:
    {vocab_str}
    
    And the prompt:
    {prompt}
    """

    # Call OpenAI API
    response = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": full_prompt}
        ],
        max_tokens=1000
    )

    return response.choices[0].message.content

In [9]:
# Read the parquet file and display headers
df = pd.read_parquet('processed_orders.parquet')
headers = df.columns.tolist()
print("Headers from parquet file:")
for header in headers:
    print(f"- {header}")

Headers from parquet file:
- Kenmerk
- Actie
- Klantnr
- Naam
- Land
- Straat
- Postcode
- Plaats
- Gewicht
- Euro DV
- Blok DV


In [10]:
# Read and process vocabulary file
vocabulary = {}
with open('Vocabulary2.csv', 'r') as file:
    csv_reader = csv.reader(file, delimiter=';')
    for row in csv_reader:
        if row:  # Skip empty rows
            key = row[0]
            # Filter out empty values
            values = [v for v in row[1:] if v]
            if values:  # Only add if there are related terms
                vocabulary[key] = values

print("Vocabulary relationships loaded:")
print(f"Number of semantic concepts: {len(vocabulary)}")
print(vocabulary)

Vocabulary relationships loaded:
Number of semantic concepts: 37
{'actorLegalPerson': ['SIMPLE:actorname'], 'actorRole': ['SIMPLE:actorRoles'], 'address': ['SIMPLE: actorAddress', 'SIMPLE:addressName'], 'addressNumber': ['SIMPLE:addressNumber'], 'cargoTemperature': ['SIMPLE:temperature'], 'city': ['SIMPLE:city'], 'containerNumber': ['SIMPLE:equipmentID'], 'containerType': ['SIMPLE:codeITU'], 'country': ['SIMPLE:country'], 'damageRemarks': ['SIMPLE:damageRemarks'], 'digitalContactId': ['SIMPLE:actorWebsite', 'SIMPLE:actorEmail', 'SIMPLE:actorPhoneNumber'], 'documentID': ['SIMPLE:externalConsignmentId', 'SIMPLE:externalShipmentId'], 'externalReference': ['Flowertrucks:Kenmerk'], 'Goods': ['Flowertrucks:goods'], 'goodsDescription': ['SIMPLE:goodDescription'], 'goodsNumbers': ['SIMPLE:idNumber'], 'goodsTypeCode': ['SIMPLE:goodTypecode'], 'grossWeight': ['SIMPLE:grossWeight', 'SIMPLE:goodGrossMass'], 'hasDangerousGoodsRegulationCode': ['SIMPLE:dangerousGoods'], 'hasNumberOfCollies': ['Flowe

In [6]:
outputNodes = read_csv_file('outputNodes.csv')
outputNodes.head(10)

Unnamed: 0,Concept,Unnamed: 1,Unnamed: 2,Property,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16
0,Container,UUID,equipmentTypeCode,containerNumber,containerSize,containerType,sealIndicator,isEmpty,isFull,equipmentProperties,equipmentCategoryCode,equipmentContainerITUCode,hasNumberOfCollies,,,,
1,Goods,UUID,goodsTypeCode,typeOfCargo,natureOfCargo,PackageTypeNumericCode,packageCode,packageTypeName,shippingMarks,numberOfTEU,numberofPackages,goodsDescription,goodsNumbers,,,,
2,BusinessActivity,UUID,hasDocumentType,documentID,documentVersion,hasMRN_Number,,,,,,,,,,,
3,Vessel,UUID,vesselName,transportMeansMode,vesselType,voyageNumber,vesselId,,,,,,,,,,
4,Truck,UUID,hasVIN,hasTransportmeansNationality,transportMeansMode,truckLicensePlate,,,,,,,,,,,
5,Wagon,UUID,wagonBrakeType,wagonBrakeWeight,wagonMaximumSpeed,wagonNrAxel,wagonId,,,,,,,,,,
6,DangerousGoods,UUID,hazardouseMaterialDetail,UNDGCode,dangerousGoodsRegulationCode,hazmatUNCode,packageLevelType,packageProperties,packageTypeNumericCode,packagingDangerLevelCode,hasFlashpointTemperature,productName,productProperties,properShippingName,hasDangerousGoodsRegulationCode,,
7,Locomotive,UUID,locomotiveBrakeType,locomotiveBrakeWeight,locomotiveDriverIndication,locomotiveTractionType,locomotiveTypeNumber,transportMeansMode,locomotiveId,,,,,,,,
8,Trailer,UUID,trailerLicensePlate,trailerId,,,,,,,,,,,,,
9,Seal,UUID,sealProperties,sealQuantity,hasSealConditionCode,,,,,,,,,,,,


In [11]:
prompt = """Instructions for the system settings are described here below.

Persona: 
You are an advanced Data Engineer with special expertise in Semantic Web modelling and Linked Data. 
Moreover, you are able to perform data transformations from various data formats (csv, json, excel) to RDF and back. 
The specific domain you do this for is the logistics domain. 
The specific use case now is for a Finnish organisation importing goods from Urk and Kampen, who intends to use the FEDeRATED format to share data with external organisations.

Workflow of this application: 
I will give you as input outputNodes, which includes the terms in FEDeRATED, so the target structure.
Moreover, you will receive a normalized labels list, which is a list of the extracted terms from an input file of the Finnish organisation.
Lastly, you will be given a term with similarities. These are terms from other datasets that include the same or similar terms. This helps you in translation
The task at hand: Take the outputNodes, normalized labels list, and try to replace the values in the 
normalized labels list with the term that is most suited from the outputNodes file. 

Give me as output the replaced terms in the normalized labels list. Be decisive. 
Also, tell me your reasoning.

Some additional information:
- Ignore all UUID's of the outputNodes, those are not to be used yet.
- Use the similarities as an aid to confirm your assumptions or to turn to when you are unable to find the appropriate FEDeRATED term.
- message is never mapped, so don't include it. 
- The messagetype hints at the type of FEDeRATED event we are about to create in the next step but is not included. 
- Only map to terms that are available to you in the outputNodes set"""

In [14]:
response = call_chatgpt_api(outputNodes, vocabulary, headers, prompt)
print(response)

Looking at the information provided, especially the vocabulary relationships, I can try to find the terms in the normalized labels list that can correspond to the outputNodes. Here are the replacement I recommend:

- "Kenmerk" -> externalReference (as per vocabulary relationships)
- "Actie" -> Not found in the outputNodes or helped by the vocabulary relationships, no clear replacement.
- "Klantnr" -> ActorId or UUID in some context, based on a assumption that 'Klantnr' could mean customer number which typically would map to an identifier
- "Naam" -> name (as per vocabulary relationships)
- "Land" -> country (as per vocabulary relationships)
- "Straat" -> address (inferred as 'Street' makes up part of an address)
- "Postcode" -> zipCode (as per vocabulary relationships)
- "Plaats" -> city (making an assumption here - 'Plaats' in Dutch can mean place or city)
- "Gewicht" -> grossWeight (as per vocabulary relationships)
- "Euro DV" -> packageTypeName (as per vocabulary relationships)
- "B

In [15]:
def call_chatgpt_api_with_parsed_response(response_text, data, prompt):
    """
    Calls the OpenAI ChatGPT API with a parsed response, JSON data, and a prompt.

    Parameters:
        response_text (str): The raw response text from the system.
        data (dict): The JSON data file to be included in the prompt.
        prompt (str): The text prompt for ChatGPT.

    Returns:
        str: The ChatGPT response.
    """
    api_key = os.getenv("OPENAI_API_KEY")  # Load API key securely
    if not api_key:
        raise ValueError("Error: API key not found! Ensure it's set in the .env file.")

    client = openai.OpenAI(api_key=api_key)

    # Convert structured data to JSON strings
    data_str = json.dumps(data, indent=4) if isinstance(data, dict) else str(data)

    # Combine input into the final prompt
    full_prompt = f"""
    Given the previous processed response:
    {response_text}

    And the following structured JSON data:
    {data_str}

    Now, using the context above, perform the following request:
    {prompt}
    """

    # Call OpenAI API
    chat_response = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": full_prompt}
        ],
        max_tokens=3000
    )

    return chat_response.choices[0].message.content  # Extract and return the response


In [20]:
prompt_text = """Instructions for the system settings are described here below.

Persona: 
You are an advanced Data Engineer with special expertise in Semantic Web modelling and Linked Data. 
Moreover, you are able to perform data transformations from various data formats (csv, json, excel) to RDF and back. 
The specific domain you do this for is the logistics domain. 
The specific use case now is for a Finnish organisation importing goods from Urk and Kampen, who intends to use the FEDeRATED format to share data with external organisations.

Workflow of this application: 
I will give the input, three things: 1. the response from a previous result, containing the mappings of my data conform the target dataset, 2. the json data, and 3. this prompt
The idea is that you replace the terms that are given in the response in the JSON data. Where no result is found, you do not make any change
The output is my json data with the terms replaced. DO NOT give anything else as output. This is a hard requirement, do not output any reasoning, only a JSON structure.

Restrictions:
1. Keep unmappable keys with their original name, so do never call it 'undefined' or 'unmappable'
"""

In [21]:
chat_response = call_chatgpt_api_with_parsed_response(response, df, prompt_text)
print(chat_response)

Based on your requirements, here is how your input JSON data would look like with the terms replaced as per the mappings from the previous response:

```json
[
    {
        "externalReference": "80305396",
        "Actie": "Lossen",
        "ActorId": "0000011637",
        "name": "Don Morris Palvelut Oy",
        "country": "FIN",
        "address": "Klaneettitie 12",
        "zipCode": 420,
        "city": "HELSINKI",
        "grossWeight": 1056,
        "packageTypeName": 2,
        "Blok DV": null
    },
    {
        "externalReference": "80305560",
        "Actie": "Lossen",
        "ActorId": "0000020523",
        "name": "Aminah OY",
        "country": "FIN",
        "address": "Luutnantintie 5",
        "zipCode": 410,
        "city": "HELSINKI",
        "grossWeight": 746,
        "packageTypeName": 1,
        "Blok DV": null
    },
    ...
]
```

This is just a sample of how the first two entries in your JSON data would look like after replacements. The keys in the input JS