# Configure the Gremlin server

In [None]:
%%capture output
%%graph_notebook_config
{
    "host": "127.0.0.1",
    "port": 8182,
    "ssl": false,
    "gremlin" : {
       "message_serializer": "graphbinary"
    }
}

# Create Schema
Create the graph schema. This is done by adding vertices and edges in gremlin to match the schema of the graph that will be loaded.

## ~metadata Vertex
There is a special metadata vertex required when creating schema. This is added as with a `T.id` of `~metadata`.

This vertex has the following properties:

| Property            | Description                                                                     | Required | Default | Type    |
|---------------------|---------------------------------------------------------------------------------|----------|---------|---------|
| `replicationFactor` | The replication factor of Aerospike that will be used.                          | Yes      | N/A     | Integer |
| `maxEdgeCacheSize`  | The max number of edges that will be packed into a single Aerospike record.     | No       | 8000    | Integer |
| `edgePackSize`      | The max number of edges that will be packed into an edge record.                | No       | 10      | Integer |
| `vertexLabelSindex` | True if a secondary index on the vertex label will be created, otherwise false. | No       | False   | Boolean |

In general, `maxEdgeCacheSize` and `edgePackSize` should not be changed from their defaults.

An example that sets the `replicationFactor` to 2, the `edgePackSize` to 12, leaves `maxEdgeCacheSize` as 8000, and enables `vertexLabelSindex`:
```
g.addV().property(T.id, "~metadata").
  property("replicationFactor", 2).
  property("edgePackSize", 12).
  property("vertexLabelSindex", true).iterate()
```

## Special Properties
There are a few special properties that can be added to vertices and edges to signal how they are stored in Aerospike.
These properties are only applicable to vertices and edges that are part of the schema, not the `~metadata` vertex.

| Property                    | Description                                                                     | Required                          | Default | Type    |
|-----------------------------|---------------------------------------------------------------------------------|-----------------------------------|---------|---------|
| `<label>.count`             | The number of the relevant vertex or edge expected in the graph.                | Yes                               | N/A     | Integer |
| `<property_key>`            | The value of the `<property_key>` should be the type of the property.           | Yes                               | N/A     | String  |
| `<property_key>.sindexed`   | True if a secondary index on the property will be created.                      | No                                | False   | Boolean |
| `<property_key>.likelihood` | The likelihood that a property will exist, where 1.0 is 100%, 0.0 is 0%.        | No                                | 1.0     | Double  |
| `<property_key>.valueSize`  | The number of bytes the specified type will take up on average.                 | Only for String and byte[] types. | N/A     | Integer |

An example vertex with label `Person` that is expected to occur 1000 times in the graph, has a property `name` that is a String, has a secondary index on `name`, and has a likelihood of 100%, where names are on average 15 characters, and has a property `age` that is an Integer, has a likelihood of 50%, and is not secondary indexed:

```
person = g.addV("Person").property(T.id, "Person").
  property("Person.count", 1000).
  property("name", "String").
  property("name.sindexed", true).
  property("name.valueSize", 15).
  property("age", Integer).
  next()
```

An example edge with label `KNOWS` that is expected to occur 100 times per Person, totalling 100,000 times in the graph, has a property `since` that is a Long, has a likelihood of 100%, and is not secondary indexed:

```
g.addE("KNOWS").from(person).to(person).
  property("KNOWS.count", 100000).
  property("since", Long).
  next()
```

## A Complete Schema Example
In the example below a dog grooming facility will be modelled. In this high-tech dog grooming facility there will be the following vertices:
- Groomer
- Client
- Pet
- Appointment
- Service

With the following edges:
- CLIENT_OF (from Client to Pet)
- WITH_GROOMER (from Appointment to Groomer)
- WITH_PET (from Appointment to Pet)
- SERVICE (from Appointment to Service)
- PROVIDES_SERVICE (from Groomer to Service)

In [None]:
%%gremlin
g.V().drop().iterate()

g.addV().property(T.id, "~metadata").
property("replicationFactor", 2).
property("vertexLabelSindex", true).iterate()


ip = g.addV("IpAddress").
property("IpAddress.count", 1).
property("ip", "String").
property("ip.valueSize", 15).
next()

household = g.addV("Household").
property("Household.count", 1).
property("address", "String").
property("address.valueSize", 15).
next()

email = g.addV("Email").
property("Email.count", 1).
property("email_hash", "String").
property("email_hash.valueSize", 15).
next()

account = g.addV("Account").
property("Account.count", 1).
property("account_number", "String").
property("account_number.valueSize", 15).
next()

subAccount = g.addV("SubAccount").
property("SubAccount.count", 1).
property("account_number", "String").
property("account_number.valueSize", 15).
next()

ssn = g.addV("SSN").
property("SSN.count", 1).
property("ssn_hash", "String").
property("ssn_hash.valueSize", 15).
next()

individual = g.addV("ConsolidatedIdentity").
property("ConsolidatedIdentity.count", 1).
property("id", "String").
property("id.valueSize", 15).
property("date_of_birth_year", "Long").
property("date_of_birth_month", "Long").
property("date_of_birth_day", "Long").
property("first_name", "String").
property("first_name.valueSize", 15).
property("last_name", "String").
property("last_name.valueSize", 15).
property("score", "Integer").
next()

phone_number = g.addV("PhoneNumber").
property("PhoneNumber.count", 1).
property("phone_number", "String").
property("phone_number.valueSize", 15).
property("wireless", "Boolean").
next()


// Partner individual id is the id of the individual in the partners database
// Partner id is the id of the partner for us
    partner_identity = g.addV("PartnerIdentity").
property("PartnerIdentity.count", 1).
property("audiences", "List").
property("audiences.valueSize", 15).
property("partner_individual_id", "String").
property("partner_individual_id.valueSize", 15).
property("partner_id", "String").
property("partner_id.valueSize", 15).
property("score", "Integer").
next()

// device id is T.id
// type should have discrete list of values, ex. Mobile device, Connected TV, Laptop, Desktop, Tablet
deviceId = g.addV("Device").
property("Device.count", 1).
property("type", "String").
property("type.valueSize", 15).
next()

cookie = g.addV("Cookie").
property("Cookie.count", 1).
next()

g.addE("HAS_PHONE_NUMBER").from(individual).to(phone_number).
property("HAS_PHONE_NUMBER.count", 1).
property("is_current", "Boolean").
iterate()

g.addE("HAS_IP_ADDR").from(individual).to(ip).
property("HAS_IP_ADDR.count", 1).
iterate()


g.addE("HAS_IDENTITY").from(individual).to(partner_identity).
property("HAS_IDENTITY.count", 1).
property("weight", "Long").
next()

g.addE("HAS_COOKIE").from(individual).to(cookie).
property("HAS_COOKIE.count", 1).
iterate()

g.addE("HAS_SSN").from(individual).to(ssn).
property("HAS_SSN.count", 1).
iterate()

g.addE("HAS_DEVICE_ID").from(individual).to(deviceId).
property("HAS_DEVICE_ID.count", 1).
iterate()

g.addE("HAS_ACCOUNT").from(individual).to(account).
property("HAS_ACCOUNT.count", 1).
iterate()

g.addE("HAS_SUB_ACCOUNT").from(account).to(subAccount).
property("HAS_SUB_ACCOUNT.count", 1).
iterate()

g.addE("HAS_HOUSEHOLD").from(individual).to(household).
property("HAS_HOUSEHOLD.count", 1).
property("is_current", "Boolean").
iterate()

g.addE("HAS_IP_ADDR").from(deviceId).to(ip).
property("HAS_IP_ADDR.count", 1).
iterate()

g.addE("HAS_EMAIL").from(individual).to(email).
property("HAS_IP_ADDR.count", 1).
iterate()

g.addE("PROVIDED_PHONE_NUMBER").from(partner_identity).to(phone_number).
property("PROVIDED_PHONE_NUMBER.count", 1).
property("is_current", "Boolean").
property("score", "Integer").
iterate()

g.addE("PROVIDED_IP_ADDR").from(partner_identity).to(ip).
property("PROVIDED_IP_ADDR.count", 1).
property("score", "Integer").
iterate()

g.addE("PROVIDED_COOKIE").from(partner_identity).to(cookie).
property("PROVIDED_COOKIE.count", 1).
property("score", "Integer").
iterate()

g.addE("PROVIDED_SSN").from(partner_identity).to(ssn).
property("PROVIDED_SSN.count", 1).
property("score", "Integer").
iterate()

g.addE("PROVIDED_DEVICE_ID").from(partner_identity).to(deviceId).
property("PROVIDED_DEVICE_ID.count", 1).
property("score", "Integer").
iterate()

g.addE("PROVIDED_ACCOUNT").from(partner_identity).to(account).
property("PROVIDED_ACCOUNT.count", 1).
property("score", "Integer").
iterate()

g.addE("PROVIDED_HOUSEHOLD").from(partner_identity).to(household).
property("PROVIDED_HOUSEHOLD.count", 1).
property("is_current", "Boolean").
property("score", "Integer").
iterate()

g.addE("PROVIDED_EMAIL").from(partner_identity).to(email).
property("PROVIDED_EMAIL.count", 1).
property("score", "Integer").
iterate()

return "Success"

# Visualize the Schema
Using the following query the schema can be visualized. After executing the query, select Graph from the result.

From here you can select the details button on the upper right side of the visualization and then select any graph element to get more details.

In [None]:
%%gremlin --edge-label-max-length 30 --label-max-length 30 -p v,oute,inv
g.V().emit(__.or(__.and(
    __.out().count().is(P.eq(0)),
    __.in().count().is(P.eq(0))),loops().is(P.eq(1)))).
repeat(outE().inV()).
path().by(elementMap())

# Size the Graph
If the schema is correct and everything is ready, run the cell below to get details for sizing your Aerospike cluster for the graph.

In [None]:
%%gremlin
g.call("sizing-tool").next()