?p ?o .}")
+ assert len(batch_df.index) == len(final_df.index) + 50
+
+
+def _create_dummy_vertex() -> Dict[str, Any]:
+ data = dict()
+ data["~id"] = str(uuid.uuid4())
+ data["~label"] = "foo"
+ data["int"] = random.randint(0, 1000)
+ data["str"] = "".join(random.choice(string.ascii_lowercase) for i in range(10))
+ data["list"] = [random.randint(0, 1000), random.randint(0, 1000)]
+ return data
+
+
+def _create_dummy_edge() -> Dict[str, Any]:
+ data = dict()
+ data["~id"] = str(uuid.uuid4())
+ data["~label"] = "bar"
+ data["~to"] = str(uuid.uuid4())
+ data["~from"] = str(uuid.uuid4())
+ data["int"] = random.randint(0, 1000)
+ data["str"] = "".join(random.choice(string.ascii_lowercase) for i in range(10))
+ return data
+
+
+def _create_dummy_triple() -> Dict[str, Any]:
+ data = dict()
+ data["s"] = "foo"
+ data["p"] = str(uuid.uuid4())
+ data["o"] = random.randint(0, 1000)
+ return data
+
+
+def _create_dummy_quad() -> Dict[str, Any]:
+ data = _create_dummy_triple()
+ data["g"] = "bar"
+ return data
diff --git a/tests/test_neptune_parsing.py b/tests/test_neptune_parsing.py
new file mode 100644
index 000000000..1650ff682
--- /dev/null
+++ b/tests/test_neptune_parsing.py
@@ -0,0 +1,211 @@
+import logging
+
+import pandas as pd
+import pytest # type: ignore
+from gremlin_python.process.traversal import T
+from gremlin_python.structure.graph import Edge, Path, Property, Vertex, VertexProperty
+
+import awswrangler as wr
+
+logging.getLogger("awswrangler").setLevel(logging.DEBUG)
+
+
+@pytest.fixture(scope="session")
+def gremlin_parser() -> wr.neptune.GremlinParser:
+ c = object.__new__(wr.neptune.GremlinParser)
+ return c
+
+
+# parse Vertex elements
+def test_parse_gremlin_vertex_elements(gremlin_parser):
+ # parse vertex elements
+ v = Vertex("foo")
+ input = [v]
+ out = gremlin_parser.gremlin_results_to_dict(input)
+ df = pd.DataFrame.from_records(out)
+ row = df.iloc[0]
+ assert df.shape == (1, 2)
+ assert row["id"] == "foo"
+ assert row["label"] == "vertex"
+
+ # parse multiple vertex elements
+ v1 = Vertex("bar")
+ input = [v, v1]
+ out = gremlin_parser.gremlin_results_to_dict(input)
+ df = pd.DataFrame.from_records(out)
+ row = df.iloc[1]
+ assert df.shape == (2, 2)
+ assert row["id"] == "bar"
+ assert row["label"] == "vertex"
+
+
+# parse Edge elements
+def test_parse_gremlin_edge_elements(gremlin_parser):
+ # parse edge elements
+ v = Edge("foo", "out1", "label", "in1")
+ input = [v]
+ out = gremlin_parser.gremlin_results_to_dict(input)
+ df = pd.DataFrame.from_records(out)
+ row = df.iloc[0]
+ assert df.shape == (1, 4)
+ assert row["id"] == "foo"
+ assert row["outV"] == "out1"
+ assert row["label"] == "label"
+ assert row["inV"] == "in1"
+
+ # parse multiple edge elements
+ v1 = Edge("bar", "out1", "label", "in2")
+ input = [v, v1]
+ out = gremlin_parser.gremlin_results_to_dict(input)
+ df = pd.DataFrame.from_records(out)
+ row = df.iloc[1]
+ assert df.shape == (2, 4)
+ assert row["id"] == "bar"
+ assert row["outV"] == "out1"
+ assert row["label"] == "label"
+ assert row["inV"] == "in2"
+
+
+# parse Property elements
+def test_parse_gremlin_property_elements(gremlin_parser):
+ # parse VertexProperty elements
+ v = VertexProperty("foo", "name", "bar", "v1")
+ input = [v]
+ out = gremlin_parser.gremlin_results_to_dict(input)
+ df = pd.DataFrame.from_records(out)
+ row = df.iloc[0]
+ assert df.shape == (1, 5)
+ assert row["id"] == "foo"
+ assert row["label"] == "name"
+ assert row["value"] == "bar"
+ assert row["key"] == "name"
+ assert row["vertex"] == "v1"
+
+ v = Property("foo", "name", "bar")
+ input = [v]
+ out = gremlin_parser.gremlin_results_to_dict(input)
+ df = pd.DataFrame.from_records(out)
+ row = df.iloc[0]
+ assert df.shape == (1, 3)
+ assert row["element"] == "bar"
+ assert row["value"] == "name"
+ assert row["key"] == "foo"
+
+
+# parse Path elements
+def test_parse_gremlin_path_elements(gremlin_parser):
+ # parse path with elements
+ v = Vertex("foo")
+ v2 = Vertex("bar")
+ e1 = Edge("e1", "foo", "label", "bar")
+ p = Path(labels=["vertex", "label", "vertex"], objects=[v, e1, v2])
+ out = gremlin_parser.gremlin_results_to_dict([p])
+ df = pd.DataFrame.from_records(out)
+ row = df.iloc[0]
+ assert df.shape == (1, 3)
+ assert row[0] == {"id": "foo", "label": "vertex"}
+ assert row[1] == {"id": "e1", "label": "label", "outV": "foo", "inV": "bar"}
+ assert row[2] == {"id": "bar", "label": "vertex"}
+
+ # parse path with multiple elements
+ e2 = Edge("bar", "out1", "label", "in2")
+ v3 = Vertex("in2")
+ p1 = Path(labels=["vertex", "label", "vertex"], objects=[v2, e2, v3])
+ out = gremlin_parser.gremlin_results_to_dict([p, p1])
+ df = pd.DataFrame.from_records(out)
+ row = df.iloc[1]
+ assert df.shape == (2, 3)
+ assert row[0] == {"id": "bar", "label": "vertex"}
+ assert row[1] == {"id": "bar", "label": "label", "outV": "out1", "inV": "in2"}
+ assert row[2] == {"id": "in2", "label": "vertex"}
+
+ # parse path with maps
+ p = Path(
+ labels=["vertex", "label", "vertex"],
+ objects=[{"name": "foo", "age": 29}, {"dist": 32}, {"name": "bar", "age": 40}],
+ )
+ out = gremlin_parser.gremlin_results_to_dict([p])
+ df = pd.DataFrame.from_records(out)
+ row = df.iloc[0]
+ assert df.shape == (1, 3)
+ assert row[0]["name"] == "foo"
+ assert row[0]["age"] == 29
+ assert row[1]["dist"] == 32
+ assert row[2]["name"] == "bar"
+ assert row[2]["age"] == 40
+
+ # parse path with mixed elements and maps
+ p = Path(
+ labels=["vertex", "label", "vertex"],
+ objects=[{"name": "foo", "age": 29}, Edge("bar", "out1", "label", "in2"), {"name": "bar", "age": 40}],
+ )
+ out = gremlin_parser.gremlin_results_to_dict([p])
+ df = pd.DataFrame.from_records(out)
+ row = df.iloc[0]
+ assert df.shape == (1, 3)
+ assert row[0]["name"] == "foo"
+ assert row[0]["age"] == 29
+ assert row[1] == {"id": "bar", "label": "label", "outV": "out1", "inV": "in2"}
+ assert row[2]["name"] == "bar"
+ assert row[2]["age"] == 40
+
+
+# parse vertex valueMap
+def test_parse_gremlin_maps(gremlin_parser):
+ # parse map
+ m = {"name": "foo", "age": 29}
+ input = [m]
+ out = gremlin_parser.gremlin_results_to_dict(input)
+ df = pd.DataFrame.from_records(out)
+ row = df.iloc[0]
+ assert df.shape == (1, 2)
+ assert row["name"] == "foo"
+ assert row["age"] == 29
+
+ # parse multiple maps with T
+ m1 = {"name": ["foo"], T.id: "2", "age": [40], T.label: "vertex"}
+ input = [m, m1]
+ out = gremlin_parser.gremlin_results_to_dict(input)
+ df = pd.DataFrame.from_records(out)
+ row = df.iloc[1]
+ assert df.shape == (2, 4)
+ assert row["name"] == "foo"
+ assert row["age"] == 40
+ assert row[T.id] == "2"
+ assert row[T.label] == "vertex"
+ m2 = {"name": ["foo", "bar"], T.id: "2", T.label: "vertex"}
+ input = [m, m1, m2]
+ out = gremlin_parser.gremlin_results_to_dict(input)
+ df = pd.DataFrame.from_records(out)
+ row = df.iloc[2]
+ assert df.shape == (3, 4)
+ assert row["name"] == ["foo", "bar"]
+ assert row[T.id] == "2"
+ assert row[T.label] == "vertex"
+
+
+# parse scalar
+def test_parse_gremlin_scalar(gremlin_parser):
+ # parse map
+ m = 12
+ n = "Foo"
+ input = [m, n]
+ out = gremlin_parser.gremlin_results_to_dict(input)
+ df = pd.DataFrame.from_records(out)
+ row = df.iloc[0]
+ assert df.shape == (2, 1)
+ assert row[0] == 12
+ row = df.iloc[1]
+ assert row[0] == "Foo"
+
+
+# parse subgraph
+def test_parse_gremlin_subgraph(gremlin_parser):
+ m = {"@type": "tinker:graph", "@value": {"vertices": ["v[45]", "v[9]"], "edges": ["e[3990][9-route->45]"]}}
+ input = [m]
+ out = gremlin_parser.gremlin_results_to_dict(input)
+ df = pd.DataFrame.from_records(out)
+ row = df.iloc[0]
+ assert df.shape == (1, 2)
+ assert row["@type"] == "tinker:graph"
+ assert row["@value"] == {"vertices": ["v[45]", "v[9]"], "edges": ["e[3990][9-route->45]"]}
diff --git a/tutorials/033 - Amazon Neptune.ipynb b/tutorials/033 - Amazon Neptune.ipynb
new file mode 100644
index 000000000..e5cf3e4c3
--- /dev/null
+++ b/tutorials/033 - Amazon Neptune.ipynb
@@ -0,0 +1,522 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "b0ee9a28",
+ "metadata": {},
+ "source": [
+ "[](https://github.com/awslabs/aws-data-wrangler)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3a2a7b51",
+ "metadata": {},
+ "source": [
+ "# 33 - Amazon Neptune"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "42724a76",
+ "metadata": {},
+ "source": [
+ "## Initialize\n",
+ "\n",
+ "The first step to using AWS Data Wrangler with Amazon Neptune is to import the library and create a client connection.\n",
+ "\n",
+ "Note: Connecting to Amazon Neptune requires that the application you are running has access to the Private VPC where Neptune is located. Without this access you will not be able to connect using AWS Data Wrangler.
"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "fd098b2c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import awswrangler as wr\n",
+ "import pandas as pd\n",
+ "\n",
+ "url='' # The Neptune Cluster endpoint\n",
+ "iam_enabled = False # Set to True/False based on the configuration of your cluster\n",
+ "neptune_port = 8182 # Set to the Neptune Cluster Port, Default is 8182\n",
+ "client = wr.neptune.connect(url, neptune_port, iam_enabled=iam_enabled)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1e9499ea",
+ "metadata": {},
+ "source": [
+ "## Return the status of the cluster"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "57903cf4",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print(client.status())"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6f13f0cb",
+ "metadata": {},
+ "source": [
+ "## Retrieve Data from Neptune using AWS Data Wrangler\n",
+ "\n",
+ "AWS Data Wrangler supports querying Amazon Neptune using TinkerPop Gremlin and openCypher for property graph data or SPARQL for RDF data.\n",
+ "\n",
+ "### Gremlin"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "2801f447",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "query = \"g.E().project('source', 'target').by(outV().id()).by(inV().id()).limit(5)\"\n",
+ "df = wr.neptune.execute_gremlin(client, query)\n",
+ "display(df.head(5))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a7666d80",
+ "metadata": {},
+ "source": [
+ "### SPARQL"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "91b52363",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "query = \"SELECT ?s ?o WHERE { ?s ?p ?o .} LIMIT 5\"\n",
+ "df = wr.neptune.execute_sparql(client, query)\n",
+ "display(df.head(5))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "367791b9",
+ "metadata": {},
+ "source": [
+ "### openCypher"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ce5df2ee",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "query = \"MATCH (n)-[r]->(d) RETURN id(n) as source, id(d) as target LIMIT 5\"\n",
+ "df = wr.neptune.execute_opencypher(client, query)\n",
+ "display(df.head(5))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f91b967c",
+ "metadata": {},
+ "source": [
+ "## Saving Data using AWS Data Wrangler\n",
+ "\n",
+ "AWS Data Wrangler supports saving Pandas DataFrames into Amazon Neptune using either a property graph or RDF data model. \n",
+ "\n",
+ "### Property Graph\n",
+ "\n",
+ "If writing to a property graph then DataFrames for vertices and edges must be written separately. DataFrames for vertices must have a `~label` column with the label and a `~id` column for the vertex id.\n",
+ "\n",
+ "If the `~id` column does not exist, the specified id does not exists, or is empty then a new vertex will be added.\n",
+ "\n",
+ "If no `~label` column exists then writing to the graph will be treated as an update of the element with the specified `~id` value.\n",
+ "\n",
+ "DataFrames for edges must have a `~id`, `~label`, `~to`, and `~from` column. If the `~id` column does not exist the specified id does not exists, or is empty then a new edge will be added. If no `~label`, `~to`, or `~from` column exists an exception will be thrown.\n",
+ "\n",
+ "#### Add Vertices/Nodes"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "579fd9c0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import uuid\n",
+ "import random\n",
+ "import string\n",
+ "def _create_dummy_vertex():\n",
+ " data = dict()\n",
+ " data[\"~id\"] = uuid.uuid4()\n",
+ " data[\"~label\"] = \"foo\"\n",
+ " data[\"int\"] = random.randint(0, 1000)\n",
+ " data[\"str\"] = \"\".join(random.choice(string.ascii_lowercase) for i in range(10))\n",
+ " data[\"list\"] = [random.randint(0, 1000), random.randint(0, 1000)]\n",
+ " return data\n",
+ "\n",
+ "data = [_create_dummy_vertex(), _create_dummy_vertex(), _create_dummy_vertex()]\n",
+ "df = pd.DataFrame(data)\n",
+ "res = wr.neptune.to_property_graph(client, df)\n",
+ "query = f\"MATCH (s) WHERE id(s)='{data[0]['~id']}' RETURN s\"\n",
+ "df = wr.neptune.execute_opencypher(client, query)\n",
+ "display(df)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "fd5fc8a2",
+ "metadata": {},
+ "source": [
+ "#### Add Edges"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "515f7a0f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import uuid\n",
+ "import random\n",
+ "import string\n",
+ "def _create_dummy_edge():\n",
+ " data = dict()\n",
+ " data[\"~id\"] = uuid.uuid4()\n",
+ " data[\"~label\"] = \"bar\"\n",
+ " data[\"~to\"] = uuid.uuid4()\n",
+ " data[\"~from\"] = uuid.uuid4()\n",
+ " data[\"int\"] = random.randint(0, 1000)\n",
+ " data[\"str\"] = \"\".join(random.choice(string.ascii_lowercase) for i in range(10))\n",
+ " return data\n",
+ "\n",
+ "data = [_create_dummy_edge(), _create_dummy_edge(), _create_dummy_edge()]\n",
+ "df = pd.DataFrame(data)\n",
+ "res = wr.neptune.to_property_graph(client, df)\n",
+ "query = f\"MATCH (s)-[r]->(d) WHERE id(r)='{data[0]['~id']}' RETURN r\"\n",
+ "df = wr.neptune.execute_opencypher(client, query)\n",
+ "display(df)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "efe6eaaf",
+ "metadata": {},
+ "source": [
+ "#### Update Existing Nodes"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d831c7a3",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "idval=uuid.uuid4()\n",
+ "wr.neptune.execute_gremlin(client, f\"g.addV().property(T.id, '{str(idval)}')\")\n",
+ "query = f\"MATCH (s) WHERE id(s)='{idval}' RETURN s\"\n",
+ "df = wr.neptune.execute_opencypher(client, query)\n",
+ "print(\"Before\")\n",
+ "display(df)\n",
+ "data = [{\"~id\": idval, \"age\": 50}]\n",
+ "df = pd.DataFrame(data)\n",
+ "res = wr.neptune.to_property_graph(client, df)\n",
+ "df = wr.neptune.execute_opencypher(client, query)\n",
+ "print(\"After\")\n",
+ "display(df)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "bff6a1fc",
+ "metadata": {},
+ "source": [
+ "#### Setting cardinality based on the header\n",
+ "\n",
+ " If you would like to save data using `single` cardinality then you can postfix (single) to the column header and\n",
+ " set `use_header_cardinality=True` (default). e.g. A column named `name(single)` will save the `name` property as single cardinality. You can disable this by setting by setting `use_header_cardinality=False`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1010c2f5",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "data = [_create_dummy_vertex()]\n",
+ "df = pd.DataFrame(data)\n",
+ "# Adding (single) to the column name in the DataFrame will cause it to write that property as `single` cardinality\n",
+ "df.rename(columns={\"int\": \"int(single)\"}, inplace=True)\n",
+ "res = wr.neptune.to_property_graph(client, df, use_header_cardinality=True)\n",
+ "\n",
+ "\n",
+ "# This can be disabled by setting `use_header_cardinality = False`\n",
+ "df.rename(columns={\"int\": \"int(single)\"}, inplace=True)\n",
+ "res = wr.neptune.to_property_graph(client, df, use_header_cardinality=False)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "beca9dab",
+ "metadata": {},
+ "source": [
+ "### RDF\n",
+ "\n",
+ "The DataFrame must consist of triples with column names for the subject, predicate, and object specified. If none are provided than `s`, `p`, and `o` are the default.\n",
+ "\n",
+ "If you want to add data into a named graph then you will also need the graph column, default is `g`.\n",
+ "\n",
+ "#### Write Triples"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1f8427b9",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def _create_dummy_triple():\n",
+ " data = dict()\n",
+ " data[\"s\"] = \"foo\"\n",
+ " data[\"p\"] = uuid.uuid4()\n",
+ " data[\"o\"] = random.randint(0, 1000)\n",
+ " return data\n",
+ "\n",
+ "data = [_create_dummy_triple(), _create_dummy_triple(), _create_dummy_triple()]\n",
+ "df = pd.DataFrame(data)\n",
+ "res = wr.neptune.to_rdf_graph(client, df)\n",
+ "query = \"SELECT ?o WHERE { <\" + str(data[0]['p']) + \"> ?o .}\"\n",
+ "df = wr.neptune.execute_sparql(client, query)\n",
+ "display(df)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b7a45c6a",
+ "metadata": {},
+ "source": [
+ "#### Write Quads"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "819f6a04",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def _create_dummy_quad():\n",
+ " data = _create_dummy_triple()\n",
+ " data[\"g\"] = \"bar\"\n",
+ " return data\n",
+ " \n",
+ "data = [_create_dummy_quad(), _create_dummy_quad(), _create_dummy_quad()]\n",
+ "df = pd.DataFrame(data)\n",
+ "res = wr.neptune.to_rdf_graph(client, df)\n",
+ "query = \"SELECT ?o WHERE { <\" + str(data[0]['p']) + \"> ?o .}\"\n",
+ "df = wr.neptune.execute_sparql(client, query)\n",
+ "display(df)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8370b377",
+ "metadata": {},
+ "source": [
+ "## Flatten DataFrames\n",
+ "\n",
+ "One of the complexities of working with a row/columns paradigm, such as Pandas, with graph results set is that it is very common for graph results to return complex and nested objects. To help simplify using the results returned from a graph within a more tabular format we have added a method to flatten the returned Pandas DataFrame.\n",
+ "\n",
+ "### Flattening the DataFrame"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4488e185",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "client = wr.neptune.connect(url, 8182, iam_enabled=False)\n",
+ "query = \"MATCH (n) RETURN n LIMIT 1\"\n",
+ "df = wr.neptune.execute_opencypher(client, query)\n",
+ "print(\"Original\")\n",
+ "display(df)\n",
+ "df_new=wr.neptune.flatten_nested_df(df)\n",
+ "print(\"Flattened\")\n",
+ "display(df_new)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "9324bff7",
+ "metadata": {},
+ "source": [
+ "### Removing the prefixing of the parent column name"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "7e95099c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_new=wr.neptune.flatten_nested_df(df, include_prefix=False)\n",
+ "display(df_new)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "21738d39",
+ "metadata": {},
+ "source": [
+ "### Specifying the column header seperator"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "8f4bcbe3",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_new=wr.neptune.flatten_nested_df(df, seperator='|')\n",
+ "display(df_new)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1bded05b",
+ "metadata": {},
+ "source": [
+ "## Putting it into a workflow"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9129f173",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "pip install igraph networkx"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "cd49d635",
+ "metadata": {},
+ "source": [
+ "### Running PageRank using NetworkX"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ecd88fe2",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import networkx as nx\n",
+ "\n",
+ "# Retrieve Data from neptune\n",
+ "client = wr.neptune.connect(url, 8182, iam_enabled=False)\n",
+ "query = \"MATCH (n)-[r]->(d) RETURN id(n) as source, id(d) as target LIMIT 100\"\n",
+ "df = wr.neptune.execute_opencypher(client, query)\n",
+ "\n",
+ "# Run PageRank\n",
+ "G=nx.from_pandas_edgelist(df, edge_attr=True)\n",
+ "pg = nx.pagerank(G)\n",
+ "\n",
+ "# Save values back into Neptune\n",
+ "rows=[]\n",
+ "for k in pg.keys():\n",
+ " rows.append({'~id': k, 'pageRank_nx(single)': pg[k]})\n",
+ "pg_df=pd.DataFrame(rows, columns=['~id','pageRank_nx(single)'])\n",
+ "res = wr.neptune.to_property_graph(client, pg_df, use_header_cardinality=True)\n",
+ "\n",
+ "# Retrieve newly saved data\n",
+ "query = \"MATCH (n:airport) WHERE n.pageRank_nx IS NOT NULL RETURN n.code, n.pageRank_nx ORDER BY n.pageRank_nx DESC LIMIT 5\"\n",
+ "df = wr.neptune.execute_opencypher(client, query)\n",
+ "display(df)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "783a599e",
+ "metadata": {},
+ "source": [
+ "### Running PageRank using iGraph"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "abb8c7ab",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import igraph as ig\n",
+ "\n",
+ "# Retrieve Data from neptune\n",
+ "client = wr.neptune.connect(url, 8182, iam_enabled=False)\n",
+ "query = \"MATCH (n)-[r]->(d) RETURN id(n) as source, id(d) as target LIMIT 100\"\n",
+ "df = wr.neptune.execute_opencypher(client, query)\n",
+ "\n",
+ "# Run PageRank\n",
+ "g = ig.Graph.TupleList(df.itertuples(index=False), directed=True, weights=False)\n",
+ "pg = g.pagerank()\n",
+ "\n",
+ "# Save values back into Neptune\n",
+ "rows=[]\n",
+ "for idx, v in enumerate(g.vs):\n",
+ " rows.append({'~id': v['name'], 'pageRank_ig(single)': pg[idx]}) \n",
+ "pg_df=pd.DataFrame(rows, columns=['~id','pageRank_ig(single)'])\n",
+ "res = wr.neptune.to_property_graph(client, pg_df, use_header_cardinality=True)\n",
+ "\n",
+ "# Retrieve newly saved data\n",
+ "query = \"MATCH (n:airport) WHERE n.pageRank_ig IS NOT NULL RETURN n.code, n.pageRank_ig ORDER BY n.pageRank_ig DESC LIMIT 5\"\n",
+ "df = wr.neptune.execute_opencypher(client, query)\n",
+ "display(df)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.12"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}