Author: Kevin ALBERT  

Created: May 2021 (Updated: 11 Aug 2021)

TestRun: 12 Aug 2021

# Graph Data Science Algorithms
_**How to enrich data features using graph networks to improve machine learning model predictions using Neo4j**_  

1. [UNDERSTAND](#Understand-Algorithms)  
1. [DEMONSTRATE](#Demonstrate-Algorithms)  
1. [SOURCES](#Source-Documentation)  

In [None]:
# install python modules
!conda install -y -c conda-forge py2neo pandas pyarrow fastparquet psutil azure-storage-blob
!conda install -y -c conda-forge seaborn xlrd
!pip install monotonic packaging scikit-plot raiwidgets azureml-core azureml-interpret interpret
!pip install git+https://github.com/ybaktir/networkx-neo4j

In [None]:
# delete database and restart (~2min)
import os
os.system(" cd ../.. && \
            sudo docker-compose down && \
            sudo rm -Rf neo4j/data/databases/neo4j && \
            sudo rm -Rf neo4j/data/transactions/neo4j && \
            sudo docker-compose up --build &")

In [1]:
# data manipulation packages
import numpy as np
import pandas as pd
# pd.describe_option('display')            # show all pandas options, parameters can slow down notebook
pd.set_option('display.max_colwidth', 100) # default 50, the maximum width in characters of a column
pd.set_option('display.max_columns', 40)   # default 20, the maximum amount of columns in view 
pd.set_option('display.max_rows', 60)      # default 60, the maximum amount of rows in view

# Javascript
from IPython.display import Javascript

# data visualisation packages
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import scikitplot as skplt

# explainer packages
from interpret.ext.blackbox import TabularExplainer
from interpret.ext.blackbox import MimicExplainer
from interpret.ext.blackbox import PFIExplainer 
from interpret.ext.glassbox import LGBMExplainableModel
from interpret.ext.glassbox import LinearExplainableModel
from interpret.ext.glassbox import SGDExplainableModel
from interpret.ext.glassbox import DecisionTreeExplainableModel

# azureml packages
from azureml.core import Workspace, Experiment, Run
from azureml.interpret import ExplanationClient
from azureml.core import Workspace, Dataset, Datastore, Run

# enable dashboard rendering on self-hosted JupyterHub
from interpret import show, preserve
from interpret.provider import InlineProvider
from interpret import set_visualize_provider
set_visualize_provider(InlineProvider())

# interpretml packages
from interpret.glassbox import ExplainableBoostingClassifier

# interpret-community packages
from raiwidgets import ExplanationDashboard
from raiwidgets import ErrorAnalysisDashboard

# machine learning packages
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

# graph neo4j packages
from py2neo import Graph, Node, Relationship
import nxneo4j as nx
from neo4j import GraphDatabase

# operating system packages
import platform
import psutil
import os

In [2]:
# environment
print(f"Cores : {psutil.cpu_count(logical=True)} ({psutil.cpu_freq().current/1000:.0f}GHz)")
print(f"Memory: {psutil.virtual_memory().total/(1024**3):.2f} GB ({psutil.virtual_memory().percent}%)")
print(f"Swap  : {os.path.getsize('/swapfile')/(1024**3):.0f} GB")
disk_size = psutil.disk_usage(psutil.disk_partitions()[0].mountpoint).total
disk_used = psutil.disk_usage(psutil.disk_partitions()[0].mountpoint).percent
disk_fs   = psutil.disk_partitions()[0].fstype 
print(f"Disk  : {disk_size/(1024**3):.0f} GB ({disk_used}% {disk_fs})")
print(f"System: {platform.uname().version.split('~')[1].split()[0]}")

Cores : 2 (3GHz)
Memory: 7.78 GB (34.0%)
Swap  : 8 GB
Disk  : 145 GB (68.5% ext4)
System: 18.04.1-Ubuntu


In [3]:
# installed python modules
conda_version = ! conda -V
print(f"conda       : {conda_version[0].split()[1]}")
pip_version = ! pip -V
print(f"pip         : {pip_version[0].split()[1]}")
python_version = ! python -V
print(f"python      : {python_version[0].split()[1]}")
pandas_version = ! pip list |grep -i pandas
print(f"pandas      : {pandas_version[0].split()[1]}")
numpy_version = ! pip list |grep -ie "^numpy "
print(f"numpy       : {numpy_version[0].split()[1]}")
matplotlib_version = ! pip list |grep -ie "^matplotlib "
print(f"matplotlib  : {matplotlib_version[0].split()[1]}")
seaborn_version = ! pip list |grep -ie "^seaborn "
print(f"seaborn     : {seaborn_version[0].split()[1]}")
scikit_plot_version = ! pip list |grep -ie "^scikit-plot "
print(f"scikit-plot : {scikit_plot_version[0].split()[1]}")
sklearn_version = ! pip list |grep -ie "^scikit-learn "
print(f"sklearn     : {sklearn_version[0].split()[1]}")
py2neo_version = ! pip list |grep -i py2neo
print(f"py2neo      : {py2neo_version[0].split()[1]}")
raiwidgets_version = ! pip list |grep -ie "^raiwidgets "
print(f"raiwidgets  : {raiwidgets_version[0].split()[1]}")
interpret_version = ! pip list |grep -ie "^interpret "
print(f"interpret   : {interpret_version[0].split()[1]}")
azuremlcore_version = ! pip list |grep -ie "^azureml-core "
print(f"azureml-core: {azuremlcore_version[0].split()[1]}")
networkx_neo4j_version = ! pip list |grep -ie "^networkx-neo4j "
print(f"nxneo4j     : {networkx_neo4j_version[0].split()[1]}")

conda       : 4.10.1
pip         : 21.1.2
python      : 3.8.10
pandas      : 1.2.4
numpy       : 1.20.3
matplotlib  : 3.4.2
seaborn     : 0.11.1
scikit-plot : 0.3.7
sklearn     : 0.24.2
py2neo      : 2021.1.1
raiwidgets  : 0.4.0
interpret   : 0.2.4
azureml-core: 1.30.0
nxneo4j     : 0.0.3


In [4]:
# load graph connection instance
server  = "20.86.118.190"
port    = "7687"
user    = "neo4j"
passw   = "digityser"
db_name = "neo4j"        # default name in v4.x
uri     = "bolt://"+server+":"+port

graph = Graph(host=server, auth=(user, passw), name=db_name) # py2neo instance

driver = GraphDatabase.driver(uri=uri, auth=(user,passw))    # nxneo4j instance
G = nx.DiGraph(driver) # directed graph

In [5]:
# neo4j database version
graph.call.dbms.components()

 name         | versions  | edition    
--------------|-----------|------------
 Neo4j Kernel | ['4.2.6'] | enterprise 

In [6]:
# neo4j plugins
! sudo ls -l ../../neo4j/plugins

total 94288
-rw-r--r-- 1 ubuntu root 21208168 May  5 22:16 apoc-4.2.0.4-all.jar
-rw-r--r-- 1 ubuntu root 13429472 May  5 22:17 apoc-couchbase-dependencies-4.2.0.4.jar
-rw-r--r-- 1 ubuntu root   709134 May  5 22:17 apoc-email-dependencies-4.2.0.4.jar
-rw-r--r-- 1 ubuntu root  1483696 May  5 22:17 apoc-mongodb-dependencies-4.2.0.4.jar
-rw-r--r-- 1 ubuntu root 10848419 May  5 22:17 apoc-nlp-dependencies-4.2.0.4.jar
-rw-r--r-- 1 ubuntu root 13956780 May  5 22:17 apoc-xls-dependencies-4.2.0.4.jar
-rw-r--r-- 1 ubuntu root 11162611 Apr 28 16:06 bloom-plugin-4.x-1.6.1.jar
-rw-r--r-- 1 ubuntu root       84 May 26 16:25 bloom-plugin.license
-rw-r--r-- 1 ubuntu root  9229517 Apr 28 16:06 neo4j-bloom-1.6.1-assets.zip
-rw-r--r-- 1 ubuntu root 14498309 May  6 07:43 neo4j-graph-data-science-1.5.2.jar


In [7]:
# neo4j database configuration
! sudo cat ../../neo4j/conf/neo4j.conf


dbms.default_listen_address=0.0.0.0


neo4j.bloom.license_file=/plugins/bloom-plugin.license
neo4j.bloom.authorization_role=admin,architect
dbms.unmanaged_extension_classes=com.neo4j.bloom.server=/browser/bloom
dbms.tx_log.rotation.retention_policy=100M size
dbms.security.procedures.whitelist=apoc.*,gds.*
dbms.security.procedures.unrestricted=apoc.*,gds.*,bloom.*
dbms.memory.pagecache.size=2G
dbms.memory.heap.max_size=2G
dbms.directories.plugins=/plugins
dbms.directories.logs=/logs
dbms.directories.import=/import
causal_clustering.transaction_advertised_address=1c3465c44514:6000
causal_clustering.raft_advertised_address=1c3465c44514:7000
causal_clustering.discovery_advertised_address=1c3465c44514:5000
apoc.import.file.enabled=true


## Data

In [41]:
# https://data.world/nrippner/titanic-disaster-dataset
df = pd.read_csv('https://query.data.world/s/d5rym7e5oz4zztlmwq6motdu2rcy2t')


# drop all columns with all values NA
df = df.dropna(axis=1, how='all')
# drop all records with all values NA
df = df.dropna(axis=0, how='all')
# remove similar rows
df = df.drop_duplicates()

df['passengerId'] = df.index

# Calculate total size of family (same surname) including passenger
df['family_size'] = df['sibsp'] + df['parch'] + 1

# Add family_name column to easily identify relatives
df['family_name'] = df['name'].str.split(',', expand=True)[0]

# Fill NaN values for Cabins
df['cabin'] = df['cabin'].fillna('No Cabin')

# Extract deck from cabin number
df['deck'] = df['cabin'].str[:1]

# Fill NaN values for deck
df['deck'] = df['deck'].fillna('No Deck')

# Fill incorrect NaN values of embarked with correct values for passengers
df.loc[df['ticket'] == '113572', 'embarked'] = 'S'

# Replace embarked with location name
embarked = {"S": "Southampton", "C": "Cherbourg", "Q": "Queenstown"}
df['embarked'] = df['embarked'].map(embarked)

# Replace NaN values with Unknown in destination column
df['home.dest'] = df['home.dest'].fillna("No Home")

# replace nan with median value
df['age'] = df['age'].replace({np.nan: getattr(df['age'], 'median')()})
df['fare'] = df['fare'].replace({np.nan: getattr(df['fare'], 'median')()})

# remove irrelevant columns
df = df.drop(['boat', 'body', 'home.dest'], axis=1)

# replace feature names with interpretable naming (others are left as-is) 
df = df.rename(columns={'pclass':'pclass', 'survived':'survived', 'name':'fullname', 'sex':'sex', 'age':'age',
                        'sibsp':'nrSiblings', 'parch':'nrParents', 'ticket':'ticketId',
                        'fare':'ticketPrice', 'cabin':'cabin', 'embarked':'port', 'passenger':'passengerId',
                        'family_size':'familySize','family_name':'familyName', 'deck':'deck'})

# keep only numeric ID values
df['ticketId'] = df['ticketId'].apply(lambda x: ''.join(c for c in x if c.isnumeric()))

# Replace NaN values with Unknown in destination column
df['ticketId'] = df['ticketId'].replace('', '0')

# Save final dataset to data/silver
df.to_csv("../../data/silver/titanicgraph/titanic.csv", sep=',', index=False)
# Save final dataset to neo4j/import
df.to_csv("../../neo4j/import/titanic.csv", sep=',', index=False)

In [None]:
# load cleaned dataset
df = pd.read_csv('../../data/silver/titanicgraph/titanic.csv')

## Passenger

In [8]:
# neo4j/import dataset "filename"
file = "titanic.csv"

In [9]:
graph.run("CREATE CONSTRAINT ON (p:Passenger) ASSERT p.passengerId IS UNIQUE")

query = """
USING PERIODIC COMMIT 1000
LOAD CSV WITH HEADERS FROM 'file:///"""+file+"""' AS line FIELDTERMINATOR ','
MERGE (p:Passenger {passengerId:toInteger(line.passengerId),
                    survived:toInteger(line.survived),
                    fullname:(line.fullname),
                    sex:(line.sex),
                    age:toInteger(line.age),
                    nrSiblings:toInteger(line.nrSiblings),
                    nrParents:toInteger(line.nrParents),
                    familySize:toInteger(line.familySize),
                    familyName:(line.familyName),
                    port:toString(line.port),
                    pclass:toInteger(line.pclass),
                    ticketId:(line.ticketId),
                    cabinId:(line.cabin),
                    ticketPrice:toFloat(line.ticketPrice)})
"""
graph.run(query).stats()

{'labels_added': 1309, 'nodes_created': 1309, 'properties_set': 18326}

## Cabin

In [11]:
graph.run("CREATE CONSTRAINT ON (c:Cabin) ASSERT c.cabin IS UNIQUE")

query = """
USING PERIODIC COMMIT 1000
LOAD CSV WITH HEADERS FROM 'file:///"""+file+"""' AS line FIELDTERMINATOR ','
MERGE (c:Cabin {cabin:(line.cabin)})
"""
display(graph.run(query).stats())

query = """
USING PERIODIC COMMIT 1000
LOAD CSV WITH HEADERS FROM 'file:///"""+file+"""' AS line FIELDTERMINATOR ','
MATCH (p:Passenger {passengerId:toInteger(line.passengerId)})
MATCH (c:Cabin {cabin:(line.cabin)})
MERGE (p)-[:SAME_CABIN]->(c)
"""
display(graph.run(query).stats())

query = """
MATCH (c:Cabin {cabin:'No Cabin'})
DETACH DELETE c
"""
display(graph.run(query).stats())

{'labels_added': 187, 'nodes_created': 187, 'properties_set': 187}

{'relationships_created': 1309}

{'nodes_deleted': 1, 'relationships_deleted': 1014}

In [12]:
%%time
query = """
CALL gds.graph.create('myCabinGraph',
    ['Passenger', 'Cabin'],
    {
        SLEPT_IN: {
            type: 'SAME_CABIN',
            orientation: 'NATURAL'
        }
    }
)
"""
display(graph.run(query).stats())

{}

CPU times: user 2.7 ms, sys: 352 µs, total: 3.05 ms
Wall time: 2.81 s


In [13]:
%%time
query = """
CALL gds.nodeSimilarity.mutate('myCabinGraph', {
    similarityCutoff: 0.00001,
    degreeCutoff: 1,
    topK: 10000,
    mutateProperty: 'score',
    mutateRelationshipType: 'SAME_CABIN'
})
"""
display(graph.run(query).stats())

{}

CPU times: user 2.83 ms, sys: 368 µs, total: 3.19 ms
Wall time: 743 ms


In [14]:
%%time
query = """
CALL gds.alpha.degree.write('myCabinGraph', {
    nodeLabels: ['Passenger'],
    relationshipTypes: ['SAME_CABIN'],
    writeProperty: 'degree_cabin'})
"""
graph.run(query).stats()

query = """
CALL gds.triangleCount.write('myCabinGraph', {
    nodeLabels: ['Passenger'],
    relationshipTypes: ['SAME_CABIN'],
    writeProperty: 'trianglesCount_cabin'})
"""
graph.run(query).stats()

query = """
CALL gds.localClusteringCoefficient.write('myCabinGraph', {
    nodeLabels: ['Passenger'],
    relationshipTypes: ['SAME_CABIN'],
    writeProperty: 'clusteringCoefficient_cabin'})
"""
graph.run(query).stats()

query = """
CALL gds.wcc.write('myCabinGraph', {
    nodeLabels: ['Passenger'],
    relationshipTypes: ['SAME_CABIN'],
    writeProperty: 'weaklyconnectedcomponents_cabin'})
"""
graph.run(query).stats()

query = """
CALL gds.betweenness.write('myCabinGraph', {
    nodeLabels: ['Passenger'],
    relationshipTypes: ['SAME_CABIN'],
    writeProperty: 'betweenness_cabin'})
"""
graph.run(query).stats()

query = """
CALL gds.pageRank.write('myCabinGraph', {
    nodeLabels: ['Passenger'],
    relationshipTypes: ['SAME_CABIN'],
    dampingFactor: 0.85,
    tolerance: 0.0000001,
    maxIterations: 20,
    writeProperty: 'pagerank_cabin'})
"""
graph.run(query).stats()

query = """
CALL gds.louvain.write('myCabinGraph', {
    nodeLabels: ['Passenger'],
    relationshipTypes: ['SAME_CABIN'],
    includeIntermediateCommunities: false,
    writeProperty: 'louvain_cabin'})
"""
graph.run(query).stats()

query = """
CALL gds.fastRP.write('myCabinGraph', {
    nodeLabels: ['Passenger'],
    relationshipTypes: ['SAME_CABIN'],
    embeddingDimension: 4,
    writeProperty:'fastrp_cabin'})
"""
graph.run(query).stats()

query = """
CALL gds.alpha.node2vec.write('myCabinGraph', {
    nodeLabels: ['Passenger'],
    relationshipTypes: ['SAME_CABIN'],
    embeddingDimension: 4,
    iterations: 10,
    walkLength: 80,
    inOutFactor: 1,
    returnFactor: 1,
    relationshipWeightProperty: null,
    writeProperty: 'node2vec_cabin'})
"""
graph.run(query).stats()

CPU times: user 8.54 ms, sys: 747 µs, total: 9.29 ms
Wall time: 35.8 s


{}

In [15]:
# remove all similarity relationships
graph.run("""MATCH p=()-[r:SAME_CABIN]->() DETACH DELETE r""")

# remove all cabin nodes
graph.run("""MATCH (c:Cabin) DETACH DELETE c""")

(No data)

## TicketId

In [16]:
graph.run("CREATE CONSTRAINT ON (t:Ticket) ASSERT t.ticketId IS UNIQUE")

query = """
USING PERIODIC COMMIT 1000
LOAD CSV WITH HEADERS FROM 'file:///"""+file+"""' AS line FIELDTERMINATOR ','
MERGE (t:Ticket {ticketId:(line.ticketId)})
"""
display(graph.run(query).stats())

query = """
USING PERIODIC COMMIT 1000
LOAD CSV WITH HEADERS FROM 'file:///"""+file+"""' AS line FIELDTERMINATOR ','
MATCH (p:Passenger {passengerId:toInteger(line.passengerId)})
MATCH (t:Ticket {ticketId:(line.ticketId)})
MERGE (p)-[:HELD_TICKET]->(t)
"""
display(graph.run(query).stats())

query = """
MATCH (t:Ticket {ticketId:'0'})
DETACH DELETE t
"""
display(graph.run(query).stats())

{'labels_added': 924, 'nodes_created': 924, 'properties_set': 924}

{'relationships_created': 1309}

{'nodes_deleted': 1, 'relationships_deleted': 4}

In [17]:
%%time
query = """
CALL gds.graph.create('myTicketGraph',
    ['Passenger', 'Ticket'],
    {
        HOLD_ON: {
            type: 'HELD_TICKET',
            orientation: 'NATURAL'
        }
    }
)
"""
display(graph.run(query).stats())

{}

CPU times: user 2.63 ms, sys: 0 ns, total: 2.63 ms
Wall time: 136 ms


In [18]:
%%time
query = """
CALL gds.nodeSimilarity.mutate('myTicketGraph', {
    similarityCutoff: 0.00001,
    degreeCutoff: 1,
    topK: 10000,
    mutateProperty: 'score',
    mutateRelationshipType: 'HELD_TICKET'
})
"""
display(graph.run(query).stats())

{}

CPU times: user 2.78 ms, sys: 0 ns, total: 2.78 ms
Wall time: 397 ms


In [19]:
%%time
query = """
CALL gds.alpha.degree.write('myTicketGraph', {
    nodeLabels: ['Passenger'],
    relationshipTypes: ['HELD_TICKET'],
    writeProperty: 'degree_ticket'})
"""
graph.run(query).stats()

query = """
CALL gds.triangleCount.write('myTicketGraph', {
    nodeLabels: ['Passenger'],
    relationshipTypes: ['HELD_TICKET'],
    writeProperty: 'trianglesCount_ticket'})
"""
graph.run(query).stats()

query = """
CALL gds.localClusteringCoefficient.write('myTicketGraph', {
    nodeLabels: ['Passenger'],
    relationshipTypes: ['HELD_TICKET'],
    writeProperty: 'clusteringCoefficient_ticket'})
"""
graph.run(query).stats()

query = """
CALL gds.wcc.write('myTicketGraph', {
    nodeLabels: ['Passenger'],
    relationshipTypes: ['HELD_TICKET'],
    writeProperty: 'weaklyconnectedcomponents_ticket'})
"""
graph.run(query).stats()

query = """
CALL gds.betweenness.write('myTicketGraph', {
    nodeLabels: ['Passenger'],
    relationshipTypes: ['HELD_TICKET'],
    writeProperty: 'betweenness_ticket'})
"""
graph.run(query).stats()

query = """
CALL gds.pageRank.write('myTicketGraph', {
    nodeLabels: ['Passenger'],
    relationshipTypes: ['HELD_TICKET'],
    dampingFactor: 0.85,
    tolerance: 0.0000001,
    maxIterations: 20,
    writeProperty: 'pagerank_ticket'})
"""
graph.run(query).stats()

query = """
CALL gds.louvain.write('myTicketGraph', {
    nodeLabels: ['Passenger'],
    relationshipTypes: ['HELD_TICKET'],
    includeIntermediateCommunities: false,
    writeProperty: 'louvain_ticket'})
"""
graph.run(query).stats()

query = """
CALL gds.fastRP.write('myTicketGraph', {
    nodeLabels: ['Passenger'],
    relationshipTypes: ['HELD_TICKET'],
    embeddingDimension: 4,
    writeProperty:'fastrp_ticket'})
"""
graph.run(query).stats()

query = """
CALL gds.alpha.node2vec.write('myTicketGraph', {
    nodeLabels: ['Passenger'],
    relationshipTypes: ['HELD_TICKET'],
    embeddingDimension: 4,
    iterations: 10,
    walkLength: 80,
    inOutFactor: 1,
    returnFactor: 1,
    relationshipWeightProperty: null,
    writeProperty: 'node2vec_ticket'})
"""
graph.run(query).stats()

CPU times: user 9.55 ms, sys: 0 ns, total: 9.55 ms
Wall time: 1min 3s


{}

In [20]:
# remove all similarity relationships
graph.run("""MATCH p=()-[r:HELD_TICKET]->() DETACH DELETE r""")

# remove all ticket nodes
graph.run("""MATCH (t:Ticket) DETACH DELETE t""")

(No data)

## Family

In [21]:
# family = pd.read_csv('https://raw.githubusercontent.com/ybaktir/neo4ds/master/titanic/data/relationships.csv')
# fixed some "quote typos":
family = pd.read_csv('../../data/silver/titanicgraph/titanicrelationships.csv')

In [22]:
%%time
# RELATIONSHIPS
G.identifier_property = 'fullname'
G.node_label = 'Passenger'

for i in range(len(family)):
    RELATIONSHIP = str(family.loc[i]['type'])
    if RELATIONSHIP == 'nan':
        G.relationship_type = 'CONNECTED'
    else:
        G.relationship_type = RELATIONSHIP
    G.add_edge(str(family.loc[i]['u_name']),str(family.loc[i]['v_name']))

CPU times: user 3.34 s, sys: 78.2 ms, total: 3.42 s
Wall time: 57.3 s


In [23]:
%%time
query = """
CALL gds.graph.create('myFamilyGraph',
    ['Passenger'],
    {
        R: {
            type: '*',
            orientation: 'UNDIRECTED'
        }
    }
)
"""
display(graph.run(query).stats())

{}

CPU times: user 2.9 ms, sys: 0 ns, total: 2.9 ms
Wall time: 132 ms


In [24]:
%%time
query = """
CALL gds.alpha.degree.write('myFamilyGraph', {
    nodeLabels: ['Passenger'],
    relationshipTypes: ['*'],
    writeProperty: 'degree_family'})
"""
graph.run(query).stats()

query = """
CALL gds.triangleCount.write('myFamilyGraph', {
    nodeLabels: ['Passenger'],
    relationshipTypes: ['*'],
    writeProperty: 'trianglesCount_family'})
"""
graph.run(query).stats()

query = """
CALL gds.localClusteringCoefficient.write('myFamilyGraph', {
    nodeLabels: ['Passenger'],
    relationshipTypes: ['*'],
    writeProperty: 'clusteringCoefficient_family'})
"""
graph.run(query).stats()

query = """
CALL gds.wcc.write('myFamilyGraph', {
    nodeLabels: ['Passenger'],
    relationshipTypes: ['*'],
    writeProperty: 'weaklyconnectedcomponents_family'})
"""
graph.run(query).stats()

query = """
CALL gds.betweenness.write('myFamilyGraph', {
    nodeLabels: ['Passenger'],
    relationshipTypes: ['*'],
    writeProperty: 'betweenness_family'})
"""
graph.run(query).stats()

query = """
CALL gds.pageRank.write('myFamilyGraph', {
    nodeLabels: ['Passenger'],
    relationshipTypes: ['*'],
    dampingFactor: 0.85,
    tolerance: 0.0000001,
    maxIterations: 20,
    writeProperty: 'pagerank_family'})
"""
graph.run(query).stats()

query = """
CALL gds.louvain.write('myFamilyGraph', {
    nodeLabels: ['Passenger'],
    relationshipTypes: ['*'],
    includeIntermediateCommunities: false,
    writeProperty: 'louvain_family'})
"""
graph.run(query).stats()

query = """
CALL gds.fastRP.write('myFamilyGraph', {
    nodeLabels: ['Passenger'],
    relationshipTypes: ['*'],
    embeddingDimension: 4,
    writeProperty:'fastrp_family'})
"""
graph.run(query).stats()

query = """
CALL gds.alpha.node2vec.write('myFamilyGraph', {
    nodeLabels: ['Passenger'],
    relationshipTypes: ['*'],
    embeddingDimension: 4,
    iterations: 10,
    walkLength: 80,
    inOutFactor: 1,
    returnFactor: 1,
    relationshipWeightProperty: null,
    writeProperty: 'node2vec_family'})
"""
graph.run(query).stats()

CPU times: user 10.8 ms, sys: 0 ns, total: 10.8 ms
Wall time: 1min 9s


{}

## Pipeline

In [25]:
def ml_pipeline(df):
    # split features + target variable 
    X = df.drop('Survived', axis='columns')
    y = df["Survived"]

    # store the dummy columns for each categorical feature
    categorical = [col for col, value in X.iteritems() if value.dtype == 'object']
    # store the numerical columns for each numerical feature
    numerical = list(X.columns.difference(categorical))

    # split data into (train + validation) and test
    # x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101, stratify=y)
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=100, shuffle=True)

    # create the preprocessing pipelines for both numeric and categorical data
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())])
    # print(numeric_transformer)

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))])
    # print(categorical_transformer)

    transformations = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numerical),
            ('cat', categorical_transformer, categorical)])
    # print(transformations)

    # append classifier to preprocessing pipeline (select best model and hyperparameters)
    pipeline = Pipeline(steps=[('preprocessor', transformations),
                               ('classifier', RandomForestClassifier(random_state=0))])

    # from sklearn.model_selection import KFold
    from sklearn.model_selection import cross_val_score
    # from sklearn.model_selection import RepeatedKFold
    from sklearn.model_selection import RepeatedStratifiedKFold

    # cv = KFold(n_splits=5, shuffle=True, random_state=12)  # !! IMPORTANT set between True/False + random_state for comparing
    # cv = RepeatedKFold(n_splits=5, n_repeats=20, random_state=12)
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=25, random_state=11)

    # this is the auc score on the TRAINING data (uses training + validation)
    scores = cross_val_score(pipeline, X, y, cv=cv, scoring='roc_auc')
    # calculate 10 auc_scores and average result
    # scores
    print("mean   : {:2.1%}\nmedian : {:2.1%}".format(np.mean(scores), np.median(scores)))

    # model = pipeline.fit(x_train, y_train)
    model = pipeline.fit(X, y)
#     return model

    mimic_explainer = MimicExplainer(model=model.steps[-1][1],              # trained model (ex: LogisticRegression())
                                     initialization_examples=X, 
                                     explainable_model=LGBMExplainableModel,   # glassbox model
                                     augment_data=True, 
                                     max_num_of_augmentations=10, 
                                     features=X.columns,                       # ex: ['Pclass', 'Sex', 'Embarked']
                                     classes=['not_survived', 'survived'],     # '1' = Survived
                                     transformations=transformations)  

    global_mimic_explanation = mimic_explainer.explain_global(X)
    print(global_mimic_explanation.get_feature_importance_dict())

## Experiments

### nothing

In [26]:
query = """
MATCH (p:Passenger)
RETURN p.sex AS Sex,
       p.age AS Age,
       p.nrSiblings AS NrSiblings,
       p.nrParents AS NrParents,
       p.ticketPrice AS TicketPrice,
       p.survived AS Survived,
       p.pclass AS Pclass,
       p.port AS Embarked
"""
df = graph.run(query).to_data_frame()

df = df.astype({'Survived':'int', 'Sex':'str', 'Embarked':'str', 'Pclass':'str'})

ml_pipeline(df)

mean   : 84.5%
median : 84.5%


Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.
Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.
divide by zero encountered in true_divide
divide by zero encountered in log
categorical_feature keyword has been found in `params` and will be ignored.
Please use categorical_feature argument of the Dataset constructor to pass this parameter.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.


{'Sex': 1.9169174738302364, 'Pclass': 0.8424180504263347, 'TicketPrice': 0.5725878562394551, 'Age': 0.5053594556216413, 'Embarked': 0.2865208048814677, 'NrSiblings': 0.12506287555167436, 'NrParents': 0.08718427448102094}


Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.
Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.


### cabin

In [35]:
query = """
MATCH (p:Passenger)
RETURN p.sex AS Sex,
       p.age AS Age,
       p.nrSiblings AS NrSiblings,
       p.nrParents AS NrParents,
       p.ticketPrice AS TicketPrice,
       p.survived AS Survived,
       p.pclass AS Pclass,
       p.port AS Embarked,
       p.cabinId AS Cabin
"""
df = graph.run(query).to_data_frame()

df = df.astype({'Survived':'int', 'Sex':'str', 'Embarked':'str', 'Pclass':'str'})
df = df.astype({'Cabin':'str'})

ml_pipeline(df)

mean   : 84.8%
median : 84.9%


Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.
Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.
divide by zero encountered in true_divide
divide by zero encountered in log
categorical_feature keyword has been found in `params` and will be ignored.
Please use categorical_feature argument of the Dataset constructor to pass this parameter.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.


{'Sex': 1.8511358812187682, 'Pclass': 0.7035332616740584, 'TicketPrice': 0.5304023686263541, 'Age': 0.46645042971780576, 'Cabin': 0.36863472665305164, 'Embarked': 0.25312864125176016, 'NrSiblings': 0.152023621940328, 'NrParents': 0.14232638714017226}


Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.
Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.


### cabin graphy features

In [28]:
query = """
MATCH (p:Passenger)
RETURN p.sex AS Sex,
       p.age AS Age,
       p.nrSiblings AS NrSiblings,
       p.nrParents AS NrParents,
       p.ticketPrice AS TicketPrice,
       p.survived AS Survived,
       p.pclass AS Pclass,
       p.port AS Embarked,
       
       p.trianglesCount_cabin AS trianglesCount_cabin,
       p.clusteringCoefficient_cabin AS clusteringCoefficient_cabin,
       p.betweenness_cabin AS betweenness_cabin,
       p.degree_cabin AS degree_cabin,
       p.weaklyconnectedcomponents_cabin AS wcc_cabin,
       p.louvain_cabin AS louvain_cabin,
       p.pagerank_cabin AS pagerank_cabin,
       p.node2vec_cabin AS node2vec_cabin,
       p.fastrp_cabin AS fastrp_cabin
"""
df = graph.run(query).to_data_frame()
df = df.astype({'Survived':'int', 'Sex':'str', 'Embarked':'str', 'Pclass':'str'})
df = df.astype({'trianglesCount_cabin':'int',
                'clusteringCoefficient_cabin':'object',
                'betweenness_cabin':'object',
                'degree_cabin':'int',
                'wcc_cabin':'int',
                'pagerank_cabin':'float',
                'louvain_cabin':'int'})

# embedding dimension
length = len(df['node2vec_cabin'][0])
# create features
columns = []
for i in range(length):
    columns.append('node2vec_cabin'+str(i+1))
# transform embedding into dataframe columns
embedding_df = pd.DataFrame(df['node2vec_cabin'].tolist(), columns=columns)
# add embedding features to larger dataframe
df = pd.concat([df, embedding_df], axis=1)
df = df.drop(['node2vec_cabin'], axis=1)

# embedding dimension
length = len(df['fastrp_cabin'][0])
# create features
columns = []
for i in range(length):
    columns.append('fastrp_cabin'+str(i+1))
# transform embedding into dataframe columns
embedding_df = pd.DataFrame(df['fastrp_cabin'].tolist(), columns=columns)
# add embedding features to larger dataframe
df = pd.concat([df, embedding_df], axis=1)
df = df.drop(['fastrp_cabin'], axis=1)

ml_pipeline(df)

mean   : 85.4%
median : 85.3%


Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.
Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.
divide by zero encountered in true_divide
divide by zero encountered in log
categorical_feature keyword has been found in `params` and will be ignored.
Please use categorical_feature argument of the Dataset constructor to pass this parameter.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.


{'Sex': 1.6013906418217314, 'louvain_cabin': 0.6620582967382125, 'wcc_cabin': 0.31138969434156, 'Age': 0.24893390225544446, 'node2vec_cabin2': 0.22302918633325158, 'node2vec_cabin4': 0.20628191915549066, 'TicketPrice': 0.1944638929700958, 'Embarked': 0.15457065760423086, 'node2vec_cabin3': 0.150366373401217, 'node2vec_cabin1': 0.11012944305426647, 'NrSiblings': 0.09991533133680242, 'pagerank_cabin': 0.08433893500068965, 'NrParents': 0.05807174696653961, 'fastrp_cabin2': 0.05278415779338961, 'fastrp_cabin3': 0.017880722187921444, 'fastrp_cabin4': 0.016040447093941312, 'Pclass': 0.010827995253345532, 'fastrp_cabin1': 0.0050386820051153865, 'degree_cabin': 0.004368819185353827, 'trianglesCount_cabin': 0.0, 'clusteringCoefficient_cabin': 0.0, 'betweenness_cabin': 0.0}


Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.
Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.


### passengerId

In [29]:
query = """
MATCH (p:Passenger)
RETURN p.sex AS Sex,
       p.age AS Age,
       p.nrSiblings AS NrSiblings,
       p.nrParents AS NrParents,
       p.ticketPrice AS TicketPrice,
       p.survived AS Survived,
       p.pclass AS Pclass,
       p.port AS Embarked,
       p.passengerId AS PassengerId
"""
df = graph.run(query).to_data_frame()

df = df.astype({'Survived':'int', 'Sex':'str', 'Embarked':'str', 'Pclass':'str'})
df = df.astype({'PassengerId':'int'})

ml_pipeline(df)

mean   : 85.2%
median : 85.2%


Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.
Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.
divide by zero encountered in true_divide
divide by zero encountered in log
categorical_feature keyword has been found in `params` and will be ignored.
Please use categorical_feature argument of the Dataset constructor to pass this parameter.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.


{'Sex': 1.7971078955264312, 'PassengerId': 1.0116676771873503, 'Age': 0.4085999074557338, 'TicketPrice': 0.39135708940006125, 'Embarked': 0.25586586434448255, 'NrSiblings': 0.15499009479908615, 'NrParents': 0.06330438668632773, 'Pclass': 0.007474404541754354}


Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.
Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.


### ticketId

In [30]:
query = """
MATCH (p:Passenger)
RETURN p.sex AS Sex,
       p.age AS Age,
       p.nrSiblings AS NrSiblings,
       p.nrParents AS NrParents,
       p.ticketPrice AS TicketPrice,
       p.survived AS Survived,
       p.pclass AS Pclass,
       p.port AS Embarked,
       p.ticketId AS TicketId
"""
df = graph.run(query).to_data_frame()

df = df.astype({'Survived':'int', 'Sex':'str', 'Embarked':'str', 'Pclass':'str'})
df = df.astype({'TicketId':'int'})

ml_pipeline(df)

mean   : 85.9%
median : 85.9%


Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.
Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.
divide by zero encountered in true_divide
divide by zero encountered in log
categorical_feature keyword has been found in `params` and will be ignored.
Please use categorical_feature argument of the Dataset constructor to pass this parameter.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.


{'Sex': 1.8668677894022927, 'Pclass': 0.8894699299894134, 'TicketId': 0.46913801003380295, 'Age': 0.4283001240353304, 'TicketPrice': 0.4147843900956752, 'Embarked': 0.16173419794883986, 'NrSiblings': 0.1410678327955804, 'NrParents': 0.05947214568017308}


Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.
Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.


### ticketId graphy features

In [31]:
query = """
MATCH (p:Passenger)
RETURN p.sex AS Sex,
       p.age AS Age,
       p.nrSiblings AS NrSiblings,
       p.nrParents AS NrParents,
       p.ticketPrice AS TicketPrice,
       p.survived AS Survived,
       p.pclass AS Pclass,
       p.port AS Embarked,
       
       p.trianglesCount_ticket AS trianglesCount_ticket,
       p.clusteringCoefficient_ticket AS clusteringCoefficient_ticket,
       p.betweenness_ticket AS betweenness_ticket,
       p.degree_ticket AS degree_ticket,
       p.weaklyconnectedcomponents_ticket AS wcc_ticket,
       p.louvain_ticket AS louvain_ticket,
       p.pagerank_ticket AS pagerank_ticket,
       p.node2vec_ticket AS node2vec_ticket,
       p.fastrp_ticket AS fastrp_ticket
"""
df = graph.run(query).to_data_frame()
df = df.astype({'Survived':'int', 'Sex':'str', 'Embarked':'str', 'Pclass':'str'})
df = df.astype({'trianglesCount_ticket':'int',
                'clusteringCoefficient_ticket':'object',
                'betweenness_ticket':'object',
                'degree_ticket':'int',
                'wcc_ticket':'int',
                'pagerank_ticket':'float',
                'louvain_ticket':'int'})

# embedding dimension
length = len(df['node2vec_ticket'][0])
# create features
columns = []
for i in range(length):
    columns.append('node2vec_ticket'+str(i+1))
# transform embedding into dataframe columns
embedding_df = pd.DataFrame(df['node2vec_ticket'].tolist(), columns=columns)
# add embedding features to larger dataframe
df = pd.concat([df, embedding_df], axis=1)
df = df.drop(['node2vec_ticket'], axis=1)

# embedding dimension
length = len(df['fastrp_ticket'][0])
# create features
columns = []
for i in range(length):
    columns.append('fastrp_ticket'+str(i+1))
# transform embedding into dataframe columns
embedding_df = pd.DataFrame(df['fastrp_ticket'].tolist(), columns=columns)
# add embedding features to larger dataframe
df = pd.concat([df, embedding_df], axis=1)
df = df.drop(['fastrp_ticket'], axis=1)

ml_pipeline(df)

mean   : 87.8%
median : 87.7%


Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.
Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.
divide by zero encountered in true_divide
divide by zero encountered in log
categorical_feature keyword has been found in `params` and will be ignored.
Please use categorical_feature argument of the Dataset constructor to pass this parameter.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.


{'Sex': 1.6457364366782608, 'louvain_ticket': 0.7850741916600213, 'node2vec_ticket2': 0.25118025339740785, 'Age': 0.24468497581379503, 'TicketPrice': 0.20239069489463504, 'fastrp_ticket2': 0.17452394406825314, 'node2vec_ticket3': 0.17272376044912754, 'node2vec_ticket1': 0.16192841327901494, 'node2vec_ticket4': 0.1539259923210886, 'degree_ticket': 0.15057952291571094, 'wcc_ticket': 0.14995024897890605, 'Embarked': 0.11178066873697431, 'NrSiblings': 0.05469522435490206, 'fastrp_ticket1': 0.043447598360805334, 'fastrp_ticket4': 0.03755625059792137, 'pagerank_ticket': 0.03481824821858193, 'NrParents': 0.033015628477077875, 'fastrp_ticket3': 0.032742065314221386, 'trianglesCount_ticket': 0.027381730707593876, 'Pclass': 0.003632838485830826, 'betweenness_ticket': 0.0, 'clusteringCoefficient_ticket': 0.0}


Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.
Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.


### family graph features

In [32]:
query = """
MATCH (p:Passenger)
RETURN p.sex AS Sex,
       p.age AS Age,
       p.nrSiblings AS NrSiblings,
       p.nrParents AS NrParents,
       p.ticketPrice AS TicketPrice,
       p.survived AS Survived,
       p.pclass AS Pclass,
       p.port AS Embarked,

       p.trianglesCount_family AS trianglesCount_family,
       p.clusteringCoefficient_family AS clusteringCoefficient_family,
       p.betweenness_family AS betweenness_family,
       p.degree_family AS degree_family,
       p.weaklyconnectedcomponents_family AS wcc_family,
       p.louvain_family AS louvain_family,
       p.pagerank_family AS pagerank_family,
       p.node2vec_family AS node2vec_family,
       p.fastrp_family AS fastrp_family
"""
df = graph.run(query).to_data_frame()
df = df.astype({'Survived':'int', 'Sex':'str', 'Embarked':'str', 'Pclass':'str'})
df = df.astype({'trianglesCount_family':'int',
                'clusteringCoefficient_family':'object',
                'betweenness_family':'object',
                'degree_family':'int',
                'wcc_family':'int',
                'pagerank_family':'float',
                'louvain_family':'int'})

# embedding dimension
length = len(df['node2vec_family'][0])
# create features
columns = []
for i in range(length):
    columns.append('node2vec_family'+str(i+1))
# transform embedding into dataframe columns
embedding_df = pd.DataFrame(df['node2vec_family'].tolist(), columns=columns)
# add embedding features to larger dataframe
df = pd.concat([df, embedding_df], axis=1)
df = df.drop(['node2vec_family'], axis=1)

# embedding dimension
length = len(df['fastrp_family'][0])
# create features
columns = []
for i in range(length):
    columns.append('fastrp_family'+str(i+1))
# transform embedding into dataframe columns
embedding_df = pd.DataFrame(df['fastrp_family'].tolist(), columns=columns)
# add embedding features to larger dataframe
df = pd.concat([df, embedding_df], axis=1)
df = df.drop(['fastrp_family'], axis=1)

ml_pipeline(df)

mean   : 86.8%
median : 86.9%


Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.
Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.
divide by zero encountered in true_divide
divide by zero encountered in log
categorical_feature keyword has been found in `params` and will be ignored.
Please use categorical_feature argument of the Dataset constructor to pass this parameter.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.


{'Sex': 1.5156115743128287, 'Pclass': 0.57576182121642, 'louvain_family': 0.24919321566000802, 'Age': 0.2362573010675135, 'node2vec_family1': 0.20969972414447124, 'TicketPrice': 0.16815105825082258, 'Embarked': 0.16272852873701726, 'node2vec_family4': 0.14221825116391199, 'wcc_family': 0.13864632982746178, 'node2vec_family3': 0.12269939204738986, 'node2vec_family2': 0.11171886026429037, 'degree_family': 0.08484397731355833, 'fastrp_family2': 0.08182241039138195, 'fastrp_family1': 0.07528459445522521, 'fastrp_family4': 0.07015141057532945, 'fastrp_family3': 0.06348500033830333, 'trianglesCount_family': 0.06153589315061519, 'NrSiblings': 0.05761960254156879, 'pagerank_family': 0.04927931745479048, 'clusteringCoefficient_family': 0.04569363522822957, 'NrParents': 0.03409140569377808, 'betweenness_family': 0.0007758760756744551}


Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.
Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.


## COMBI cabin + ticket + family

In [39]:
query = """
MATCH (p:Passenger)
RETURN p.sex AS Sex,
       p.age AS Age,
       p.nrSiblings AS NrSiblings,
       p.nrParents AS NrParents,
       p.ticketPrice AS TicketPrice,
       p.survived AS Survived,
       p.pclass AS Pclass,
       p.port AS Embarked,
       
       //p.cabinId AS Cabin,
       //p.ticketId AS TicketId,
       //p.passengerId AS PassengerId,
       
       p.trianglesCount_cabin AS trianglesCount_cabin,
       p.clusteringCoefficient_cabin AS clusteringCoefficient_cabin,
       p.betweenness_cabin AS betweenness_cabin,
       p.degree_cabin AS degree_cabin,
       p.weaklyconnectedcomponents_cabin AS wcc_cabin,
       p.louvain_cabin AS louvain_cabin,
       p.pagerank_cabin AS pagerank_cabin,
       p.node2vec_cabin AS node2vec_cabin,
       p.fastrp_cabin AS fastrp_cabin,
       
       p.trianglesCount_ticket AS trianglesCount_ticket,
       p.clusteringCoefficient_ticket AS clusteringCoefficient_ticket,
       p.betweenness_ticket AS betweenness_ticket,
       p.degree_ticket AS degree_ticket,
       p.weaklyconnectedcomponents_ticket AS wcc_ticket,
       p.louvain_ticket AS louvain_ticket,
       p.pagerank_ticket AS pagerank_ticket,
       p.node2vec_ticket AS node2vec_ticket,
       p.fastrp_ticket AS fastrp_ticket,
       
       p.trianglesCount_family AS trianglesCount_family,
       p.clusteringCoefficient_family AS clusteringCoefficient_family,
       p.betweenness_family AS betweenness_family,
       p.degree_family AS degree_family,
       p.weaklyconnectedcomponents_family AS wcc_family,
       p.louvain_family AS louvain_family,
       p.pagerank_family AS pagerank_family,
       p.node2vec_family AS node2vec_family,
       p.fastrp_family AS fastrp_family
"""
df = graph.run(query).to_data_frame()
df = df.astype({'Survived':'int', 'Sex':'str', 'Embarked':'str', 'Pclass':'str'})
# df = df.astype({'Cabin':'str'})
# df = df.astype({'TicketId':'int'})
# df = df.astype({'PassengerId':'int'})
df = df.astype({'trianglesCount_cabin':'int',
                'clusteringCoefficient_cabin':'object',
                'betweenness_cabin':'object',
                'degree_cabin':'int',
                'wcc_cabin':'int',
                'pagerank_cabin':'float',
                'louvain_cabin':'int'})
df = df.astype({'trianglesCount_ticket':'int',
                'clusteringCoefficient_ticket':'object',
                'betweenness_ticket':'object',
                'degree_ticket':'int',
                'wcc_ticket':'int',
                'pagerank_ticket':'float',
                'louvain_ticket':'int'})
df = df.astype({'trianglesCount_family':'int',
                'clusteringCoefficient_family':'object',
                'betweenness_family':'object',
                'degree_family':'int',
                'wcc_family':'int',
                'pagerank_family':'float',
                'louvain_family':'int'})

# embedding dimension
length = len(df['node2vec_cabin'][0])
# create features
columns = []
for i in range(length):
    columns.append('node2vec_cabin'+str(i+1))
# transform embedding into dataframe columns
embedding_df = pd.DataFrame(df['node2vec_cabin'].tolist(), columns=columns)
# add embedding features to larger dataframe
df = pd.concat([df, embedding_df], axis=1)
df = df.drop(['node2vec_cabin'], axis=1)

# embedding dimension
length = len(df['fastrp_cabin'][0])
# create features
columns = []
for i in range(length):
    columns.append('fastrp_cabin'+str(i+1))
# transform embedding into dataframe columns
embedding_df = pd.DataFrame(df['fastrp_cabin'].tolist(), columns=columns)
# add embedding features to larger dataframe
df = pd.concat([df, embedding_df], axis=1)
df = df.drop(['fastrp_cabin'], axis=1)

# embedding dimension
length = len(df['node2vec_ticket'][0])
# create features
columns = []
for i in range(length):
    columns.append('node2vec_ticket'+str(i+1))
# transform embedding into dataframe columns
embedding_df = pd.DataFrame(df['node2vec_ticket'].tolist(), columns=columns)
# add embedding features to larger dataframe
df = pd.concat([df, embedding_df], axis=1)
df = df.drop(['node2vec_ticket'], axis=1)

# embedding dimension
length = len(df['fastrp_ticket'][0])
# create features
columns = []
for i in range(length):
    columns.append('fastrp_ticket'+str(i+1))
# transform embedding into dataframe columns
embedding_df = pd.DataFrame(df['fastrp_ticket'].tolist(), columns=columns)
# add embedding features to larger dataframe
df = pd.concat([df, embedding_df], axis=1)
df = df.drop(['fastrp_ticket'], axis=1)

# embedding dimension
length = len(df['node2vec_family'][0])
# create features
columns = []
for i in range(length):
    columns.append('node2vec_family'+str(i+1))
# transform embedding into dataframe columns
embedding_df = pd.DataFrame(df['node2vec_family'].tolist(), columns=columns)
# add embedding features to larger dataframe
df = pd.concat([df, embedding_df], axis=1)
df = df.drop(['node2vec_family'], axis=1)

# embedding dimension
length = len(df['fastrp_family'][0])
# create features
columns = []
for i in range(length):
    columns.append('fastrp_family'+str(i+1))
# transform embedding into dataframe columns
embedding_df = pd.DataFrame(df['fastrp_family'].tolist(), columns=columns)
# add embedding features to larger dataframe
df = pd.concat([df, embedding_df], axis=1)
df = df.drop(['fastrp_family'], axis=1)

ml_pipeline(df)

mean   : 87.5%
median : 87.5%


Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.
Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.
divide by zero encountered in true_divide
divide by zero encountered in log
categorical_feature keyword has been found in `params` and will be ignored.
Please use categorical_feature argument of the Dataset constructor to pass this parameter.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.


{'Sex': 1.4122783145284568, 'louvain_cabin': 0.3735955069637859, 'wcc_cabin': 0.1735441600962264, 'Age': 0.14968979718908246, 'louvain_ticket': 0.14604380173480347, 'fastrp_ticket2': 0.1421121871105653, 'node2vec_cabin2': 0.12964586469303832, 'node2vec_family1': 0.1256783678489946, 'node2vec_ticket2': 0.11906966929822344, 'node2vec_cabin4': 0.11065421536942242, 'TicketPrice': 0.10504840677052411, 'Embarked': 0.1012290724433035, 'node2vec_ticket3': 0.09847012896848022, 'degree_ticket': 0.09662193738120256, 'node2vec_family4': 0.08673261212990731, 'node2vec_ticket1': 0.08613459870081709, 'node2vec_family2': 0.07974999138300334, 'pagerank_ticket': 0.07660499804264821, 'node2vec_ticket4': 0.06925698688001902, 'pagerank_cabin': 0.0663027208857047, 'node2vec_cabin1': 0.0659701976101762, 'louvain_family': 0.06514675415874896, 'node2vec_cabin3': 0.06179031282202072, 'node2vec_family3': 0.06140173155578417, 'trianglesCount_family': 0.05885812171309304, 'wcc_family': 0.057366055013599856, 'fastr

Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.
Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.


nothing      = 84.5%  
  
cabin        = 84.8%  
cabin graph  = 85.4%  
  
ticket       = 85.9%  
ticket graph = 87.8%  
  
passenger    = 85.2%  
  
family graph = 86.8%  
  
combi        = 87.5%  