In [1]:
# Define Neo4j connections
import pandas as pd
from neo4j import GraphDatabase
host = 'neo4j://localhost:7687'
user = 'neo4j'
password = 'letmein'
driver = GraphDatabase.driver(host,auth=(user, password))

In [2]:
def run_query(query):
    with driver.session() as session:
        result = session.run(query)
        return pd.DataFrame([r.values() for r in result], columns=result.keys())

In [5]:
import_queries = """

CALL apoc.schema.assert({Character:['name']},{Comic:['id'], Character:['id'], Event:['id'], Group:['id']});

LOAD CSV WITH HEADERS FROM "https://raw.githubusercontent.com/tomasonjo/blog-datasets/main/Marvel/heroes.csv" as row
CREATE (c:Character)
SET c += row;

LOAD CSV WITH HEADERS FROM "https://raw.githubusercontent.com/tomasonjo/blog-datasets/main/Marvel/groups.csv" as row
CREATE (c:Group)
SET c += row;

LOAD CSV WITH HEADERS FROM "https://raw.githubusercontent.com/tomasonjo/blog-datasets/main/Marvel/events.csv" as row
CREATE (c:Event)
SET c += row;

LOAD CSV WITH HEADERS FROM "https://raw.githubusercontent.com/tomasonjo/blog-datasets/main/Marvel/comics.csv" as row
CREATE (c:Comic)
SET c += apoc.map.clean(row,[],["null"]);

LOAD CSV WITH HEADERS FROM "https://raw.githubusercontent.com/tomasonjo/blog-datasets/main/Marvel/heroToComics.csv" as row
MATCH (c:Character{id:row.hero})
MATCH (co:Comic{id:row.comic})
MERGE (c)-[:APPEARED_IN]->(co);

LOAD CSV WITH HEADERS FROM "https://raw.githubusercontent.com/tomasonjo/blog-datasets/main/Marvel/heroToEvent.csv" as row
MATCH (c:Character{id:row.hero})
MATCH (e:Event{id:row.event})
MERGE (c)-[:PART_OF_EVENT]->(e);

LOAD CSV WITH HEADERS FROM "https://raw.githubusercontent.com/tomasonjo/blog-datasets/main/Marvel/heroToGroup.csv" as row
MATCH (c:Character{id:row.hero})
MATCH (g:Group{id:row.group})
MERGE (c)-[:PART_OF_GROUP]->(g);

LOAD CSV WITH HEADERS FROM "https://raw.githubusercontent.com/tomasonjo/blog-datasets/main/Marvel/heroToHero.csv" as row
MATCH (s:Character{id:row.source})
MATCH (t:Character{id:row.target})
CALL apoc.create.relationship(s,row.type, {}, t) YIELD rel
RETURN distinct 'done';

LOAD CSV WITH HEADERS FROM "https://raw.githubusercontent.com/tomasonjo/blog-datasets/main/Marvel/heroStats.csv" as row
MATCH (s:Character{id:row.hero})
CREATE (s)-[:HAS_STATS]->(stats:Stats)
SET stats += apoc.map.clean(row,['hero'],[]);

LOAD CSV WITH HEADERS FROM "https://raw.githubusercontent.com/tomasonjo/blog-datasets/main/Marvel/heroFlight.csv" as row
MATCH (s:Character{id:row.hero})
SET s.flight = row.flight;

MATCH (s:Stats)
WITH keys(s) as keys LIMIT 1
MATCH (s:Stats)
UNWIND keys as key
CALL apoc.create.setProperty(s, key, toInteger(s[key]))
YIELD node
RETURN distinct 'done';
"""

## Graph import

In [6]:
with driver.session() as session:
    for statement in import_queries.split(';'):
        try:
            session.run(statement.strip())
        except:
            pass

## Graph schema
In the center of the graph, there are characters, also known as heroes. They can appear in multiple comics, are part of an event, and can belong to a group. For some of the characters, we also know their stats like speed and fighting skills. Finally, we have social ties between characters that represent relative, ally, or enemy relationships.

There are 1105 characters that have appeared in 38875 comics.
We have stats for 470 of the characters. There are also 92 groups and 74 events stored in the graph.
## Exploratory graph analysis
To get to know our graph, we will begin with a basic graph data exploration process. First, we will take a look at the characters that have most frequently appeared in comics.

In [26]:
run_query("""
MATCH (c:Character)
RETURN c.name as character, 
       size((c)-[:APPEARED_IN]->()) as comics
ORDER BY comics DESC
LIMIT 5
""")

Unnamed: 0,character,comics
0,Spider-Man (1602),3357
1,Tony Stark,2354
2,Logan,2098
3,Steve Rogers,2019
4,Thor (Marvel: Avengers Alliance),1547


The top five most frequent characters come as no surprise. Spiderman is the most frequent or popular character. It is no wonder that they created a younger version of Spiderman just recently, given his popularity. Tony Stark, also known as Iron Man, is in second place. It seems that Logan, also known as Wolverine, was quite popular throughout history, but I think that his popularity slowly faded away in recent times. Steve Rogers, who goes by the more popular name Captain America, is also quite famous. It would seem that the recent Marvel movies showcased the more popular characters from the comics.

Next, we will look at how many comics were released throughout the decades. The year of the comic is stored as a string in our graph, so we can use the substringfunction to extract the decade.

In [4]:
run_query("""
MATCH (c:Comic)
RETURN substring(c.year, 0, 3) + "0" as decade, 
       count(*) as count
ORDER BY decade ASC
""")

Unnamed: 0,decade,count
0,1930.0,95
1,1940.0,584
2,1950.0,756
3,1960.0,4114
4,1970.0,1956
5,1980.0,2428
6,1990.0,3738
7,2000.0,8309
8,2010.0,11139
9,2020.0,19


Interesting to see that the first comics were produced in the 1930s. Some of the heroes are relatively senior by now. There was a spike in the 1960s and then gradual progression over the decades with 11.139 comics in the 2010s. The last column represents comics with a null date, so for around 6000 comics out of 38000, we don’t have the date available. And we haven’t scraped all the comics in the 2020s either.

Next, we will take a look at the most popular characters in the comics throughout the decades. We will iterate over comics and extract the top three most frequent heroes by the decade.

In [5]:
run_query("""
MATCH (c:Comic)<-[:APPEARED_IN]-(c1:Character)
WHERE NOT c.year = "null"
WITH substring(c.year,0,3) + "0" as decade, 
     c1.name as character, 
     count(*) as count
ORDER BY count DESC
RETURN decade, collect(character)[..3] as top_3_characters
ORDER BY decade
""")

Unnamed: 0,decade,top_3_characters
0,1930,"[Johnny Storm, Sub-Mariner, Archangel]"
1,1940,"[Johnny Storm, Two-Gun Kid, Steve Rogers]"
2,1950,"[Rawhide Kid, Tony Stark, Stephen Strange]"
3,1960,"[Thor (Marvel: Avengers Alliance), Spider-Man ..."
4,1970,"[Spider-Man (1602), Stephen Strange, Shang-Chi..."
5,1980,"[Logan, Tony Stark, Spider-Man (1602)]"
6,1990,"[Spider-Man (1602), Tony Stark, Steve Rogers]"
7,2000,"[Spider-Man (1602), Tony Stark, Logan]"
8,2010,"[Spider-Man (1602), Steve Rogers, Logan]"


It seems it all started with Johnny Storm, also known as the Human Torch. Iron Man (Tony Stark) was already popular in the 1950s, and Spiderman and Captain America (Steve Rogers) have risen in popularity in the 1960s. From then on, it seems that Spiderman, Wolverine, Iron Man, and Captain America win the popularity contest.
You might be wondering what the events are in our graph, so let’s take a look. We will examine the events with the highest count of participating heroes.

In [6]:
run_query("""
MATCH (e:Event)
RETURN e.title as event, 
       size((e)<-[:PART_OF_EVENT]-()) as count_of_heroes,
       e.start as start,
       e.end as end,
       e.description as description 
ORDER BY count_of_heroes DESC 
LIMIT 5
""")

Unnamed: 0,event,count_of_heroes,start,end,description
0,Fear Itself,132,2011-04-16 00:00:00,2011-10-18 00:00:00,"The Serpent, God of Fear and brother to the Al..."
1,Dark Reign,128,2008-12-01 00:00:00,2009-12-31 12:59:00,Norman Osborn came out the hero of Secret Inva...
2,Acts of Vengeance!,93,1989-12-10 00:00:00,2008-01-04 00:00:00,Loki sets about convincing the super-villains ...
3,Secret Invasion,89,2008-06-02 00:00:00,2009-01-25 00:00:00,The shape-shifting Skrulls have been infiltrat...
4,Civil War,86,2006-07-01 00:00:00,2007-01-29 00:00:00,After a horrific tragedy raises questions on w...


I have little to no idea what these events represent, but it is interesting to see that many characters participate. Most of the events span over less than a year, while the Acts of Vengeance spans over two decades. And judging by the description, Loki had something to do with it along with 92! other characters. Unfortunately, we don’t have the connection between comics and events stored in our graph to allow further analysis. If someone will scrape the Marvel API, I will gladly add it to the dataset.

Let’s also take a look at the biggest groups of characters.

In [8]:
run_query("""
MATCH (g:Group)
RETURN g.name as group, 
       size((g)<-[:PART_OF_GROUP]-()) as members
ORDER BY members DESC LIMIT 5
""")

Unnamed: 0,group,members
0,X-Men,41
1,Avengers,31
2,Defenders,26
3,Next Avengers,14
4,Guardians of the Galaxy,12


There are 41 characters in X-Men, which makes sense as they had a whole academy. You might be surprised by 31 members of Avengers, but in the comics, there were many members of Avengers, although most are former members.

Just because we can, let’s inspect if some members of the same group are also enemies.

In [9]:
run_query("""
MATCH (c1:Character)-[:PART_OF_GROUP]->(g:Group)<-[:PART_OF_GROUP]-(c2:Character)
WHERE (c1)-[:ENEMY]-(c2) and id(c1) < id(c2)
RETURN c1.name as character1, c2.name as character2, g.name as group
""")

Unnamed: 0,character1,character2,group
0,Logan,Sabretooth (House of M),X-Men
1,Logan,Mystique (House of M),X-Men
2,CAIN MARKO JUGGERNAUT,Logan,X-Men
3,CAIN MARKO JUGGERNAUT,Storm (Marvel Heroes),X-Men
4,Rogue (X-Men: Battle of the Atom),Warren Worthington III,X-Men


It seems that Logan does not get along with some of the other X-Men. For some of the characters, we also have the place of origin and education available, so let’s quickly look at that. During the scraping, I noticed a hero originated from Yugoslavia, so I wonder if there are more characters from Yugoslavia.

In [10]:
run_query("""
MATCH (c:Character)
WHERE c.place_of_origin contains "Yugoslavia"
RETURN c.name as character, 
       c.place_of_origin as place_of_origin,
       c.aliases as aliases
""")

Unnamed: 0,character,place_of_origin,aliases
0,Purple Man,"Rijeka, Yugoslavia","Killgrave the Purple Man, Killy"
1,Abomination (Ultimate),"Zagreb, Yugoslavia","Agent R-7, the Ravager of Worlds"


Two characters originated from today’s Croatia, which is less than two hours drive from where I live. Let’s also check out all the characters that completed their Ph.D. degree.

In [11]:
run_query("""
MATCH (c:Character)
WHERE c.education contains "Ph.D"
RETURN c.name as character, c.education as education
LIMIT 10
""")

Unnamed: 0,character,education
0,UNKNOWN ACHEBE,"Ph.D. in Law (Yale), degrees in Psychology, Po..."
1,PROFESSOR MENDEL STROMM MENDEL STROMM,Ph.D. in robotics
2,FRANKLIN HALL GRAVITON,Ph.D. in physics
3,Morbius,Ph.D in Biochemistry
4,Tony Stark,Ph.Ds in physics and electrical engineering
5,Hulk-dok,Ph.D in nuclear physics and two other fields
6,Sasquatch (Walter Langkowski),Ph.D. in physics from the Massachusetts Instit...
7,Professor X (Ultimate),"Ph.Ds in genetics, biophysics, psychology, and..."
8,Klaw,"Ph.D. in physics, bachelor’s degree in geology"
9,High Evolutionary,Uncompleted Ph.D at Oxford University


It looks like a lot of these heroes are quite employable. Only Nightshade seems a bit dodgy. It feels like something one would put on their LinkedIn profile to get noticed when searching for Ph.D. profiles. By the way, did you know that Professor X has four Ph.D.s and is also MD in psychiatry? Quite the educated men.
## Analyzing communities of allies and relatives
We have examined basic graph statistics, and now we will focus more on network analysis. We will investigate the social ties between characters.
To start, we will calculate the degree values for each relationship type between characters and display the heroes with the highest overall degree.

In [12]:
run_query("""
MATCH (c:Character)
RETURN c.name as name,
       size((c)-[:ALLY]->()) as allies,
       size((c)-[:ENEMY]->()) as enemies,
       size((c)-[:RELATIVE]->()) as relative
ORDER BY allies + enemies + relative DESC 
LIMIT 5
""")

Unnamed: 0,name,allies,enemies,relative
0,Scarlet Witch (Marvel Heroes),16,14,8
1,Thor (Marvel: Avengers Alliance),9,14,10
2,Invisible Woman (Marvel: Avengers Alliance),13,10,7
3,Logan,14,10,5
4,Karnak,6,2,17


Scarlet Witch and Thor seem to have the most direct enemies. Wolverine has the most allies but also many enemies. It looks like Triton has a big family with 17 direct relative relationships. We can use the `apoc.path.subgraphAll` procedure to examine the relatives' community of Triton.

In [13]:
run_query("""
MATCH p=(c:Character{name:"Triton"})
CALL apoc.path.subgraphAll(id(c), {relationshipFilter:"RELATIVE"})
YIELD nodes, relationships
RETURN nodes, relationships
""")

Unnamed: 0,nodes,relationships
0,"[(aliases, education, identity, name, id, plac...","[(), (), (), (), (), (), (), (), (), (), (), (..."


I never knew that some of the Marvel heroes have quite a big happy family. It wouldn’t be accurate if there weren’t a black sheep of the family present. Maximus looks like the family’s black sheep here as he has four enemies within the family. You might wonder why ally and enemy relationships are shown when we only traversed the relative ties. Neo4j Browser has a feature that displays all connections between nodes on the screen.

## Weakly Connected Components algorithm
The Weakly Connected Components is a part of almost every graph analysis workflow. It is used to find disconnected components or islands within the network. In this example, the graph consists of two components. Michael, Mark, and Doug belong to the first component, while Bridget, Alice, and Charles belong to the second component. We will apply the Weakly Connected Components algorithm to find the largest component of allied characters. As we don’t plan to run any other algorithms on this network, we will use the anonymous graph projection.

In [28]:
run_query("""
CALL gds.wcc.stream({
  nodeProjection:'Character',
  relationshipProjection:'ALLY'})
YIELD nodeId, componentId
WITH componentId, count(*) as members
WHERE members > 1
RETURN componentId, members
ORDER BY members DESC
LIMIT 5
""")

Unnamed: 0,componentId,members
0,0,195
1,26,4
2,245,3
3,6,2
4,2,2


The largest component of allies has 195 members. Then we have a couple of tiny allies islands with only a few members. If we visualize the largest component of allies in the Neo4j Browser and have the connect results nodes option selected, we get the following visualization.

Although we have found the largest allies component, we can observe that many of the characters in the component are actually enemies (red relationships). To better understand why this occurs, let’s look at the following example.

## Custom ally component algorithm
Suppose we wanted to find communities of allies where there are no enemies within the given component. The algorithm implementation is relatively straightforward, and you could use Neo4j custom procedures, for example. Still, if you are like me and don’t speak Java, you can always resort to your favorite scripting language. I have developed the custom Ally component algorithm in Python. First, we define some helper functions for fetching allies and enemies of a single node.

In [48]:
def get_allies(node_id):
    data = session.run("""MATCH (c:Character)-[:ALLY]-(ally) 
                          WHERE c.id = $node_id 
                          RETURN collect(ally.id) as allies""",
                      {'node_id':node_id})
    return data.single()['allies']

def get_enemies(node_id):
    data = session.run("""MATCH (c:Character)-[:ENEMY]-(enemy) 
                          WHERE c.id = $node_id 
                          RETURN collect(enemy.id) as allies""",
                      {'node_id':node_id})
    return data.single()['allies']

def get_members():
    return session.run("""
    CALL gds.wcc.stream({
        nodeProjection:'Character',
        relationshipProjection:'ALLY'})
    YIELD nodeId, componentId
    WITH componentId, collect(gds.util.asNode(nodeId).id) as members
    WHERE size(members) > 10
    RETURN componentId, members
    """).single()['members']

My implementation is relatively simple. The input to the algorithm is the list of all node ids in the largest allied components. Start from a single node, load its enemies into the enemies list and load its allies into a queue that will be processed later. Then we iterate over the allied queue. If a node is not an enemy with any of the existing nodes in the component, add them to the community list and add their enemies to the community’s enemies list. I’ve added some minor performance tweaks like if we have traversed the node already in the allies queue, we can remove that node from the global list of starting nodes.

In [49]:
from collections import deque

def get_largest_stable_allies(node_list):
    final_communities = list()
    while node_list:
        community = set()
        enemies_list = set()
        visited = set()
        
        allies_list = deque()
        allies_list.appendleft(node_list[0])
        
        while allies_list:
            # Get the node from the queue
            start_node = allies_list.pop()
            
            # Skip if current node is enemy with anyone
            if start_node in enemies_list:
                continue
            
            # Get allies and enemies
            allies = get_allies(start_node)
            enemies = get_enemies(start_node)
            
            visited.add(start_node)
            # Add enemies
            enemies_list.update(enemies)
            # Add allies to the list of next visits
            allies_list.extendleft([x for x in allies if (x not in enemies_list) and (x not in visited)])
            # Add current node to community
            community.add(start_node)
            
            # Remove visited nodes from global node list
            try:
                node_list.remove(start_node)
            except:
                pass
        final_communities.append(list(community))
    return max(final_communities, key=len)

In [50]:
members = get_members()
get_largest_stable_allies(members)

AttributeError: 'function' object has no attribute 'run'

In this code, the algorithm only returns the ids of nodes that belong to the largest allied component where there are no enemies within. It shouldn’t be a problem to mark these nodes in Neo4j, as you can match them by their ids. The largest component of allies, where there are no enemies within, has 142 members. If we visualize it in Neo4j Browser, we can see that there are no enemy relationships visible.

## Analyzing characters’ stats
In the last part of our analysis, we will examine the stats of the characters. We have the stats available for a total of 470 heroes. This information was scraped from Marvel’s website. The scale for stats ranges from zero to seven, and Iron Man does not have a single seven. Probably not the strongest of the heroes, even though he is one of the more popular ones. Now we will explore the characters with the highest stats average. Whenever I need some help with my cypher queries, I turn to Neo4j Slack. Luckily for us, Andrew Bowman is always around with great advice on optimizing and prettifying our cypher queries. This time he showed me the `apoc.map.values` procedure. It can be used to fetch all properties of a single node without explicitly writing the property keys.

In [35]:
run_query("""
MATCH (c:Character)-[:HAS_STATS]->(stats)
RETURN c.name as character, 
       apoc.coll.avg(apoc.map.values(stats, keys(stats))) as average_stats
ORDER BY average_stats DESC
LIMIT 10
""")

Unnamed: 0,character,average_stats
0,Asylum,7.0
1,CHTHON,7.0
2,Bloodscream,7.0
3,Dracula,7.0
4,Eternity,7.0
5,Reaper,7.0
6,Living Tribunal,7.0
7,Hyperion (Earth-712),7.0
8,Juggernaut,7.0
9,SET,7.0


It seems many characters have their stats maxed out. I am not sure exactly how this data collection process works, but I found a fascinating heroine by the name of Squirrel Girl that could probably kick Iron Man’s ass with one hand while making sourdough bread with the other. Or polish her nails, not exactly sure what type of girl she is. The only thing certain is that she is a badass.
## k-Nearest Neighbours algorithm
The k-Nearest Neighbour is one of the more standard graph algorithms and was already implemented in the Graph Data Science library before in the form of Cosine, Euclidian, and Pearson similarity algorithms. Those were basic implementation where the algorithms compared a given vector for all node pairs in the network. Because comparing all node pairs does not scale well, another implementation of the kNN algorithm was added to the library. It is based on the Efficient k-nearest neighbor graph construction for generic similarity measures article. Instead of comparing every node pair, the algorithm selects possible neighbors based on the assumption that the neighbors-of-neighbors of a node are most likely already the nearest one. The algorithm scales quasi-linear with respect to the node count instead of being quadratic. The implementation uses the Cosine similarity to compare two vectors.
First, we need to create a vector (array of numbers) that will be compared between the pairs of heroes. We will use the characters’ stats as well as their ability to fly to populate the vector. Because all stats have the same range between zero and seven, there is no need for normalization. We only need to encode the flight feature to span between zero and seven as well. Those characters that can fly will have the value of flight feature seven, while those who can’t fly will have the value zero.

In [51]:
run_query("""
MATCH (c:Character)-[:HAS_STATS]->(s)
WITH c, [s.durability, s.energy, s.fighting_skills, 
         s.intelligence, s.speed, s.strength,
         CASE WHEN c.flight = 'true' THEN 7 ELSE 0 END] as stats_vector
SET c.stats_vector = stats_vector
""")

We will also tag the characters that have the stats vector with a second label. This way, we can easily filter heroes with a stats vector in our native projection of the named graph.

In [52]:
run_query("""
MATCH (c:Character)
WHERE exists (c.stats_vector)
SET c:CharacterStats
""")

Now that everything is ready, we can go ahead and load our named graph. We will project all nodes with the CharacterStats label and their stats_vector properties in a named graph. If you need a quick refresher or introduction to how the GDS library works, I would suggest taking the Introduction to Graph Algorithms course.

In [53]:
run_query("""
CALL gds.graph.create('marvel', 'CharacterStats',
  '*', {nodeProperties:'stats_vector'})
""")

Unnamed: 0,nodeProjection,relationshipProjection,graphName,nodeCount,relationshipCount,createMillis
0,{'CharacterStats': {'properties': {'stats_vect...,"{'__ALL__': {'orientation': 'NATURAL', 'aggreg...",marvel,470,515,43


Now, we can go ahead and infer the similarity network with the new kNN algorithm. We will use the mutate mode of the algorithm. The mutate mode stores the results back to the projected graph instead of the Neo4j stored graph. This way, we can use the kNN algorithm results as the input for the community detection algorithms later in the workflow. The kNN algorithm has some parameters we can use to fine-tune the results:
* topK: The number of neighbors to find for each node. The K-nearest neighbors are returned.
* sampleRate: Sample rate to limit the number of comparisons per node.
* deltaThreshold: Value as a percentage to determine when to stop early. If fewer updates than the configured value happen, the algorithm stops.
* randomJoins: Between every iteration, how many attempts are being made to connect new node neighbors based on random selection.

We will define the topK value of 15 and sampleRate of 0.8, and leave the other parameters at default values.

In [54]:
run_query("""
CALL gds.beta.knn.mutate('marvel', {nodeWeightProperty:'stats_vector', 
  sampleRate:0.8, topK:15, mutateProperty:'score', mutateRelationshipType:'SIMILAR'})
""")

Unnamed: 0,createMillis,computeMillis,mutateMillis,postProcessingMillis,nodesCompared,relationshipsWritten,similarityDistribution,configuration
0,3,396,38,-1,470,7050,"{'p1': 0.2500009536743164, 'max': 1.0000066757...","{'topK': 15, 'maxIterations': 100, 'randomJoin..."


## Louvain Modularity algorithm
The similarity network is inferred and stored in the named graph. We can examine the community structure of this new similarity network with the Louvain Modularity algorithm. As the similarity scores of relationships are available as their properties, we will use the weighted variant of the Louvain Modularity algorithm. Using the `relationshipWeightProperty` parameter, we let the algorithm know it should consider the relationships’ weight when calculating the network’s community structure. This time we will use the `write` mode of the algorithm to store the results back to the Neo4j stored graph.

In [55]:
run_query("""
CALL gds.louvain.write('marvel',
  {relationshipTypes:['SIMILAR'],  
   relationshipWeightProperty:'score', 
   writeProperty:'louvain'});
""")

Unnamed: 0,writeMillis,nodePropertiesWritten,modularity,modularities,ranLevels,communityCount,communityDistribution,postProcessingMillis,createMillis,computeMillis,configuration
0,91,470,0.627539,"[0.564955989452921, 0.6275389748398155]",2,8,"{'p99': 93, 'min': 30, 'max': 93, 'mean': 58.7...",18,2,276,"{'maxIterations': 10, 'writeConcurrency': 4, '..."


We can examine the community structure results with the following cypher query.

In [41]:
run_query("""
MATCH (c:Character)-[:HAS_STATS]->(stats)
RETURN c.louvain as community, count(*) as members, 
       avg(stats.fighting_skills) as fighting_skills,
       avg(stats.durability) as durability,
       avg(stats.energy) as energy,
       avg(stats.intelligence) as intelligence,
       avg(stats.speed) as speed,
       avg(stats.strength) as strength,
       avg(CASE WHEN c.flight = 'true' THEN 7.0 ELSE 0.0 END) as flight
""")

Unnamed: 0,community,members,fighting_skills,durability,energy,intelligence,speed,strength,flight
0,105,100,3.69,4.11,2.92,3.27,3.15,3.74,0.84
1,9,44,6.068182,6.863636,6.636364,6.340909,6.909091,6.840909,1.75
2,372,94,4.404255,5.638298,5.244681,4.319149,5.138298,4.957447,1.712766
3,152,60,4.133333,3.266667,2.316667,3.083333,2.966667,3.233333,0.35
4,151,32,4.625,5.40625,4.46875,4.5,4.15625,4.9375,1.09375
5,36,43,2.883721,2.488372,0.813953,2.953488,1.930233,2.069767,0.162791
6,302,82,3.109756,2.560976,2.04878,2.841463,2.317073,2.243902,0.597561
7,44,15,4.6,3.533333,2.266667,3.4,3.133333,4.133333,0.0


It would make sense to add the standard deviation for each stat, but it wouldn’t be presentable for a blog post. The community with an id 68 has the most powerful members. The average for most stats is 6.5, which means that they are almost entirely maxed out. The average value of flight at 2 indicates that around 30% (2/7) of the members can fly. The largest community with 106 members has their stats averaged between 2 and 3, which would indicate that they might be support characters with lesser abilities. The characters with stronger abilities are usually the lead characters.

## Label Propagation algorithm
Label Propagation algorithm can also be used to determine the community structure of a network. We will apply it to the inferred similarity network and compare the results with the Louvain Modularity algorithm results.

In [56]:
run_query("""
CALL gds.labelPropagation.write('marvel',
  {relationshipTypes:['SIMILAR'],
   relationshipWeightProperty:'score', 
   writeProperty:'labelPropagation'})
""")

Unnamed: 0,writeMillis,nodePropertiesWritten,ranIterations,didConverge,communityCount,communityDistribution,postProcessingMillis,createMillis,computeMillis,configuration
0,66,470,6,True,18,"{'p99': 105, 'min': 3, 'max': 105, 'mean': 26....",13,1,35,"{'maxIterations': 10, 'writeConcurrency': 4, '..."


We investigate the results of the Label Propagation algorithm.


In [57]:
run_query("""
MATCH (c:Character)-[:HAS_STATS]->(stats)
RETURN c.labelPropagation as community, count(*) as members, 
       avg(stats.fighting_skills) as fighting_skills,
       avg(stats.durability) as durability,
       avg(stats.energy) as energy,
       avg(stats.intelligence) as intelligence,
       avg(stats.speed) as speed,
       avg(stats.strength) as strength,
       avg(CASE WHEN c.flight = 'true' THEN 7.0 ELSE 0.0 END) as flight
""")

Unnamed: 0,community,members,fighting_skills,durability,energy,intelligence,speed,strength,flight
0,444,9,4.333333,5.555556,4.666667,4.444444,5.333333,4.777778,0.0
1,602,12,1.833333,2.333333,0.5,2.666667,2.0,2.166667,0.0
2,118,68,4.058824,3.558824,2.25,3.029412,3.0,3.279412,0.617647
3,270,19,4.631579,5.315789,4.421053,4.526316,4.315789,4.789474,0.0
4,189,105,3.714286,4.07619,2.685714,3.342857,2.980952,3.695238,0.866667
5,215,17,4.470588,3.647059,2.588235,3.352941,3.117647,4.117647,0.0
6,218,14,4.142857,4.5,4.928571,4.142857,3.857143,4.285714,0.0
7,675,7,2.0,1.0,1.857143,3.0,0.285714,0.857143,0.0
8,343,14,2.785714,2.428571,2.0,2.785714,2.071429,2.0,0.0
9,722,22,2.590909,2.090909,0.727273,3.136364,2.181818,1.681818,0.318182


We can notice that the Label Propagation algorithm found twice as many communities as the Louvain Modularity algorithm. Some of them are relatively tiny. For example, the community with an id 693 has only three members, and all their average stats are at 1.0 value. They are the heroes that go by the name of Maggott, Deathbird, and Slayback. Funky names. The most powerful community has an id of 137 and only 23 members. Remember, the most powerful community found by the Louvain Modularity algorithm had 46 members and a slightly lower value of average stats.

## Conclusion
I hope you have learned some tricks on performing network analysis in Neo4j with the help of APOC and GDS libraries. There are still many things we could do with this graph, so expect a new post shortly.

#### ML with GDS ... continued from https://github.com/AliciaFrame/ML_with_GDS

## Data Prep for Marvel

### 1. Load data - from https://gist.github.com/tomasonjo/fbc6d617c3f6476a3a825b5dd22fd29a
#### Already done above, so skip this step


### 2. Move character traits to character nodes

In [61]:
run_query("""
MATCH (c:Character)-[:HAS_STATS]->(s)
WITH c, s.strength as strength, s.fighting_skills as fighting_skills, s.durability as durability, s.speed as speed, s.intelligence as intelligence, s.energy as energy
SET c.strength=strength,
    c.fighting_skills=fighting_skills,
    c.durability=durability,
    c.speed=speed,
    c.intelligence=intelligence,
    c.energy=energy
RETURN count(c)
""")

Unnamed: 0,count(c)
0,470


### 3. Create an appeared together relationship

In [64]:
run_query("""
MATCH (c1:Character)-[:APPEARED_IN]->(c:Comic)<-[:APPEARED_IN]-(c2:Character) 
WITH c1, c2, count(c) as weight
MERGE (c1)-[:APPEARED_WITH{times:weight}]->(c2)
MERGE (c2)-[:APPEARED_WITH{times:weight}]->(c1)
""")

### 4. one hot encode group membership <-- I ended up not using this, but useful to know how to 

In [65]:
run_query("""
MATCH (group:Group)
WITH group
   ORDER BY group.name
WITH collect(group) AS groups
MATCH (c:Character)
WITH c, gds.alpha.ml.oneHotEncoding(groups, [(c)-[:PART_OF_GROUP]->(group) | group]) as group_membership
SET c.group_membership=group_membership
""")

## Feature Engineering

### 1. load graph with features

In [75]:
run_query("""
CALL gds.graph.list()
YIELD graphName AS namedGraph
WITH namedGraph
CALL gds.graph.drop(namedGraph)
YIELD graphName
RETURN graphName;
""")

Unnamed: 0,graphName


In [77]:
run_query("""
CALL gds.graph.create(
  'marvel-character-graph',
  {
    Person: {
      label: 'Character',
      properties: { 
      strength:{property:'strength',defaultValue:0},
      fighting_skills:{property:'fighting_skills', defaultValue:0},
      durability:{property:'durability', defaultValue:0},
      speed:{property:'speed', defaultValue:0},
      intelligence:{property:'intelligence', defaultValue:0},
      group_membership:{property:'group_membership',defaultValue:[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]}
      }
    }
  }, {
    APPEARS_WITH_UNDIRECTED: {
      type: 'APPEARED_WITH',
      orientation: 'UNDIRECTED',
      aggregation: 'SINGLE',
      properties: ['times']
    },
    APPEARS_WITH_DIRECTED: {
      type: 'APPEARED_WITH',
      orientation: 'NATURAL',
      properties: ['times'],
      aggregation: 'SINGLE'
    },
    ALLY_UNDIRECTED: {
      type: 'ALLY',
      orientation: 'UNDIRECTED',
      aggregation: 'SINGLE'
    },
    ALLY_DIRECTED: {
      type: 'ALLY',
      orientation: 'NATURAL',
      aggregation: 'SINGLE'
    },    
    ENEMY_UNDIRECTED: {
      type: 'ENEMY',
      orientation: 'UNDIRECTED',
      aggregation: 'SINGLE'
    },
    ENEMY_DIRECTED: {
      type: 'ENEMY',
      orientation: 'NATURAL',
      aggregation: 'SINGLE'
    }
   
});
""")

Unnamed: 0,nodeProjection,relationshipProjection,graphName,nodeCount,relationshipCount,createMillis
0,{'Person': {'properties': {'group_membership':...,{'ENEMY_UNDIRECTED': {'orientation': 'UNDIRECT...,marvel-character-graph,1105,132845,110


In [78]:
# run_query("""
# CALL gds.graph.drop('marvel-character-graph')
# """)

In [79]:
# run_query("""
# CALL gds.graph.list()
# """)

### 2. run centrality algos to add more features 
### pageRank

In [89]:
run_query("""

// pageRank
CALL gds.pageRank.write('marvel-character-graph',{
     relationshipTypes: ['APPEARS_WITH_DIRECTED'],
     writeProperty: 'appeared_with_pageRank'
});


""")

Unnamed: 0,writeMillis,nodePropertiesWritten,ranIterations,didConverge,centralityDistribution,postProcessingMillis,createMillis,computeMillis,configuration
0,77,1105,20,False,"{'p99': 4.413420677185059, 'min': 0.1499996185...",50,1,128,"{'maxIterations': 20, 'writeConcurrency': 4, '..."


In [90]:
run_query("""

// pageRank
CALL gds.pageRank.write('marvel-character-graph',{
     relationshipTypes: ['ALLY_DIRECTED'],
     writeProperty: 'ally_pageRank'
});


""")

Unnamed: 0,writeMillis,nodePropertiesWritten,ranIterations,didConverge,centralityDistribution,postProcessingMillis,createMillis,computeMillis,configuration
0,62,1105,19,True,"{'p99': 0.36827564239501953, 'min': 0.14999961...",23,0,59,"{'maxIterations': 20, 'writeConcurrency': 4, '..."


In [91]:
run_query("""

// pageRank
CALL gds.pageRank.write('marvel-character-graph',{
     relationshipTypes: ['ENEMY_DIRECTED'],
     writeProperty: 'enemy_pageRank'
});


""")

Unnamed: 0,writeMillis,nodePropertiesWritten,ranIterations,didConverge,centralityDistribution,postProcessingMillis,createMillis,computeMillis,configuration
0,62,1105,16,True,"{'p99': 0.3315916061401367, 'min': 0.149999618...",15,0,49,"{'maxIterations': 20, 'writeConcurrency': 4, '..."


### betweenness

In [92]:
run_query("""

// betweenness
CALL gds.betweenness.write('marvel-character-graph',{
     relationshipTypes: ['APPEARS_WITH_UNDIRECTED'],
     writeProperty: 'appeared_with_betweenness'
});


""")

Unnamed: 0,nodePropertiesWritten,writeMillis,centralityDistribution,postProcessingMillis,createMillis,computeMillis,configuration
0,1105,59,"{'p99': 6057.406249880791, 'min': 0.0, 'max': ...",110,1,271,"{'writeConcurrency': 4, 'writeProperty': 'appe..."


In [93]:
run_query("""

// betweenness
CALL gds.betweenness.write('marvel-character-graph',{
     relationshipTypes: ['ALLY_UNDIRECTED'],
     writeProperty: 'ally_betweenness'
});


""")

Unnamed: 0,nodePropertiesWritten,writeMillis,centralityDistribution,postProcessingMillis,createMillis,computeMillis,configuration
0,1105,69,"{'p99': 1849.0078048706055, 'min': 0.0, 'max':...",28,0,10,"{'writeConcurrency': 4, 'writeProperty': 'ally..."


In [95]:
run_query("""
// betweenness
CALL gds.betweenness.write('marvel-character-graph',{
     relationshipTypes: ['ENEMY_UNDIRECTED'],
     writeProperty: 'enemy_betweenness'
});

""")

Unnamed: 0,nodePropertiesWritten,writeMillis,centralityDistribution,postProcessingMillis,createMillis,computeMillis,configuration
0,1105,64,"{'p99': 2607.9687480926514, 'min': 0.0, 'max':...",35,0,6,"{'writeConcurrency': 4, 'writeProperty': 'enem..."


### HITS

In [96]:
run_query("""

//HITS
CALL gds.alpha.hits.write('marvel-character-graph',{
     relationshipTypes: ['APPEARS_WITH_DIRECTED'],
     hitsIterations: 50,
     authProperty: 'appeared_with_auth',
     hubProperty: 'appeared_with_hub'
});

""")

Unnamed: 0,nodePropertiesWritten,ranIterations,didConverge,writeMillis,postProcessingMillis,createMillis,computeMillis,configuration
0,2210,201,False,86,0,4,346,"{'writeConcurrency': 0, 'writeProperty': '', '..."


In [97]:
run_query("""

//HITS
CALL gds.alpha.hits.write('marvel-character-graph',{
     relationshipTypes: ['ALLY_DIRECTED'],
     hitsIterations: 50,
     authProperty: 'appeared_with_auth',
     hubProperty: 'appeared_with_hub'
});

""")

Unnamed: 0,nodePropertiesWritten,ranIterations,didConverge,writeMillis,postProcessingMillis,createMillis,computeMillis,configuration
0,2210,201,False,38,0,1,22,"{'writeConcurrency': 0, 'writeProperty': '', '..."


In [98]:
run_query("""

//HITS
CALL gds.alpha.hits.write('marvel-character-graph',{
     relationshipTypes: ['ENEMY_DIRECTED'],
     hitsIterations: 50,
     authProperty: 'appeared_with_auth',
     hubProperty: 'appeared_with_hub'
});

""")

Unnamed: 0,nodePropertiesWritten,ranIterations,didConverge,writeMillis,postProcessingMillis,createMillis,computeMillis,configuration
0,2210,201,False,30,0,1,22,"{'writeConcurrency': 0, 'writeProperty': '', '..."


### 3. mutate the in-memory graph rather than reload

In [103]:
run_query("""

// pageRank

CALL gds.pageRank.mutate('marvel-character-graph',{
     relationshipTypes: ['APPEARS_WITH_DIRECTED'],
     mutateProperty: 'appeared_with_pageRank'
});

""")

Unnamed: 0,mutateMillis,nodePropertiesWritten,ranIterations,didConverge,centralityDistribution,postProcessingMillis,createMillis,computeMillis,configuration
0,0,1105,20,False,"{'p99': 4.413420677185059, 'min': 0.1499996185...",9,0,123,"{'maxIterations': 20, 'sourceNodes': [], 'rela..."


In [104]:
run_query("""

// pageRank

CALL gds.pageRank.mutate('marvel-character-graph',{
     relationshipTypes: ['ALLY_DIRECTED'],
     mutateProperty: 'ally_pageRank'
});
""")

Unnamed: 0,mutateMillis,nodePropertiesWritten,ranIterations,didConverge,centralityDistribution,postProcessingMillis,createMillis,computeMillis,configuration
0,0,1105,19,True,"{'p99': 0.36827564239501953, 'min': 0.14999961...",1,0,63,"{'maxIterations': 20, 'sourceNodes': [], 'rela..."


In [105]:
run_query("""

// pageRank

CALL gds.pageRank.mutate('marvel-character-graph',{
     relationshipTypes: ['ENEMY_DIRECTED'],
     mutateProperty: 'enemy_pageRank'
});
""")

Unnamed: 0,mutateMillis,nodePropertiesWritten,ranIterations,didConverge,centralityDistribution,postProcessingMillis,createMillis,computeMillis,configuration
0,0,1105,16,True,"{'p99': 0.3315916061401367, 'min': 0.149999618...",1,0,49,"{'maxIterations': 20, 'sourceNodes': [], 'rela..."


In [109]:
run_query("""

// betweenness

CALL gds.betweenness.mutate('marvel-character-graph',{
     relationshipTypes: ['APPEARS_WITH_UNDIRECTED'],
     mutateProperty: 'appeared_with_betweenness'
});
""")

Unnamed: 0,nodePropertiesWritten,mutateMillis,centralityDistribution,postProcessingMillis,createMillis,computeMillis,configuration
0,1105,0,"{'p99': 6057.406249880791, 'min': 0.0, 'max': ...",110,0,252,"{'nodeLabels': ['*'], 'sudo': False, 'relation..."


In [110]:
run_query("""

// betweenness

CALL gds.betweenness.mutate('marvel-character-graph',{
     relationshipTypes: ['ALLY_UNDIRECTED'],
     mutateProperty: 'ally_betweenness'
});
""")

Unnamed: 0,nodePropertiesWritten,mutateMillis,centralityDistribution,postProcessingMillis,createMillis,computeMillis,configuration
0,1105,0,"{'p99': 1849.0078048706055, 'min': 0.0, 'max':...",41,0,2,"{'nodeLabels': ['*'], 'sudo': False, 'relation..."


In [111]:
run_query("""

// betweenness

CALL gds.betweenness.mutate('marvel-character-graph',{
     relationshipTypes: ['ENEMY_UNDIRECTED'],
     mutateProperty: 'enemy_betweenness'
});
""")

Unnamed: 0,nodePropertiesWritten,mutateMillis,centralityDistribution,postProcessingMillis,createMillis,computeMillis,configuration
0,1105,0,"{'p99': 2607.9687480926514, 'min': 0.0, 'max':...",48,0,2,"{'nodeLabels': ['*'], 'sudo': False, 'relation..."


In [115]:
run_query("""

//HITS

CALL gds.alpha.hits.mutate('marvel-character-graph',{
     relationshipTypes: ['APPEARS_WITH_DIRECTED'],
     hitsIterations: 50,
     authProperty: 'appeared_with_auth',
     hubProperty: 'appeared_with_hub'
});
""")

Unnamed: 0,nodePropertiesWritten,ranIterations,didConverge,mutateMillis,postProcessingMillis,createMillis,computeMillis,configuration
0,2210,201,False,0,0,1,289,"{'writeConcurrency': 0, 'writeProperty': '', '..."


In [116]:
run_query("""

//HITS

CALL gds.alpha.hits.mutate('marvel-character-graph',{
     relationshipTypes: ['ALLY_DIRECTED'],
     hitsIterations: 50,
     authProperty: 'appeared_with_auth',
     hubProperty: 'appeared_with_hub'
});
""")

Unnamed: 0,nodePropertiesWritten,ranIterations,didConverge,mutateMillis,postProcessingMillis,createMillis,computeMillis,configuration
0,2210,201,False,0,0,1,17,"{'writeConcurrency': 0, 'writeProperty': '', '..."


In [117]:
run_query("""

//HITS

CALL gds.alpha.hits.mutate('marvel-character-graph',{
     relationshipTypes: ['ENEMY_DIRECTED'],
     hitsIterations: 50,
     authProperty: 'appeared_with_auth',
     hubProperty: 'appeared_with_hub'
});
""")

Unnamed: 0,nodePropertiesWritten,ranIterations,didConverge,mutateMillis,postProcessingMillis,createMillis,computeMillis,configuration
0,2210,201,False,0,0,1,15,"{'writeConcurrency': 0, 'writeProperty': '', '..."


### 4. compute fastRP extended embedding (Fast Random Projection (FastRP) node embedding algorithm)
#### I'm writing this back because I'm only going to train my model on known characters, but I want the embedding for the full graph

In [119]:
run_query("""
CALL gds.beta.fastRPExtended.write('marvel-character-graph',{
    relationshipTypes:['APPEARS_WITH_UNDIRECTED'],
    featureProperties: ['strength','fighting_skills','durability','speed','intelligence','appeared_with_pageRank','ally_pageRank','enemy_pageRank','appeared_with_betweenness','ally_betweenness','enemy_betweenness','appeared_with_hub','appeared_with_auth'], //14 node features
    relationshipWeightProperty: 'times',
    propertyDimension: 45,
    embeddingDimension: 250,
    iterationWeights: [0, 0, 1.0, 1.0],
    normalizationStrength:0.05,
    writeProperty: 'fastRP_Extended_Embedding'
});
""")

Unnamed: 0,nodeCount,nodePropertiesWritten,createMillis,computeMillis,writeMillis,configuration
0,1105,1105,6,150,114,"{'writeConcurrency': 4, 'relationshipWeightPro..."


### 5. drop extra graphs

In [121]:
run_query("""
call gds.graph.drop('marvel-character-graph');
""")

Unnamed: 0,graphName,database,memoryUsage,sizeInBytes,detailSizeInBytes,nodeProjection,relationshipProjection,nodeQuery,relationshipQuery,nodeCount,relationshipCount,density,creationTime,modificationTime,schema
0,marvel-character-graph,neo4j,6536 KiB,6693824,"{'relationships': {'total': 2704160, 'everythi...",{'Person': {'properties': {'group_membership':...,{'ENEMY_UNDIRECTED': {'orientation': 'UNDIRECT...,,,1105,132845,0.108896,2021-02-19T12:02:57.265508000-05:00,2021-02-19T12:33:21.560312000-05:00,"{'relationships': {'ENEMY_UNDIRECTED': {}, 'EN..."


## Node Classification

### select-label-the-data-for-the-model.cypher

In [122]:
## 1. Select & label the data for the model -find the x-men and tag them, then flag to use in model

run_query("""
MATCH (c:Character)-[:PART_OF_GROUP]-> (g:Group{name:'X-Men'})
SET c.is_xman=1, c:Model_Data;
""")

In [123]:
## 2. find and include some unaffiliated individuals that are very far from x-men (but not orphan nodes) 
## there are way more not x-men (133 with other affiliations, 936 with no known group) 
## so we need to downsample for training

run_query("""
MATCH (c:Character)
WHERE NOT (c)-[:PART_OF_GROUP]->(:Group) WITH c
WHERE NOT (c)-[:APPEARED_WITH*2..3]-(:Character{is_xman:1}) 
AND apoc.node.degree(c)>0 WITH c
WHERE rand() < 0.2
SET c:Model_Data, c.is_xman=0;
""")


In [124]:
## 3. label the holdout data too (to predict on)

run_query("""
MATCH (c:Character)
WHERE NOT (c:Model_Data)
SET c:Holdout_Data;
""")

### load-graph-for-class-prediction

In [None]:
## drop graph for class prediction, if exists

run_query("""
call gds.graph.drop('marvel_model_data');
""")

In [None]:
## Drop Model ( for community limitations)

run_query("""
CALL gds.beta.model.drop("xmen-model-fastRP");
  """)

In [149]:
## 2. load graph for class prediction

run_query("""
CALL gds.graph.create(
  'marvel_model_data',
  {
    Character: {
      label: 'Model_Data',
      properties: { 
        fastRP_embedding:{property:'fastRP_Extended_Embedding', defaultValue:0},
        //graphSAGE_embedding:{property:'graphSAGE_embedding', defaultValue:0},
        strength:{property:'strength', defaultValue:0},
        durability:{property:'durability', defaultValue:0},
        intelligence:{property:'intelligence', defaultValue:0},
        energy:{property:'energy', defaultValue:0},
        speed:{property:'speed', defaultValue:0},
        is_xman:{property:'is_xman', defaultValue:0}
      }
    },
    Holdout_Character: {
      label: 'Holdout_Data',
      properties: { 
        fastRP_embedding:{property:'fastRP_Extended_Embedding', defaultValue:0},
        //graphSAGE_embedding:{property:'graphSAGE_embedding', defaultValue:0},
        strength:{property:'strength', defaultValue:0},
        durability:{property:'durability', defaultValue:0},
        intelligence:{property:'intelligence', defaultValue:0},
        energy:{property:'energy', defaultValue:0},
        speed:{property:'speed', defaultValue:0},
        is_xman:{property:'is_xman', defaultValue:0}
      }
    }
  }, {
    APPEARED_WITH: { //I don't actually need this for node classification
      type: 'APPEARED_WITH',
      orientation: 'UNDIRECTED',
      properties: ['times'],
      aggregation: 'SINGLE'
    }
});
""")

Unnamed: 0,nodeProjection,relationshipProjection,graphName,nodeCount,relationshipCount,createMillis
0,{'Holdout_Character': {'properties': {'strengt...,{'APPEARED_WITH': {'orientation': 'UNDIRECTED'...,marvel_model_data,1105,65612,30


### train-node-classifier-to-find-x-men-fast-rp

In [153]:
## 3. train node classifier to find x-men: fastRP

run_query("""
CALL gds.alpha.ml.nodeClassification.train('marvel_model_data', {
   nodeLabels: ['Character'],
   modelName: 'xmen-model-fastRP',
   featureProperties: ['fastRP_embedding'], 
   targetProperty: 'is_xman', 
   metrics: ['F1_WEIGHTED','ACCURACY'], 
   holdoutFraction: 0.2, 
   validationFolds: 5, 
   randomSeed: 2,
   params: [
       {penalty: 0.0625, maxIterations: 1000},
       {penalty: 0.125, maxIterations: 1000}, 
       {penalty: 0.25, maxIterations: 1000}, 
       {penalty: 0.5, maxIterations: 1000},
       {penalty: 1.0, maxIterations: 1000},
       {penalty: 2.0, maxIterations: 1000}, 
       {penalty: 4.0, maxIterations: 1000}
       ]
    }) YIELD modelInfo
  RETURN
  modelInfo.bestParameters AS winningModel,
  modelInfo.metrics.F1_WEIGHTED.outerTrain AS trainGraphScore,
  modelInfo.metrics.F1_WEIGHTED.test AS testGraphScore
  """)

Unnamed: 0,winningModel,trainGraphScore,testGraphScore
0,"{'maxIterations': 1000, 'penalty': 0.0625}",0.928319,1.0


In [152]:
## Drop Model ( for community limitations)

run_query("""
CALL gds.beta.model.drop("xmen-model-fastRP");
  """)

Unnamed: 0,modelInfo,trainConfig,graphSchema,loaded,stored,creationTime,shared
0,"{'modelName': 'xmen-model-fastRP', 'modelType'...","{'holdoutFraction': 0.2, 'params': [{'maxItera...","{'relationships': {'APPEARED_WITH': {}}, 'node...",True,False,2021-02-19T20:30:42.072490000-05:00,False


In [158]:
### Run the next two cells only if not using fastRP

In [135]:
## 4. compare to tabular properties
## if not using fastRP


run_query("""
CALL gds.alpha.ml.nodeClassification.train('marvel_model_data', {
   nodeLabels: ['Character'],
   modelName: 'xmen-model-properties',
   featureProperties: ['energy','speed','strength','durability','intelligence'], 
   targetProperty: 'is_xman', 
   metrics: ['F1_WEIGHTED','ACCURACY'], 
   holdoutFraction: 0.2, 
   validationFolds: 5, 
   randomSeed: 2,
   params: [
       {penalty: 0.0625, maxIterations: 1000},
       {penalty: 0.125, maxIterations: 1000}, 
       {penalty: 0.25, maxIterations: 1000}, 
       {penalty: 0.5, maxIterations: 1000},
       {penalty: 1.0, maxIterations: 1000},
       {penalty: 2.0, maxIterations: 1000}, 
       {penalty: 4.0, maxIterations: 1000}
       ]
    }) YIELD modelInfo
  RETURN
  modelInfo.bestParameters AS winningModel,
  modelInfo.metrics.F1_WEIGHTED.outerTrain AS trainGraphScore,
  modelInfo.metrics.F1_WEIGHTED.test AS testGraphScore
  """)

Unnamed: 0,winningModel,trainGraphScore,testGraphScore
0,"{'maxIterations': 1000, 'penalty': 0.0625}",0.448286,0.239766


In [None]:
## Drop Model ( for community limitations)

run_query("""
CALL gds.beta.model.drop("xmen-model-properties");
  """)

## 5. Make some predictions!

In [155]:
##  1. lets predict node classes (aka: can we find more x-men?)
## Add the predictions to the in-memory graph

run_query("""
CALL gds.alpha.ml.nodeClassification.predict.mutate('marvel_model_data', {
  nodeLabels: ['Holdout_Character'], //filter our the character nodes
  modelName: 'xmen-model-fastRP',
  mutateProperty: 'predicted_xman',
  predictedProbabilityProperty: 'predicted_xman_probability'
});

  """)

Unnamed: 0,nodePropertiesWritten,mutateMillis,postProcessingMillis,createMillis,computeMillis,configuration
0,2066,0,0,0,2,"{'modelName': 'xmen-model-fastRP', 'predictedP..."


In [156]:
##  1. lets predict node classes (aka: can we find more x-men?)
## Add the predictions to the in-memory graph

run_query("""
CALL gds.graph.writeNodeProperties(
  'marvel_model_data',
  ['predicted_xman', 'predicted_xman_probability'],
  ['Holdout_Character']
);
  """)

Unnamed: 0,writeMillis,graphName,nodeProperties,propertiesWritten
0,100,marvel_model_data,"[predicted_xman, predicted_xman_probability]",2066


In [157]:
## 3. check our predicted node classes

run_query("""
MATCH (c:Character) 
WHERE c.predicted_xman = 1 AND NOT c:Model_Data
RETURN c.name, c.aliases, c.predicted_xman, c.predicted_xman_probability 
  """)

## (some of the results are unlabeled x men, like Beast, others are agents of SHIELD (frequent antagonists) or allies (avengers))

Unnamed: 0,c.name,c.aliases,c.predicted_xman,c.predicted_xman_probability
0,Steve Rogers,"Steven Rogers, Brett Hendrick, Buck Jones, Yeo...",1,"[0.1387353529305271, 0.8612646470694726]"
1,James Buchanan Barnes,"James Buchanan Barnes, Captain America",1,"[0.1387253187111901, 0.8612746812888097]"
2,Nick Fury (LEGO Marvel Super Heroes),,1,"[0.13878208115148838, 0.8612179188485113]"
3,Sharon Carter,"Agent 13, Irma Kruhl, Fraulein Rogers, others",1,"[0.13871332327426625, 0.8612866767257333]"
4,Kate Bishop,"Hawkingbird, Mockingbird, Taskmistress, Weapon...",1,"[0.1386900695302335, 0.8613099304697661]"
...,...,...,...,...
849,Whizzer (Stanley Stewart),,1,"[0.13866771041539983, 0.8613322895845997]"
850,Talon (Fraternity of Raptors),"Lord Talon; impersonated Araki, and Smasher",1,"[0.13864395710056085, 0.8613560428994388]"
851,Lava-Man,,1,"[0.13858830271278944, 0.8614116972872103]"
852,Blue Blade,,1,"[0.1386942381917725, 0.8613057618082272]"


## Link Prediction

In [4]:
run_query("""
call gds.graph.list ();
""")

Unnamed: 0,degreeDistribution,graphName,database,memoryUsage,sizeInBytes,detailSizeInBytes,nodeProjection,relationshipProjection,nodeQuery,relationshipQuery,nodeCount,relationshipCount,density,creationTime,modificationTime,schema


In [None]:
## drop graph for class prediction, if exists

run_query("""
call gds.graph.drop('marvel_model_data');
""")

In [None]:
## List Models ( for community limitations)

run_query("""
CALL gds.beta.model.list();
  """)

In [None]:
## Drop Model if it exist ( for community limitations)

run_query("""
CALL gds.beta.model.drop("xmen-model-fastRP");
  """)

In [7]:
## Drop Model ( for community limitations)

run_query("""
CALL gds.beta.model.list();
  """)

Unnamed: 0,modelInfo,trainConfig,graphSchema,loaded,stored,creationTime,shared


### 1. Split the graph into the data we want to use for the model, and data to hold out to test afterwards

In [10]:
## 1. Split the graph into the data we want to use for the model, and data to hold out to test afterwards
run_query("""
MATCH (c1:Character)-[:APPEARED_IN]->(c:Comic)<-[:APPEARED_IN]-(c2:Character) 
WHERE c.year <> "2020" AND c.year <> "2019" AND c.year <> "2018" AND c.year <> "2017" AND c.year <> "2016" 
WITH c1, c2, count(c) as weight
MERGE (c1)-[:APPEARED_WITH_MODEL{times:weight}]->(c2)
MERGE (c2)-[:APPEARED_WITH_MODEL{times:weight}]->(c1);
  """)

### And label the data that's been held out

In [11]:
## And label the data that's been held out
run_query("""
MATCH (c1:Character)-[:APPEARED_IN]->(c:Comic)<-[:APPEARED_IN]-(c2:Character) 
WHERE c.year="2020" OR c.year="2019" OR c.year="2018" OR c.year="2017" OR c.year="2016" 
WITH c1, c2, count(c) as weight
MERGE (c1)-[:APPEARED_WITH_HOLDOUT{times:weight}]->(c2)
MERGE (c2)-[:APPEARED_WITH_HOLDOUT{times:weight}]->(c1);
  """)

### 2. load graph for class prediction

In [13]:
## 2. load graph for class prediction

run_query("""
CALL gds.graph.create(
  'marvel_linkpred_data',
  {
    Character: {
      label: 'Character',
      properties: { 
        fastRP_embedding:{property:'fastRP_Extended_Embedding', defaultValue:0},
        //graphSAGE_embedding:{property:'graphSAGE_embedding', defaultValue:0},
        strength:{property:'strength', defaultValue:0},
        durability:{property:'durability', defaultValue:0},
        intelligence:{property:'intelligence', defaultValue:0},
        energy:{property:'energy', defaultValue:0},
        speed:{property:'speed', defaultValue:0},
        is_xman:{property:'is_xman', defaultValue:0}
      }
    }
  }, {
    APPEARED_WITH: { //I don't actually need this for node classification
      type: 'APPEARED_WITH_MODEL',
      orientation: 'UNDIRECTED',
      properties: ['times'],
      aggregation: 'SINGLE'
    },
    APPEARED_WITH_HOLDOUT: { //I don't actually need this for node classification
      type: 'APPEARED_WITH_HOLDOUT',
      orientation: 'UNDIRECTED',
      properties: ['times'],
      aggregation: 'SINGLE'
    }
});
  """)

Unnamed: 0,nodeProjection,relationshipProjection,graphName,nodeCount,relationshipCount,createMillis
0,{'Character': {'properties': {'strength': {'pr...,{'APPEARED_WITH_HOLDOUT': {'orientation': 'UND...,marvel_linkpred_data,1105,54268,260


### 3. Add test train splits to in-memory graph

In [14]:
## 3. Add test train splits to in-memory graph
run_query("""
CALL gds.alpha.ml.splitRelationships.mutate('marvel_linkpred_data', {
  relationshipTypes: ['APPEARED_WITH'],
  remainingRelationshipType: 'APPEARED_WITH_REMAINING',
  holdoutRelationshipType: 'APPEARED_WITH_TESTGRAPH',
  holdoutFraction: 0.2
}) YIELD relationshipsWritten;
  """)


Unnamed: 0,relationshipsWritten
0,51438


In [15]:
run_query("""
CALL gds.alpha.ml.splitRelationships.mutate('marvel_linkpred_data', {
  relationshipTypes: ['APPEARED_WITH_REMAINING'],
  remainingRelationshipType: 'APPEARED_WITH_IGNORED_FOR_TRAINING',
  holdoutRelationshipType: 'APPEARED_WITH_TRAINGRAPH',
  holdoutFraction: 0.2
}) YIELD relationshipsWritten;
  """)

Unnamed: 0,relationshipsWritten
0,41152


### 4.  train a link prediction model

In [17]:
## 4.  train a link prediction model

run_query("""
CALL gds.alpha.ml.linkPrediction.train
('marvel_linkpred_data', {
  trainRelationshipType: 'APPEARED_WITH_TRAINGRAPH',
  testRelationshipType: 'APPEARED_WITH_TESTGRAPH',
  modelName: 'lp-appearance-model',
  featureProperties: ['fastRP_embedding'],
  validationFolds: 5,
  classRatio: 1.33,
  randomSeed: 2,
  params: [
    {penalty: 0.24, maxIterations: 1000},
    {penalty: 0.5, maxIterations: 1000},
    {penalty: 1.0, maxIterations: 1000},
    {penalty: 0.0, maxIterations: 1000}
  ]
}) YIELD modelInfo
RETURN
  modelInfo.bestParameters AS winningModel,
  modelInfo.metrics.AUCPR.outerTrain AS trainGraphScore,
  modelInfo.metrics.AUCPR.test AS testGraphScore
  """)

Unnamed: 0,winningModel,trainGraphScore,testGraphScore
0,"{'maxIterations': 1000, 'penalty': 1.0}",0.649209,0.657659


### 5.  train a link prediction model - without an embedding

In [None]:
## 5.  train a link prediction model - without an embedding

run_query("""
CALL gds.alpha.ml.linkPrediction.train('marvel_linkpred_data', {
  trainRelationshipType: 'APPEARED_WITH_TRAINGRAPH',
  testRelationshipType: 'APPEARED_WITH_TESTGRAPH',
  modelName: 'lp-appearance-model-noEmbedding',
  featureProperties: ['strength','speed','intelligence','durability'],
  validationFolds: 5,
  classRatio: 1.33,
  randomSeed: 2,
  params: [
    {penalty: 0.24, maxIterations: 1000},
    {penalty: 0.5, maxIterations: 1000},
    {penalty: 1.0, maxIterations: 1000},
    {penalty: 0.0, maxIterations: 1000}
  ]
}) YIELD modelInfo
RETURN
  modelInfo.bestParameters AS winningModel,
  modelInfo.metrics.AUCPR.outerTrain AS trainGraphScore,
  modelInfo.metrics.AUCPR.test AS testGraphScore
  """)

### 2. lets predict some new links (aka: can we find more x-men?)
#### Add the predictions to the in-memory graph

In [20]:
## 2. lets predict some new links (aka: can we find more x-men?)
## Add the predictions to the in-memory graph

run_query("""
CALL gds.alpha.ml.linkPrediction.predict.mutate('marvel_linkpred_data', {
  relationshipTypes: ['APPEARED_WITH'], //filter out the known relationship type
  modelName: 'lp-appearance-model',
  mutateRelationshipType: 'APPEARED_WITH_PREDICTED',
  topN: 500,
  threshold: 0.49
});

  """)

Unnamed: 0,relationshipsWritten,mutateMillis,postProcessingMillis,createMillis,computeMillis,configuration
0,1000,0,0,0,720,"{'modelName': 'lp-appearance-model', 'threshol..."


In [21]:
run_query("""
CALL gds.graph.writeRelationship(
  'marvel_linkpred_data',
  'APPEARED_WITH_PREDICTED'
);
  """)

Unnamed: 0,writeMillis,graphName,relationshipType,relationshipProperty,relationshipsWritten,propertiesWritten
0,221,marvel_linkpred_data,APPEARED_WITH_PREDICTED,,1000,0


### 4. check predicted links 

In [22]:
### 4. check predicted links 
run_query("""
MATCH (c1:Character)-[r:APPEARED_WITH_PREDICTED]->(c2:Character) 
RETURN c1.name, c2.name
  """)

Unnamed: 0,c1.name,c2.name
0,UNREVEALED BLACKOUT,"MAXWELL ""MAX"" DILLON ELECTRO"
1,DAVID ALLEYNE PRODIGY,"MAXWELL ""MAX"" DILLON ELECTRO"
2,UNREVEALED UNSPOKEN,"MAXWELL ""MAX"" DILLON ELECTRO"
3,SILAS KING SOLARR,"MAXWELL ""MAX"" DILLON ELECTRO"
4,BALDER,"MAXWELL ""MAX"" DILLON ELECTRO"
...,...,...
995,"MAXWELL ""MAX"" DILLON ELECTRO",Sprite
996,"MAXWELL ""MAX"" DILLON ELECTRO",The Spike
997,SIGYN,The Spike
998,"MAXWELL ""MAX"" DILLON ELECTRO",X-Ray (James Darnell)
