In [1]:
import neo4j

import csv

import math
import numpy as np
import pandas as pd

import psycopg2

In [2]:
driver = neo4j.GraphDatabase.driver(uri="neo4j://neo4j:7687", auth=("neo4j","ucb_mids_w205"))

In [3]:
session = driver.session(database="neo4j")

In [4]:
def my_get_node_list():
    "get a list of nodes in the current graph"
    
    query = "match (n) return n.name as name"
    
    result = session.run(query)
    
    node_list = []
    
    for r in result:
        node_list.append(r["name"])
        
    node_list = sorted(node_list)
    
    return node_list

In [5]:
def my_select_query_pandas(query, rollback_before_flag, rollback_after_flag):
    "function to run a select query and return rows in a pandas dataframe"
    
    if rollback_before_flag:
        connection.rollback()
    
    df = pd.read_sql_query(query, connection)
    
    if rollback_after_flag:
        connection.rollback()
    
    # fix the float columns that really should be integers
    
    for column in df:
    
        if df[column].dtype == "float64":

            fraction_flag = False

            for value in df[column].values:
                
                if not np.isnan(value):
                    if value - math.floor(value) != 0:
                        fraction_flag = True

            if not fraction_flag:
                df[column] = df[column].astype('Int64')
    
    return(df)

In [6]:
def my_neo4j_wipe_out_database():
    "wipe out database by deleting all nodes and relationships"
    
    query = "match (node)-[relationship]->() delete node, relationship"
    session.run(query)
    
    query = "match (node) delete node"
    session.run(query)

In [7]:
def my_neo4j_run_query_pandas(query, **kwargs):
    "run a query and return the results in a pandas dataframe"
    
    result = session.run(query, **kwargs)
    
    df = pd.DataFrame([r.values() for r in result], columns=result.keys())
    
    return df

In [8]:
def my_neo4j_number_nodes_relationships():
    "print the number of nodes and relationships"
   
    
    query = """
        match (n) 
        return n.name as node_name, labels(n) as labels
        order by n.name
    """
    
    df = my_neo4j_run_query_pandas(query)
    
    number_nodes = df.shape[0]
    
    
    query = """
        match (n1)-[r]->(n2) 
        return n1.name as node_name_1, labels(n1) as node_1_labels, 
            type(r) as relationship_type, n2.name as node_name_2, labels(n2) as node_2_labels
        order by node_name_1, node_name_2
    """
    
    df = my_neo4j_run_query_pandas(query)
    
    number_relationships = df.shape[0]
    
    print("-------------------------")
    print("  Nodes:", number_nodes)
    print("  Relationships:", number_relationships)
    print("-------------------------")


In [9]:
def my_neo4j_create_node(station_name):
    "create a node with label Station"
    
    query = """
    
    CREATE (:Station {name: $station_name})
    
    """
    
    session.run(query, station_name=station_name)
    

In [10]:
def my_neo4j_create_relationship_one_way(from_station, to_station, weight):
    "create a relationship one way between two stations with a weight"
    
    query = """
    
    MATCH (from:Station), 
          (to:Station)
    WHERE from.name = $from_station and to.name = $to_station
    CREATE (from)-[:LINK {weight: $weight}]->(to)
    
    """
    
    session.run(query, from_station=from_station, to_station=to_station, weight=weight)
    

In [11]:
def my_neo4j_create_relationship_two_way(from_station, to_station, weight):
    "create relationships two way between two stations with a weight"
    
    query = """
    
    MATCH (from:Station), 
          (to:Station)
    WHERE from.name = $from_station and to.name = $to_station
    CREATE (from)-[:LINK {weight: $weight}]->(to),
           (to)-[:LINK {weight: $weight}]->(from)
    
    """
    
    session.run(query, from_station=from_station, to_station=to_station, weight=weight)
    

In [12]:
connection = psycopg2.connect(
    user = "postgres",
    password = "ucb",
    host = "postgres",
    port = "5432",
    database = "postgres"
)

In [13]:
cursor = connection.cursor()

In [14]:
my_neo4j_wipe_out_database()

# Constructing Graph

In [15]:
connection.rollback()

query = """

select station
from stations
order by station

"""

cursor.execute(query)

connection.rollback()

rows = cursor.fetchall()

for row in rows:
    
    station = row[0]
    
    my_neo4j_create_node('depart ' + station)
    my_neo4j_create_node('arrive ' + station)
    

In [16]:
connection.rollback()

query = """

select station, line
from lines
order by station, line

"""

cursor.execute(query)

connection.rollback()

rows = cursor.fetchall()

for row in rows:
    
    station = row[0]
    line = row[1]
    
    depart = 'depart ' + station
    arrive = 'arrive ' + station
    line_station = line + ' ' + station
    
    my_neo4j_create_node(line_station)
    my_neo4j_create_relationship_one_way(depart, line_station, 0)
    my_neo4j_create_relationship_one_way(line_station, arrive, 0)
    

In [17]:
connection.rollback()

query = """

select a.station, a.line as from_line, b.line as to_line, s.transfer_time
from lines a
     join lines b
       on a.station = b.station and a.line <> b.line 
     join stations s
       on a.station = s.station
order by 1, 2, 3

"""

cursor.execute(query)

connection.rollback()

rows = cursor.fetchall()

for row in rows:
    
    station = row[0]
    from_line = row[1]
    to_line = row[2]
    transfer_time = int(row[3])
    
    from_station = from_line + ' ' + station
    to_station = to_line + ' ' + station
    
    my_neo4j_create_relationship_one_way(from_station, to_station, transfer_time)
    

In [18]:
connection.rollback()

query = """

select a.line, a.station as from_station, b.station as to_station, t.travel_time
from lines a
  join lines b
    on a.line = b.line and b.sequence = (a.sequence + 1)
  join travel_times t
    on (a.station = t.station_1 and b.station = t.station_2)
        or (a.station = t.station_2 and b.station = t.station_1)
order by line, from_station, to_station

"""

cursor.execute(query)

connection.rollback()

rows = cursor.fetchall()

for row in rows:
    
    line = row[0]
    from_station = line + ' ' + row[1]
    to_station = line + ' ' + row[2]
    travel_time = int(row[3])
    
    my_neo4j_create_relationship_two_way(from_station, to_station, travel_time)
    

In [19]:
my_neo4j_number_nodes_relationships()

-------------------------
  Nodes: 214
  Relationships: 652
-------------------------


To see graph in neo4j GUI:

- Use this link in private browser, replacing X's with server IP adress https://xxxx:7473
- log in with username neo4j password ucb_mids_w205
- use command "match (n) return n"


# Prepping table for features

In [20]:
query = "CALL gds.graph.drop('ds_graph', false)"
session.run(query)

query = """

CALL gds.graph.project('ds_graph', 'Station', 'LINK', {relationshipProperties: 'weight'})
"""

session.run(query)

<neo4j._sync.work.result.Result at 0x7f1a768d9940>

In [21]:
connection.rollback()

query = """

drop table if exists graphy_features
;

create table graphy_features(
    node varchar(32),
    community numeric(5),
    pagerank float(16)
)
;

"""

cursor.execute(query)

connection.commit()

In [22]:
connection.rollback()

query = """

insert into graphy_features
values
(%s, 0, 0)
;

"""

node_list = my_get_node_list()

for node in node_list:
    cursor.execute(query, (node,))

connection.commit()

In [23]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select * 
from graphy_features
order by node

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,node,community,pagerank
0,arrive 12th Street,0,0
1,arrive 16th Street Mission,0,0
2,arrive 19th Street,0,0
3,arrive 24th Street Mission,0,0
4,arrive Antioch,0,0
...,...,...,...
209,yellow San Bruno,0,0
210,yellow SFO,0,0
211,yellow South San Francisco,0,0
212,yellow Walnut Creek,0,0


# Louvain Community Detection

In [24]:
query = """

CALL gds.louvain.stream('ds_graph', {includeIntermediateCommunities: true})
YIELD nodeId, communityId, intermediateCommunityIds
RETURN gds.util.asNode(nodeId).name AS name, communityId as community, intermediateCommunityIds as intermediate_community
ORDER BY community, name ASC

"""

result = session.run(query)

for r in result:
    
    query = "update graphy_features set community = %s where node = %s"
    
    cursor.execute(query, (r["community"], r["name"]))

connection.commit()

# Page Rank

In [25]:
query = """

CALL gds.pageRank.stream('ds_graph',
                         { maxIterations: $max_iterations,
                           dampingFactor: $damping_factor}
                         )
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).name AS name, score as pageRank
ORDER BY pageRank DESC, name ASC

"""

max_iterations = 20 # default 20
damping_factor = 0.85 # default 0.85

#my_neo4j_run_query_pandas(query, max_iterations=max_iterations, damping_factor=damping_factor)

result = session.run(query, max_iterations=max_iterations, damping_factor=damping_factor)

for r in result:
    
    query = "update graphy_features set pageRank = %s where node = %s"
    
    cursor.execute(query, (r["pageRank"], r["name"]))

connection.commit()

In [26]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select * 
from graphy_features
order by node

"""

df = my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

In [27]:
df.sort_values(['community', 'pagerank'])

Unnamed: 0,node,community,pagerank
68,depart 12th Street,72,0.150000
70,depart 19th Street,72,0.150000
94,depart MacArthur,72,0.150000
0,arrive 12th Street,72,0.452021
2,arrive 19th Street,72,0.462088
...,...,...,...
197,yellow Embarcadero,166,0.628645
141,green West Oakland,166,0.643969
67,blue West Oakland,166,0.643972
186,red West Oakland,166,0.644766


In [28]:
len(df['community'].unique())

11

# Checking logic with multiple station nodes

In [29]:
df[df['node'].str.contains('16th Street Mission')]

Unnamed: 0,node,community,pagerank
1,arrive 16th Street Mission,74,0.502965
50,blue 16th Street Mission,74,0.623158
69,depart 16th Street Mission,74,0.15
120,green 16th Street Mission,74,0.623158
164,red 16th Street Mission,74,0.623159
188,yellow 16th Street Mission,74,0.623159


- we can see that community is all the same for each station
- page rank is also the same except arrive and depart nodes
- following suggestions doc, we will just ignore those nodes

In [30]:
df = df[df['node'].str.contains('arrive') == False]
df = df[df['node'].str.contains('depart') == False]
df[df['node'].str.contains('16th Street Mission')]

Unnamed: 0,node,community,pagerank
50,blue 16th Street Mission,74,0.623158
120,green 16th Street Mission,74,0.623158
164,red 16th Street Mission,74,0.623159
188,yellow 16th Street Mission,74,0.623159


In [31]:
df[df['node'].str.contains('SFO')]

Unnamed: 0,node,community,pagerank
184,red SFO,148,0.508938
210,yellow SFO,148,0.487344


- Looks like some pagerank values for different lines are still different
- Suggestions doc suggests to just average these, so we will do that in the future

# Consolidating nodes

In [32]:
new_df_dict = {'node': [], 'community': [], 'pagerank': []}
current_station = ' '
current_community = 0
current_pagerank = []
for i, row in df.iterrows():
    station = row['node']
    this_station = ' '.join(station.split(' ')[1:])
    if this_station != current_station: # we are on a new station, need to set values for the last one
        if current_station != ' ': # this if is so the 0 ghost community is not added
            new_df_dict['node'].append(current_station)
            new_df_dict['community'].append(current_community)
            new_df_dict['pagerank'].append(np.mean(current_pagerank))
        current_station = this_station
        current_community = row['community'] # we are assuming the community will be the same for all lines in one station based off quick visual inspection
        current_pagerank = [row['pagerank']]
    else:
        current_pagerank.append(row['pagerank'])
new_df = pd.DataFrame().from_dict(new_df_dict)
new_df

Unnamed: 0,node,community,pagerank
0,16th Street Mission,74,0.623158
1,24th Street Mission,74,0.623798
2,Balboa Park,80,0.635425
3,Bay Fair,100,0.675695
4,Castro Valley,100,0.571916
...,...,...,...
108,Rockridge,160,0.567313
109,San Bruno,148,0.608985
110,SFO,148,0.487344
111,South San Francisco,148,0.585070


# Finding delivery hubs

In [44]:
delivery_hubs = new_df.groupby(['community', 'node']).mean()
delivery_hubs = delivery_hubs.loc[delivery_hubs.groupby('community')['pagerank'].idxmax()].reset_index()['node']
delivery_hubs

0                MacArthur
1      24th Street Mission
2              Balboa Park
3                 Milpitas
4                 Bay Fair
5                 Coliseum
6        Montgomery Street
7     El Cerrito del Norte
8                San Bruno
9         Pittsburg Center
10            West Oakland
Name: node, dtype: object

# Shortest path to delivery hubs from "kitchen"

In [34]:
def shortest_path(from_station, to_station):
    "given two stations (from_station and to_station), find the the shortest route between both stations"

    query = "CALL gds.graph.drop('ds_graph', false)"
    session.run(query)

    query = "CALL gds.graph.project('ds_graph', 'Station', 'LINK', {relationshipProperties: 'weight'})"
    session.run(query)

    query = """

    MATCH (source:Station {name: $source}), (target:Station {name: $target})
    CALL gds.shortestPath.dijkstra.stream(
        'ds_graph', 
        { sourceNode: source, 
          targetNode: target, 
          relationshipWeightProperty: 'weight'
        }
    )
    YIELD index, sourceNode, targetNode, totalCost, nodeIds, costs, path
    RETURN
        gds.util.asNode(sourceNode).name AS from,
        gds.util.asNode(targetNode).name AS to,
        totalCost,
        [nodeId IN nodeIds | gds.util.asNode(nodeId).name] AS nodes,
        costs
    ORDER BY index

    """

    result = session.run(query, source=from_station, target=to_station)
    
    for r in result:
        
        total_cost = int(r['totalCost'])
        
        print("\n--------------------------------")
        print("   Total Cost: ", total_cost)
        print("   Minutes: ", round(total_cost / 60.0,1))
        print("--------------------------------")
        
        nodes = r['nodes']
        costs = r['costs']
        
        i = 0
        previous = 0
        
        for n in nodes:
            
            print(n + ", " + str(int(costs[i]) - previous)  + ", " + str(int(costs[i])))
            
            previous = int(costs[i])
            i += 1

In [35]:
shortest_path('depart Downtown Berkeley', 'arrive 24th Street Mission')


--------------------------------
   Total Cost:  1980
   Minutes:  33.0
--------------------------------
depart Downtown Berkeley, 0, 0
red Downtown Berkeley, 0, 0
red Ashby, 180, 180
red MacArthur, 240, 420
red 19th Street, 180, 600
red 12th Street, 120, 720
red West Oakland, 300, 1020
red Embarcadero, 420, 1440
red Montgomery Street, 60, 1500
red Powell Street, 120, 1620
red Civic Center, 60, 1680
red 16th Street Mission, 180, 1860
red 24th Street Mission, 120, 1980
arrive 24th Street Mission, 0, 1980


In [36]:
query = "CALL gds.graph.drop('ds_graph', false)"
session.run(query)

query = "CALL gds.graph.project('ds_graph', 'Station', 'LINK', {relationshipProperties: 'weight'})"
session.run(query)

<neo4j._sync.work.result.Result at 0x7f1a89a11fd0>

In [37]:
query = """

MATCH (source:Station {name: $source}), (target:Station {name: $target})
CALL gds.shortestPath.yens.stream(
    'ds_graph', 
    { sourceNode: source,
      targetNode: target,
      k: $k,
      relationshipWeightProperty: 'weight'
    }
)
YIELD index, sourceNode, targetNode, totalCost, nodeIds, costs, path
RETURN
    index,
    gds.util.asNode(sourceNode).name AS from,
    gds.util.asNode(targetNode).name AS to,
    totalCost,
    [nodeId IN nodeIds | gds.util.asNode(nodeId).name] AS nodes,
    costs
ORDER BY index
"""

source = 'depart Downtown Berkeley'
target = 'arrive Castro Valley'
k = 5

my_neo4j_run_query_pandas(query, source=source, target=target, k=k)

Unnamed: 0,index,from,to,totalCost,nodes,costs
0,0,depart Downtown Berkeley,arrive Castro Valley,2214.0,"[depart Downtown Berkeley, orange Downtown Ber...","[0.0, 0.0, 180.0, 420.0, 600.0, 720.0, 900.0, ..."
1,1,depart Downtown Berkeley,arrive Castro Valley,2223.0,"[depart Downtown Berkeley, orange Downtown Ber...","[0.0, 0.0, 180.0, 420.0, 600.0, 720.0, 900.0, ..."
2,2,depart Downtown Berkeley,arrive Castro Valley,2268.0,"[depart Downtown Berkeley, orange Downtown Ber...","[0.0, 0.0, 180.0, 420.0, 600.0, 720.0, 900.0, ..."
3,3,depart Downtown Berkeley,arrive Castro Valley,2268.0,"[depart Downtown Berkeley, orange Downtown Ber...","[0.0, 0.0, 180.0, 420.0, 600.0, 720.0, 900.0, ..."
4,4,depart Downtown Berkeley,arrive Castro Valley,2273.0,"[depart Downtown Berkeley, red Downtown Berkel...","[0.0, 0.0, 180.0, 420.0, 479.0, 659.0, 779.0, ..."


In [39]:
kitchen = 'Downtown Berkeley'
for station in delivery_hubs:
    print(('depart '+ kitchen, 'arrive ' + station))
    shortest_path('depart '+ kitchen, 'arrive ' + station)

('depart Downtown Berkeley', 'arrive MacArthur')

--------------------------------
   Total Cost:  420
   Minutes:  7.0
--------------------------------
depart Downtown Berkeley, 0, 0
orange Downtown Berkeley, 0, 0
orange Ashby, 180, 180
orange MacArthur, 240, 420
arrive MacArthur, 0, 420
('depart Downtown Berkeley', 'arrive 24th Street Mission')

--------------------------------
   Total Cost:  1980
   Minutes:  33.0
--------------------------------
depart Downtown Berkeley, 0, 0
red Downtown Berkeley, 0, 0
red Ashby, 180, 180
red MacArthur, 240, 420
red 19th Street, 180, 600
red 12th Street, 120, 720
red West Oakland, 300, 1020
red Embarcadero, 420, 1440
red Montgomery Street, 60, 1500
red Powell Street, 120, 1620
red Civic Center, 60, 1680
red 16th Street Mission, 180, 1860
red 24th Street Mission, 120, 1980
arrive 24th Street Mission, 0, 1980
('depart Downtown Berkeley', 'arrive Balboa Park')

--------------------------------
   Total Cost:  2280
   Minutes:  38.0
-----------------