# Create a graph database in Neo4j for the BART system

# Included Modules and Packages

In [1]:
import neo4j

import csv

import math
import numpy as np
import pandas as pd

import psycopg2

# Neo4j DB Startup

In [2]:
driver = neo4j.GraphDatabase.driver(uri="neo4j://neo4j:7687", auth=("neo4j","ucb_mids_w205"))

In [3]:
session = driver.session(database="neo4j")

# Postgres DB Startup

In [4]:
connection = psycopg2.connect(
    user = "postgres",
    password = "ucb",
    host = "postgres",
    port = "5432",
    database = "postgres"
)
cursor = connection.cursor()

#  Function Declarations 

In [5]:
def my_neo4j_wipe_out_database():
    "wipe out database by deleting all nodes and relationships"
    
    query = "match (node)-[relationship]->() delete node, relationship"
    session.run(query)
    
    query = "match (node) delete node"
    session.run(query)

In [6]:
def my_neo4j_run_query_pandas(query, **kwargs):
    "run a query and return the results in a pandas dataframe"
    
    result = session.run(query, **kwargs)
    
    df = pd.DataFrame([r.values() for r in result], columns=result.keys())
    
    return df

In [7]:
def my_neo4j_number_nodes_relationships():
    "print the number of nodes and relationships"
   
    
    query = """
        match (n) 
        return n.name as node_name, labels(n) as labels
        order by n.name
    """
    
    df = my_neo4j_run_query_pandas(query)
    
    number_nodes = df.shape[0]
    
    
    query = """
        match (n1)-[r]->(n2) 
        return n1.name as node_name_1, labels(n1) as node_1_labels, 
            type(r) as relationship_type, n2.name as node_name_2, labels(n2) as node_2_labels
        order by node_name_1, node_name_2
    """
    
    df = my_neo4j_run_query_pandas(query)
    
    number_relationships = df.shape[0]
    
    print("-------------------------")
    print("  Nodes:", number_nodes)
    print("  Relationships:", number_relationships)
    print("-------------------------")

def my_neo4j_nodes_relationships():
    "print all the nodes and relationships"
   
    print("-------------------------")
    print("  Nodes:")
    print("-------------------------")
    
    query = """
        match (n) 
        return n.name as node_name, labels(n) as labels
        order by n.name
    """
    
    df = my_neo4j_run_query_pandas(query)
    
    number_nodes = df.shape[0]
    
    display(df)
    
    print("-------------------------")
    print("  Relationships:")
    print("-------------------------")
    
    query = """
        match (n1)-[r]->(n2) 
        return n1.name as node_name_1, labels(n1) as node_1_labels, 
            type(r) as relationship_type, n2.name as node_name_2, labels(n2) as node_2_labels
        order by node_name_1, node_name_2
    """
    
    df = my_neo4j_run_query_pandas(query)
    
    number_relationships = df.shape[0]
    
    display(df)
    
    density = (2 * number_relationships) / (number_nodes * (number_nodes - 1))
    
    print("-------------------------")
    print("  Density:", f'{density:.1f}')
    print("-------------------------")
    
    return df

In [8]:
def my_neo4j_create_node(station_name):
    "create a node with label Station"
    
    query = """
    
    CREATE (:Station {name: $station_name})
    
    """
    
    session.run(query, station_name=station_name)
    

In [9]:
def my_neo4j_create_relationship_one_way(from_station, to_station, weight):
    "create a relationship one way between two stations with a weight"
    
    query = """
    
    MATCH (from:Station), 
          (to:Station)
    WHERE from.name = $from_station and to.name = $to_station
    CREATE (from)-[:LINK {weight: $weight}]->(to)
    
    """
    
    session.run(query, from_station=from_station, to_station=to_station, weight=weight)
    

In [10]:
def my_neo4j_create_relationship_two_way(from_station, to_station, weight):
    "create relationships two way between two stations with a weight"
    
    query = """
    
    MATCH (from:Station), 
          (to:Station)
    WHERE from.name = $from_station and to.name = $to_station
    CREATE (from)-[:LINK {weight: $weight}]->(to),
           (to)-[:LINK {weight: $weight}]->(from)
    
    """
    
    session.run(query, from_station=from_station, to_station=to_station, weight=weight)
    

In [11]:
def my_select_query_pandas(query, rollback_before_flag, rollback_after_flag):
    "function to run a select query and return rows in a pandas dataframe"
    
    if rollback_before_flag:
        connection.rollback()
    
    df = pd.read_sql_query(query, connection)
    
    if rollback_after_flag:
        connection.rollback()
    
    # fix the float columns that really should be integers
    
    for column in df:
    
        if df[column].dtype == "float64":

            fraction_flag = False

            for value in df[column].values:
                
                if not np.isnan(value):
                    if value - math.floor(value) != 0:
                        fraction_flag = True

            if not fraction_flag:
                df[column] = df[column].astype('Int64')
    
    return(df)
    

# Bart Station, Station Line, and Travel Time EDA

In [12]:
query = """
select * from p3_stations
"""

my_select_query_pandas(query, True, True).iloc[:5]

Unnamed: 0,station,latitude,longitude,transfer_time
0,12th Street,37.803608,-122.272006,282
1,16th Street Mission,37.764847,-122.420042,287
2,19th Street,37.807869,-122.26898,67
3,24th Street Mission,37.752,-122.4187,277
4,Antioch,37.996281,-121.783404,0


In [13]:
query = """

    select a.line, a.station as from_station, b.station as to_station, t.travel_time
    from p3_lines a
      join p3_lines b
        on a.line = b.line and b.sequence = (a.sequence + 1)
      join p3_travel_times t
        on (a.station = t.station_1 and b.station = t.station_2)
            or (a.station = t.station_2 and b.station = t.station_1)
    order by line, from_station, to_station

    """
    
lines_stations = my_select_query_pandas(query, True, True)

# Customer Table Creation

In [14]:
def make_customers_table():
    from geographiclib.geodesic import Geodesic
    rollback_before_flag = True
    rollback_after_flag = True

    # customers and zip locale
    query = """

    select c.*, z.latitude, z.longitude
    from p3_customers c
    join p3_zip_codes z on c.zip=z.zip
    where closest_store_id = 1

    """
    cust_zips = my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)
    zip_locations = cust_zips[['zip','latitude', 'longitude']].drop_duplicates()
    
    # stations
    query = """
    select * 
    from p3_stations
    """
    stations = my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)[['station','latitude','longitude']]
    
    def my_calculate_distance(point_1, point_2):
        "Given two points in (latitude, longitude) format, calculate the distance between them in miles"

        geod = Geodesic.WGS84

        g = geod.Inverse(point_1[0], point_1[1], point_2[0], point_2[1])
        miles = g['s12'] / 1000 * 0.621371
    
        return miles

    def get_closest_station_from_zip(zip_row, stations):
        zip_locale = tuple(zip_row[['latitude','longitude']])
        min_dist = 100**10
        min_dist_station = ''

        for station_index, station_row in stations.iterrows():
            station_locale = tuple(station_row[['latitude','longitude']])
            curr_dist = my_calculate_distance(zip_locale, station_locale)
            if curr_dist < min_dist:
                min_dist = curr_dist
                min_dist_station = station_row['station']
        return pd.Series({"station": min_dist_station, 'distance': round(min_dist,2)})
    
    zip_locations[['closest_station','closest_station_distance']] = zip_locations.apply(get_closest_station_from_zip, args=(stations,), axis=1)
    customers = pd.merge(cust_zips, zip_locations[['zip','closest_station','closest_station_distance']])
    customers.rename(columns={'distance':'closest_store_distance'}, inplace=True)
    return customers

In [15]:
customers = make_customers_table()
customers.iloc[:5]

Unnamed: 0,customer_id,first_name,last_name,street,city,state,zip,closest_store_id,closest_store_distance,latitude,longitude,closest_station,closest_station_distance
0,1,Robb,Weaving,5 Ramsey Place,Oakland,CA,94609,1,1,37.8343,-122.2643,MacArthur,0.45
1,2,Robby,Belliard,6 Londonderry Plaza,Oakland,CA,94609,1,1,37.8343,-122.2643,MacArthur,0.45
2,3,Sadella,Caudrelier,548 Mcguire Parkway,Oakland,CA,94609,1,1,37.8343,-122.2643,MacArthur,0.45
3,4,Holmes,Shimmings,99 Kennedy Court,Oakland,CA,94609,1,1,37.8343,-122.2643,MacArthur,0.45
4,5,Beverley,Gubbin,51 Mcbride Drive,Oakland,CA,94609,1,1,37.8343,-122.2643,MacArthur,0.45


# Graph Creation

In [16]:
def create_station_nodes():
    connection.rollback()

    query = """

    select station
    from p3_stations
    order by station

    """

    cursor.execute(query)

    connection.rollback()

    rows = cursor.fetchall()

    for row in rows:

        station = row[0]

        my_neo4j_create_node('depart ' + station)
        my_neo4j_create_node('arrive ' + station)


In [17]:
def create_line_nodes_to_stations():
    connection.rollback()

    query = """

    select station, line
    from p3_lines
    order by station, line

    """

    cursor.execute(query)

    connection.rollback()

    rows = cursor.fetchall()

    for row in rows:

        station = row[0]
        line = row[1]

        depart = 'depart ' + station
        arrive = 'arrive ' + station
        line_station = line + ' ' + station

        my_neo4j_create_node(line_station)
        my_neo4j_create_relationship_one_way(depart, line_station, 0)
        my_neo4j_create_relationship_one_way(line_station, arrive, 0)

In [18]:
def create_line_transfers():
    connection.rollback()

    query = """

    select a.station, a.line as from_line, b.line as to_line, s.transfer_time
    from p3_lines a
         join p3_lines b
           on a.station = b.station and a.line <> b.line 
         join p3_stations s
           on a.station = s.station
    order by 1, 2, 3

    """

    cursor.execute(query)

    connection.rollback()

    rows = cursor.fetchall()

    for row in rows:

        station = row[0]
        from_line = row[1]
        to_line = row[2]
        transfer_time = int(row[3])

        from_station = from_line + ' ' + station
        to_station = to_line + ' ' + station

        my_neo4j_create_relationship_one_way(from_station, to_station, transfer_time)

In [19]:
def connect_stations():
    connection.rollback()

    query = """

    select a.line, a.station as from_station, b.station as to_station, t.travel_time
    from p3_lines a
      join p3_lines b
        on a.line = b.line and b.sequence = (a.sequence + 1)
      join p3_travel_times t
        on (a.station = t.station_1 and b.station = t.station_2)
            or (a.station = t.station_2 and b.station = t.station_1)
    order by line, from_station, to_station

    """

    cursor.execute(query)

    connection.rollback()

    rows = cursor.fetchall()

    for row in rows:

        line = row[0]
        from_station = line + ' ' + row[1]
        to_station = line + ' ' + row[2]
        travel_time = int(row[3])

        my_neo4j_create_relationship_two_way(from_station, to_station, travel_time)

In [20]:
def connect_customers(customers):
    def my_neo4j_create_relationship_cust_to_station(from_customer, closest_station, weight):
        "create relationships two way between two stations with a weight"

        query = """

        MATCH (from:Customer), 
              (to:Station)
        WHERE from.name = $from_customer and to.name = $closest_station
        CREATE (from)-[:LINK {weight: $weight}]->(to),
               (to)-[:LINK {weight: $weight}]->(from)

        """

        session.run(query, from_customer=from_customer, closest_station=closest_station, weight=weight)

    def my_neo4j_create_cust_node(customer):
        "create a node with label Customer"

        query = """

        CREATE (:Customer {name: $customer})

        """

        session.run(query, customer=customer)

    # for cust_index, customer_row in customers[customers['closest_station']=='El Cerrito Plaza'].iterrows():
    for cust_index, customer_row in customers.iterrows():
        closest_station = f"arrive {customer_row['closest_station']}"
        from_customer = f"{customer_row['customer_id']} - {customer_row['first_name']} {customer_row['last_name']}"
        closest_station_distance = customer_row['closest_station_distance']
        my_neo4j_create_cust_node(from_customer)
        my_neo4j_create_relationship_cust_to_station(from_customer,closest_station,closest_station_distance)


In [21]:
def recreate_stations_and_lines():
    my_neo4j_wipe_out_database()
    create_station_nodes()
    create_line_nodes_to_stations()
    create_line_transfers()
    connect_stations()
    my_neo4j_number_nodes_relationships()
    my_neo4j_nodes_relationships()
    # connect_customers()
    print("Done!")

In [22]:
recreate_stations_and_lines()

-------------------------
  Nodes: 214
  Relationships: 652
-------------------------
-------------------------
  Nodes:
-------------------------


Unnamed: 0,node_name,labels
0,arrive 12th Street,[Station]
1,arrive 16th Street Mission,[Station]
2,arrive 19th Street,[Station]
3,arrive 24th Street Mission,[Station]
4,arrive Antioch,[Station]
...,...,...
209,yellow SFO,[Station]
210,yellow San Bruno,[Station]
211,yellow South San Francisco,[Station]
212,yellow Walnut Creek,[Station]


-------------------------
  Relationships:
-------------------------


Unnamed: 0,node_name_1,node_1_labels,relationship_type,node_name_2,node_2_labels
0,blue 16th Street Mission,[Station],LINK,arrive 16th Street Mission,[Station]
1,blue 16th Street Mission,[Station],LINK,blue 24th Street Mission,[Station]
2,blue 16th Street Mission,[Station],LINK,blue Civic Center,[Station]
3,blue 16th Street Mission,[Station],LINK,green 16th Street Mission,[Station]
4,blue 16th Street Mission,[Station],LINK,red 16th Street Mission,[Station]
...,...,...,...,...,...
647,yellow West Oakland,[Station],LINK,blue West Oakland,[Station]
648,yellow West Oakland,[Station],LINK,green West Oakland,[Station]
649,yellow West Oakland,[Station],LINK,red West Oakland,[Station]
650,yellow West Oakland,[Station],LINK,yellow 12th Street,[Station]


-------------------------
  Density: 0.0
-------------------------
Done!


# Centrality Algorithms Table Creation

In [23]:
query = "CALL gds.graph.drop('ds_graph', false)"
session.run(query)
query = "CALL gds.graph.project('ds_graph', 'Station', 'LINK', {relationshipProperties: 'weight'})"
session.run(query)

<neo4j._sync.work.result.Result at 0x7fe3963ef9d0>

In [24]:
def get_degree_centrality():
    query = """

    CALL gds.degree.stream('ds_graph')
    YIELD nodeId, score
    RETURN gds.util.asNode(nodeId).name AS name, score as degree_centrality
    ORDER BY degree_centrality DESC, name

    """

    return my_neo4j_run_query_pandas(query)

degree_centrality = get_degree_centrality()

In [25]:
def get_closeness_centrality():
    query = """

    CALL gds.beta.closeness.stream('ds_graph')
    YIELD nodeId, score
    RETURN gds.util.asNode(nodeId).name AS name, score as closeness_centrality
    ORDER BY score DESC

    """

    return my_neo4j_run_query_pandas(query)

closeness_centrality = get_closeness_centrality()

In [26]:
def get_wasserman():
    query = """

    CALL gds.beta.closeness.stream('ds_graph',
                                   {useWassermanFaust: true}
                                  )
    YIELD nodeId, score
    RETURN gds.util.asNode(nodeId).name AS name, score as wasserman
    ORDER BY score DESC

    """

    return my_neo4j_run_query_pandas(query)

wasserman = get_wasserman()

In [27]:
def get_harmonic():
    query = """

    CALL gds.alpha.closeness.harmonic.stream('ds_graph', {})
    YIELD nodeId, centrality
    RETURN gds.util.asNode(nodeId).name AS name, centrality as harmonic
    ORDER BY centrality DESC

    """

    return my_neo4j_run_query_pandas(query)

harmonic = get_harmonic()

In [28]:
def get_betweenness():
    query = """

    CALL gds.betweenness.stream('ds_graph', {relationshipWeightProperty: 'weight'})
    YIELD nodeId, score
    RETURN gds.util.asNode(nodeId).name AS name, score as betweenness
    ORDER BY betweenness DESC

    """

    return my_neo4j_run_query_pandas(query)

betweenness = get_betweenness()

In [29]:
def merge_station_scores(algo_scores):
    station_scores = algo_scores[0]
    for df in algo_scores[1:]:
        station_scores = pd.merge(station_scores, df, on='name', how='outer')

    station_scores['line'] = station_scores['name'].str.split(n=1).str[0]
    station_scores['station_name'] = station_scores['name'].str.split(n=1).str[1]
    
    return station_scores

In [30]:
algo_scores = [
    degree_centrality,
    closeness_centrality,
    wasserman,
    harmonic,
    betweenness,
]

station_scores = merge_station_scores(algo_scores)

valid_lines = ['blue', 'green', 'orange', 'red', 'yellow', 'gray']
station_scores = station_scores[station_scores['line'].isin(valid_lines)]

station_scores.iloc[:5]

Unnamed: 0,name,degree_centrality,closeness_centrality,wasserman,harmonic,betweenness,line,station_name
0,blue 16th Street Mission,6.0,0.107237,0.082064,0.139981,1914.333333,blue,16th Street Mission
1,blue 24th Street Mission,6.0,0.100866,0.077189,0.136221,1952.833333,blue,24th Street Mission
2,blue Balboa Park,6.0,0.088683,0.067866,0.125865,1538.25,blue,Balboa Park
3,blue Civic Center,6.0,0.113668,0.086985,0.143392,2122.25,blue,Civic Center
4,blue Coliseum,6.0,0.118202,0.090455,0.144704,3490.0,blue,Coliseum


In [31]:
score_types = ['degree_centrality','closeness_centrality', 'wasserman', 'harmonic', 'betweenness']

pivoted_scores = station_scores.pivot_table(index='station_name', columns='line', values=score_types)
pivoted_scores.columns = ['_'.join(col[::-1]) for col in pivoted_scores.columns.values]
final_station_scores = pivoted_scores[sorted(pivoted_scores.columns)]
final_station_scores.iloc[:5]

Unnamed: 0_level_0,blue_betweenness,blue_closeness_centrality,blue_degree_centrality,blue_harmonic,blue_wasserman,gray_betweenness,gray_closeness_centrality,gray_degree_centrality,gray_harmonic,gray_wasserman,...,red_betweenness,red_closeness_centrality,red_degree_centrality,red_harmonic,red_wasserman,yellow_betweenness,yellow_closeness_centrality,yellow_degree_centrality,yellow_harmonic,yellow_wasserman
station_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12th Street,,,,,,,,,,,...,2724.833333,0.134711,5.0,0.153544,0.103089,3545.333333,0.135607,5.0,0.153839,0.103775
16th Street Mission,1914.333333,0.107237,6.0,0.139981,0.082064,,,,,,...,1986.333333,0.107591,6.0,0.140628,0.082335,2526.166667,0.108162,6.0,0.140753,0.082772
19th Street,,,,,,,,,,,...,2568.0,0.127145,5.0,0.145552,0.097299,3405.0,0.127943,5.0,0.145909,0.09791
24th Street Mission,1952.833333,0.100866,6.0,0.136221,0.077189,,,,,,...,2028.333333,0.101179,6.0,0.137178,0.077428,2553.833333,0.101684,6.0,0.137289,0.077815
Antioch,,,,,,,,,,,...,,,,,,325.0,0.05876,2.0,0.065522,0.044967


In [32]:
cust_station = pd.merge(customers, final_station_scores, left_on='closest_station', right_index=True, how='left')
cust_station.iloc[:5]

Unnamed: 0,customer_id,first_name,last_name,street,city,state,zip,closest_store_id,closest_store_distance,latitude,...,red_betweenness,red_closeness_centrality,red_degree_centrality,red_harmonic,red_wasserman,yellow_betweenness,yellow_closeness_centrality,yellow_degree_centrality,yellow_harmonic,yellow_wasserman
0,1,Robb,Weaving,5 Ramsey Place,Oakland,CA,94609,1,1,37.8343,...,3596.333333,0.119677,5.0,0.140327,0.091584,6535.833333,0.120384,5.0,0.140766,0.092125
1,2,Robby,Belliard,6 Londonderry Plaza,Oakland,CA,94609,1,1,37.8343,...,3596.333333,0.119677,5.0,0.140327,0.091584,6535.833333,0.120384,5.0,0.140766,0.092125
2,3,Sadella,Caudrelier,548 Mcguire Parkway,Oakland,CA,94609,1,1,37.8343,...,3596.333333,0.119677,5.0,0.140327,0.091584,6535.833333,0.120384,5.0,0.140766,0.092125
3,4,Holmes,Shimmings,99 Kennedy Court,Oakland,CA,94609,1,1,37.8343,...,3596.333333,0.119677,5.0,0.140327,0.091584,6535.833333,0.120384,5.0,0.140766,0.092125
4,5,Beverley,Gubbin,51 Mcbride Drive,Oakland,CA,94609,1,1,37.8343,...,3596.333333,0.119677,5.0,0.140327,0.091584,6535.833333,0.120384,5.0,0.140766,0.092125


In [33]:
cust_station.to_csv('customers_station_scores.csv',index=False)

In [34]:
final_station_scores.to_csv('station_scores.csv')