In [1]:
import pandas as pd
pd.set_option('max_colwidth', -1)
pd.set_option('display.max_rows', 500)

from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql import DataFrame
from pyspark.sql.window import Window
spark = SparkSession.builder.appName('abc').getOrCreate()

from pyspark.sql.functions import col
from pyspark.sql import SparkSession

In [2]:
def generate_series(start, stop, interval, alias):
    """
    :param start  - lower bound, inclusive
    :param stop   - upper bound, exclusive
    :interval int - increment interval in seconds
    """
    spark = SparkSession.builder.getOrCreate()
    # Determine start and stops in epoch seconds
    start, stop = spark.createDataFrame(
        [(start, stop)], ("start", "stop")
    ).select(
        [col(c).cast("timestamp").cast("long") for c in ("start", "stop")
    ]).first()
    
    # Create range with increments and cast to timestamp
    return spark.range(start, stop, interval).select(
        col("id").cast("timestamp").alias(alias)
    )

In [3]:
%reload_ext cypher

In [4]:
%env NEO4J_URL=http://neo4j:h4ck3r@172.19.0.2:7474/db/data 

env: NEO4J_URL=http://neo4j:h4ck3r@172.19.0.2:7474/db/data


#### ONIBUS DE DIFERENTES LINHAS QUE SE ENCONTRAM EM UM MESMO PONTO AO LONGO DO DIA

In [5]:
results = %cypher \
MATCH (y:Year)-[:CONTAINS]->(m:Month)-[:CONTAINS]->(d:Day)-[:HAS_LINE]->(l:Line)-[:HAS_TRIP]->(t:Trip)-[:HAS_BUS_STOP]->(bss:BusStop) \
with y,m,d,l,t,bss \
MATCH (t)-[:HAS_EVENT]->(ev:Event)<-[:HAS_EVENT]-(h:Hour)<-[:CONTAINS]-(d) \
MATCH (ev)-[:IS_NEAR_BY]->(bss)  \
return  y.value as year, m.value as month, d.value as day, bss.number as busstop_number,bss.name as name,bss.latitude as latitude, bss.longitude as longitude, l.line_code as line_code,t.line_way as line_way, ev.vehicle as vehicle, ev.event_timestamp as event_timestamp

434082 rows affected.


In [8]:
df = spark.createDataFrame(results.get_dataframe())

In [9]:
# start date is min date
date_min=df.withColumn("event_timestamp",F.date_trunc('hour', df.event_timestamp)).agg({'event_timestamp': 'min'}).first()[0]
date_max=df.withColumn("event_timestamp",F.date_trunc('hour', df.event_timestamp)).agg({'event_timestamp': 'max'}).first()[0]


time_series5_df = generate_series(date_min, date_max, 60 * 5   , "time_span5min")
time_series10_df = generate_series(date_min, date_max, 60 * 10 , "time_span10min")
time_series15_df = generate_series(date_min, date_max, 60 * 15 , "time_span15min")
time_series20_df = generate_series(date_min, date_max, 60 * 20 , "time_span20min")
time_series25_df = generate_series(date_min, date_max, 60 * 25 , "time_span25min")
time_series30_df = generate_series(date_min, date_max, 60 * 30 , "time_span30min")
# time_series35_df = generate_series(date_min, date_max, 60 * 35 , "time_span35min")
# time_series40_df = generate_series(date_min, date_max, 60 * 40 , "time_span40min")
# time_series45_df = generate_series(date_min, date_max, 60 * 45 , "time_span45min")
# time_series50_df = generate_series(date_min, date_max, 60 * 50 , "time_span50min")
# time_series55_df = generate_series(date_min, date_max, 60 * 55 , "time_span55min")
# time_series59_df = generate_series(date_min, date_max, 60 * 59 , "time_span59min")

In [10]:
time_series5_df.toPandas().head()

Unnamed: 0,time_span5min
0,2019-05-01 01:00:00
1,2019-05-01 01:05:00
2,2019-05-01 01:10:00
3,2019-05-01 01:15:00
4,2019-05-01 01:20:00


In [11]:
time_span5_df = time_series5_df.withColumn("next_time_span5min",F.lead("time_span5min").over(Window.orderBy(time_series5_df.time_span5min)))
time_span10_df = time_series10_df.withColumn("next_time_span10min",F.lead("time_span10min").over(Window.orderBy(time_series10_df.time_span10min)))
time_span15_df = time_series15_df.withColumn("next_time_span15min",F.lead("time_span15min").over(Window.orderBy(time_series15_df.time_span15min)))
time_span20_df = time_series20_df.withColumn("next_time_span20min",F.lead("time_span20min").over(Window.orderBy(time_series20_df.time_span20min)))

time_span25_df = time_series25_df.withColumn("next_time_span25min",F.lead("time_span25min").over(Window.orderBy(time_series25_df.time_span25min)))
time_span30_df = time_series30_df.withColumn("next_time_span30min",F.lead("time_span30min").over(Window.orderBy(time_series30_df.time_span30min)))

# time_span35_df = time_series35_df.withColumn("next_time_span35min",F.lead("time_span35min").over(Window.orderBy(time_series35_df.time_span35min)))
# time_span40_df = time_series40_df.withColumn("next_time_span40min",F.lead("time_span40min").over(Window.orderBy(time_series40_df.time_span40min)))
# time_span45_df = time_series45_df.withColumn("next_time_span45min",F.lead("time_span45min").over(Window.orderBy(time_series45_df.time_span45min)))
# time_span50_df = time_series50_df.withColumn("next_time_span50min",F.lead("time_span50min").over(Window.orderBy(time_series50_df.time_span50min)))
# time_span55_df = time_series55_df.withColumn("next_time_span55min",F.lead("time_span55min").over(Window.orderBy(time_series55_df.time_span55min)))
# time_span59_df = time_series59_df.withColumn("next_time_span59min",F.lead("time_span59min").over(Window.orderBy(time_series59_df.time_span59min)))

In [12]:
time_span5_df.toPandas().head()

Unnamed: 0,time_span5min,next_time_span5min
0,2019-05-01 01:00:00,2019-05-01 01:05:00
1,2019-05-01 01:05:00,2019-05-01 01:10:00
2,2019-05-01 01:10:00,2019-05-01 01:15:00
3,2019-05-01 01:15:00,2019-05-01 01:20:00
4,2019-05-01 01:20:00,2019-05-01 01:25:00


In [13]:
df.printSchema()

root
 |-- year: long (nullable = true)
 |-- month: long (nullable = true)
 |-- day: long (nullable = true)
 |-- busstop_number: string (nullable = true)
 |-- name: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- line_code: string (nullable = true)
 |-- line_way: string (nullable = true)
 |-- vehicle: string (nullable = true)
 |-- event_timestamp: string (nullable = true)



In [14]:
df.toPandas().head()

Unnamed: 0,year,month,day,busstop_number,name,latitude,longitude,line_code,line_way,vehicle,event_timestamp
0,2019,5,2,120208,"Rua Lodovico Geronazzo, 1945 - Boa Vista",-25.381516215735,-49.252794612343,225,Terminal Boa Vista,BA004,2019-05-02 05:50:47
1,2019,5,2,120210,"Rua Lodovico Geronazzo, 1731 - Boa Vista",-25.382707644463,-49.251159913336,225,Terminal Boa Vista,BA130,2019-05-02 05:36:46
2,2019,5,2,104105,Terminal Barreirinha - 225 - Boa Vista / Barreirinha,-25.377196086551,-49.262522831912,225,Terminal Boa Vista,BA004,2019-05-02 05:54:24
3,2019,5,2,120206,"Rua Lodovico Geronazzo, 2175 - Boa Vista",-25.379797124821,-49.25504221448,225,Terminal Boa Vista,BA130,2019-05-02 05:37:51
4,2019,5,2,120204,"Rua Lodovico Geronazzo, 2388 - Boa Vista",-25.378772576366,-49.256422684983,225,Terminal Boa Vista,BA130,2019-05-02 05:38:23


In [15]:
#filter("busstop_number == 150751")
events = (df.join(time_span5_df, (df.event_timestamp >= time_span5_df.time_span5min)
               & (df.event_timestamp < time_span5_df.next_time_span5min ))
         )

events = (events.join(time_span10_df, (events.event_timestamp >= time_span10_df.time_span10min)
               & (events.event_timestamp < time_span10_df.next_time_span10min ))
         )

events = (events.join(time_span15_df, (events.event_timestamp >= time_span15_df.time_span15min)
               & (events.event_timestamp < time_span15_df.next_time_span15min ))
         )

events = (events.join(time_span20_df, (events.event_timestamp >= time_span20_df.time_span20min)
               & (events.event_timestamp < time_span20_df.next_time_span20min ))
         )

events = (events.join(time_span25_df, (events.event_timestamp >= time_span25_df.time_span25min)
               & (events.event_timestamp < time_span25_df.next_time_span25min ))
         )

events = (events.join(time_span30_df, (events.event_timestamp >= time_span30_df.time_span30min)
               & (events.event_timestamp < time_span30_df.next_time_span30min ))
         )


# events = (events.join(time_span35_df, (events.event_timestamp >= time_span35_df.time_span35min)
#                & (events.event_timestamp < time_span35_df.next_time_span35min ))
#          )

# events = (events.join(time_span40_df, (events.event_timestamp >= time_span40_df.time_span40min)
#                & (events.event_timestamp < time_span40_df.next_time_span40min ))
#          )

# events = (events.join(time_span45_df, (events.event_timestamp >= time_span45_df.time_span45min)
#                & (events.event_timestamp < time_span45_df.next_time_span45min ))
#          )

# events = (events.join(time_span50_df, (events.event_timestamp >= time_span50_df.time_span50min)
#                & (events.event_timestamp < time_span50_df.next_time_span50min ))
#          )


# events = (events.join(time_span55_df, (events.event_timestamp >= time_span55_df.time_span55min)
#                & (events.event_timestamp < time_span55_df.next_time_span55min ))
#          )


# events = (events.join(time_span59_df, (events.event_timestamp >= time_span59_df.time_span59min)
#                & (events.event_timestamp < time_span59_df.next_time_span59min ))
#          )

events = events.orderBy(F.desc("busstop_number"),F.asc("event_timestamp")).drop("event_timestamp")

In [16]:
events.toPandas().head()

Unnamed: 0,year,month,day,busstop_number,name,latitude,longitude,line_code,line_way,vehicle,...,time_span10min,next_time_span10min,time_span15min,next_time_span15min,time_span20min,next_time_span20min,time_span25min,next_time_span25min,time_span30min,next_time_span30min
0,2019,5,1,660104,"Rua Amazonas, 197 - Jd. Cecilia - Campo Magro - PR",-25.38146,-49.36393,928,Terminal Santa Felicidade,MN400,...,2019-05-01 06:00:00,2019-05-01 06:10:00,2019-05-01 06:00:00,2019-05-01 06:15:00,2019-05-01 06:00:00,2019-05-01 06:20:00,2019-05-01 06:00:00,2019-05-01 06:25:00,2019-05-01 06:00:00,2019-05-01 06:30:00
1,2019,5,1,660104,"Rua Amazonas, 197 - Jd. Cecilia - Campo Magro - PR",-25.38146,-49.36393,928,Terminal Santa Felicidade,MN400,...,2019-05-01 07:00:00,2019-05-01 07:10:00,2019-05-01 07:00:00,2019-05-01 07:15:00,2019-05-01 07:00:00,2019-05-01 07:20:00,2019-05-01 06:50:00,2019-05-01 07:15:00,2019-05-01 07:00:00,2019-05-01 07:30:00
2,2019,5,1,660104,"Rua Amazonas, 197 - Jd. Cecilia - Campo Magro - PR",-25.38146,-49.36393,928,Passauna,MN400,...,2019-05-01 08:00:00,2019-05-01 08:10:00,2019-05-01 08:00:00,2019-05-01 08:15:00,2019-05-01 08:00:00,2019-05-01 08:20:00,2019-05-01 07:40:00,2019-05-01 08:05:00,2019-05-01 08:00:00,2019-05-01 08:30:00
3,2019,5,1,660104,"Rua Amazonas, 197 - Jd. Cecilia - Campo Magro - PR",-25.38146,-49.36393,928,Passauna,MN400,...,2019-05-01 09:00:00,2019-05-01 09:10:00,2019-05-01 09:00:00,2019-05-01 09:15:00,2019-05-01 09:00:00,2019-05-01 09:20:00,2019-05-01 08:55:00,2019-05-01 09:20:00,2019-05-01 09:00:00,2019-05-01 09:30:00
4,2019,5,1,660104,"Rua Amazonas, 197 - Jd. Cecilia - Campo Magro - PR",-25.38146,-49.36393,928,Terminal Santa Felicidade,MN400,...,2019-05-01 10:00:00,2019-05-01 10:10:00,2019-05-01 10:00:00,2019-05-01 10:15:00,2019-05-01 10:00:00,2019-05-01 10:20:00,2019-05-01 09:45:00,2019-05-01 10:10:00,2019-05-01 10:00:00,2019-05-01 10:30:00


In [25]:
window5 = (
    Window.partitionBy(events.year, events.month, events.day, events.busstop_number,events.name, events.latitude, events.longitude, events.time_span5min,events.next_time_span5min)
)

window10 = (
    Window.partitionBy(events.year, events.month, events.day, events.busstop_number,events.name, events.latitude, events.longitude, events.time_span10min,events.next_time_span10min)
)

window15 = (
    Window.partitionBy(events.year, events.month, events.day, events.busstop_number,events.name, events.latitude, events.longitude, events.time_span15min,events.next_time_span15min)
)

window20 = (
    Window.partitionBy(events.year, events.month, events.day, events.busstop_number,events.name, events.latitude, events.longitude, events.time_span20min,events.next_time_span20min)
)

window25 = (
    Window.partitionBy(events.year, events.month, events.day, events.busstop_number,events.name, events.latitude, events.longitude, events.time_span25min,events.next_time_span25min)
)

window30 = (
    Window.partitionBy(events.year, events.month, events.day, events.busstop_number,events.name, events.latitude, events.longitude, events.time_span30min,events.next_time_span30min)
)


events2 = (events
          .withColumn("count_lines5min", F.size(F.collect_set('line_code').over(window5)))
          .withColumn("count_vehicles5min", F.size(F.collect_set('vehicle').over(window5)))
          .withColumn("set_vehicles5min", F.collect_set('vehicle').over(window5))
          .withColumn("set_lines5min", F.collect_set('line_code').over(window5))
           
           .withColumn("count_lines10min", F.size(F.collect_set('line_code').over(window10)))
          .withColumn("count_vehicles10min", F.size(F.collect_set('vehicle').over(window10)))
          .withColumn("set_vehicles10min", F.collect_set('vehicle').over(window10))
          .withColumn("set_lines10min", F.collect_set('line_code').over(window10))
           
           .withColumn("count_lines15min", F.size(F.collect_set('line_code').over(window15)))
          .withColumn("count_vehicles15min", F.size(F.collect_set('vehicle').over(window15)))
          .withColumn("set_vehicles15min", F.collect_set('vehicle').over(window15))
          .withColumn("set_lines15min", F.collect_set('line_code').over(window15))
           
           .withColumn("count_lines20min", F.size(F.collect_set('line_code').over(window20)))
          .withColumn("count_vehicles20min", F.size(F.collect_set('vehicle').over(window20)))
          .withColumn("set_vehicles20min", F.collect_set('vehicle').over(window20))
          .withColumn("set_lines20min", F.collect_set('line_code').over(window20))
           
           .withColumn("count_lines25min", F.size(F.collect_set('line_code').over(window25)))
          .withColumn("count_vehicles25min", F.size(F.collect_set('vehicle').over(window25)))
          .withColumn("set_vehicles25min", F.collect_set('vehicle').over(window25))
          .withColumn("set_lines25min", F.collect_set('line_code').over(window25))
           
          .withColumn("count_lines30min", F.size(F.collect_set('line_code').over(window30)))
          .withColumn("count_vehicles30min", F.size(F.collect_set('vehicle').over(window30)))
          .withColumn("set_vehicles30min", F.collect_set('vehicle').over(window30))
          .withColumn("set_lines30min", F.collect_set('line_code').over(window30))
          
          .orderBy(F.desc("busstop_number"),F.asc("time_span5min"))).drop("vehicle","line_code","line_way").distinct()

In [26]:
df_frequencies = events2.toPandas()

KeyboardInterrupt: 

In [None]:
df_frequencies.to_csv("frequencies.csv", index=False)

In [18]:
events2.filter("day ==2").toPandas().head()

Unnamed: 0,year,month,day,busstop_number,name,latitude,longitude,time_span,next_time_span,count_lines,count_vehicles,set_vehicles,set_lines
0,2019,5,2,660104,"Rua Amazonas, 197 - Jd. Cecilia - Campo Magro - PR",-25.38146,-49.36393,2019-05-02 05:15:00,2019-05-02 05:20:00,1,1,[MN403],[911]
1,2019,5,2,660104,"Rua Amazonas, 197 - Jd. Cecilia - Campo Magro - PR",-25.38146,-49.36393,2019-05-02 05:55:00,2019-05-02 06:00:00,1,1,[MN403],[911]
2,2019,5,2,660104,"Rua Amazonas, 197 - Jd. Cecilia - Campo Magro - PR",-25.38146,-49.36393,2019-05-02 06:10:00,2019-05-02 06:15:00,1,1,[MN405],[911]
3,2019,5,2,660104,"Rua Amazonas, 197 - Jd. Cecilia - Campo Magro - PR",-25.38146,-49.36393,2019-05-02 06:35:00,2019-05-02 06:40:00,1,1,[MN403],[911]
4,2019,5,2,660104,"Rua Amazonas, 197 - Jd. Cecilia - Campo Magro - PR",-25.38146,-49.36393,2019-05-02 06:55:00,2019-05-02 07:00:00,1,1,[MN405],[911]


In [19]:
events2.groupBy("busstop_number","name","latitude","longitude").agg(F.mean("count_lines").alias("nr_medio_linhas")).orderBy(F.desc("nr_medio_linhas")).limit(50).toPandas()

Unnamed: 0,busstop_number,name,latitude,longitude,nr_medio_linhas
0,150332,"Rua Leon Nicolas, 2081 - Cap?o Raso",-25.515159644727,-49.294443608469,1.821637
1,150751,"Av. Winston Churchill, 2677 - Cap?o Raso",-25.520731874323,-49.295383384725,1.814696
2,150331,"Av. Winston Churchill, 2472 - Cap?o Raso",-25.518348864079,-49.29566769888,1.810526
3,110022,"Rua Vinte e Quatro de Maio, 280-350 - Centro",-25.439756734365,-49.273240017786,1.769531
4,110026,"Rua Alferes Poli, 400 - Reboucas",-25.440705808683,-49.271456482887,1.6621
5,110208,"Av. Iguacu, 1184 - Reboucas",-25.443296676303,-49.272462905592,1.640845
6,150634,"Av. Iguacu, 2612 - Agua Verde",-25.448942375096,-49.287531555838,1.637931
7,150631,"Av. Iguacu, 1788 - Agua Verde",-25.445710238874,-49.278719158119,1.632653
8,160244,"Rua Emanoel Voluz, 284 - Pinheirinho",-25.524017075104,-49.292992119479,1.611888
9,110024,"Rua Alferes Poli, 787 - Reboucas",-25.443478176732,-49.270140950921,1.604348


In [20]:
events2.count()

279689

> Alterar o grafo para contemplar o tempo médio de deslocamento entre dois pontos de todas as linhas

```sql
    (bs)-[:NEXT_BS {year,month,day,hour, time_avg, time_std, n_veic}]->(bs)  # um dia pode ter 24 arestas, realizar buscas através do tempo
```

1. Identificar pontos com alta centralidade (demanda)
2. Fixar um raio baseado no tempo médio de caminhada
3. Identificar o número de pontos a área contempla e o número de linhas ( maximizar o número de linhas)
4. Eleger top X áreas e criar arestas virtuais entre estes pontos com tempo médio de caminhada
5. Realizar testes de deslocamento no grafo
6. Identificar mudança topográfica no grafo

In [None]:
https://pbpython.com/styling-pandas.html
https://pbpython.com/monte-carlo.html
https://github.com/chris1610/pbpython/blob/master/notebooks/Monte_Carlo_Simulation.ipynb
http://www.flawofaverages.com/