In [1]:
import pandas as pd
pd.set_option('max_colwidth', -1)
pd.set_option('display.max_rows', 500)

from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql import DataFrame
from pyspark.sql.window import Window
spark = SparkSession.builder.appName('abc').getOrCreate()

from pyspark.sql.functions import col
from pyspark.sql import SparkSession

In [2]:
def generate_series(start, stop, interval, alias):
    """
    :param start  - lower bound, inclusive
    :param stop   - upper bound, exclusive
    :interval int - increment interval in seconds
    """
    spark = SparkSession.builder.getOrCreate()
    # Determine start and stops in epoch seconds
    start, stop = spark.createDataFrame(
        [(start, stop)], ("start", "stop")
    ).select(
        [col(c).cast("timestamp").cast("long") for c in ("start", "stop")
    ]).first()
    
    # Create range with increments and cast to timestamp
    return spark.range(start, stop, interval).select(
        col("id").cast("timestamp").alias(alias)
    )

In [3]:
%reload_ext cypher

In [4]:
%env NEO4J_URL=http://neo4j:h4ck3r@172.19.0.2:7474/db/data 

env: NEO4J_URL=http://neo4j:h4ck3r@172.19.0.2:7474/db/data


#### ONIBUS DE DIFERENTES LINHAS QUE SE ENCONTRAM EM UM MESMO PONTO AO LONGO DO DIA

In [5]:
results = %cypher \
MATCH (y:Year)-[:CONTAINS]->(m:Month)-[:CONTAINS]->(d:Day)-[:HAS_LINE]->(l:Line)-[:HAS_TRIP]->(t:Trip)-[:HAS_BUS_STOP]->(bss:BusStop) \
with y,m,d,l,t,bss \
MATCH (t)-[:HAS_EVENT]->(ev:Event)<-[:HAS_EVENT]-(h:Hour)<-[:CONTAINS]-(d) \
MATCH (ev)-[:IS_NEAR_BY]->(bss)  \
return  y.value as year, m.value as month, d.value as day, bss.number as busstop_number,bss.name as name,bss.latitude as latitude, bss.longitude as longitude, l.line_code as line_code,t.line_way as line_way, ev.vehicle as vehicle, ev.event_timestamp as event_timestamp

434082 rows affected.


In [6]:
df = spark.createDataFrame(results.get_dataframe())

In [7]:
# start date is min date
date_min=df.withColumn("event_timestamp",F.date_trunc('hour', df.event_timestamp)).agg({'event_timestamp': 'min'}).first()[0]
date_max=df.withColumn("event_timestamp",F.date_trunc('hour', df.event_timestamp)).agg({'event_timestamp': 'max'}).first()[0]


time_series5_df = generate_series(date_min, date_max, 60 * 5   , "time_span5min")
time_series10_df = generate_series(date_min, date_max, 60 * 10 , "time_span10min")
time_series15_df = generate_series(date_min, date_max, 60 * 15 , "time_span15min")
time_series20_df = generate_series(date_min, date_max, 60 * 20 , "time_span20min")
time_series25_df = generate_series(date_min, date_max, 60 * 25 , "time_span25min")
time_series30_df = generate_series(date_min, date_max, 60 * 30 , "time_span30min")
# time_series35_df = generate_series(date_min, date_max, 60 * 35 , "time_span35min")
# time_series40_df = generate_series(date_min, date_max, 60 * 40 , "time_span40min")
# time_series45_df = generate_series(date_min, date_max, 60 * 45 , "time_span45min")
# time_series50_df = generate_series(date_min, date_max, 60 * 50 , "time_span50min")
# time_series55_df = generate_series(date_min, date_max, 60 * 55 , "time_span55min")
# time_series59_df = generate_series(date_min, date_max, 60 * 59 , "time_span59min")

In [149]:
time_series10_df.toPandas().head()

Unnamed: 0,time_span10min
0,2019-05-01 01:00:00
1,2019-05-01 01:10:00
2,2019-05-01 01:20:00
3,2019-05-01 01:30:00
4,2019-05-01 01:40:00


In [9]:
time_span5_df = time_series5_df.withColumn("next_time_span5min",F.lead("time_span5min").over(Window.orderBy(time_series5_df.time_span5min)))
time_span10_df = time_series10_df.withColumn("next_time_span10min",F.lead("time_span10min").over(Window.orderBy(time_series10_df.time_span10min)))
time_span15_df = time_series15_df.withColumn("next_time_span15min",F.lead("time_span15min").over(Window.orderBy(time_series15_df.time_span15min)))
time_span20_df = time_series20_df.withColumn("next_time_span20min",F.lead("time_span20min").over(Window.orderBy(time_series20_df.time_span20min)))

time_span25_df = time_series25_df.withColumn("next_time_span25min",F.lead("time_span25min").over(Window.orderBy(time_series25_df.time_span25min)))
time_span30_df = time_series30_df.withColumn("next_time_span30min",F.lead("time_span30min").over(Window.orderBy(time_series30_df.time_span30min)))

# time_span35_df = time_series35_df.withColumn("next_time_span35min",F.lead("time_span35min").over(Window.orderBy(time_series35_df.time_span35min)))
# time_span40_df = time_series40_df.withColumn("next_time_span40min",F.lead("time_span40min").over(Window.orderBy(time_series40_df.time_span40min)))
# time_span45_df = time_series45_df.withColumn("next_time_span45min",F.lead("time_span45min").over(Window.orderBy(time_series45_df.time_span45min)))
# time_span50_df = time_series50_df.withColumn("next_time_span50min",F.lead("time_span50min").over(Window.orderBy(time_series50_df.time_span50min)))
# time_span55_df = time_series55_df.withColumn("next_time_span55min",F.lead("time_span55min").over(Window.orderBy(time_series55_df.time_span55min)))
# time_span59_df = time_series59_df.withColumn("next_time_span59min",F.lead("time_span59min").over(Window.orderBy(time_series59_df.time_span59min)))

In [150]:
time_span10_df.toPandas().head()

Unnamed: 0,time_span10min,next_time_span10min
0,2019-05-01 01:00:00,2019-05-01 01:10:00
1,2019-05-01 01:10:00,2019-05-01 01:20:00
2,2019-05-01 01:20:00,2019-05-01 01:30:00
3,2019-05-01 01:30:00,2019-05-01 01:40:00
4,2019-05-01 01:40:00,2019-05-01 01:50:00


In [11]:
#df.printSchema()

In [None]:
#df.toPandas().head()

In [12]:
#filter("busstop_number == 150751")
events = (df.join(time_span5_df, (df.event_timestamp >= time_span5_df.time_span5min)
               & (df.event_timestamp < time_span5_df.next_time_span5min ))
         )

events = (events.join(time_span10_df, (events.event_timestamp >= time_span10_df.time_span10min)
               & (events.event_timestamp < time_span10_df.next_time_span10min ))
         )

events = (events.join(time_span15_df, (events.event_timestamp >= time_span15_df.time_span15min)
               & (events.event_timestamp < time_span15_df.next_time_span15min ))
         )

events = (events.join(time_span20_df, (events.event_timestamp >= time_span20_df.time_span20min)
               & (events.event_timestamp < time_span20_df.next_time_span20min ))
         )

events = (events.join(time_span25_df, (events.event_timestamp >= time_span25_df.time_span25min)
               & (events.event_timestamp < time_span25_df.next_time_span25min ))
         )

events = (events.join(time_span30_df, (events.event_timestamp >= time_span30_df.time_span30min)
               & (events.event_timestamp < time_span30_df.next_time_span30min ))
         )


# events = (events.join(time_span35_df, (events.event_timestamp >= time_span35_df.time_span35min)
#                & (events.event_timestamp < time_span35_df.next_time_span35min ))
#          )

# events = (events.join(time_span40_df, (events.event_timestamp >= time_span40_df.time_span40min)
#                & (events.event_timestamp < time_span40_df.next_time_span40min ))
#          )

# events = (events.join(time_span45_df, (events.event_timestamp >= time_span45_df.time_span45min)
#                & (events.event_timestamp < time_span45_df.next_time_span45min ))
#          )

# events = (events.join(time_span50_df, (events.event_timestamp >= time_span50_df.time_span50min)
#                & (events.event_timestamp < time_span50_df.next_time_span50min ))
#          )


# events = (events.join(time_span55_df, (events.event_timestamp >= time_span55_df.time_span55min)
#                & (events.event_timestamp < time_span55_df.next_time_span55min ))
#          )


# events = (events.join(time_span59_df, (events.event_timestamp >= time_span59_df.time_span59min)
#                & (events.event_timestamp < time_span59_df.next_time_span59min ))
#          )

events = events.orderBy(F.desc("busstop_number"),F.asc("event_timestamp")).drop("event_timestamp")

In [13]:
#events.toPandas().head()

In [14]:
window5 = (
    Window.partitionBy(events.year, events.month, events.day, events.busstop_number,events.name, events.latitude, events.longitude, events.time_span5min,events.next_time_span5min)
)

window10 = (
    Window.partitionBy(events.year, events.month, events.day, events.busstop_number,events.name, events.latitude, events.longitude, events.time_span10min,events.next_time_span10min)
)

window15 = (
    Window.partitionBy(events.year, events.month, events.day, events.busstop_number,events.name, events.latitude, events.longitude, events.time_span15min,events.next_time_span15min)
)

window20 = (
    Window.partitionBy(events.year, events.month, events.day, events.busstop_number,events.name, events.latitude, events.longitude, events.time_span20min,events.next_time_span20min)
)

window25 = (
    Window.partitionBy(events.year, events.month, events.day, events.busstop_number,events.name, events.latitude, events.longitude, events.time_span25min,events.next_time_span25min)
)

window30 = (
    Window.partitionBy(events.year, events.month, events.day, events.busstop_number,events.name, events.latitude, events.longitude, events.time_span30min,events.next_time_span30min)
)


events2 = (events
          .withColumn("count_lines5min", F.size(F.collect_set('line_code').over(window5)))
          .withColumn("count_vehicles5min", F.size(F.collect_set('vehicle').over(window5)))
          .withColumn("set_vehicles5min", F.collect_set('vehicle').over(window5))
          .withColumn("set_lines5min", F.collect_set('line_code').over(window5))
           
           .withColumn("count_lines10min", F.size(F.collect_set('line_code').over(window10)))
          .withColumn("count_vehicles10min", F.size(F.collect_set('vehicle').over(window10)))
          .withColumn("set_vehicles10min", F.collect_set('vehicle').over(window10))
          .withColumn("set_lines10min", F.collect_set('line_code').over(window10))
           
           .withColumn("count_lines15min", F.size(F.collect_set('line_code').over(window15)))
          .withColumn("count_vehicles15min", F.size(F.collect_set('vehicle').over(window15)))
          .withColumn("set_vehicles15min", F.collect_set('vehicle').over(window15))
          .withColumn("set_lines15min", F.collect_set('line_code').over(window15))
           
           .withColumn("count_lines20min", F.size(F.collect_set('line_code').over(window20)))
          .withColumn("count_vehicles20min", F.size(F.collect_set('vehicle').over(window20)))
          .withColumn("set_vehicles20min", F.collect_set('vehicle').over(window20))
          .withColumn("set_lines20min", F.collect_set('line_code').over(window20))
           
           .withColumn("count_lines25min", F.size(F.collect_set('line_code').over(window25)))
          .withColumn("count_vehicles25min", F.size(F.collect_set('vehicle').over(window25)))
          .withColumn("set_vehicles25min", F.collect_set('vehicle').over(window25))
          .withColumn("set_lines25min", F.collect_set('line_code').over(window25))
           
          .withColumn("count_lines30min", F.size(F.collect_set('line_code').over(window30)))
          .withColumn("count_vehicles30min", F.size(F.collect_set('vehicle').over(window30)))
          .withColumn("set_vehicles30min", F.collect_set('vehicle').over(window30))
          .withColumn("set_lines30min", F.collect_set('line_code').over(window30))
          
          .orderBy(F.desc("busstop_number"),F.asc("time_span5min"))).drop("vehicle","line_code","line_way").distinct()

In [15]:
df_frequencies = events2.toPandas()

In [16]:
df_frequencies.to_csv("frequencies.csv", index=False)

--- 