In [39]:
import pyspark
from pyspark import SparkContext
from pyspark.sql.window import Window
from pyspark.sql import functions as F
from pyspark.sql import types as T

import json
import numpy as np

In [42]:
def rename_columns(df, list_of_tuples):
    for (old_col, new_col) in list_of_tuples:
        df = df.withColumnRenamed(old_col, new_col)

    return df

def read_file(filepath, sqlContext):
    data_frame = sqlContext.read.format("com.databricks.spark.csv") \
        .option("header", "false") \
        .option("inferSchema", "true") \
        .option("nullValue", "-") \
        .load(filepath)

    while len(data_frame.columns) < 16:
        col_name = "_c" + str(len(data_frame.columns))
        data_frame = data_frame.withColumn(col_name, F.lit(None))

    data_frame = rename_columns(
        data_frame,
        [
            ("_c0", "route"),
            ("_c1", "tripNum"),
            ("_c2", "shapeId"),
            ("_c3", "shapeSequence"),
            ("_c4", "shapeLat"),
            ("_c5", "shapeLon"),
            ("_c6", "distanceTraveledShape"),
            ("_c7", "busCode"),
            ("_c8", "gpsPointId"),
            ("_c9", "gpsLat"),
            ("_c10", "gpsLon"),
            ("_c11", "distanceToShapePoint"),
            ("_c12", "timestamp"),
            ("_c13", "busStopId"),
            ("_c14", "problem"),
            ("_c15", "numPassengers")
        ]
    )

    date = "-".join(filepath.split("/")[-2].split("_")[:3])

    data_frame = data_frame.withColumn("date", F.lit(date))

    return data_frame

### Read Input Data

In [3]:
sc = SparkContext.getOrCreate()
sqlContext = pyspark.SQLContext(sc)

In [43]:
trips_data = read_file('/local/tarciso/data/sample-data/bulma-output/2017_05_11_veiculos.csv/part-00000', sqlContext)

In [44]:
trips_data.head(3)

[Row(route=u'372', tripNum=4, shapeId=1891, shapeSequence=6136916, shapeLat=-25.432724990605614, shapeLon=-49.27218701780396, distanceTraveledShape=10149.88, busCode=u'CC170', gpsPointId=None, gpsLat=None, gpsLon=None, distanceToShapePoint=None, timestamp=u'12:13:43', busStopId=None, problem=u'BETWEEN', numPassengers=None, date=u'2017-05-11'),
 Row(route=u'372', tripNum=4, shapeId=1891, shapeSequence=6136915, shapeLat=-25.432470122014415, shapeLon=-49.27231020615159, distanceTraveledShape=10119.046, busCode=u'CC170', gpsPointId=None, gpsLat=None, gpsLon=None, distanceToShapePoint=None, timestamp=u'12:45:14', busStopId=None, problem=u'BETWEEN', numPassengers=None, date=u'2017-05-11'),
 Row(route=u'372', tripNum=1, shapeId=1891, shapeSequence=6136650, shapeLat=-25.413362156585787, shapeLon=-49.20592429766663, distanceTraveledShape=79.983, busCode=u'CC170', gpsPointId=None, gpsLat=-25.413378, gpsLon=-49.205836, distanceToShapePoint=9.041484, timestamp=u'05:41:14', busStopId=None, problem=

In [45]:
trips_data.select("timestamp").head(5)

[Row(timestamp=u'12:13:43'),
 Row(timestamp=u'12:45:14'),
 Row(timestamp=u'05:41:14'),
 Row(timestamp=u'05:41:16'),
 Row(timestamp=u'05:41:20')]

In [46]:
trips_data = trips_data.withColumn("new_timestamp", F.unix_timestamp(F.col("timestamp"), "HH:mm:ss"))
trips_data.select("new_timestamp").head(5)

[Row(new_timestamp=54823),
 Row(new_timestamp=56714),
 Row(new_timestamp=31274),
 Row(new_timestamp=31276),
 Row(new_timestamp=31280)]

In [47]:
trips_data.printSchema()

root
 |-- route: string (nullable = true)
 |-- tripNum: integer (nullable = true)
 |-- shapeId: integer (nullable = true)
 |-- shapeSequence: integer (nullable = true)
 |-- shapeLat: double (nullable = true)
 |-- shapeLon: double (nullable = true)
 |-- distanceTraveledShape: double (nullable = true)
 |-- busCode: string (nullable = true)
 |-- gpsPointId: string (nullable = true)
 |-- gpsLat: double (nullable = true)
 |-- gpsLon: double (nullable = true)
 |-- distanceToShapePoint: double (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- busStopId: integer (nullable = true)
 |-- problem: string (nullable = true)
 |-- numPassengers: integer (nullable = true)
 |-- date: string (nullable = false)
 |-- new_timestamp: long (nullable = true)



In [48]:
test = trips_data.groupby(["route","tripNum"]).avg("new_timestamp").show()

+-----+-------+------------------+
|route|tripNum|avg(new_timestamp)|
+-----+-------+------------------+
|  646|      7|  58207.4257221458|
|  650|      9| 72151.01580354972|
|  638|      6|56234.409443725744|
|  561|      7|51830.629664179105|
|  652|     19| 75122.97798742139|
|  671|      4|48615.761233480174|
|  658|      3| 49221.17559808613|
|  644|      5| 41852.36443661972|
|  657|     18| 72637.45272969374|
|  811|     18| 80897.59565522075|
|  610|      2| 36077.42105263158|
|  684|     15| 76872.37625329815|
|  024|      3| 76129.83539094651|
|  513|     15| 79788.97517730496|
|  681|     12|     72234.3328125|
|  913|     15| 63734.27101200686|
|  718|     42| 91960.52525252526|
|  829|   null|              null|
|  467|   null|              null|
|  366|      3|43975.903930131004|
+-----+-------+------------------+
only showing top 20 rows



In [49]:
trips_data = trips_data.withColumn("timediff",(trips_data.new_timestamp - F.lag(trips_data.new_timestamp)
.over(Window.partitionBy(["route","tripNum"]) \
.orderBy("new_timestamp"))))

trips_data.select(["route","tripNum","timestamp","timediff"]).show()

+-----+-------+---------+--------+
|route|tripNum|timestamp|timediff|
+-----+-------+---------+--------+
|  024|      3|     null|    null|
|  024|      3|     null|    null|
|  024|      3|     null|    null|
|  024|      3|     null|    null|
|  024|      3|     null|    null|
|  024|      3|     null|    null|
|  024|      3|     null|    null|
|  024|      3|     null|    null|
|  024|      3|     null|    null|
|  024|      3|     null|    null|
|  024|      3|     null|    null|
|  024|      3|     null|    null|
|  024|      3|     null|    null|
|  024|      3|     null|    null|
|  024|      3|     null|    null|
|  024|      3|     null|    null|
|  024|      3|     null|    null|
|  024|      3|     null|    null|
|  024|      3|     null|    null|
|  024|      3| 17:32:47|    null|
+-----+-------+---------+--------+
only showing top 20 rows



In [50]:
def find_median(values_list):
    try:
        median = np.median(values_list) #get the median of values in a list in each row
        return round(float(median),2)
    except Exception:
        return None #if there is anything wrong with the given values

median_finder = F.udf(find_median,T.FloatType())

In [51]:
timediffs_list = trips_data.groupby(["route","tripNum"]).agg(F.collect_list("timediff").alias("timediffs"))
timediffs_list = timediffs_list.withColumn("median",median_finder("timediffs"))
timediffs_list.show()

+-----+-------+--------------------+------+
|route|tripNum|           timediffs|median|
+-----+-------+--------------------+------+
|  024|      3|[108, 2, 5, 2, 1,...|   3.0|
|  366|      3|[4, 2, 11, 8, 21,...|   3.0|
|  467|   null|                  []|   NaN|
|  513|     15|[6, 83, 4, 11, 8,...|   4.0|
|  561|      7|[8, 9, 26, 2, 8, ...|   4.0|
|  610|      2|[1, 2, 0, 5, 4, 7...|   2.0|
|  638|      6|[3, 4, 9, 9, 2, 3...|   3.0|
|  644|      5|[15, 0, 2, 1, 3, ...|   2.0|
|  646|      7|[6, 1, 2, 3, 2, 3...|   1.0|
|  650|      9|[7, 23, 5, 2, 3, ...|   1.0|
|  652|     19|[15, 6, 5, 4, 6, ...|   3.0|
|  657|     18|[19, 16, 4, 12, 6...|   2.0|
|  658|      3|[247, 42, 6, 87, ...|   2.0|
|  671|      4|[5, 2, 2, 2, 5, 4...|   3.0|
|  681|     12|[2, 5, 8, 7, 3, 1...|   3.0|
|  684|     15|[6, 4, 4, 2, 1, 1...|   2.0|
|  718|     42|[1, 1, 4, 6, 2, 6...|   4.0|
|  811|     18|[2, 3, 1, 1, 2, 1...|   1.0|
|  829|   null|                  []|   NaN|
|  913|     15|[42, 1, 4, 2, 1, 

In [58]:
medians = timediffs_list.select("median").collect()

In [63]:
np.nanmedian(medians)

3.0