In [19]:
import pyspark
from pyspark import SparkContext
from pyspark.sql.window import Window
from pyspark.sql import functions as F

import json
#from pyspark.ml.feature import VectorAssembler
#from pyspark.ml import Pipeline
#from pyspark.ml.feature import StringIndexer
#from pyspark.mllib.evaluation import RegressionMetrics

In [4]:
def rename_columns(df, list_of_tuples):
    for (old_col, new_col) in list_of_tuples:
        df = df.withColumnRenamed(old_col, new_col)

    return df

def read_file(filepath, sqlContext):
    data_frame = sqlContext.read.format("com.databricks.spark.csv") \
        .option("header", "false") \
        .option("inferSchema", "true") \
        .option("nullValue", "-") \
        .load(filepath)

    while len(data_frame.columns) < 16:
        col_name = "_c" + str(len(data_frame.columns))
        data_frame = data_frame.withColumn(col_name, lit(None))

    data_frame = rename_columns(
        data_frame,
        [
            ("_c0", "route"),
            ("_c1", "tripNum"),
            ("_c2", "shapeId"),
            ("_c3", "shapeSequence"),
            ("_c4", "shapeLat"),
            ("_c5", "shapeLon"),
            ("_c6", "distanceTraveledShape"),
            ("_c7", "busCode"),
            ("_c8", "gpsPointId"),
            ("_c9", "gpsLat"),
            ("_c10", "gpsLon"),
            ("_c11", "distanceToShapePoint"),
            ("_c12", "timestamp"),
            ("_c13", "busStopId"),
            ("_c14", "problem"),
            ("_c15", "numPassengers")
        ]
    )

    date = "-".join(filepath.split("/")[-2].split("_")[:3])

    data_frame = data_frame.withColumn("date", lit(date))

    return data_frame

### Read Input Data

In [2]:
sc = SparkContext.getOrCreate()
sqlContext = pyspark.SQLContext(sc)

In [5]:
trips_data = read_file('/local/tarciso/sample-data/bulma-output/2017_05_11_veiculos.csv/part-00000', sqlContext)

In [17]:
trips_data.head(3)

[Row(route=u'372', tripNum=4, shapeId=1891, shapeSequence=6136916, shapeLat=-25.432724990605614, shapeLon=-49.27218701780396, distanceTraveledShape=10149.88, busCode=u'CC170', gpsPointId=None, gpsLat=None, gpsLon=None, distanceToShapePoint=None, timestamp=54823, busStopId=None, problem=u'BETWEEN', numPassengers=None, date=u'2017-05-11'),
 Row(route=u'372', tripNum=4, shapeId=1891, shapeSequence=6136915, shapeLat=-25.432470122014415, shapeLon=-49.27231020615159, distanceTraveledShape=10119.046, busCode=u'CC170', gpsPointId=None, gpsLat=None, gpsLon=None, distanceToShapePoint=None, timestamp=56714, busStopId=None, problem=u'BETWEEN', numPassengers=None, date=u'2017-05-11'),
 Row(route=u'372', tripNum=1, shapeId=1891, shapeSequence=6136650, shapeLat=-25.413362156585787, shapeLon=-49.20592429766663, distanceTraveledShape=79.983, busCode=u'CC170', gpsPointId=None, gpsLat=-25.413378, gpsLon=-49.205836, distanceToShapePoint=9.041484, timestamp=31274, busStopId=None, problem=u'NO_PROBLEM', num

In [18]:
trips_data.printSchema()

root
 |-- route: string (nullable = true)
 |-- tripNum: integer (nullable = true)
 |-- shapeId: integer (nullable = true)
 |-- shapeSequence: integer (nullable = true)
 |-- shapeLat: double (nullable = true)
 |-- shapeLon: double (nullable = true)
 |-- distanceTraveledShape: double (nullable = true)
 |-- busCode: string (nullable = true)
 |-- gpsPointId: string (nullable = true)
 |-- gpsLat: double (nullable = true)
 |-- gpsLon: double (nullable = true)
 |-- distanceToShapePoint: double (nullable = true)
 |-- timestamp: long (nullable = true)
 |-- busStopId: integer (nullable = true)
 |-- problem: string (nullable = true)
 |-- numPassengers: integer (nullable = true)
 |-- date: string (nullable = false)



In [21]:
timestamp_window = Window.partitionBy().orderBy("timestamp")
trips_data = trips_data.withColumn("prev_timestamp", F.lag(trips_data.timestamp).over(timestamp_window))
trips_data = trips_data.withColumn("diff_timestamp", F.when(F.isnull(trips_data.timestamp - trips_data.prev_timestamp), 0)
                              .otherwise(trips_data.timestamp - trips_data.prev_timestamp))

In [1]:
trips_data.select(["timestamp","prev_timestamp","diff_timestamp"]).head(5)

NameError: name 'trips_data' is not defined

In [9]:
filepath = '/local/tarciso/sample-data/ticketing-sample/doc1-2017051115.txt'
ticketing_data = sqlContext.read.json(filepath)

In [10]:
ticketing_data.select("DATAUTILIZACAO").show()

+--------------------+
|      DATAUTILIZACAO|
+--------------------+
|10/05/17 20:15:16...|
|10/05/17 13:10:24...|
|10/05/17 08:23:45...|
|10/05/17 11:54:19...|
|10/05/17 13:30:10...|
|10/05/17 07:52:52...|
|10/05/17 18:34:06...|
|10/05/17 06:15:31...|
|10/05/17 17:57:28...|
|10/05/17 10:03:56...|
|10/05/17 13:35:56...|
|10/05/17 07:50:18...|
|10/05/17 12:26:01...|
|10/05/17 14:34:27...|
|10/05/17 14:34:30...|
|10/05/17 14:34:32...|
|10/05/17 14:02:43...|
|10/05/17 11:04:25...|
|10/05/17 06:44:08...|
|10/05/17 17:38:18...|
+--------------------+
only showing top 20 rows



In [11]:
time_fmt = "HH:mm:ss"
parse_time = unix_timestamp("timestamp", time_fmt)
trips_data = trips_data.withColumn("timestamp", parse_time)

In [12]:
trips_data.select("timestamp").show()

+---------+
|timestamp|
+---------+
|    54823|
|    56714|
|    31274|
|    31276|
|    31280|
|    31287|
|    31291|
|    31293|
|    31297|
|    31305|
|    31307|
|    31312|
|    31320|
|    31323|
|    31328|
|    31332|
|    31337|
|    31341|
|    31343|
|    31345|
+---------+
only showing top 20 rows



In [13]:
time_fmt = "DD/MM/YY HH:mm:ss"
parse_datetime = unix_timestamp("DATAUTILIZACAO", time_fmt)
ticketing_data = ticketing_data.withColumn("DATAUTILIZACAO", regexp_replace(col("DATAUTILIZACAO"), ",000000", ""))
ticketing_data = ticketing_data.withColumn("DATAUTILIZACAO", parse_datetime)

In [14]:
ticketing_data.select("DATAUTILIZACAO").show()

+--------------+
|DATAUTILIZACAO|
+--------------+
|    1483312516|
|    1483287024|
|    1483269825|
|    1483282459|
|    1483288210|
|    1483267972|
|    1483306446|
|    1483262131|
|    1483304248|
|    1483275836|
|    1483288556|
|    1483267818|
|    1483284361|
|    1483292067|
|    1483292070|
|    1483292072|
|    1483290163|
|    1483279465|
|    1483263848|
|    1483303098|
+--------------+
only showing top 20 rows



In [15]:
def get_group_N_min(timestamp,N):
    mins_since_midnight = 60*hour(timestamp) + minute(timestamp)
    return(floor(mins_since_midnight/N))

#ticketing_data = ticketing_data.withColumn("DATAUTILIZACAO",(60*hour(col("DATAUTILIZACAO")) + 
#                                                             minute(col("DATAUTILIZACAO")))/5)
#hour(ticketing_data.select("DATAUTILIZACAO"))