In [3]:
import pyspark
from pyspark import SparkContext
from pyspark.sql.window import Window
from pyspark.sql import functions as F
from pyspark.sql import types as T

import json
import numpy as np
#from pyspark.ml.feature import VectorAssembler
#from pyspark.ml import Pipeline
#from pyspark.ml.feature import StringIndexer
#from pyspark.mllib.evaluation import RegressionMetrics

In [4]:
def rename_columns(df, list_of_tuples):
    for (old_col, new_col) in list_of_tuples:
        df = df.withColumnRenamed(old_col, new_col)

    return df

def read_file(filepath, sqlContext):
    data_frame = sqlContext.read.format("com.databricks.spark.csv") \
        .option("header", "false") \
        .option("inferSchema", "true") \
        .option("nullValue", "-") \
        .load(filepath)

    while len(data_frame.columns) < 16:
        col_name = "_c" + str(len(data_frame.columns))
        data_frame = data_frame.withColumn(col_name, F.lit(None))

    data_frame = rename_columns(
        data_frame,
        [
            ("_c0", "route"),
            ("_c1", "tripNum"),
            ("_c2", "shapeId"),
            ("_c3", "shapeSequence"),
            ("_c4", "shapeLat"),
            ("_c5", "shapeLon"),
            ("_c6", "distanceTraveledShape"),
            ("_c7", "busCode"),
            ("_c8", "gpsPointId"),
            ("_c9", "gpsLat"),
            ("_c10", "gpsLon"),
            ("_c11", "distanceToShapePoint"),
            ("_c12", "timestamp"),
            ("_c13", "busStopId"),
            ("_c14", "problem"),
            ("_c15", "numPassengers")
        ]
    )

    date = "-".join(filepath.split("/")[-2].split("_")[:3])

    data_frame = data_frame.withColumn("date", F.lit(date))

    return data_frame

### Read Input Data

In [5]:
sc = SparkContext.getOrCreate()
sqlContext = pyspark.SQLContext(sc)

In [6]:
trips_data = read_file('/local/tarciso/data/sample-data/bulma-output/2017_05_11_veiculos.csv/part-00000', sqlContext)

In [7]:
trips_data.head(3)

[Row(route=u'372', tripNum=4, shapeId=1891, shapeSequence=6136916, shapeLat=-25.432724990605614, shapeLon=-49.27218701780396, distanceTraveledShape=10149.88, busCode=u'CC170', gpsPointId=None, gpsLat=None, gpsLon=None, distanceToShapePoint=None, timestamp=u'12:13:43', busStopId=None, problem=u'BETWEEN', numPassengers=None, date=u'2017-05-11'),
 Row(route=u'372', tripNum=4, shapeId=1891, shapeSequence=6136915, shapeLat=-25.432470122014415, shapeLon=-49.27231020615159, distanceTraveledShape=10119.046, busCode=u'CC170', gpsPointId=None, gpsLat=None, gpsLon=None, distanceToShapePoint=None, timestamp=u'12:45:14', busStopId=None, problem=u'BETWEEN', numPassengers=None, date=u'2017-05-11'),
 Row(route=u'372', tripNum=1, shapeId=1891, shapeSequence=6136650, shapeLat=-25.413362156585787, shapeLon=-49.20592429766663, distanceTraveledShape=79.983, busCode=u'CC170', gpsPointId=None, gpsLat=-25.413378, gpsLon=-49.205836, distanceToShapePoint=9.041484, timestamp=u'05:41:14', busStopId=None, problem=

In [8]:
trips_data.printSchema()

root
 |-- route: string (nullable = true)
 |-- tripNum: integer (nullable = true)
 |-- shapeId: integer (nullable = true)
 |-- shapeSequence: integer (nullable = true)
 |-- shapeLat: double (nullable = true)
 |-- shapeLon: double (nullable = true)
 |-- distanceTraveledShape: double (nullable = true)
 |-- busCode: string (nullable = true)
 |-- gpsPointId: string (nullable = true)
 |-- gpsLat: double (nullable = true)
 |-- gpsLon: double (nullable = true)
 |-- distanceToShapePoint: double (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- busStopId: integer (nullable = true)
 |-- problem: string (nullable = true)
 |-- numPassengers: integer (nullable = true)
 |-- date: string (nullable = false)



In [9]:
filepath = '/local/tarciso/data/sample-data/ticketing-sample/doc1-2017051115.txt'
ticketing_data = sqlContext.read.json(filepath)

In [25]:
ticketing_data.printSchema()

root
 |-- CODLINHA: string (nullable = true)
 |-- CODVEICULO: string (nullable = true)
 |-- DATANASCIMENTO: string (nullable = true)
 |-- DATAUTILIZACAO: string (nullable = true)
 |-- NOMELINHA: string (nullable = true)
 |-- NUMEROCARTAO: string (nullable = true)
 |-- SEXO: string (nullable = true)
 |-- new_DATAUTILIZACAO: long (nullable = true)



In [26]:
ticketing_data.head(5)

[Row(CODLINHA=u'021', CODVEICULO=u'08046', DATANASCIMENTO=u'26/01/72', DATAUTILIZACAO=u'10/05/17 20:15:16,000000', NOMELINHA=u'INTERB II ANTI H', NUMEROCARTAO=u'0001937533', SEXO=u'F', new_DATAUTILIZACAO=1483312516),
 Row(CODLINHA=u'021', CODVEICULO=u'08027', DATANASCIMENTO=u'26/01/72', DATAUTILIZACAO=u'10/05/17 13:10:24,000000', NOMELINHA=u'INTERB II ANTI H', NUMEROCARTAO=u'0001937533', SEXO=u'F', new_DATAUTILIZACAO=1483287024),
 Row(CODLINHA=u'623', CODVEICULO=u'HA022', DATANASCIMENTO=u'06/03/71', DATAUTILIZACAO=u'10/05/17 08:23:45,000000', NOMELINHA=u'PQ.INDUSTRIAL', NUMEROCARTAO=u'0001311020', SEXO=u'F', new_DATAUTILIZACAO=1483269825),
 Row(CODLINHA=u'000', CODVEICULO=u'03023', DATANASCIMENTO=u'06/03/71', DATAUTILIZACAO=u'10/05/17 11:54:19,000000', NOMELINHA=u'OPER S/LINHA', NUMEROCARTAO=u'0001311020', SEXO=u'F', new_DATAUTILIZACAO=1483282459),
 Row(CODLINHA=u'TPH', CODVEICULO=u'03019', DATANASCIMENTO=u'23/11/79', DATAUTILIZACAO=u'10/05/17 13:30:10,000000', NOMELINHA=u'TERMINAL PIN

## Pre-processing data

In [20]:
trips_data = trips_data.withColumn("gps_timestamp", F.concat(F.col("date"), F.lit(" "), F.col("timestamp")))
trips_data = trips_data.withColumn("gps_timestamp", F.unix_timestamp(F.col("gps_timestamp"), "YYYY-MM-DD HH:mm:ss"))

In [22]:
trips_data.select(["date","timestamp","gps_timestamp"]).show()

+----------+---------+-------------+
|      date|timestamp|gps_timestamp|
+----------+---------+-------------+
|2017-05-11| 12:13:43|   1483283623|
|2017-05-11| 12:45:14|   1483285514|
|2017-05-11| 05:41:14|   1483260074|
|2017-05-11| 05:41:16|   1483260076|
|2017-05-11| 05:41:20|   1483260080|
|2017-05-11| 05:41:27|   1483260087|
|2017-05-11| 05:41:31|   1483260091|
|2017-05-11| 05:41:33|   1483260093|
|2017-05-11| 05:41:37|   1483260097|
|2017-05-11| 05:41:45|   1483260105|
|2017-05-11| 05:41:47|   1483260107|
|2017-05-11| 05:41:52|   1483260112|
|2017-05-11| 05:42:00|   1483260120|
|2017-05-11| 05:42:03|   1483260123|
|2017-05-11| 05:42:08|   1483260128|
|2017-05-11| 05:42:12|   1483260132|
|2017-05-11| 05:42:17|   1483260137|
|2017-05-11| 05:42:21|   1483260141|
|2017-05-11| 05:42:23|   1483260143|
|2017-05-11| 05:42:25|   1483260145|
+----------+---------+-------------+
only showing top 20 rows



In [27]:
ticketing_data = ticketing_data.withColumn("card_timestamp", F.unix_timestamp(F.col("DATAUTILIZACAO"), "DD/MM/YY HH:mm:ss"))

In [28]:
ticketing_data.select(["DATAUTILIZACAO","card_timestamp"]).show()

+--------------------+--------------+
|      DATAUTILIZACAO|card_timestamp|
+--------------------+--------------+
|10/05/17 20:15:16...|    1483312516|
|10/05/17 13:10:24...|    1483287024|
|10/05/17 08:23:45...|    1483269825|
|10/05/17 11:54:19...|    1483282459|
|10/05/17 13:30:10...|    1483288210|
|10/05/17 07:52:52...|    1483267972|
|10/05/17 18:34:06...|    1483306446|
|10/05/17 06:15:31...|    1483262131|
|10/05/17 17:57:28...|    1483304248|
|10/05/17 10:03:56...|    1483275836|
|10/05/17 13:35:56...|    1483288556|
|10/05/17 07:50:18...|    1483267818|
|10/05/17 12:26:01...|    1483284361|
|10/05/17 14:34:27...|    1483292067|
|10/05/17 14:34:30...|    1483292070|
|10/05/17 14:34:32...|    1483292072|
|10/05/17 14:02:43...|    1483290163|
|10/05/17 11:04:25...|    1483279465|
|10/05/17 06:44:08...|    1483263848|
|10/05/17 17:38:18...|    1483303098|
+--------------------+--------------+
only showing top 20 rows



In [39]:
curr_date = trips_data.sample(False, 0.1, seed=0).limit(1).withColumn("curr_date", F.unix_timestamp(F.col("date"), "YYYY-MM-DD")).select("curr_date").rdd.flatMap(list).first()

In [40]:
print curr_date

1483239600


In [42]:
def get_group_N_sec(timestamp,curr_date_timestamp,N):
    secs_since_midnight = timestamp-curr_date_timestamp
    return(F.floor(secs_since_midnight/N))

In [43]:
ticketing_data = ticketing_data.withColumn("sec_group",get_group_N_sec(F.col("card_timestamp"),curr_date,30))
ticketing_data.select(["card_timestamp","sec_group"]).show()

+--------------+---------+
|card_timestamp|sec_group|
+--------------+---------+
|    1483312516|     2430|
|    1483287024|     1580|
|    1483269825|     1007|
|    1483282459|     1428|
|    1483288210|     1620|
|    1483267972|      945|
|    1483306446|     2228|
|    1483262131|      751|
|    1483304248|     2154|
|    1483275836|     1207|
|    1483288556|     1631|
|    1483267818|      940|
|    1483284361|     1492|
|    1483292067|     1748|
|    1483292070|     1749|
|    1483292072|     1749|
|    1483290163|     1685|
|    1483279465|     1328|
|    1483263848|      808|
|    1483303098|     2116|
+--------------+---------+
only showing top 20 rows



In [44]:
trips_data = trips_data.withColumn("sec_group",get_group_N_sec(F.col("gps_timestamp"),curr_date,30))
trips_data.select(["gps_timestamp","sec_group"]).show()

+-------------+---------+
|gps_timestamp|sec_group|
+-------------+---------+
|   1483283623|     1467|
|   1483285514|     1530|
|   1483260074|      682|
|   1483260076|      682|
|   1483260080|      682|
|   1483260087|      682|
|   1483260091|      683|
|   1483260093|      683|
|   1483260097|      683|
|   1483260105|      683|
|   1483260107|      683|
|   1483260112|      683|
|   1483260120|      684|
|   1483260123|      684|
|   1483260128|      684|
|   1483260132|      684|
|   1483260137|      684|
|   1483260141|      684|
|   1483260143|      684|
|   1483260145|      684|
+-------------+---------+
only showing top 20 rows

