In [2]:
import pyspark
from pyspark import SparkContext
from pyspark.sql.window import Window
from pyspark.sql import functions as F
from pyspark.sql import types as T

import json
import numpy as np

In [3]:
def rename_columns(df, list_of_tuples):
    for (old_col, new_col) in list_of_tuples:
        df = df.withColumnRenamed(old_col, new_col)
    return df

def read_data(sqlContext, filepath):
    df = sqlContext.read.format("com.databricks.spark.csv")\
        .option("header", "true")\
        .option("inferSchema", "true") \
        .option("nullValue", "-")\
        .load(filepath)

    return df

def read_file(filepath, sqlContext):
    data_frame = sqlContext.read.csv(filepath, header=False,
                                     inferSchema=True,nullValue="-")
    while len(data_frame.columns) < 16:
        col_name = "_c" + str(len(data_frame.columns))
        data_frame = data_frame.withColumn(col_name, F.lit(None))

    data_frame = rename_columns(
        data_frame,
        [
            ("_c0", "route"),
            ("_c1", "tripNum"),
            ("_c2", "shapeId"),
            ("_c3", "shapeSequence"),
            ("_c4", "shapeLat"),
            ("_c5", "shapeLon"),
            ("_c6", "distanceTraveledShape"),
            ("_c7", "busCode"),
            ("_c8", "gpsPointId"),
            ("_c9", "gpsLat"),
            ("_c10", "gpsLon"),
            ("_c11", "distanceToShapePoint"),
            ("_c12", "timestamp"),
            ("_c13", "busStopId"),
            ("_c14", "problem"),
            ("_c15", "userBirthdate"),
            ("_c16", "cardTimestamp"),
            ("_c17", "lineName"),
            ("_c18", "cardNum"),
            ("_c19", "userGender"),
        ]
    )

    date = "-".join(filepath.split("/")[-2].split("_")[:3])

    data_frame = data_frame.withColumn("date", F.lit(date))
    data_frame = data_frame.withColumn("date", F.date_sub(F.col("date"),1))
    
    return data_frame

def print_df(df,l=10):
    return df.limit(l).toPandas()

### Read Pre-OD Data

In [4]:
sc = SparkContext.getOrCreate()
sqlContext = pyspark.SQLContext(sc)

In [5]:
exp_data_folder_path = '/local/tarciso/masters/experiments/preliminary-exp/preliminary-exp-sample-data/'
pre_od_data = read_data(sqlContext,exp_data_folder_path + 'pre_od_matrix')

In [37]:
print_df(pre_od_data)

Unnamed: 0,o_route,o_bus_code,o_date,o_tripNum,o_timestamp,o_shape_lat,o_shape_lon,o_stop_id,o_boarding_id,cardNum,...,d_date,d_tripNum,d_timestamp,d_shape_lat,d_shape_lon,d_stop_id,d_boarding_id,o_unixtimestamp,d_unixtimestamp,leg_duration
0,30,KB603,2017-05-10,2,18:09:11,-25.388545,-49.21149,32495,1279900254208,3659830,...,2017-05-10,1,06:49:06,-25.480276,-49.268259,32014,1279900254209,76151,35346,-1.0
1,561,EC001,2017-05-10,1,06:49:06,-25.480276,-49.268259,32014,1279900254209,3659830,...,2017-05-10,2,18:09:11,-25.388545,-49.21149,32495,1279900254208,35346,76151,680.083333
2,561,EC006,2017-05-10,12,16:26:23,-25.435352,-49.273778,26152,1279900254210,3659926,...,2017-05-10,1,06:30:23,-25.513591,-49.300009,35745,1279900254211,69983,34223,-1.0
3,40,MB604,2017-05-10,1,06:30:23,-25.513591,-49.300009,35745,1279900254211,3659926,...,2017-05-10,12,16:26:23,-25.435352,-49.273778,26152,1279900254210,34223,69983,596.0
4,471,EC003,2017-05-10,7,13:15:25,-25.435529,-49.271583,26178,1279900254212,3659999,...,2017-05-10,3,07:43:30,-25.524062,-49.234259,31212,1279900254213,58525,38610,-1.0
5,535,EA600,2017-05-10,3,07:43:30,-25.524062,-49.234259,31212,1279900254213,3659999,...,2017-05-10,7,13:15:25,-25.435529,-49.271583,26178,1279900254212,38610,58525,331.916667
6,535,EA600,2017-05-10,3,07:43:32,-25.524062,-49.234259,31212,1279900254214,3660010,...,2017-05-10,12,15:09:48,-25.408467,-49.276251,27290,1279900254215,38612,65388,446.266667
7,176,BC939,2017-05-10,12,15:09:48,-25.408467,-49.276251,27290,1279900254215,3660010,...,2017-05-10,3,07:43:32,-25.524062,-49.234259,31212,1279900254214,65388,38612,-1.0
8,176,BC939,2017-05-10,12,15:09:56,-25.408467,-49.276251,27290,1279900254216,3660122,...,2017-05-10,7,18:42:04,-25.430091,-49.271777,26334,1279900254217,65396,78124,212.133333
9,207,BC033,2017-05-10,7,18:42:04,-25.430091,-49.271777,26334,1279900254217,3660122,...,2017-05-10,12,15:09:56,-25.408467,-49.276251,27290,1279900254216,78124,65396,-1.0


In [6]:
user_boardings = read_file(exp_data_folder_path + '/bulma-output/2017_05_11_veiculos.csv/part-00000', sqlContext)

### Building stops-distance table

In [9]:
busStops = user_boardings.select(['busStopId','shapeLat','shapeLon']).dropDuplicates(['busStopId'])

In [10]:
print busStops.count()
print_df(busStops)

5326


Unnamed: 0,busStopId,shapeLat,shapeLon
0,31261,-25.545885,-49.263491
1,32396,-25.468642,-49.299404
2,31951,-25.494953,-49.20858
3,29993,-25.541054,-49.294021
4,33868,-25.517789,-49.324665
5,33722,-25.407783,-49.339019
6,26583,-25.439577,-49.269716
7,30903,-25.467922,-49.230765
8,33717,-25.424029,-49.279715
9,7554,-25.504891,-49.356074


In [13]:
o_busStops = rename_columns(
        busStops,
        [
            ("busStopId", "o_busStopId"),
            ("shapeLat", "o_shapeLat"),
            ("shapeLon", "o_shapeLon")
        ]
    )

d_busStops = rename_columns(
        busStops,
        [
            ("busStopId", "d_busStopId"),
            ("shapeLat", "d_shapeLat"),
            ("shapeLon", "d_shapeLon")
        ]
    )

In [14]:
print_df(o_busStops)

Unnamed: 0,o_busStopId,o_shapeLat,o_shapeLon
0,31261,-25.545885,-49.263491
1,32396,-25.468642,-49.299404
2,31951,-25.494953,-49.20858
3,29993,-25.541054,-49.294021
4,33868,-25.517789,-49.324665
5,33722,-25.407783,-49.339019
6,26583,-25.439577,-49.269716
7,30903,-25.467922,-49.230765
8,33717,-25.424029,-49.279715
9,7554,-25.504891,-49.356074


In [15]:
print_df(d_busStops)

Unnamed: 0,d_busStopId,d_shapeLat,d_shapeLon
0,31261,-25.545885,-49.263491
1,32396,-25.468642,-49.299404
2,31951,-25.494953,-49.20858
3,29993,-25.541054,-49.294021
4,33868,-25.517789,-49.324665
5,33722,-25.407783,-49.339019
6,26583,-25.439577,-49.269716
7,30903,-25.467922,-49.230765
8,33717,-25.424029,-49.279715
9,7554,-25.504891,-49.356074


In [16]:
stops_dist_table = o_busStops.crossJoin(d_busStops)

In [17]:
print_df(stops_dist_table)

Unnamed: 0,o_busStopId,o_shapeLat,o_shapeLon,d_busStopId,d_shapeLat,d_shapeLon
0,31261,-25.545885,-49.263491,31261,-25.545885,-49.263491
1,31261,-25.545885,-49.263491,32396,-25.468642,-49.299404
2,31261,-25.545885,-49.263491,31951,-25.494953,-49.20858
3,31261,-25.545885,-49.263491,29993,-25.541054,-49.294021
4,31261,-25.545885,-49.263491,33868,-25.517789,-49.324665
5,31261,-25.545885,-49.263491,33722,-25.407783,-49.339019
6,31261,-25.545885,-49.263491,26583,-25.439577,-49.269716
7,31261,-25.545885,-49.263491,30903,-25.467922,-49.230765
8,31261,-25.545885,-49.263491,33717,-25.424029,-49.279715
9,31261,-25.545885,-49.263491,7554,-25.504891,-49.356074


In [18]:
stops_dist_table.count()

28366276

In [19]:
def dist(long_x, lat_x, long_y, lat_y):
    return F.acos(
        F.sin(F.toRadians(lat_x)) * F.sin(F.toRadians(lat_y)) + 
        F.cos(F.toRadians(lat_x)) * F.cos(F.toRadians(lat_y)) * 
            F.cos(F.toRadians(long_x) - F.toRadians(long_y))
    ) * F.lit(6371.0)

In [20]:
stops_dist_table = stops_dist_table.withColumn("dist", dist('o_shapeLon', 'o_shapeLat','d_shapeLon','d_shapeLat'))

In [21]:
print_df(stops_dist_table)

Unnamed: 0,o_busStopId,o_shapeLat,o_shapeLon,d_busStopId,d_shapeLat,d_shapeLon,dist
0,31261,-25.545885,-49.263491,31261,-25.545885,-49.263491,0.0
1,31261,-25.545885,-49.263491,32396,-25.468642,-49.299404,9.314479
2,31261,-25.545885,-49.263491,31951,-25.494953,-49.20858,7.901535
3,31261,-25.545885,-49.263491,29993,-25.541054,-49.294021,3.109689
4,31261,-25.545885,-49.263491,33868,-25.517789,-49.324665,6.887185
5,31261,-25.545885,-49.263491,33722,-25.407783,-49.339019,17.125818
6,31261,-25.545885,-49.263491,26583,-25.439577,-49.269716,11.837379
7,31261,-25.545885,-49.263491,30903,-25.467922,-49.230765,9.270331
8,31261,-25.545885,-49.263491,33717,-25.424029,-49.279715,13.647174
9,31261,-25.545885,-49.263491,7554,-25.504891,-49.356074,10.347941


In [35]:
def get_stops_dist(stops_dist_df,stopA,stopB):
    return stops_dist_df.where(
        (F.col('o_busStopId') == stopA) &
        (F.col('d_busStopId') == stopB)).select('dist').collect()[0][0]

get_stops_dist_udf = F.udf(get_stops_dist())

In [36]:
get_stops_dist(stops_dist_table,31261,33717)

13.647173507372901

### Finding the closest alighting stop to next trip boarding

In [None]:
def find_closest_alighting_stop(pre_od_df,stops_dist_df,route,bus,trip_num,o_stop_id,next_trip_o_stop_id):
    stops_dist_df.where()
    
    return ... # Some lookup logic

find_closest_alighting_stop_udf = udf(find_closest_alighting_stop)

df.withColumn("foo", iplookup("c0"))



In [41]:
def df_loop(df):
    i = 0
    for row in df.rdd.collect():
        if i > 10:
            break
        print row
        i+=1

In [44]:
df_loop(d_busStops)

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:36121)
Traceback (most recent call last):
  File "/local/tarciso/programs/spark-2.1.0-bin-hadoop2.7/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 963, in start
    self.socket.connect((self.address, self.port))
  File "/local/tarciso/programs/anaconda2/lib/python2.7/socket.py", line 228, in meth
    return getattr(self._sock,name)(*args)
error: [Errno 111] Connection refused


Py4JNetworkError: An error occurred while trying to connect to the Java server (127.0.0.1:36121)