# Config

In [None]:
import pandas as pd
from hdfs3 import HDFileSystem
from inference import run_search
import numpy as np
from tqdm import tqdm
import numpy as np

In [None]:
%load_ext sparkmagic.magics

In [None]:
%spark cleanup

In [None]:
%spark add -l python -s groupAD -u http://iccluster044.iccluster.epfl.ch:8998 -k

In [None]:
%%spark
import pyspark.sql.functions as F
import pyspark.sql.types as T
import numpy as np

# Execute the tests

Get the station names that appear the most

In [None]:
hdfs = HDFileSystem(user="ebouille")
df_schedule_network = pd.DataFrame()
for path in hdfs.glob("/user/anmaier/schedule_network.orc/hour=*/*.orc"):
    with hdfs.open(path) as f:
        df_schedule_network = pd.concat((df_schedule_network, pd.read_orc(f)), ignore_index=True)

In [None]:
#top_stations = df_schedule_network[(df_schedule_network['src_stop_name'].value_counts() > 500)]
top_n = 50
top_stations = df_schedule_network.groupby('src_stop_name').count().sort_values('src_timestamp', ascending=False).iloc[:top_n, :].index.tolist()
print(len(top_stations))
df_schedule_network['src_stop_name'].value_counts().hist(bins=100)

In [None]:
# Get paths
def get_random_time():
        """Generate a random time between 8am and 8pm as a timestamp a random day in 2022"""
        # Generate random times until it satisfies these conditions:
        # * it is between 8am and 8pm
        # * it is not a sunday nor a saturday
        while True:
            # get random timestamps between 1st january 2022 and 1st january 2023
            random_timestamp = np.random.randint(1640991600, 1672527600)
            random_timestamp = pd.Timestamp.fromtimestamp(random_timestamp)
            # get their date
            random_date = random_timestamp.date()
            # get a random time between 8am and 8pm
            year = 2022
            month = random_date.month
            day = random_date.day
            hour = int(8+12*np.random.rand())
            minute = int(60*np.random.rand())
            second = 0
            random_time = pd.Timestamp(year=year, month=month, day=day, hour=hour, minute=minute, second=second)
            # check if it is not a sunday nor a saturday
            if random_time.weekday() not in [5, 6]:
                break
        return random_time

responses = [] # list of responses given by the route planner for each query
time_limits = pd.DataFrame()
n_queries = 300#len(df_departures)
path_counter = 0
for i in tqdm(range(n_queries)):
    departure = np.random.choice(top_stations)
    while True:
        arrival = np.random.choice(top_stations)
        if arrival != departure:
            break
    random_time = get_random_time()
    arrival_hour = random_time.hour
    arrival_minute = random_time.minute
    print(departure, arrival, random_time)
    response = run_search(departure, arrival, arrival_hour, arrival_minute, 0.95, top_n=1, verbose=False, max_transfers=3, max_duration=60)
    print(len(response))
    if response != []:
        responses.append(response)
        for path in response:
            path_counter += 1
            time_limit = pd.DataFrame({'time_limit': [random_time],
                                       'path_id': [path_counter]
                                      })
            time_limits = pd.concat((time_limits, time_limit), ignore_index=True)

In [None]:
# Store paths
if False:
    for i, response in enumerate(responses):
        for j, path in enumerate(response):
            file_path = "/user/anmaier/validation/response=" + str(i) + "_path=" + str(j) + "/"
            with hdfs.open(file_path, 'wb') as f:
                f.write(path.path_list.to_csv(index=False))
# Load paths
if True:
    for file_path in hdfs.glob("/user/anmaier/validation/response=*_path=*/*.csv"):
        with hdfs.open(file_path) as f:
            pd.read_csv(f).head()

In [None]:
path_counter = 0
transfers = pd.DataFrame(columns=["src_trip_id", "src_timestamp", "dst_trip_id", "dst_timestamp", "walking_duration", "path_id"])
last_trips = pd.DataFrame(columns=["trip_id", "dst_timestamp", "path_id"])
for i, response in enumerate(responses):
    for j, path in enumerate(response):
        path = path.path_list
        path = path.reset_index(drop=True)
        path_counter += 1
        # Take last trip
        # If no edge in the path, we stop there (should not happen with real data)
        if len(path) == 0:
            continue
        last_trip = pd.DataFrame({'trip_id': [path.iloc[-2]['trip_id']],
                          'dst_timestamp': [path.iloc[-2]['dst_timestamp']],
                          'path_id': [path_counter]
                         })
        last_trips = pd.concat((last_trips, last_trip), ignore_index=True)
        
        # Take all situations where the user transfers by walking
        # indexes of the walking edges
        walking_edges_idxs = path[path.route_desc == "walking"].index
        # If no walk, we stop there
        if len(walking_edges_idxs) == 0:
            continue
        # Remove index 0 if it is a walk since we assume that 
        # a first walk has a probability 1 to be on time
        if walking_edges_idxs[0] == 0:
            walking_edges_idxs = walking_edges_idxs[1:]
        if len(walking_edges_idxs) == 0:
            continue
        # Dataframe where each row is a change (happens when there is a walking edge)
        path_transfers = pd.DataFrame()
        path_transfers['src_trip_id'] = path.loc[walking_edges_idxs - 1, 'trip_id'].reset_index(drop=True)
        path_transfers['src_timestamp'] = path.loc[walking_edges_idxs - 1, 'dst_timestamp'].reset_index(drop=True)
        path_transfers['dst_trip_id'] = path.loc[walking_edges_idxs + 1, 'trip_id'].reset_index(drop=True)
        path_transfers['dst_timestamp'] = path.loc[walking_edges_idxs + 1, 'src_timestamp'].reset_index(drop=True)
        path_transfers['walking_duration'] = path.loc[walking_edges_idxs, 'walking_duration'].reset_index(drop=True)
        # Add a column that is a unique identifier of the path
        path_transfers['path_id'] = path_counter
        # Append the transfers of the route to the transfers dataframe
        transfers = pd.concat((transfers, path_transfers), ignore_index=True)
        
# We store the transfers and last_trips and time_limits csv files on hdfs
transfers_csv = transfers.to_csv(index=False)
last_trips_csv = last_trips.to_csv(index=False)
time_limits_csv = time_limits.to_csv(index=False)
with hdfs.open('/user/anmaier/validation/transfers.csv', 'wb') as f:
    f.write(transfers_csv)
with hdfs.open('/user/anmaier/validation/last_trips.csv', 'wb') as f:
    f.write(last_trips_csv)
with hdfs.open('/user/anmaier/validation/time_limits.csv', 'wb') as f:
    f.write(time_limits_csv)


In [None]:
%%spark
# Read transfers
df_transfers = (spark.read.csv("/user/anmaier/validation/transfers.csv", header=True)
                .withColumn("src_timestamp", F.to_timestamp("src_timestamp"))
                .withColumn("dst_timestamp", F.to_timestamp("dst_timestamp"))
                .withColumn("walking_duration", F.col('walking_duration').cast(T.DoubleType()))
                .withColumn('path_id', F.col('path_id').cast(T.IntegerType())))
# Read last_trips
df_last_trips = (spark.read.csv("/user/anmaier/validation/last_trips.csv", header=True)
                 .withColumn("dst_timestamp", F.to_timestamp("dst_timestamp"))
                 .withColumn('path_id', F.col('path_id').cast(T.IntegerType())))
# Read last_trips
df_time_limits = (spark.read.csv("/user/anmaier/validation/time_limits.csv", header=True)
                 .withColumn("time_limit", F.to_timestamp("time_limit"))
                 .withColumn('path_id', F.col('path_id').cast(T.IntegerType())))

In [None]:
%%spark
# Load test dataset
is_stored = True
if is_stored:
    df_test = spark.read.orc("/user/anmaier/validation/df_test.orc").cache()
    df_test.count()
else:
    df_test = (spark.read.orc("/data/sbb/part_orc/istdaten/year=2022")
               .select("ANKUNFTSZEIT", "ABFAHRTSZEIT", "AN_PROGNOSE", "AB_PROGNOSE", "FAHRT_BEZEICHNER", "BETRIEBSTAG")
                # Rename columns
               .withColumnRenamed("ANKUNFTSZEIT", "arrival_time")
               .withColumnRenamed("ABFAHRTSZEIT", "departure_time")
               .withColumnRenamed("AN_PROGNOSE", "true_arrival_time")
               .withColumnRenamed("AB_PROGNOSE", "true_departure_time")
               .withColumnRenamed("FAHRT_BEZEICHNER", "trip_id")
               .withColumnRenamed("BETRIEBSTAG", "date")
               # Format timestamps
               .withColumn("arrival_time", F.to_timestamp("arrival_time", "dd.MM.yyyy HH:mm"))
               .withColumn("departure_time", F.to_timestamp("departure_time", "dd.MM.yyyy HH:mm"))
               .withColumn("true_arrival_time", F.to_timestamp("true_arrival_time", "dd.MM.yyyy HH:mm:ss"))
               .withColumn("true_departure_time", F.to_timestamp("true_departure_time", "dd.MM.yyyy HH:mm:ss"))
               .withColumn("date", F.to_date('date', "dd.MM.yyyy"))
               .cache())

    df_test.write.orc("/user/anmaier/validation/df_test.orc", mode="overwrite")

In [None]:
%%spark
# Take date of generated time_limits
dates = (df_time_limits
         .withColumn('date', F.to_date('time_limit')))

In [None]:
%%spark
# Modify transfers and last_trips with the new dates
df_transfers = (df_transfers
                 .withColumn('src_timestamp_str', F.date_format('src_timestamp', "HH:mm:ss"))
                 .withColumn('dst_timestamp_str', F.date_format('dst_timestamp', "HH:mm:ss"))
                 .join(dates, "path_id")
                 .withColumn('src_timestamp', F.to_timestamp(F.concat(F.date_format('date', "yyyy-MM-dd"), F.lit(" "), F.col('src_timestamp_str'))))
                 .withColumn('dst_timestamp', F.to_timestamp(F.concat(F.date_format('date', "yyyy-MM-dd"), F.lit(" "), F.col('dst_timestamp_str')))))
 # Weird error, using this tricks of saving the dataframe in a orc file and loading it again solves the issue
df_transfers.write.orc("/user/anmaier/validation/temp_transfers.orc", mode="overwrite")
df_last_trips = (df_last_trips
                 .withColumn('dst_timestamp_str', F.date_format('dst_timestamp', "HH:mm:ss"))
                 .join(dates, "path_id")
                 .withColumn('dst_timestamp', F.to_timestamp(F.concat(F.date_format('date', "yyyy-MM-dd"), F.lit(" "), F.col('dst_timestamp_str')))))
df_last_trips.write.orc("/user/anmaier/validation/temp_last_trips.orc", mode="overwrite")

In [None]:
%%spark
temp = (spark.read.orc("/user/anmaier/validation/temp_transfers.orc")
        # Format data
        .withColumn("src_timestamp", F.to_timestamp("src_timestamp"))
        .withColumn("dst_timestamp", F.to_timestamp("dst_timestamp")))
(temp
 .join(df_test.select('arrival_time', 'trip_id', 'true_arrival_time'),
       (temp.src_timestamp == df_test.arrival_time)
       & (temp.src_trip_id == df_test.trip_id))
 .write.orc("/user/anmaier/validation/temp_transfers2.orc", mode="overwrite"))
# Again we use the same tricks
temp = (spark.read.orc("/user/anmaier/validation/temp_transfers2.orc")
        .withColumn("src_timestamp", F.to_timestamp("src_timestamp"))
        .withColumn("dst_timestamp", F.to_timestamp("dst_timestamp")))
transfers_test = (temp
                .join(df_test.select('departure_time', 'trip_id', 'true_departure_time'),
                      (temp.dst_timestamp == df_test.departure_time)
                      & (temp.dst_trip_id == df_test.trip_id))
                  .select('true_departure_time', 'true_arrival_time', 'departure_time', 'arrival_time', 'walking_duration', 'path_id')
                  .na.drop())

transfers_yes = (transfers_test
                # True if we missed our transfer
                 .withColumn('test', 
                             (F.unix_timestamp('true_departure_time') - F.unix_timestamp('true_arrival_time')).cast(T.DoubleType())
                             < F.col('walking_duration') * 60.)
                 .withColumn('test', F.col('test').cast(T.IntegerType()))
                 .withColumn('path_id', F.col('path_id').cast(T.IntegerType()))
                 .groupBy('path_id')
                 .sum('test')
                 .withColumn('result_transfers', (F.col('sum(test)') == 0).cast(T.IntegerType())))


In [None]:
%%spark
temp = (spark.read.orc("/user/anmaier/validation/temp_last_trips.orc")
        # Format data
        .withColumn("dst_timestamp", F.to_timestamp("dst_timestamp")))
test_temp = df_test.select('arrival_time', 'trip_id', 'true_arrival_time')
last_trips_test = (temp
                    .join(test_temp,
                          (temp.dst_timestamp == test_temp.arrival_time)
                          & (temp.trip_id == test_temp.trip_id))
                   .select('true_arrival_time', 'arrival_time', 'time_limit', 'path_id')
                   .na.drop())
                   
last_trips_yes = (last_trips_test
                  # True if we did not arrive in time
                  .withColumn('test',
                              F.unix_timestamp('time_limit') < F.unix_timestamp('true_arrival_time'))
                  .withColumn('test', F.col('test').cast(T.IntegerType()))
                  .withColumn('path_id', F.col('path_id').cast(T.IntegerType()))
                  .groupBy('path_id')
                  .sum('test')
                  .withColumn('result_last_trips', (F.col('sum(test)') == 0).cast(T.IntegerType())))


In [None]:
%%spark
joined_yes = last_trips_yes.join(transfers_yes, 'path_id', 'left').fillna(-1)
results = (joined_yes
           .filter(~(F.col('result_transfers') == F.lit(0)) | (F.col('result_last_trips') == F.lit(0))))

In [None]:
%%spark
n_path = joined_yes.count()

n_path_yesyesssss = results.count()
print(n_path_yesyesssss, n_path)
if n_path != 0:
    pourcentage = n_path_yesyesssss / n_path
    print(pourcentage)


In [None]:
# 50 queries -> 15 path test, 15 path yes