# Assignment 1

In [13]:
import datetime
import json
import os
from enum import Enum
from functools import reduce
from itertools import count
from operator import add

import pandas as pd
import shapefile
from tqdm.notebook import tqdm

from scripts.helpers import *

## Raw JSON to CSV

Converts raw JSON files containing the vehiclePositions from STIB to a single CSV file
**Reads from**: raw JSON files in `data/raw` folder (`data/raw/vehiclePosition*.json`)
**Writes to**: Single CSV file containing all the vehicle positions in `data` folder (`data/processed/assignment1/vehiclePositions.csv`)

In [14]:
raw_json_files = [
    '../data/raw/vehicleTimestamp/vehiclePosition01.json',
    '../data/raw/vehicleTimestamp/vehiclePosition02.json',
    '../data/raw/vehicleTimestamp/vehiclePosition03.json',
    '../data/raw/vehicleTimestamp/vehiclePosition04.json',
    '../data/raw/vehicleTimestamp/vehiclePosition05.json',
    '../data/raw/vehicleTimestamp/vehiclePosition06.json',
    '../data/raw/vehicleTimestamp/vehiclePosition07.json',
    '../data/raw/vehicleTimestamp/vehiclePosition08.json',
    '../data/raw/vehicleTimestamp/vehiclePosition09.json',
    '../data/raw/vehicleTimestamp/vehiclePosition10.json',
    '../data/raw/vehicleTimestamp/vehiclePosition11.json',
    '../data/raw/vehicleTimestamp/vehiclePosition12.json',
    '../data/raw/vehicleTimestamp/vehiclePosition13.json'
]
vehicle_positions_csv = '../data/processed/assignment1/vehiclePositions.csv'
csv_header = ['Timestamp', 'LineId', 'DirectionId', 'DistanceFromPoint', 'PointId']

In [15]:
with write_csv(vehicle_positions_csv) as writer:
    writer.writerow(csv_header)
    for raw_json_path in tqdm(raw_json_files):
        file = open(raw_json_path, 'r', encoding='utf8')
        data = json.load(file)['data']
        file.close()
        for time in data:
            timestamp = time['time']
            for response in time['Responses']:
                if response is None:
                    # Skip if response is empty
                    continue
                for line in response['lines']:
                    line_id = line['lineId']
                    for vehiclePosition in line['vehiclePositions']:
                        writer.writerow([
                            timestamp,
                            line_id,
                            vehiclePosition['directionId'],
                            vehiclePosition['distanceFromPoint'],
                            vehiclePosition['pointId'],
                        ])

  0%|          | 0/13 [00:00<?, ?it/s]

In [16]:
vehicle_positions_df = pd.read_csv(vehicle_positions_csv)
vehicle_positions_df.head()

Unnamed: 0,Timestamp,LineId,DirectionId,DistanceFromPoint,PointId
0,1630914886924,1,8161,1,8012
1,1630914886924,1,8162,0,8142
2,1630914886924,1,8162,0,8282
3,1630914886924,1,8731,0,8111
4,1630914886924,1,8162,1,8062


## Shapefile to routes

Converts raw Stops Shapefiles from STIB to a single CSV file containing line routes
**Reads from**:
 - Shapefiles in `data/raw/shapefiles` folder (`data/raw/shapefiles/ACTU_STOPS.*`)
 - `stops.txt` GTFS file in `data/raw/gtfs` folder

**Writes to**: Single CSV file containing all the line routes in `data` folder (`data/line_stops.csv`)

In [17]:
csv_header = ['lineId', 'direction', 'stop_id', 'stop_id_int', 'name', 'name_ascii', 'lat', 'long', 'lambert_x',
              'lambert_y', 'order']
stops_shapefile_path = '../data/raw/shapefiles/ACTU_STOPS.shp'
stops_gtfs_path = '../data/raw/gtfs/stops.csv'
merged_stops_csv_path = '../data/processed/assignment1/line_stops.csv'

In [18]:
stops_shapefile = shapefile.Reader(stops_shapefile_path)
# We take the first value of each field tuple (it's name), and skip the first field (DeletionFlag field, not relevant)
stop_fields = [field[0] for field in stops_shapefile.fields][1:]
shapefile_df = pd.DataFrame(stops_shapefile.records(), columns=stop_fields)
shapefile_df.head()

Unnamed: 0,Code_Ligne,Variante,succession,stop_id,descr_fr,descr_nl,alpha_fr,alpha_nl,coord_x,coord_y,mode,numero_lig,terminus
0,012b,1,1,9600B,BRUSSELS AIRPORT,BRUSSELS AIRPORT,Brussels Airport,Brussels Airport,157950.0,176429.0,B,12,BRUSSELS CITY
1,012b,1,2,3017,BOURGET,BOURGET,Bourget,Bourget,154334.0,174200.0,B,12,BRUSSELS CITY
2,012b,1,3,5048,DA VINCI,DA VINCI,Da Vinci,Da Vinci,152934.0,173976.0,B,12,BRUSSELS CITY
3,012b,1,4,2695,GENEVE,GENEVE,Genève,Genève,152428.0,172606.0,B,12,BRUSSELS CITY
4,012b,1,5,2250,MEISER,MEISER,Meiser,Meiser,152045.0,171508.0,B,12,BRUSSELS CITY


In [19]:
shapefile_df.drop(columns=['descr_nl', 'alpha_nl', 'mode', 'numero_lig', 'terminus'], inplace=True)
renames = {'Code_Ligne': 'lineId',
           'Variante': 'direction',
           'succession': 'order',
           'descr_fr': 'name_ascii',
           'alpha_fr': 'name',
           'coord_x': 'lambert_x',
           'coord_y': 'lambert_y'}
shapefile_df.rename(columns=renames, inplace=True)
shapefile_df.sort_values(['lineId', 'direction', 'order'], inplace=True)
shapefile_df['stop_id_int'] = shapefile_df['stop_id'].apply(lambda stop_id: int(stop_id[:4]))
shapefile_df.head()

Unnamed: 0,lineId,direction,order,stop_id,name_ascii,name,lambert_x,lambert_y,stop_id_int
2987,001m,1,1,8733,GARE DE L'OUEST,Gare de l'Ouest,146633.5,170956.4,8733
2988,001m,1,2,8742,BEEKKANT,Beekkant,146776.5,171444.3,8742
2989,001m,1,3,8292,ETANGS NOIRS,Étangs Noirs,147492.7,171859.9,8292
2990,001m,1,4,8282,COMTE DE FLANDRE,Comte de Flandre,148013.6,171590.4,8282
2991,001m,1,5,8272,SAINTE-CATHERINE,Sainte-Catherine,148539.5,171278.2,8272


In [20]:
gtfs_stops_df = pd.read_csv(stops_gtfs_path)
gtfs_stops_df.dropna(axis=1, inplace=True)
gtfs_stops_df.drop(columns=['stop_id', 'location_type', 'stop_name'], inplace=True)
gtfs_stops_df['stop_id_int'] = gtfs_stops_df['stop_id'].apply(lambda stop_id: int(stop_id[:4]))
gtfs_stops_df.head()

Unnamed: 0,stop_lat,stop_lon,stop_id_int
0,50.838006,4.40897,89
1,50.863666,4.329612,470
2,50.863732,4.329236,471
3,50.863543,4.329023,472
4,50.863418,4.330031,473


In [21]:
joined_stops_df = shapefile_df.join(gtfs_stops_df.set_index('stop_id_int'), on='stop_id_int')
joined_stops_df.head()

Unnamed: 0,lineId,direction,order,stop_id,name_ascii,name,lambert_x,lambert_y,stop_id_int,stop_lat,stop_lon
2987,001m,1,1,8733,GARE DE L'OUEST,Gare de l'Ouest,146633.5,170956.4,8733,50.848999,4.320948
2988,001m,1,2,8742,BEEKKANT,Beekkant,146776.5,171444.3,8742,50.853386,4.322974
2989,001m,1,3,8292,ETANGS NOIRS,Étangs Noirs,147492.7,171859.9,8292,50.857125,4.333143
2990,001m,1,4,8282,COMTE DE FLANDRE,Comte de Flandre,148013.6,171590.4,8282,50.854705,4.340542
2991,001m,1,5,8272,SAINTE-CATHERINE,Sainte-Catherine,148539.5,171278.2,8272,50.8519,4.348012


In [22]:
joined_stops_df[joined_stops_df['stop_lat'].isna()]

Unnamed: 0,lineId,direction,order,stop_id,name_ascii,name,lambert_x,lambert_y,stop_id_int,stop_lat,stop_lon
3424,019t,1,1,5104F,GROOT-BIJGAARDEN,Groot-Bijgaarden,143429.4,172979.7,5104,,
3467,019t,2,22,5169F,GROOT-BIJGAARDEN,Groot-Bijgaarden,143385.5,172978.7,5169,,


In [23]:
file = open(merged_stops_csv_path, 'w', encoding='utf8')
joined_stops_df.to_csv(file)
file.close()

## Drop incomplete data from CSV

Cleans `vehiclePositions.csv` file created in previous section
**Reads from**: CSV file containing all the vehicle positions in `data` folder (`data/processed/assignment1/vehiclePositions.csv`)
**Writes to**: CSV file containing filtered vehicle positions in `data` folder (`data/processed/assignment1/vehiclePositionsClean.csv`)

In [84]:
def drop_positions_with_unknown_stop_or_direction():
    total_count = {}
    deletion_count = {}
    stops = {f'{int(stop[0][:-1])}-{stop[3]}' for stop in
             read_csv_stream('../data/processed/assignment1/line_stops.csv')}
    positions = read_csv_stream('../data/processed/assignment1/vehiclePositions.csv', skip_first=False)
    with write_csv('../data/processed/assignment1/vehiclePositionsClean.csv') as writer:
        writer.writerow(next(positions))
        for position in tqdm(positions):
            line_id = position[1]
            if line_id not in total_count:
                total_count[line_id] = 0
            if line_id not in deletion_count:
                deletion_count[line_id] = 0
            direction_id = position[2]
            stop_id = position[4]
            total_count[line_id] += 1
            if f'{line_id}-{direction_id}' not in stops or f'{line_id}-{stop_id}' not in stops:
                deletion_count[line_id] += 1
            else:
                writer.writerow(position)
    for line in sorted(total_count):
        deleted = deletion_count[line]
        total = total_count[line]
        print(f'\tLine {line}: {deleted} rows deleted out of {total} ({(deleted / total) * 100:.2f}%)')
    total_rows = reduce(add, total_count.values())
    total_deletions = reduce(add, deletion_count.values())
    print(f'\tTotal: {total_deletions} rows deleted out of {total_rows} ({(total_deletions / total_rows) * 100:.2f}%)')

In [85]:
drop_positions_with_unknown_stop_or_direction()

0it [00:00, ?it/s]

	Line 1: 63100 rows deleted out of 305454 (20.66%)
	Line 12: 2091 rows deleted out of 238655 (0.88%)
	Line 13: 12387 rows deleted out of 231422 (5.35%)
	Line 14: 26554 rows deleted out of 272554 (9.74%)
	Line 17: 393 rows deleted out of 93979 (0.42%)
	Line 19: 9177 rows deleted out of 374100 (2.45%)
	Line 2: 4129 rows deleted out of 215114 (1.92%)
	Line 20: 12947 rows deleted out of 240316 (5.39%)
	Line 21: 5420 rows deleted out of 196146 (2.76%)
	Line 25: 349123 rows deleted out of 387235 (90.16%)
	Line 27: 34005 rows deleted out of 222168 (15.31%)
	Line 28: 13419 rows deleted out of 188596 (7.12%)
	Line 29: 37447 rows deleted out of 352595 (10.62%)
	Line 3: 162263 rows deleted out of 383543 (42.31%)
	Line 33: 0 rows deleted out of 69363 (0.00%)
	Line 34: 77757 rows deleted out of 271519 (28.64%)
	Line 36: 8227 rows deleted out of 243214 (3.38%)
	Line 37: 33673 rows deleted out of 163857 (20.55%)
	Line 38: 8697 rows deleted out of 346695 (2.51%)
	Line 39: 8136 rows deleted out of 2112

## Add direction to CSV

Adds direction to `vehiclePositionsClean.csv` file created in previous section
**Reads from**: CSV file containing filtered vehicle positions in `data` folder (`data/processed/assignment1/vehiclePositionsClean.csv`)
**Writes to**: CSV file containing filtered vehicle positions with direction in `data` folder (`data/processed/assignment1/vehiclePositionsCleanDirected.csv`)

In [86]:
def get_direction_from_line_stop_and_destination(line: Tuple[List[List[str]], List[List[str]]], stop_id: str,
                                                 destination_id: str) -> int:
    # Simple case 1 -> destination_id is in one direction but not in the other:
    stops = ({stop[3] for stop in line[0]}, {stop[3] for stop in line[1]})
    if destination_id in stops[0] and destination_id not in stops[1]:
        return 0
    if destination_id not in stops[0] and destination_id in stops[1]:
        return 1

    # Simple case 2 -> destination_id is the last stop of a direction:
    if destination_id == line[0][-1][3]:
        return 0
    if destination_id == line[1][-1][3]:
        return 1

    # Simple case 3 -> stop_id is in one direction but not in the other:
    if stop_id in stops[0] and stop_id not in stops[1]:
        return 0
    if stop_id not in stops[0] and stop_id in stops[1]:
        return 1

    # Complex case 1 -> if stop_id != destination_id, return the direction in which the stop with
    # id destination_id is after the stop with id stop_id
    if stop_id != destination_id:
        index_of_destination_0 = next(int(stop[8]) for stop in line[0] if stop[3] == destination_id)
        index_of_stop_0 = next(int(stop[8]) for stop in line[0] if stop[3] == stop_id)
        return 0 if index_of_stop_0 < index_of_destination_0 else 1
    # Complex case 2 -> if stop_id == destination_id, return the direction in which the stop with
    # id destination_id is further down the direction
    else:
        index_of_destination_0 = next(int(stop[8]) for stop in line[0] if stop[3] == destination_id)
        index_of_destination_1 = next(int(stop[8]) for stop in line[1] if stop[3] == destination_id)
        return 0 if index_of_destination_0 > index_of_destination_1 else 1

In [87]:
def add_direction_to_csv():
    positions = read_csv_stream('../data/processed/assignment1/vehiclePositionsClean.csv', skip_first=False)
    directed_positions, output_file = get_csv_writer('../data/processed/assignment1/vehiclePositionsCleanDirected.csv')
    directed_positions.writerow([*next(positions), 'Direction'])
    grouped_lines = group_line_stops(read_csv_stream('../data/processed/assignment1/line_stops.csv'))
    memory = {}
    for position in tqdm(positions):
        line_id = position[1]
        stop_id = position[4]
        destination_id = position[2]
        tuple_id = f'{line_id}-{stop_id}-{destination_id}'
        if tuple_id not in memory:
            memory[tuple_id] = get_direction_from_line_stop_and_destination(grouped_lines[line_id], stop_id,
                                                                            destination_id)
        directed_positions.writerow([*position, memory[tuple_id]])
    output_file.close()

In [88]:
add_direction_to_csv()

0it [00:00, ?it/s]

## Split CSV into lines

Splits `vehiclePositionsCleanDirected.csv` file created in previous section into separate CSV (one per line)
**Reads from**: CSV file containing filtered vehicle positions with direction in `data` folder (`data/processed/assignment1/vehiclePositionsCleanDirected.csv`)
**Writes to**: CSV file per line containing filtered vehicle positions with direction in `data/processed/assignment1/vehiclePositionsPerLine` folder (`data/processed/assignment1/vehiclePositionsPerLine/vehiclePositions*.csv`)

In [89]:
def split_csv_by_lines():
    files = {}
    positions = read_csv_stream('../data/processed/assignment1/vehiclePositionsCleanDirected.csv', skip_first=False)
    output_dir = '../data/processed/assignment1/vehiclePositionsPerLine'
    if not os.path.exists(output_dir):
        os.mkdir(output_dir)
    header = next(positions)
    for line in tqdm(positions):
        line_id = line[1]
        if line_id not in files:
            files[line_id] = get_csv_writer(
                f'{output_dir}/vehiclePositions{line_id}.csv')
            files[line_id][0].writerow(header)
        files[line_id][0].writerow(line)
    for _, file in files.values():
        file.close()

In [90]:
split_csv_by_lines()

0it [00:00, ?it/s]

## Vehicle Matching

Tries to link several vehicle positions belonging to same physical vehicle
**Reads from**: CSV file per line containing filtered vehicle positions with direction in `data/processed/assignment1/vehiclePositionsPerLine` folder (`data/processed/assignment1/vehiclePositionsPerLine/vehiclePositions*.csv`)
**Writes to**: CSV file per line containing vehicle positions with `bus_id` in `data/processed/assignment1/csv_lines_linked` folder (`data/processed/assignment1/csv_lines_linked/vehiclePositions*.csv`)

In [91]:
def split_positions_by_direction(positions: Iterable[List[str]]) -> Tuple[List[List[str]], List[List[str]]]:
    line = ([], [])
    for position in positions:
        line[int(position[-1])].append(position)
    return line

In [103]:
def get_index_of_stop_in_line(line, direction, stop_id):
    return get_index_of_stop_in_line_direction(line[direction], stop_id)


def get_index_of_stop_in_line_direction(line, stop_id):
    return next((int(stop[-1]) for stop in line if stop[3] == stop_id), -1)


class Match(Enum):
    WRONG = 1
    OK = 2
    TOO_FAR = 3

In [100]:
def group_positions_by_timestamp(positions: Iterable[List[str]]) -> List[Tuple[int, List[List[str]]]]:
    grouped_positions = []
    old_timestamp = -1
    current_timestamp_positions = []
    for position in positions:
        current_timestamp = int(position[0])
        if current_timestamp != old_timestamp:
            assert current_timestamp > old_timestamp
            grouped_positions.append((old_timestamp, current_timestamp_positions))
            old_timestamp = current_timestamp
            current_timestamp_positions = []
        current_timestamp_positions.append(position)
    grouped_positions.append((old_timestamp, current_timestamp_positions))
    return grouped_positions[1:]

In [101]:
def possible_match(first_position: List[str], second_position: List[str],
                   line: List[List[str]]) -> Match:
    first_stop_id = first_position[4]
    second_stop_id = second_position[4]
    # Both positions are in the same stop -> Compare using distance from that stop
    if first_stop_id == second_stop_id:
        first_distance = int(first_position[3])
        second_distance = int(second_position[3])
        return Match.OK if first_distance <= second_distance else Match.WRONG
    # Positions are in different stops -> Compare using order of stops in direction
    else:
        # Assert both positions have the same direction
        first_stop_index = get_index_of_stop_in_line_direction(line, first_stop_id)
        second_stop_index = get_index_of_stop_in_line_direction(line, second_stop_id)
        if second_stop_index - first_stop_index > 3:
            return Match.TOO_FAR
        return Match.OK if first_stop_index < second_stop_index else Match.WRONG

In [102]:
def find_bus_matched_of_line_direction(positions: Iterable[List[str]], line: List[List[str]], line_id: str,
                                       direction: int, writer):
    grouped_positions = group_positions_by_timestamp(positions)
    previous_positions = []
    bus_id = (f'{line_id}-{direction}-{i:06d}' for i in count())
    sorting_key = lambda vehicle_position: get_index_of_stop_in_line_direction(line, vehicle_position[-2])

    for timestamp, current_positions in grouped_positions:
        sorted_positions = sorted(current_positions, key=sorting_key)
        while len(sorted_positions) > 0 and get_index_of_stop_in_line_direction(line, sorted_positions[0][-2]) == -1:
            sorted_positions.pop(0)
        current_previous_position_index = 0
        current_position_index = 0
        while current_position_index < len(sorted_positions) and current_previous_position_index < len(
                previous_positions):
            previous_position = previous_positions[current_previous_position_index]
            current_position = sorted_positions[current_position_index]
            result = possible_match(previous_position, current_position, line)
            if result == Match.OK:
                current_position.append(previous_position[-1])
                current_previous_position_index += 1
                current_position_index += 1
            elif result == Match.WRONG:
                current_position.append(next(bus_id))
                current_position_index += 1
            elif result == Match.TOO_FAR:
                current_previous_position_index += 1
        for position in sorted_positions:
            if len(position) == 6:
                position.append(next(bus_id))
            writer.writerow(position)
        previous_positions = sorted_positions

In [96]:
def find_bus_matches_of_line(file_path: str, output_path: str, line_id: str,
                             line: Tuple[List[List[str]], List[List[str]]]) -> None:
    positions = read_csv_stream(file_path, skip_first=False)
    with write_csv(output_path) as linked_positions:
        linked_positions.writerow([*next(positions), 'BusId'])
        direction1, direction2 = split_positions_by_direction(positions)
        find_bus_matched_of_line_direction(direction1, line[0], line_id, 0, linked_positions)
        find_bus_matched_of_line_direction(direction2, line[1], line_id, 1, linked_positions)


In [97]:
def find_bus_matches_of_lines():
    path = '../data/processed/assignment1/vehiclePositionsPerLine'
    output_path = '../data/processed/assignment1/csv_lines_linked'
    if not os.path.exists(output_path):
        os.mkdir(output_path)
    lines = group_line_stops(read_csv_stream('../data/processed/assignment1/line_stops.csv'))
    for file in tqdm(os.listdir(path)):
        line_id = file[16:-4]
        line = lines[line_id]
        find_bus_matches_of_line(f'{path}/{file}', f'{output_path}/{file}', line_id, line)

In [104]:
find_bus_matches_of_lines()

  0%|          | 0/74 [00:00<?, ?it/s]

# Calculate average time between stops

In [105]:
def group_positions_by_vehicle(positions: List[List[str]]) -> Dict[str, List[List[List[str]]]]:
    grouped_positions = {}
    for position in positions:
        bus_id = position[-1]
        if bus_id not in grouped_positions:
            grouped_positions[bus_id] = []
        grouped_positions[bus_id].append(position)
    return grouped_positions

In [106]:
def get_hour_from_timestamp(timestamp: int) -> int:
    return datetime.datetime.fromtimestamp(timestamp // 1000).hour

In [107]:
def get_vehicle_times_between_stops(vehicle_positions: List[List[str]],
                                    line: Tuple[List[List[str]], List[List[str]]]) -> List:
    times = []
    previous_timestamp = int(vehicle_positions[0][0])
    previous_stop = vehicle_positions[0][4]
    for position in vehicle_positions:
        current_timestamp = int(position[0])
        current_stop = position[4]
        if current_stop != previous_stop:
            time_difference = (current_timestamp - previous_timestamp) // 1000
            first_hour = get_hour_from_timestamp(previous_timestamp)
            last_hour = get_hour_from_timestamp(current_timestamp)
            times.append([first_hour, previous_stop, current_stop, time_difference])
            if first_hour != last_hour:
                times.append([last_hour, previous_stop, current_stop, time_difference])
            previous_stop = current_stop
            previous_timestamp = current_timestamp
    return times

In [108]:
def calculate_average_time_between_stops_of_line(positions: List[List[str]], line_id: str,
                                                 line: Tuple[List[List[str]], List[List[str]]], output):
    grouped_positions = group_positions_by_vehicle(positions)
    times = []
    for vehicle_id, vehicle_positions in grouped_positions.items():
        times += get_vehicle_times_between_stops(vehicle_positions, line)
    for direction in [0, 1]:
        for fromStop, toStop in zip(line[direction][:-1], line[direction][1:]):
            time_sum = [0 for _ in range(24)]
            time_sum_filtered = [0 for _ in range(24)]
            time_count = [0 for _ in range(24)]
            time_count_filtered = [0 for _ in range(24)]
            for time in times:
                if time[1] == fromStop[3] and time[2] == toStop[3]:
                    time_sum[time[0]] += time[3]
                    time_count[time[0]] += 1
                    if time[3] < 6000:  # Times larger than 10 minutes are likely anomalies that shouldn't be counted
                        time_sum_filtered[time[0]] += time[3]
                        time_count_filtered[time[0]] += 1
            output[0].writerow(
                [line_id, fromStop[3], toStop[3],
                 *[f'{total / amount:.2f}' if amount > 0 else 0 for total, amount in zip(time_sum, time_count)]])
            output[1].writerow([line_id, fromStop[3], toStop[3], *time_count])
            output[2].writerow(
                [line_id, fromStop[3], toStop[3], *[f'{total / amount:.2f}' if amount > 0 else 0 for total, amount in
                                                    zip(time_sum_filtered, time_count_filtered)]])
            output[3].writerow([line_id, fromStop[3], toStop[3], *time_count_filtered])

In [109]:
def calculate_average_time_between_stops():
    source_path = '../data/processed/assignment1/csv_lines_linked'
    output_path = '../data/processed/assignment1/average_time_between_stops.csv'
    output_path_filtered = '../data/processed/assignment1/average_time_between_stops_filtered.csv'
    output_path_count = '../data/processed/assignment1/average_time_between_stops_count.csv'
    output_path_filtered_count = '../data/processed/assignment1/average_time_between_stops_filtered_count.csv'
    lines = group_line_stops(read_csv_stream('../data/processed/assignment1/line_stops.csv'))
    with write_csv(output_path) as output, write_csv(output_path_filtered) as output_filtered, write_csv(
            output_path_count) as output_count, write_csv(output_path_filtered_count) as output_filtered_count:
        header = ['LineId', 'FromStop', 'ToStop', *[f'{i}' for i in range(24)], 'Day']
        output.writerow(header)
        output_filtered.writerow(header)
        output_count.writerow(header)
        output_filtered_count.writerow(header)
        for file in tqdm(os.listdir(source_path)):
            line_id = file[16:-4]
            line = lines[line_id]
            positions = read_csv_list(f'{source_path}/{file}')[1:]
            calculate_average_time_between_stops_of_line(positions, line_id, line,
                                                         (output, output_count, output_filtered, output_filtered_count))

In [110]:
calculate_average_time_between_stops()

  0%|          | 0/74 [00:00<?, ?it/s]

In [111]:
def calculate_daily_average_time_between_stops():
    source_path = '../data/processed/assignment1/csv_lines_linked'
    output_path = '../data/processed/assignment1/daily_average_time_between_stops_filtered'
    lines = group_line_stops(read_csv_stream('../data/processed/assignment1/line_stops.csv'))
    header = ['LineId', 'FromStop', 'ToStop', *[f'{i}' for i in range(24)]]

    class Dummy:
        def writerow(self, *args):
            pass

    dummy = Dummy()
    day_files = {}
    for file in tqdm(os.listdir(source_path)):
        line_id = file[16:-4]
        line = lines[line_id]
        positions = read_csv_list(f'{source_path}/{file}')[1:]
        day_positions = {}
        for position in positions:
            day = datetime.datetime.fromtimestamp(int(position[0]) / 1000).date().strftime('%Y-%m-%d')
            if day not in day_positions:
                day_positions[day] = []
            day_positions[day].append(position)
        for day in day_positions:
            if day not in day_files:
                day_files[day] = get_csv_writer(f'{output_path}/{day}.csv')
                day_files[day][0].writerow(header)
            calculate_average_time_between_stops_of_line(day_positions[day], line_id, line,
                                                         (dummy, dummy, day_files[day][0], dummy))
    for writer, file in day_files.values():
        file.close()

In [113]:
calculate_daily_average_time_between_stops()

  0%|          | 0/74 [00:00<?, ?it/s]

# Import Libraries

**Shapefile:** This is the pyshp library that is used to manipulate shapefiles.
**Math:** This library is used for basic math functions to calculate distance between points

In [114]:
line_stops = pd.read_csv('../data/processed/assignment1/line_stops.csv')
sf_actu_lines = shapefile.Reader('../data/raw/shapefiles/ACTU_LINES.shp')
# here we initialize shape_records, which includes a combination of the shapes and records from the shapefile. This combination will allow us to pull the lambert coordinates from the shapes as while also accessing the record information like line_id.
shape_records = sf_actu_lines.shapeRecords()

# Shapefile Distance Calculation Function

Now that we have our libraries loaded and files imported, we will create a function that can calculate the distance between two points on a polyline. The start_point and end_point are indexes to tell us where we should start and stop calculating distance in the polyline. the line_segment is one of the shape elements that will be pulled from the shapefile. This calculation will be called later in an iterative for loop for each shape element withing shape_records.shape.

We will calculate the distance between each point in the shapefile using Pythagoreas' theorem, since the units in both line_stops and the shapefile are already provided in Belgium Lambert 1972 format, which projects the points onto a flat surface.

In [115]:
def calculate_distance_between_polyline_points(start_point: int, stop_point: int,
                                               line_segment: shapefile.Shape) -> float:
    # initializing our total distance to 0
    total_distance = 0
    # we'll need to calculate the distance between each consecutive pair of coordinates, and will iterate
    # from the start_point to the end_point. Each newly caluclated distance between points will be added
    # to the sum total_distance and then returned.
    for index in range(start_point, stop_point - 1):
        current = index
        next = index + 1
        total_distance += sqrt(pow((line_segment.points[current][0] - line_segment.points[next][0]), 2) + pow(
            (line_segment.points[current][1] - line_segment.points[next][1]), 2))
    return total_distance


# Calculate Distance Between Stops

Time to get to work! Here we several nested for loops that are used to compare match up the line in the shapefile to the line in the line_stops. For each matching line, we will cycle through to project the stop location ONTO the polyline. This is required because a bus stop can be imagined to be on a sidewalk, while the polyline is moving along the road.

Once we have matched up our bus stop with the nearest polyline point, we move to the next stop and do the same. Having 2 stop locations projected, we can call the previously defined calculate_distance_between_polyline_points function to find the distance between these 2 stops.

The first iteration of the loop will result in a dummy value, as it does not have a real stop to pair with. All of these dummy values are dropped once the dictionary that stores all values is transformed into a dataframe.

The data frame will be accessed latter with the unique combination of [LineID + fromStopID + toStopID]. This combination will be different depending on which direction a vehicle is moving, as the stop id's are not the same on each side of a street.

We will also hold onto the index value for the polyline location in case we need it later on for future predictions.

**integration**: This is how things were integerated
parameters: Here we dropped all stops before 4am because...

In [116]:
# initialize a dictionary that will be used to make a dataframe and csv file with the following format:
# |    LineId    |   FromStop  |   ToStop  |   distance    |    fromIndex   |    toIndex   |
stop_distance = {'LineId': [], 'LineIdFormatted': [], 'LineId_GeoMerge': [], 'Type': [], 'Direction': [],
                 'FromStop': [], 'ToStop': [], 'distance': [], 'fromIndex': [], 'toIndex': [],
                 'FromStop_lat': [], 'FromStop_lon': [], 'ToStop_lat': [], 'ToStop_lon': []}

# Initializing variables that will be used in loops
last_pointID = 0
last_stop_id = 0
last_stop_lat = 0
last_stop_lon = 0
adjusted_stop_lat_GPS = 0
adjusted_stop_lon_GPS = 0

# look through each shape/record combo in the shape_records file. Each element of shape_records represents a single line (metro, bus or tram)
for shape_record in shape_records:
    record = shape_record.record
    shape = shape_record.shape
    # look through each of the stops that exist in the line_stops csv. Here we are going to only cycle through a subset of the line_stops where there is a match on LineId and the direction to reduce computation time.
    for index, stop in line_stops[
        (line_stops['lineId'] == record['LIGNE']) & (line_stops['direction'] == record['VARIANTE'])].sort_values(
        by=['order']).iterrows():
        # Initializing variables that will be used in loops
        min_distance = 50
        adjusted_stop_lat = 50
        adjusted_stop_lon = 50
        current_pointID = 0
        current_stop_id = stop['stop_id_int']
        stop_lat = stop['lambert_x']
        stop_lon = stop['lambert_y']
        #After choosing a single stop from the line_stops file, we will compare that stops lambert GPS position to each coordinate that makes up the polyline in the current shape_records shape. We are finding the closest location in the shape file to our bus stop location. This can be done using euclidean distance calculation because the coordinates are in lambert notation. Whichever location on the polyline is the closest becomes the projected location of the bus stop using the if statement.
        for pointID in range(len(shape.points)):
            point_lat = shape.points[pointID][0]
            point_lon = shape.points[pointID][1]
            distance = sqrt(pow((point_lat - stop_lat), 2) + pow((point_lon - stop_lon), 2))
            # if statement to compare distances and updated if shorter. It also saves the polyline info for future use in predicting
            # which method of transport is being used.
            if distance < min_distance:
                min_distance = distance
                adjusted_stop_lat = point_lat
                adjusted_stop_lon = point_lon
                current_pointID = pointID
                adjusted_stop_lat_GPS = stop['lat']
                adjusted_stop_lon_GPS = stop['long']
        # now we call a previously defined function to calculate the total distance between the location projected during the previous for loop iteration and the current loop iteration. We are able to do this because the stops have been sorted by descending order from first to last. The first row in the array will always be a dummy row and needs to be dropped afterwards.
        distance_between_stops = calculate_distance_between_polyline_points(last_pointID, current_pointID, shape)
        # we update our dictionary with all the values needed for distance between stops.
        # we will also strip out the leading zeros and the trailing text characters indicating (b,t,m for bus, tram and metro)
        stripped_line_id = stop['lineId'][:-1].strip("0")
        stop_distance['LineId'].append(stripped_line_id)
        stop_distance['LineIdFormatted'].append(f"line{stripped_line_id}")
        stop_distance['LineId_GeoMerge'].append(f"{stop['lineId']}-{stop['direction']}")
        stop_distance['Type'].append(str(stop['lineId'][-1]))
        stop_distance['Direction'].append(stop['direction'])
        stop_distance['FromStop'].append(last_stop_id)
        stop_distance['ToStop'].append(current_stop_id)
        stop_distance['distance'].append(distance_between_stops)
        stop_distance['fromIndex'].append(last_pointID)
        stop_distance['toIndex'].append(current_pointID)
        stop_distance['FromStop_lat'].append(last_stop_lat)
        stop_distance['FromStop_lon'].append(last_stop_lon)
        stop_distance['ToStop_lat'].append(adjusted_stop_lat_GPS)
        stop_distance['ToStop_lon'].append(adjusted_stop_lon_GPS)
        # after calculating the distance, we update the last stop id, point, and lat/lon to the currently being used before iterating through to the next bus stop. The current point becomes the last point for the next calculation.
        last_stop_id = current_stop_id
        last_pointID = current_pointID
        last_stop_lat = adjusted_stop_lat_GPS
        last_stop_lon = adjusted_stop_lon_GPS


In [122]:

#stop_distance is a dictionary we can use to merge with GeoJSON

# Opening JSON file
json_shapes_path = '../data/raw/shapes_lat_long.json'
with open(json_shapes_path, 'r') as file:
    line_shapes_geojson = json.load(file)

df_line_shapes_geojson = pd.json_normalize(line_shapes_geojson)
df_stop_distance = pd.DataFrame.from_dict(stop_distance)

sorted_geo_json = []
for line_id, polyline in df_line_shapes_geojson.iteritems():
    for stop_index, value in df_stop_distance.iterrows():
        if line_id == value['LineId_GeoMerge']:
            sorted_geo_json.append(polyline.values.__array__())  #df_line_shapes_geojson[line_id]

#
# add_geojson_to_distance_dictionary(stop_distance, line_shapes)
# polyline_list = []
# WKT_list = []
#
#
# for stop_segment in df_stop_distance:
#
# for pointID in range(last_pointID, current_pointID+1):
#     point_lat = shape.points[pointID][0]
#     point_lon = shape.points[pointID][1]
#     polyline_list.append([point_lat, point_lon])
#     WKT_list.append(f"{point_lat} {point_lon}")
#
# geo_json_test["geojson"].append(f'{{"type": "FeatureCollection", "features": [{{"type": "Feature", properties: {{}}, "geometry": {{"type": "LineString", "coordinates": {polyline_list}}}}}]}}')

In [132]:
df_line_shapes_geojson_transformed = df_line_shapes_geojson.transpose()[0].apply(lambda x: {"type": "FeatureCollection",
                                                                                            "features": [
                                                                                                {"type": "Feature",
                                                                                                 "properties": {},
                                                                                                 "geometry": {
                                                                                                     "type": "LineString",
                                                                                                     "coordinates": x}
                                                                                                 }]
                                                                                            })

df_stop_distance_merged = df_stop_distance.merge(df_line_shapes_geojson_transformed, left_on='LineId_GeoMerge',
                                                 right_index=True).rename(columns={0: 'geojson'})
print(df_stop_distance_merged)

     LineId LineIdFormatted LineId_GeoMerge Type  Direction  FromStop  ToStop  \
0         1           line1          001m-1    m          1         0    8733   
1         1           line1          001m-1    m          1      8733    8742   
2         1           line1          001m-1    m          1      8742    8292   
3         1           line1          001m-1    m          1      8292    8282   
4         1           line1          001m-1    m          1      8282    8272   
...     ...             ...             ...  ...        ...       ...     ...   
4171    218         line218          218b-2    b          2      9025    2209   
4172    218         line218          218b-2    b          2      2209    2835   
4173    218         line218          218b-2    b          2      2835    1901   
4174    218         line218          218b-2    b          2      1901    2221   
4175    218         line218          218b-2    b          2      2221    3347   

        distance  fromIndex

In [133]:
# now we convert the dictionary to a Pandas DataFrame for easier manipulation
df_stop_distance_merged.drop(df_stop_distance_merged[df_stop_distance_merged['toIndex'] == 0].index, inplace=True)

# Export to CSV

We can now export the dataframe to a csv file for use in other parts of the cleaning and predicitons.

In [134]:
# finally, we export the distance to a csv file named stop_distance.csv
df_stop_distance_merged.to_csv(r'../data/processed/assignment1/stop_distance.csv', index=False, header=True)

## Calculate Speed

In [135]:
stop_distance_path = '../data/processed/assignment1/stop_distance.csv'
stop_time_path = '../data/processed/assignment1/average_time_between_stops_filtered.csv'

# initialize a dictionary that will be used to make a dataframe and csv file with the following format:
# |  LineID      | FromStop      |   ToStop   |   0    |   1   | ... |  23   |
stop_time = pd.read_csv(stop_time_path)

# |    LineID    |   FromStop    |   ToStop   |   distance    |    fromIndex   |    toIndex   |
stop_distance = pd.read_csv(stop_distance_path)
stop_distance['FromStop'].astype('float_', copy=True, errors='raise')
stop_distance['ToStop'].astype('float_', copy=True, errors='raise')


def calculate_speed(time: pd.DataFrame, distance: pd.DataFrame) -> pd.DataFrame:
    merged_time_distance = time.merge(distance, how='left', on=['LineId', 'FromStop', 'ToStop'])
    for hour in range(0, 24):
        merged_time_distance[f"speed{hour}"] = (merged_time_distance['distance'] / merged_time_distance[
            f"{hour}"]) * 3.6
    merged_time_distance.drop(columns=['distance', *[f'{i}' for i in range(24)]], inplace=True)
    return merged_time_distance


df_speed = calculate_speed(stop_time, stop_distance)

In [138]:
df_speed.to_csv(r'../data/processed/assignment1/vehicleSpeed.csv', index=False, header=True)

# Reformat data for use in visualization

In [142]:
# LineId,FromStop,ToStop,FromStop_lat,FromStop_lon,ToStop_lat,ToStop_lon,speed0,speed1,speed2

new_header = ['LineId', 'FromStop', 'ToStop', 'Day', 'LineIdFormatted', 'LineId_GeoMerge', 'Type', 'Direction',
              'fromIndex', 'toIndex', 'FromStop_lat', 'FromStop_lon', 'ToStop_lat', 'ToStop_lon', 'geojson', 'hour',
              'speed']

with write_csv('../data/processed/assignment1/vehicleSpeedReformatted.csv') as output:
    output.writerow(new_header)
    speeds = read_csv_stream('../data/processed/assignment1/vehicleSpeed.csv', skip_first=True)
    for speed_line in speeds:
        output.writerows([[*speed_line[:15], f'{hour:02d}:00', speed_line[15 + hour]] for hour in range(24)])

Merge Delays with Speed

In [143]:
vehicle_speed = '../data/processed/assignment1/vehicleSpeedReformatted.csv'
stop_delays = '../data/processed/assignment2/grouped_visual_data_ass2.csv'

df_vehicle_speed = pd.read_csv(vehicle_speed)
df_stop_delays = pd.read_csv(stop_delays)

In [144]:
print(df_vehicle_speed.head())
print(df_stop_delays.head())

   LineId  FromStop  ToStop  Day LineIdFormatted LineId_GeoMerge Type  \
0      37      2957    5810  NaN          line37          037b-1    b   
1      37      2957    5810  NaN          line37          037b-1    b   
2      37      2957    5810  NaN          line37          037b-1    b   
3      37      2957    5810  NaN          line37          037b-1    b   
4      37      2957    5810  NaN          line37          037b-1    b   

   Direction  fromIndex  toIndex  FromStop_lat  FromStop_lon  ToStop_lat  \
0        1.0       49.0     57.0     50.821413      4.341859   50.818572   
1        1.0       49.0     57.0     50.821413      4.341859   50.818572   
2        1.0       49.0     57.0     50.821413      4.341859   50.818572   
3        1.0       49.0     57.0     50.821413      4.341859   50.818572   
4        1.0       49.0     57.0     50.821413      4.341859   50.818572   

   ToStop_lon                                            geojson   hour  \
0    4.340952  {'type': 'Feat