# Assignment 4

## Convert Line shapes to EPSG:4326

In [1]:
import json

import shapefile
from numpy import mean, median
from pyproj import Proj, transform
from tqdm.notebook import tqdm

In [2]:
# already run if you have shapes_lat_long.json
lines = shapefile.Reader('../data/raw/shapefiles/ACTU_LINES')
lambert = Proj('EPSG:31370')
latlong = Proj('EPSG:4326')
shapes = {f'{line.record["LIGNE"]}-{line.record["VARIANTE"]}': [transform(lambert, latlong, x, y) for x, y in
                                                                line.shape.points] for line in
          tqdm(lines.shapeRecords())}
with open('../data/raw/shapes_lat_long.json', 'w', encoding='utf8') as json_file:
    json.dump(shapes, json_file)

  0%|          | 0/174 [00:00<?, ?it/s]

  shapes = {f'{line.record["LIGNE"]}-{line.record["VARIANTE"]}': [transform(lambert, latlong, x, y) for x, y in


## Create plots for each track and line

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import json
import os
from tqdm.notebook import tqdm

In [6]:
tracks = pd.read_csv('../data/raw/GPSTracksAssignment4/GPStracks.csv')
with open('../data/raw/shapes_lat_long.json', 'r', encoding='utf8') as json_file:
    shapes = json.load(json_file)
for track_id in tqdm(tracks['TrackId'].unique()):
    track = tracks[tracks['TrackId'] == track_id].sort_values(by='time')
    if not os.path.exists(f'../data/line_plots/Track{track_id}'):
        os.mkdir(f'../data/line_plots/Track{track_id}')
    for index, (line_id, line) in tqdm(enumerate(shapes.items())):
        plt.scatter(x=track['lon'], y=track['lat'], label=f'Track {track_id}')
        plt.scatter(x=[lon for _, lon in line], y=[lat for lat, _ in line], label=line_id)
        plt.title(line_id)
        plt.savefig(f'../data/line_plots/Track{track_id}/{line_id}.png')
        plt.close()

  0%|          | 0/9 [00:00<?, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

### Track ⟶ Line:

 - Track 1: Other
 - Track 3: Bus 50 or Tram 82 or Bus 49
 - Track 4: Bus 50
 - Track 5: Tram 82 or Tram 97
 - Track 6: Tram 8
 - Track 7: Tram 7
 - Track 8: Other
 - Track 10: Tram 8 or Tram 93
 - Track 11: Tram 25

## Create CSV of labeled points

In [7]:
import json
from scripts.helpers import write_csv

In [8]:
# creating the dataset.

In [10]:
with open('../data/raw/shapes_lat_long.json', 'r', encoding='utf8') as json_file:
    shapes = json.load(json_file)
with write_csv('../data/processed/assignment4/labeled_line_points.csv') as csv_out:
    csv_out.writerow(['Lat', 'Long', 'LineId'])
    for line_id, points in shapes.items():
        if line_id[:-2] in ['032t']:  # Remove lines that are not active anymore
            continue
        for lat, long in points:
            csv_out.writerow([lat, long, line_id[:-2]])

## Create Classification Model

In [12]:
from sklearn.neighbors import KNeighborsClassifier
from scripts.helpers import read_csv_list
import pandas as pd
from typing import Tuple, List

In [15]:
def get_datasets() -> Tuple[List[List[float]], List[str], pd.DataFrame]:
    csv_lines = read_csv_list('../data/processed/assignment4/labeled_line_points.csv')
    data_set = [[float(point[0]), float(point[1])] for point in csv_lines[1:]]
    data_labels = [point[-1] for point in csv_lines[1:]]
    tracks = pd.read_csv('../data/raw/GPSTracksAssignment4/GPStracks.csv')
    return data_set, data_labels, tracks

### Create "Ensemble" with KNN using average probability

In [16]:
data_set, data_labels, tracks = get_datasets()
print('Training model')
model = KNeighborsClassifier(n_neighbors=10)
model.fit(data_set, data_labels)
print('Model trained')
for track_id in tracks['TrackId'].unique():
    track_points = tracks[tracks['TrackId'] == track_id][['lat', 'lon']]
    predictions = model.predict_proba(track_points)
    final_predictions = [(sum(predictions[i][x]
                              for i in range(len(predictions))) * 100 / len(predictions), model.classes_[x])
                         for x in range(len(predictions[0]))]
    sorted_predictions = sorted(final_predictions, key=lambda x: x[0], reverse=True)

    print(track_id, '->', sorted_predictions[:4])

Training model
Model trained
1 -> [(28.314606741573037, '046b'), (17.07865168539325, '218b'), (11.96629213483146, '051t'), (9.438202247191013, '088b')]
3 -> [(33.304347826086946, '082t'), (23.826086956521745, '049b'), (23.043478260869566, '050b'), (10.782608695652176, '048b')]
4 -> [(32.365145228215795, '050b'), (13.81742738589212, '074b'), (12.987551867219914, '212b'), (10.248962655601662, '097t')]
5 -> [(33.669724770642205, '212b'), (24.67889908256885, '097t'), (24.12844036697251, '082t'), (11.559633027522937, '050b')]
6 -> [(40.72202166064981, '008t'), (26.498194945848372, '093t'), (7.725631768953068, '007t'), (4.007220216606497, '209b')]
7 -> [(41.487179487179475, '007t'), (38.56410256410256, '025t'), (12.256410256410266, '209b'), (4.153846153846154, '008t')]
8 -> [(14.840764331210197, '007t'), (12.261146496815282, '008t'), (11.910828025477711, '038b'), (8.980891719745237, '004t')]
10 -> [(42.258064516129025, '008t'), (42.258064516129025, '093t'), (4.838709677419354, '038b'), (4.35



### Create "Ensemble" with KNN using "presence"

In [17]:
data_set, data_labels, tracks = get_datasets()
print('Training model')
model = KNeighborsClassifier(n_neighbors=15)
model.fit(data_set, data_labels)

track_predictions = {}

print('Model trained')
for track_id in tracks['TrackId'].unique():
    track_points = tracks[tracks['TrackId'] == track_id][['lat', 'lon']]
    predictions = model.predict_proba(track_points)
    converted_predictions = [
        [1 if probability > 0 else 0 for probability in prediction]
        for prediction in predictions
    ]
    final_predictions = [(sum(converted_predictions[i][x]
                              for i in range(len(predictions))) * 100 / len(predictions), model.classes_[x])
                         for x in range(len(predictions[0]))]
    filtered_predictions = [
        (percentage, line) for percentage, line in final_predictions if percentage > 75
    ]

    filtered_predictions = sorted(filtered_predictions, key=lambda x: x[0], reverse=True)

    if len(filtered_predictions):
        filtered_predictions = [
            (prob, line) for prob, line in filtered_predictions if filtered_predictions[0][0] - prob < 5
        ]
    track_predictions[track_id] = filtered_predictions[:4]
    print(track_id, '->', filtered_predictions[:4])


Training model
Model trained
1 -> []
3 -> [(97.3913043478261, '049b'), (96.52173913043478, '050b'), (95.65217391304348, '082t')]
4 -> [(100.0, '050b')]
5 -> [(98.1651376146789, '097t'), (97.70642201834862, '082t'), (96.3302752293578, '212b')]
6 -> [(96.75090252707581, '008t')]
7 -> [(100.0, '007t')]
8 -> []
10 -> [(93.54838709677419, '008t'), (93.54838709677419, '093t')]
11 -> [(98.08612440191388, '025t')]




In [18]:
import datetime
from scripts.helpers import distance
import pandas as pd
from numpy import mean, median

In [19]:
def calculate_average_speed_of_track(track: pd.DataFrame) -> float:
    total_time = 0.0
    total_distance = 0.0
    previous_time = datetime.datetime.fromisoformat(track.iloc[0]['time'][:-1])
    previous_position = (track.iloc[0]['lat'], track.iloc[0]['lon'])
    for _, _, lat, long, time in track.sort_values(by='time').itertuples():
        timestamp = datetime.datetime.fromisoformat(time[:-1])
        seconds = (timestamp - previous_time).total_seconds()
        if seconds > 0:
            dist = abs(distance(*previous_position, lat, long)) * 1000
            total_distance += dist
            total_time += seconds
        previous_time = timestamp
        previous_position = (lat, long)
    return total_distance / total_time

In [21]:
tracks = pd.read_csv('../data/raw/GPSTracksAssignment4/GPStracks.csv')
tracks_speed = {}
for track_id in tracks['TrackId'].unique():
    track = tracks[tracks['TrackId'] == track_id]
    speed = calculate_average_speed_of_track(track)
    tracks_speed[track_id] = speed * 3.6
    print(track_id, '->', speed * 3.6, 'km/h')

1 -> 7.116623587308064 km/h
3 -> 23.126448391942652 km/h
4 -> 13.695407924105305 km/h
5 -> 14.871617291497595 km/h
6 -> 14.471340208418587 km/h
7 -> 16.346108370386162 km/h
8 -> 22.46110292777329 km/h
10 -> 51.63218769316401 km/h
11 -> 25.019532962404657 km/h


In [22]:
stops = pd.read_csv('../data/processed/assignment1/line_stops.csv')


def get_closest_stop_in_direction(position, line_direction_stops):
    distances = line_direction_stops.apply(
        lambda row: distance(row['stop_lat'], row['stop_lon'], position['lat'], position['lon']), axis=1)
    min_distance = distances.min()
    return line_direction_stops[distances == min_distance].iloc[0]


def get_closest_stops_in_line(position, line_stops):
    return (get_closest_stop_in_direction(position, line_stops[line_stops['direction'] == 1]),
            get_closest_stop_in_direction(position, line_stops[line_stops['direction'] == 2]))


In [23]:
def select_line_by_speed(track, track_speed, possible_lines, speeds):
    selected_line = None
    min_dif = 99999
    for _, line_id in possible_lines:
        first_stop, last_stop = get_first_and_last_stops(line_id, track)
        median_speed = get_speed_of_line_between_stops(first_stop, last_stop, line_id, speeds)
        if abs(median_speed - track_speed) < min_dif:
            min_dif = abs(median_speed - track_speed)
            selected_line = line_id
    return selected_line


def get_speed_of_line_between_stops(first_stop, last_stop, line_id, speeds):
    line_speeds = speeds[speeds['LineId'] == int(line_id[:-1])]
    total_speed = []
    current_stop = int(first_stop['stop_id'].strip('qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM'))
    while current_stop != int(last_stop['stop_id'].strip('qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM')):
        row = line_speeds[line_speeds['FromStop'] == current_stop].iloc[0]
        total_speed.append(row['speed9'])
        current_stop = row['ToStop']
    median_speed = median(total_speed)
    return median_speed


def get_first_and_last_stops(line_id, track):
    line_stops = stops[stops['lineId'] == line_id]
    first_stops = get_closest_stops_in_line(track.iloc[0], line_stops)
    last_stops = get_closest_stops_in_line(track.iloc[-1], line_stops)
    if first_stops[0]['order'] < last_stops[0]['order']:
        first_stop = first_stops[0]
        last_stop = last_stops[0]
    else:
        first_stop = first_stops[1]
        last_stop = last_stops[1]
    return first_stop, last_stop

In [24]:
def get_type_of_line(line):
    return {'m': 'Metro', 't': 'Tram', 'b': 'Bus'}[line[-1]]


speeds = pd.read_csv('../data/processed/assignment1/vehicleSpeed.csv')

for track_id in tracks['TrackId'].unique():
    track = tracks[tracks['TrackId'] == track_id]
    possible_lines = track_predictions[track_id]
    if len(possible_lines) == 0:
        print(f'{track_id} ==> Other')
    elif len(possible_lines) == 1:
        print(f'{track_id} ==> {get_type_of_line(possible_lines[0][1])} ({possible_lines[0][1]})')
    elif len(possible_lines) > 1 and len({l[1][-1] for l in possible_lines}) == 1:
        selected_line = select_line_by_speed(track, tracks_speed[track_id], possible_lines, speeds)
        print(
            f'{track_id} ==> {get_type_of_line(possible_lines[0][1])} ({" || ".join([l[1] for l in possible_lines])}) [ ==> {get_type_of_line(selected_line)} ({selected_line})]')
    else:
        possible_types = {get_type_of_line(l[1]) for l in possible_lines}
        selected_line = select_line_by_speed(track, tracks_speed[track_id], possible_lines, speeds)
        print(f'{track_id} ==> {" or ".join(possible_types)} ({" || ".join([l[1] for l in possible_lines])}) ==> {get_type_of_line(selected_line)} ({selected_line})')

1 ==> Other
3 ==> Bus or Tram (049b || 050b || 082t) ==> Tram (082t)
4 ==> Bus (050b)


IndexError: single positional indexer is out-of-bounds