# Assignment 4

## Convert Line shapes to EPSG:4326

In [1]:
import json

import shapefile
from pyproj import Proj, transform
from tqdm.notebook import tqdm

In [None]:
lines = shapefile.Reader('../data/raw/shapefiles/ACTU_LINES')
lambert = Proj('EPSG:31370')
latlong = Proj('EPSG:4326')
shapes = {f'{line.record["LIGNE"]}-{line.record["VARIANTE"]}': [transform(lambert, latlong, x, y) for x, y in
                                                                line.shape.points] for line in
          tqdm(lines.shapeRecords())}
with open('../data/raw/shapes_lat_long.json', 'w', encoding='utf8') as json_file:
    json.dump(shapes, json_file)

## Create plots for each track and line

In [10]:
import pandas as pd
import matplotlib.pyplot as plt
import json
import os
from tqdm.notebook import tqdm

In [None]:
tracks = pd.read_csv('../data/raw/GPStracks.csv')
with open('../data/raw/shapes_lat_long.json', 'r', encoding='utf8') as json_file:
    shapes = json.load(json_file)
for track_id in tqdm(tracks['TrackId'].unique()):
    track = tracks[tracks['TrackId'] == track_id].sort_values(by='time')
    if not os.path.exists(f'../data/line_plots/Track{track_id}'):
        os.mkdir(f'../data/line_plots/Track{track_id}')
    for index, (line_id, line) in tqdm(enumerate(shapes.items())):
        plt.scatter(x=track['lon'], y=track['lat'], label=f'Track {track_id}')
        plt.scatter(x=[lon for _, lon in line], y=[lat for lat, _ in line], label=line_id)
        plt.title(line_id)
        plt.savefig(f'../data/line_plots/Track{track_id}/{line_id}.png')
        plt.close()

### Track ⟶ Line:

 - Track 1: Other
 - Track 3: Bus 50 or Tram 82 or Bus 49
 - Track 4: Bus 50
 - Track 5: Tram 82 or Tram 97
 - Track 6: Tram 8
 - Track 7: Tram 7
 - Track 8: Other
 - Track 10: Tram 8 or Tram 93
 - Track 11: Tram 25

## Create CSV of labeled points

In [16]:
import json
from scripts.helpers import write_csv

In [20]:
with open('../data/raw/shapes_lat_long.json', 'r', encoding='utf8') as json_file:
    shapes = json.load(json_file)
with write_csv('../data/labeled_line_points.csv') as csv_out:
    csv_out.writerow(['Lat', 'Long', 'LineId'])
    for line_id, points in shapes.items():
        if line_id[:-2] in ['032t']:  # Remove lines that are not active anymore
            continue
        for lat, long in points:
            csv_out.writerow([lat, long, line_id[:-2]])

## Create Classification Model

In [50]:
from sklearn.neighbors import KNeighborsClassifier
from scripts.helpers import read_csv_list
import pandas as pd
from typing import Tuple, List

In [51]:
def get_datasets() -> Tuple[List[List[float]], List[str], pd.DataFrame]:
    csv_lines = read_csv_list('../data/labeled_line_points.csv')
    data_set = [[float(point[0]), float(point[1])] for point in csv_lines[1:]]
    data_labels = [point[-1] for point in csv_lines[1:]]
    tracks = pd.read_csv('../data/raw/GPStracks.csv')
    return data_set, data_labels, tracks

### Create "Ensemble" with KNN using average probability

In [52]:
data_set, data_labels, tracks = get_datasets()
print('Training model')
model = KNeighborsClassifier(n_neighbors=2)
model.fit(data_set, data_labels)
print('Model trained')
for track_id in tracks['TrackId'].unique():
    track_points = tracks[tracks['TrackId'] == track_id][['lat', 'lon']]
    predictions = model.predict_proba(track_points)
    final_predictions = [(sum(predictions[i][x]
                              for i in range(len(predictions))) * 100 / len(predictions), model.classes_[x])
                         for x in range(len(predictions[0]))]
    sorted_predictions = sorted(final_predictions, key=lambda x: x[0], reverse=True)

    print(track_id, '->', sorted_predictions[:4])

Training model
Model trained
1 -> [(31.179775280898877, '046b'), (14.044943820224718, '088b'), (11.797752808988765, '051t'), (11.797752808988765, '218b')]
3 -> [(34.34782608695652, '082t'), (25.652173913043477, '049b'), (19.565217391304348, '050b'), (11.304347826086957, '048b')]
4 -> [(37.75933609958506, '050b'), (12.655601659751037, '212b'), (10.37344398340249, '074b'), (9.95850622406639, '052b')]
5 -> [(33.944954128440365, '212b'), (26.605504587155963, '082t'), (24.31192660550459, '097t'), (11.46788990825688, '050b')]
6 -> [(41.335740072202164, '008t'), (27.978339350180505, '093t'), (8.12274368231047, '007t'), (3.7906137184115525, '211b')]
7 -> [(44.87179487179487, '007t'), (38.717948717948715, '025t'), (10.512820512820513, '209b'), (3.8461538461538463, '008t')]
8 -> [(12.420382165605096, '007t'), (12.261146496815286, '038b'), (11.94267515923567, '008t'), (10.987261146496815, '004t')]
10 -> [(45.96774193548387, '008t'), (45.96774193548387, '093t'), (4.032258064516129, '038b'), (2.419



### Create "Ensemble" with KNN using "existence"

In [75]:
data_set, data_labels, tracks = get_datasets()
print('Training model')
model = KNeighborsClassifier(n_neighbors=25)
model.fit(data_set, data_labels)
print('Model trained')
for track_id in tracks['TrackId'].unique():
    track_points = tracks[tracks['TrackId'] == track_id][['lat', 'lon']]
    predictions = model.predict_proba(track_points)
    converted_predictions = [
        [1 if probability > 0 else 0 for probability in prediction]
        for prediction in predictions
    ]
    final_predictions = [(sum(converted_predictions[i][x]
                              for i in range(len(predictions))) * 100 / len(predictions), model.classes_[x])
                         for x in range(len(predictions[0]))]
    filtered_predictions = [
        (percentage, line) for percentage, line in final_predictions if percentage > 75
    ]
    if len(filtered_predictions):
        filtered_predictions = [
            (prob, line) for prob, line in filtered_predictions if filtered_predictions[0][0] - prob < 5
        ]
    sorted_predictions = sorted(filtered_predictions, key=lambda x: x[0], reverse=True)

    print(track_id, '->', sorted_predictions[:4])

Training model
Model trained
1 -> []
3 -> [(100.0, '049b'), (100.0, '050b'), (99.1304347826087, '082t')]
4 -> [(100.0, '050b')]
5 -> [(100.0, '212b'), (98.62385321100918, '097t'), (98.1651376146789, '082t')]
6 -> [(97.47292418772564, '008t')]
7 -> [(100.0, '007t')]
8 -> []
10 -> [(93.54838709677419, '008t'), (93.54838709677419, '093t')]
11 -> [(100.0, '025t'), (85.16746411483254, '007t')]




In [77]:
import datetime
from scripts.helpers import distance
import pandas as pd

In [95]:
def calculate_average_speed_of_track(track: pd.DataFrame) -> float:
    total_time = 0.0
    total_speed = 0.0
    previous_time = datetime.datetime.fromisoformat(track.iloc[0]['time'][:-1])
    previous_position = (track.iloc[0]['lat'], track.iloc[0]['lon'])
    for _, _, lat, long, time in track.sort_values(by='time').itertuples():
        timestamp = datetime.datetime.fromisoformat(time[:-1])
        seconds = (timestamp - previous_time).total_seconds()
        if seconds > 0:
            speed = abs(distance(*previous_position, lat, long)) * 1000 / seconds
            total_speed += speed
            total_time += seconds
        previous_time = timestamp
        previous_position = (lat, long)
    return total_speed / total_time

In [96]:
tracks = pd.read_csv('../data/raw/GPStracks.csv')
for track_id in tracks['TrackId'].unique():
    track = tracks[tracks['TrackId'] == track_id]
    speed = calculate_average_speed_of_track(track)
    print(track_id, '->', speed, 'm/s')

1 -> 0.7291839252917509 m/s
3 -> 3.633952140433144 m/s
4 -> 1.732230824629012 m/s
5 -> 2.06724017703918 m/s
6 -> 1.9623354220439553 m/s
7 -> 3.2759623317901125 m/s
8 -> 3.694685611022721 m/s
10 -> 10.221985679245405 m/s
11 -> 4.6304892012317085 m/s
