# Assignment 4

## Convert Line shapes to EPSG:4326

In [2]:
import json

import shapefile
from pyproj import Proj, transform
from tqdm.notebook import tqdm

In [3]:
# already run if you have shapes_lat_long.json
lines = shapefile.Reader('../data/raw/shapefiles/ACTU_LINES')
lambert = Proj('EPSG:31370')
latlong = Proj('EPSG:4326')
shapes = {f'{line.record["LIGNE"]}-{line.record["VARIANTE"]}': [transform(lambert, latlong, x, y) for x, y in
                                                                line.shape.points] for line in
          tqdm(lines.shapeRecords())}
with open('../data/raw/shapes_lat_long.json', 'w', encoding='utf8') as json_file:
    json.dump(shapes, json_file)

  0%|          | 0/174 [00:00<?, ?it/s]

  shapes = {f'{line.record["LIGNE"]}-{line.record["VARIANTE"]}': [transform(lambert, latlong, x, y) for x, y in


KeyboardInterrupt: 

## Create plots for each track and line

In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import json
import os
from tqdm.notebook import tqdm

In [8]:
tracks = pd.read_csv('../data/raw/GPSTracksAssignment4/GPStracks.csv')
with open('../data/raw/shapes_lat_long.json', 'r', encoding='utf8') as json_file:
    shapes = json.load(json_file)
for track_id in tqdm(tracks['TrackId'].unique()):
    track = tracks[tracks['TrackId'] == track_id].sort_values(by='time')
    if not os.path.exists(f'../data/line_plots/Track{track_id}'):
        os.mkdir(f'../data/line_plots/Track{track_id}')
    for index, (line_id, line) in tqdm(enumerate(shapes.items())):
        plt.scatter(x=track['lon'], y=track['lat'], label=f'Track {track_id}')
        plt.scatter(x=[lon for _, lon in line], y=[lat for lat, _ in line], label=line_id)
        plt.title(line_id)
        plt.savefig(f'../data/line_plots/Track{track_id}/{line_id}.png')
        plt.close()

  0%|          | 0/9 [00:00<?, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

### Track ⟶ Line:

 - Track 1: Other
 - Track 3: Bus 50 or Tram 82 or Bus 49
 - Track 4: Bus 50
 - Track 5: Tram 82 or Tram 97
 - Track 6: Tram 8
 - Track 7: Tram 7
 - Track 8: Other
 - Track 10: Tram 8 or Tram 93
 - Track 11: Tram 25

## Create CSV of labeled points

In [9]:
import json
from scripts.helpers import write_csv

In [None]:
# creating the dataset.

In [11]:
with open('../data/raw/shapes_lat_long.json', 'r', encoding='utf8') as json_file:
    shapes = json.load(json_file)
with write_csv('../data/processed/assignment4/labeled_line_points.csv') as csv_out:
    csv_out.writerow(['Lat', 'Long', 'LineId'])
    for line_id, points in shapes.items():
        if line_id[:-2] in ['032t']:  # Remove lines that are not active anymore
            continue
        for lat, long in points:
            csv_out.writerow([lat, long, line_id[:-2]])

## Create Classification Model

In [14]:
from sklearn.neighbors import KNeighborsClassifier
from scripts.helpers import read_csv_list
import pandas as pd
from typing import Tuple, List

In [15]:
def get_datasets() -> Tuple[List[List[float]], List[str], pd.DataFrame]:
    csv_lines = read_csv_list('../data/processed/assignment4/labeled_line_points.csv')
    data_set = [[float(point[0]), float(point[1])] for point in csv_lines[1:]]
    data_labels = [point[-1] for point in csv_lines[1:]]
    tracks = pd.read_csv('../data/raw/GPSTracksAssignment4/GPStracks.csv')
    return data_set, data_labels, tracks

### Create "Ensemble" with KNN using average probability

In [18]:
data_set, data_labels, tracks = get_datasets()
print('Training model')
model = KNeighborsClassifier(n_neighbors=2)
model.fit(data_set, data_labels)
print('Model trained')
for track_id in tracks['TrackId'].unique():
    track_points = tracks[tracks['TrackId'] == track_id][['lat', 'lon']]
    predictions = model.predict_proba(track_points)
    final_predictions = [(sum(predictions[i][x]
                              for i in range(len(predictions))) * 100 / len(predictions), model.classes_[x])
                         for x in range(len(predictions[0]))]
    sorted_predictions = sorted(final_predictions, key=lambda x: x[0], reverse=True)

    print(track_id, '->', sorted_predictions[:4])

Training model
Model trained
1 -> [(30.337078651685392, '046b'), (14.044943820224718, '088b'), (12.359550561797754, '218b'), (11.797752808988765, '051t')]
3 -> [(34.34782608695652, '082t'), (23.043478260869566, '049b'), (22.17391304347826, '050b'), (11.304347826086957, '048b')]
4 -> [(33.81742738589212, '050b'), (13.278008298755188, '074b'), (13.278008298755188, '212b'), (9.95850622406639, '082t')]
5 -> [(33.944954128440365, '212b'), (26.146788990825687, '082t'), (24.770642201834864, '097t'), (11.46788990825688, '050b')]
6 -> [(41.15523465703971, '008t'), (28.15884476534296, '093t'), (8.12274368231047, '007t'), (3.7906137184115525, '071b')]
7 -> [(42.30769230769231, '007t'), (41.282051282051285, '025t'), (10.256410256410257, '209b'), (3.8461538461538463, '008t')]
8 -> [(12.738853503184714, '007t'), (12.420382165605096, '051t'), (12.261146496815286, '038b'), (12.101910828025478, '008t')]
10 -> [(45.96774193548387, '008t'), (45.96774193548387, '093t'), (4.032258064516129, '060b'), (4.032



### Create "Ensemble" with KNN using "existence"

In [82]:
data_set, data_labels, tracks = get_datasets()
print('Training model')
model = KNeighborsClassifier(n_neighbors=25)
model.fit(data_set, data_labels)

track_predictions = {}

print('Model trained')
for track_id in tracks['TrackId'].unique():
    track_points = tracks[tracks['TrackId'] == track_id][['lat', 'lon']]
    predictions = model.predict_proba(track_points)
    converted_predictions = [
        [1 if probability > 0 else 0 for probability in prediction]
        for prediction in predictions
    ]
    final_predictions = [(sum(converted_predictions[i][x]
                              for i in range(len(predictions))) * 100 / len(predictions), model.classes_[x])
                         for x in range(len(predictions[0]))]
    filtered_predictions = [
        (percentage, line) for percentage, line in final_predictions if percentage > 75
    ]

    filtered_predictions = sorted(filtered_predictions, key=lambda x: x[0], reverse=True)

    if len(filtered_predictions):
        filtered_predictions = [
            (prob, line) for prob, line in filtered_predictions if filtered_predictions[0][0] - prob < 5
        ]
    track_predictions[track_id] = filtered_predictions[:4]
    print(track_id, '->', filtered_predictions[:4])

Training model
Model trained
1 -> []
3 -> [(100.0, '049b'), (100.0, '050b'), (99.1304347826087, '082t')]
4 -> [(100.0, '050b')]
5 -> [(100.0, '212b'), (98.62385321100918, '097t'), (98.1651376146789, '082t')]
6 -> [(97.47292418772564, '008t')]
7 -> [(100.0, '007t')]
8 -> []
10 -> [(93.54838709677419, '008t'), (93.54838709677419, '093t')]
11 -> [(100.0, '025t')]




In [72]:
import datetime
from scripts.helpers import distance
import pandas as pd

In [73]:
def calculate_average_speed_of_track(track: pd.DataFrame) -> float:
    total_time = 0.0
    total_speed = 0.0
    previous_time = datetime.datetime.fromisoformat(track.iloc[0]['time'][:-1])
    previous_position = (track.iloc[0]['lat'], track.iloc[0]['lon'])
    for _, _, lat, long, time in track.sort_values(by='time').itertuples():
        timestamp = datetime.datetime.fromisoformat(time[:-1])
        seconds = (timestamp - previous_time).total_seconds()
        if seconds > 0:
            speed = abs(distance(*previous_position, lat, long)) * 1000 / seconds
            total_speed += speed
            total_time += seconds
        previous_time = timestamp
        previous_position = (lat, long)
    return total_speed / total_time

In [86]:
tracks = pd.read_csv('../data/raw/GPSTracksAssignment4/GPStracks.csv')
tracks_speed = {}
for track_id in tracks['TrackId'].unique():
    track = tracks[tracks['TrackId'] == track_id]
    speed = calculate_average_speed_of_track(track)
    tracks_speed[track_id] = speed
    print(track_id, '->', speed, 'm/s')

1 -> 0.7291839252917509 m/s
3 -> 3.633952140433144 m/s
4 -> 1.732230824629012 m/s
5 -> 2.06724017703918 m/s
6 -> 1.9623354220439553 m/s
7 -> 3.2759623317901125 m/s
8 -> 3.694685611022721 m/s
10 -> 10.221985679245405 m/s
11 -> 4.6304892012317085 m/s


# Find the right line

In [475]:
import math
import numpy as np

In [474]:
tracks_to_be_done = []

for k, v in track_predictions.items():
    if(len(v) != 0 and len(v) != 1):
        tracks_to_be_done.append(k)
        
tracks_to_be_done

[3, 5, 10]

In [358]:
stop_speed = pd.read_csv('../data/processed/assignment1/average_time_between_stops_filtered.csv')
line_stops = pd.read_csv('../data/processed/assignment1/line_stops.csv')
line_stops = line_stops[['lineId', 'stop_id_int', 'lat', 'long', 'order']]

tracks['hour'] = pd.to_datetime(tracks.time).dt.hour
first_last_tracks = tracks[['TrackId','lat', 'lon', 'hour']].groupby(['TrackId']).nth([0,-1])
first_last_tracks.reset_index(inplace=True)

In [488]:
# def find_the_first_and_last_stop()

closest_stops = {}

# from tracks_to_be_done get the stops for which we should find the closest stop
for TrackId, v in track_predictions.items():
    if (TrackId in tracks_to_be_done): # use only trackids that need to be solved
        
        # get the first and last snapshot:
        first_snapshot = first_last_tracks[first_last_tracks['TrackId'] == TrackId].iloc[0]
        last_snapshot = first_last_tracks[first_last_tracks['TrackId'] == TrackId].iloc[1]
    
        
        list_of_potential_lines = [element[-1] for element in v] # get the lines for that track
        
        line_dic = []
        
        for line in list_of_potential_lines: #line=='049b'
            if(int(line[:-1]) <100): # if its not a night bus

                # now find the stop with the smallest distance
                current_line = line_stops[line_stops.lineId == line] # '049b'stop_id ...
                x2 = current_line['lat']
                y2 = current_line['long']
                
                x1 = first_last_tracks.iloc[0]['lat']
                y1 = first_last_tracks.iloc[0]['lon']
                first = np.sqrt(np.array(((x1 - x2) ** 2 + (y1-y2) ** 2)))
                first_argmin = first.argmin()
                
#                 print(np.sort(first))
                
                x1 = first_last_tracks.iloc[1]['lat']
                y1 = first_last_tracks.iloc[1]['lon']
                second = np.sqrt(np.array(((x1 - x2) ** 2 + (y1-y2) ** 2)))
                second_argmin = second.argmin()
                
                hour = first_last_tracks[first_last_tracks['TrackId'] == TrackId].hour
                line_dic.append({'first': current_line.iloc[first_argmin,:].values.tolist(),
                                  'second': current_line.iloc[second_argmin,:].values.tolist(),
                                    'hour': hour.values.tolist()})

        closest_stops[TrackId] = line_dic
#         break
#     break
        
    # for each stop get all the stops and find the distance to the closest stop
        # then return those stops and calculate the average speed

[0.01644577 0.01660387 0.01875681 0.01907986 0.01907986 0.01928761
 0.01952714 0.02230231 0.02280039 0.02311928 0.02370881 0.02548447
 0.02569878 0.02576971 0.02745115 0.02980606 0.03010199 0.03090258
 0.03113115 0.03242177 0.03355453 0.03437683 0.03489133 0.03638308
 0.03669681 0.03686756 0.03696471 0.03736952 0.03752144 0.03823208
 0.03944844 0.0400432  0.04022684 0.04024963 0.0403693  0.04089335
 0.04196395 0.04297749 0.04308739 0.04334963 0.04398187 0.04431006
 0.04462683 0.04487004 0.04549244 0.04589435 0.04621247 0.04637487
 0.04697506 0.04721127 0.0472664  0.04732452 0.04755743 0.04806814
 0.04886952 0.04942519 0.04950206 0.05013908]
[0.01644577 0.01660387 0.01928761 0.01952714 0.02311928 0.02370881
 0.02569878 0.02745115 0.02980606 0.03010199 0.03242177 0.03355453
 0.03669681 0.03761395 0.04185757 0.04287635 0.04718849 0.04770469
 0.04896809 0.0497319  0.05420348 0.05457491 0.05687314 0.05693524
 0.05718735 0.05749793 0.061715   0.06202167 0.06485724 0.06530189
 0.06836001 0.06

{}

In [472]:
fourtynine = stop_speed[stop_speed['LineId'] == 82]
fourtynine[fourtynine['FromStop']==6608]

Unnamed: 0,LineId,FromStop,ToStop,0,1,2,3,4,5,6,...,15,16,17,18,19,20,21,22,23,Day
785,82,6608,3228,81.34,0.0,0.0,0.0,0.0,89.93,90.16,...,96.71,117.85,104.8,108.68,90.31,94.68,82.52,85.4,89.62,


In [465]:
# then return those stops and calculate the average speed
for k,v in closest_stops.items():
    for line in v:
        int_line = int(line['first'][0][:-1])
        first_stop = line['first'][1]
        second_stop = line['second'][1]
        first_order = line['first'][-1]
        second_order = line['second'][-1]
        first_hour = line['hour'][0]
        second_hour = line['hour'][1]
        
        print(int_line,first_stop, second_stop, first_order, second_order)
        
#         get_sequence_between_stops
#         break
#     break

49 2500 2531 29 1
50 2500 2531 25 1
82 6608 2401 24 23
97 6361 6361 29 29
82 6608 2401 24 23
8 6361 6361 32 32
93 6353 6311 9 23


In [442]:
line_stops[line_stops['lineId'] == '049b']

Unnamed: 0,lineId,stop_id_int,lat,long,order
1599,049b,2531,50.836335,4.337833,1
1600,049b,2539,50.834096,4.335818,2
1601,049b,2544,50.831389,4.332229,3
1602,049b,2548,50.828952,4.329179,4
1603,049b,2553,50.827135,4.327179,5
1604,049b,2546,50.824159,4.325394,6
1605,049b,2551,50.822171,4.321734,7
1606,049b,2556,50.823497,4.316113,8
1607,049b,3681,50.823973,4.310747,9
1608,049b,2550,50.827044,4.305009,10
