In [None]:
import sys
# add parent directory and its parent to sys.path so that python finds the modules
sys.path.append('..')
sys.path.append('../..')

from datetime import datetime

import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from db_utils import get_rect_to_rect_data
from cluster_utils import cluster_by_max_projection_and_distance

In [None]:
plt.rcParams.update({
    "figure.facecolor":  'white', 
    "axes.facecolor":    'white', 
    "savefig.facecolor": 'white', 
})

In [None]:
def analyse_df_for_faulty_entries(df_simra, show_faulty_entries = False):
    
    # Some entries contain nans, or no speed, even though a distance is given. Inspect further. Option for filtering or preprocessing.

    faulty_entries = df_simra[((df_simra.velo == 0) | (df_simra.velo.isna())) & (df_simra.dist != 0.0)]

    n_entries = len(df_simra)
    n_faulty_entries = len(faulty_entries) 
    percentage_faulty = n_faulty_entries / n_entries * 100

    print(f'Number of faulty rows (velocity is nan or zero even though distance is given): {n_faulty_entries}')
    print(f'Total rows: {n_entries}')
    print(f'Share of faulty rows: {round(percentage_faulty,2)}%.')

    if show_faulty_entries: display(faulty_entries)

In [None]:
def cluster_and_plot_for_intersection(start_end_coords, end_date_str = '2099-01-01 00:00:00', files_to_exclude = None, **kwargs):
    end_date = datetime.strptime(end_date_str, '%Y-%m-%d %H:%M:%S')
    for direction, (start_coord, end_coord) in start_end_coords.items():
        print('######## ' + direction + ' ########')
        print('Start:', start_coord)
        print('End:', end_coord)
        df_simra = get_rect_to_rect_data(start_coord, end_coord, end_date=end_date, files_to_exclude=files_to_exclude)
        for key, value in kwargs.items():
            if key == 'analyse_for_faulty_entries':
                analyse_df_for_faulty_entries(df_simra)
        cluster_by_max_projection_and_distance(df_simra, direction = direction)
        print('\n')

In [None]:
# use data until ..
end_date_str = '2023-01-01 00:00:00'
end_date = datetime.strptime(end_date_str, '%Y-%m-%d %H:%M:%S')

In [None]:
# files you want to exclude, e.g. because they use the pedestrian way
files_to_exclude = ['VM2_-2112701535', 'VM2_-217686115','VM2_-1247665811','VM2_-104300786', 'VM2_330973206', 'VM2_1100569031', 'VM2_421371629', 'VM2_421371629', 'VM2_1476499235', 'VM2_-1451152685', 'VM2_-1523872256']

In [None]:
start_end_coords = {
    'Alexanderstr. onto Alexanderstr. (south to west)':
    (
        (13.416169,52.521508,13.416513,52.521832),
        (13.41591,52.522275,13.416421,52.522569)
    ),    
        'Karl-Marx-Allee onto Alexanderstr. (east to south)':
    (
        (13.41673,52.5219,13.417469,52.522088),
        (13.416021,52.521813,13.416321,52.52203)
        # (13.415876,52.521774,13.416321,52.52203) # larger endbox
    )
}

In [None]:
cluster_and_plot_for_intersection(start_end_coords)