In [None]:
import glob

import numpy as np
import pandas as pd
import plotly.express as px
from dtaidistance import dtw_ndim
from scipy.cluster.hierarchy import linkage, fcluster
from simplification.cutil import simplify_coords

In [None]:
from env import clean_and_filtered_data_path, grouped_data_path, distance_matrix_dir

# Erstelle einen vollständig gefilterten Datensatz.

Lade den durch `clean_and_filter.py` gefilterten Datensatz, um ihn nach Streckenähnlichkeit zu gruppieren und zu filtern. Hiermit werden alle Fahrten entfernt, die zu stark von der Mehrheit abweichen. Zurück bleibt ein Datensatz mit nach Strecke gruppierten Fahrten, der wenig Rauschen durch Falschinformationen enthält.

In [None]:
everything = pd.read_csv(
    clean_and_filtered_data_path,
    usecols=['vehicle_id', 'status', 'latitude', 'longitude', 'last_modified', 'pdop', 'line', 'ziel', 'ziel_short', 'ride_id'],
    parse_dates=['last_modified'],
)

In [None]:
print("Row Count: ", everything.shape[0])

Lade die von `create_distance_matrix.py` erstellten Distanzmatrix-Dateien, um die Fahrten zu clustern. Die Distanzmatrix-Dateien enthalten eine durch den DWT-Algorithmus ermittelte Ähnlichkeit aller Fahrten, die denselben Start- und Endpunkt angesteuert haben.

In [None]:
all_distance_matrix = []

for filename in glob.glob(f"{distance_matrix_dir}/distance_matrix-*"):
    all_distance_matrix.append(pd.read_csv(filename, index_col=0))


In [None]:
print("Loaded Matrix Data: ", len(all_distance_matrix))

Erstelle Gruppierungen durch das Clustern der Distanzmatrizen. Dadurch werden die Fahrten nach ihrer Ähnlichkeit gruppiert.

In [None]:
cluster_ds = []
filtered_clusters = []

distance_threshold = 0.003
for i, distance_matrix in enumerate(all_distance_matrix):
    condensed_distance_matrix = distance_matrix.values[np.triu_indices_from(distance_matrix.values, k=1)]

    # cluster dataset by distance matrix
    linkage_matrix = linkage(condensed_distance_matrix, method='average')
    clusters = fcluster(linkage_matrix, distance_threshold, criterion='distance')

    # translate to unique namespace
    clusters = clusters + (i * 1000)

    clustered_datasets = pd.DataFrame({'ride_id': distance_matrix.index.to_numpy(), 'cluster': clusters})
    cluster_ds.append(clustered_datasets)

    # get the biggest cluster and save it for later filtering
    cluster_counts = clustered_datasets['cluster'].value_counts()
    index = np.unravel_index(cluster_counts.values.argmax(), cluster_counts.shape)
    max_cluster = cluster_counts.index.to_numpy()[index]

    filtered_clusters.append(max_cluster)

In [None]:
ride_id_to_cluster_df = pd.concat(cluster_ds)

In [None]:
print("Amount of unique clusters:", ride_id_to_cluster_df.cluster.nunique())

In [None]:
all_data_with_cluster = everything.merge(ride_id_to_cluster_df, on="ride_id", how='left')

In [None]:
all_data_with_cluster.head()

Nun erstellen wir vereinfachte Versionen der Fahrten, um sie besser mit der gleich ausgewählten Basisroute zu vergleichen.

In [None]:
def simpli(x, epsilon):
    result_array = simplify_coords(x, epsilon)
    return pd.DataFrame(result_array)

In [None]:
simplified_coordinates = (all_data_with_cluster.groupby('ride_id').apply(lambda group: simpli(group[['latitude', 'longitude']].to_numpy(), .001), include_groups=False))

In [None]:
simplified_coordinates.head()

In [None]:
ride_ids = all_data_with_cluster['ride_id'].unique()

In [None]:
anything_simple = pd.DataFrame(simplified_coordinates).reset_index()
simple_with_cluster = anything_simple.merge(ride_id_to_cluster_df, on="ride_id", how='left')

Zufällige Auswahl der Basisroute als Referenzstrecke für alle Gruppenmitglieder.

In [None]:
random_sample = simple_with_cluster[['ride_id', 'cluster']].drop_duplicates().groupby('cluster').agg(pd.DataFrame.sample)
chosen_routes = random_sample[random_sample.index.isin(filtered_clusters)]['ride_id'].to_numpy()

In [None]:
chosen_routes

Darstellen aller Basisrouten

In [None]:
filtered = all_data_with_cluster[all_data_with_cluster.ride_id.isin(chosen_routes)]

In [None]:
fig = px.line_map(filtered, lat='latitude', lon='longitude', color='cluster', center=dict(lat=54.781638, lon=9.432534), zoom=12, map_style="open-street-map")
fig.show()

Erstellen einer Distanzmatrix von jeder Fahrt zur allen Basisfahrten. Hiermit werden die Fahrten den jeweiligen Gruppen zugeordnet.

In [None]:
distance_matrix = pd.DataFrame(index=ride_ids, columns=chosen_routes)

In [None]:
import pandas as pd
from multiprocessing import Pool, cpu_count
from tqdm import tqdm

sc = pd.DataFrame(simplified_coordinates).reset_index()

def calculate_distance(args):
    id1, id2 = args
    df1 = sc[sc['ride_id'] == id1]
    df2 = sc[sc['ride_id'] == id2]
    series1 = df1[[0, 1]].to_numpy()
    series2 = df2[[0, 1]].to_numpy()
    d = dtw_ndim.distance(series1, series2)
    return (id1, id2, d)

with Pool(cpu_count()) as pool:
    args = [(id1, id2) for id1 in ride_ids for id2 in chosen_routes]
    results = list(tqdm(pool.imap(calculate_distance, args), total=len(args)))

for id1, id2, distance in results:
    distance_matrix.loc[id1, id2] = distance

In [None]:
distance_matrix.shape

In [None]:
# If you want to save the distance_matrix
# distance_matrix.to_csv('ride_distance_to_routes.csv')

Finde pro Fahrt die ähnlichste Basisfahrt

In [None]:
copy_d = distance_matrix.copy()

In [None]:
copy_d['group'] = copy_d.idxmin(axis=1)
copy_d['smallest_value'] = copy_d.min(axis=1)
copy_d

In [None]:
filter_threshold = copy_d[copy_d.smallest_value < 0.008]['group']
filter_sc = all_data_with_cluster[all_data_with_cluster.ride_id.isin(filter_threshold.index)]

merged_with_smallest = filter_sc.merge(filter_threshold, left_on='ride_id', right_index=True, how='left')
merged_with_smallest
filter_2 = merged_with_smallest[merged_with_smallest.group.isin(chosen_routes)]

In [None]:
filtered_entries = all_data_with_cluster.merge(filter_threshold, left_on='ride_id', right_index=True, how='inner')[['latitude', 'longitude', 'pdop', 'last_modified', 'ride_id', 'group']].reset_index(drop=True)
filtered_entries

Speichere die gefilterte und gruppierte Liste.

In [None]:
filtered_entries.to_csv(grouped_data_path, index=False)