## Clustering graph

In [None]:
import pandas as pd
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
sse = []

for i in range(1, 21):
    # create a KMeans model with the current number of clusters
    kmeans = KMeans(n_clusters=i)
    # fit the model to the locations dataframe
    kmeans.fit(sorted_data_stops[['current_lat', 'current_lng']])
    # append the sum of squared errors to the list
    sse.append(kmeans.inertia_)

# create a plot with the number of clusters on the x-axis and SSE on the y-axis
plt.plot(range(1, 21), sse)
plt.xlabel('Number of Clusters')
plt.ylabel('Sum of Squared Errors (SSE)')
plt.title('Elbow Plot')
plt.show()

# Artificial routes

In [None]:
# routes are given

import itertools
artificial_completed_routes = pd.DataFrame(columns=['routes'])

def common_subsequence(planned, actual):
    return [x[0] for x in itertools.takewhile(lambda x: x[0] == x[1], zip(planned, actual))]

artificial_completed_routes['routes'] = routes.apply(lambda row: common_subsequence(row['planned_route_craft'], row['actual_route_location']), axis=1)

filtered_results = artificial_completed_routes[artificial_completed_routes['routes'].apply(lambda x: len(x) > 1 and len(x) != len(routes.loc[artificial_completed_routes.index, 'planned_route_craft']))]

filtered_results['driver_id'] = routes.loc[filtered_results.index, 'driver_id']
filtered_results['last_two_weeks_count'] = routes.loc[filtered_results.index, 'last_two_weeks_count']

filtered_results['distance_route'] = filtered_results.apply(lambda row: routes.loc[row.name, 'distance_route'][:len(row['routes'])-1], axis=1)

artificial_planned_routes = pd.DataFrame({'common_subsequence': filtered_results['routes'], 'driver_id': filtered_results['driver_id'], 'distance_route': filtered_results['distance_route'], 'last_two_weeks_count': filtered_results['last_two_weeks_count']})

artificial_planned_routes

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from sklearn.cluster import KMeans

data_stops = pd.read_csv('uni_molde_v2.csv', sep=',')

data_stops
data_stops.loc[data_stops['stop_completed_at'].isna(), 'stop_completed_at'] = "-1"
data_stops.loc[data_stops['stop_arrived_at'].isna(), 'stop_arrived_at'] = "-1"

print(data_stops.isnull().sum())


sorted_data_stops = data_stops.sort_values(by='stop_dispatched_at', ascending=True)
sorted_data_stops = sorted_data_stops.reset_index(drop=True)
sorted_data_stops['day_of_week'] = pd.to_datetime(sorted_data_stops['stop_dispatched_at']).dt.day_name()
sorted_data_stops['date'] = pd.to_datetime(sorted_data_stops['stop_dispatched_at']).dt.date

#clustering
locations_df = sorted_data_stops[['current_lat', 'current_lng']]
kmeans = KMeans(n_clusters=1000, random_state=42)
kmeans.fit(locations_df)
sorted_data_stops['cluster'] = kmeans.labels_ + 1

sorted_data_stops['location_id_craft'] = sorted_data_stops.groupby(['current_lat', 'current_lng']).ngroup()+1
with open('output.txt', 'w') as f:
    print(sorted_data_stops.to_string(), file=f)
print('number of groups', sorted_data_stops['location_id_craft'].nunique())

# data_stops_day= sorted_data_stops[sorted_data_stops['day_of_week'] == "Wednesday"]
grouped_df = sorted_data_stops.groupby('driver_workday_id')[['driver_id', 'location_type_id', 'address_id', 'stop_dispatched_at', 'stop_arrived_at', 'stop_earliest', 'stop_latest', 'current_lat', 'current_lng', 'stop_completed_at', 'cluster', 'location_id_craft','day_of_week', 'date']].apply(lambda x: pd.Series({
    'driver_id': x['driver_id'].tolist(),
    'location_type_id': x['location_type_id'].tolist(),
    'planned_route_location': x['address_id'].tolist(),
    'stop_dispatched_at': x['stop_dispatched_at'].tolist(),
    'stop_arrived_at': x['stop_arrived_at'].tolist(),
    'stop_earliest': x['stop_earliest'].tolist(),
    'stop_latest': x['stop_latest'].tolist(),
    'current_lat': x['current_lat'].tolist(),
    'current_lng': x['current_lng'].tolist(),
    'stop_completed_at': x['stop_completed_at'].tolist(),
    'planned_route_cluster': x['cluster'].tolist(),
    'planned_route_craft': x['location_id_craft'].tolist(),
    'day_of_week': x['day_of_week'].tolist(),
    'date': x['date'].tolist()
})).reset_index()

stop_completed_at        0
stop_arrived_at          0
stop_latest              0
stop_earliest            0
stop_dispatched_at       0
location_id              0
location_type_id         0
driver_workday_id        0
organization_id          0
address_id               0
location_is_depot        0
driver_id                0
contact_id               0
current_lat              0
current_lng              0
prev_planned_lat      7450
prev_planned_lng      7450
prev_actual_lat       7450
prev_actual_lng       7450
dtype: int64
number of groups 3110


In [None]:
import folium
import math

lat_seq = grouped_df['current_lat'][0]
lon_seq = grouped_df['current_lng'][0]

print(lat_seq, lon_seq)
def build_route(lat_seq, lon_seq):
    m = folium.Map(location=[lat_seq[0], lon_seq[0]], zoom_start=13)

    # Add a polyline with arrows
    for i in range(len(lat_seq)):
        lat, lon = lat_seq[i], lon_seq[i]

        # Add an index label
        folium.Marker([lat, lon], icon=folium.DivIcon(html=f'<div style="font-size: 13pt; border: 2px solid white; border-radius: 50%; padding: 2px; background-color: white">{i+1}</div>')).add_to(m)

        if i < len(lat_seq) - 1:
            next_lat, next_lon = lat_seq[i + 1], lon_seq[i + 1]
            angle = np.rad2deg(np.arctan2(next_lat - lat, next_lon - lon))
            arrow_lon = lon + 0.001 * np.cos(np.deg2rad(angle))

            # Add a line segment
            folium.PolyLine([[lat, lon], [next_lat, next_lon]], color='blue', dash_array='5, 5').add_to(m)

            # Add an arrow at the end of the line segment
            folium.Marker([next_lat, next_lon], icon=folium.Icon(color='blue', icon='arrow-up', prefix='fa')).add_to(m)

    m.save('route_map.html')

In [3]:
import folium
def build_route(lat_seq, lon_seq, color, earliest, latest, arrived, location_type_id, file_name, marker_bool = False):
    m = folium.Map(location=[lat_seq[0], lon_seq[0]], zoom_start=13)
    # Add a polyline with arrows
    offset = 0
    for i in range(len(lat_seq)):
        lat, lon = lat_seq[i], lon_seq[i]
        earliest_date = earliest[i]
        latest_date = latest[i]
        arrived_date = arrived[i]
        location_type = location_type_id[i]

        # Determine the marker color based on the arrived date being within the earliest and latest date range
        if marker_bool == False:
            marker_color = 'white'
        else:
            if earliest_date <= arrived_date <= latest_date:
                marker_color = 'green'
            else:
                marker_color = 'red'

        # Add an index label
        folium.Marker([lat + offset * 0.000001, lon + offset * 0.000001], icon=folium.DivIcon(html=f'<div style="font-size: 13pt; border: 1px solid white; border-radius: 50%; width: 1.2em; height: 1.2em; background-color: {marker_color}; border-radius: 50%;text-align: center;text-color: white;">{i+1}</div>')).add_to(m)

        # Add a red square marker for depots
        if location_type == 1:
            folium.Marker([lat + offset * 0.000001, lon + offset * 0.000001], icon=folium.Icon(color='red', icon='square', prefix='fa')).add_to(m)

        if i < len(lat_seq) - 1:
            next_lat, next_lon = lat_seq[i + 1], lon_seq[i + 1]
            angle = np.rad2deg(np.arctan2(next_lat - lat, next_lon - lon))
            arrow_lon = lon + 0.001 * np.cos(np.deg2rad(angle))

            # Add a line segment
            folium.PolyLine([[lat + offset * 0.000001, lon + offset * 0.000001], [next_lat, next_lon]], color=color, dash_array='5, 5').add_to(m)

            # Add an arrow at the end of the line segment
            folium.Marker([next_lat, next_lon], icon=folium.Icon(color=color, icon='arrow-up', prefix='fa')).add_to(m)

        # Increase the offset for the next node with the same location
        offset += 1

    m.save(file_name)

In [4]:
def draw_planned_route(row_id):
    lats = uncompleted_routes_df["current_lat"].iloc[row_id]
    lngs = uncompleted_routes_df["current_lng"].iloc[row_id]
    earliests = uncompleted_routes_df["stop_earliest"].iloc[row_id]
    latests = uncompleted_routes_df["stop_latest"].iloc[row_id]
    arriveds = uncompleted_routes_df["stop_arrived_at"].iloc[row_id]
    location_types = uncompleted_routes_df["location_type_id"].iloc[row_id]
    build_route(lats, lngs, 'blue', earliests, latests, arriveds, location_types, f'planned_route_{row_id}.html')

In [None]:
def draw_actual_route(row_id):
    mapping = {}
    row = uncompleted_routes_df.iloc[row_id]
    planned_route = row['planned_route_craft']
    actual_route = row['actual_route_location']
    longs = row['current_lng']
    lats = row['current_lat']
    earliest = row['stop_earliest']
    latest = row['stop_latest']
    arrived = row['stop_arrived_at']
    location_type_id = row['location_type_id']

    for i in range(len(planned_route)):
        mapping[planned_route[i]] = (longs[i], lats[i], earliest[i], latest[i],arrived[i], location_type_id[i])

    actual_longs = [mapping[location][0] for location in actual_route]
    actual_lats = [mapping[location][1] for location in actual_route]
    actual_earliest = [mapping[location][2] for location in actual_route]
    actual_latest = [mapping[location][3] for location in actual_route]
    actual_arrived = [mapping[location][4] for location in actual_route]
    actual_location_type_id = [mapping[location][5] for location in actual_route]


    build_route(actual_lats, actual_longs, 'red', actual_earliest, actual_latest, actual_arrived, actual_location_type_id, f'actual_route_{row_id}.html',True)

id = 13
draw_planned_route(id)
draw_actual_route(id)

In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression

# One-hot encode categorical columns
encoded_routes = pd.get_dummies(final_routes.drop(['routes'], axis=1), drop_first=True)
# Train a Random Forest classifier
model = LogisticRegression()
model.fit(encoded_routes, final_routes['label'])
coefs = model.coef_

# Calculate odds ratio
odds_ratios = np.exp(coefs)
odds_ratios

In [None]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency

# Convert categorical column to numerical representation
driver_ids = final_routes['driver_id_sorted'].astype('category')
driver_ids_encoded = driver_ids.cat.codes

# Calculate contingency table
contingency_table = pd.crosstab(driver_ids_encoded, final_routes['label'])

# Perform Chi-square test for independence
chi2_stat, p_val, dof, expected = chi2_contingency(contingency_table)

# Print the correlation matrix
print("Correlation Matrix:")
print(contingency_table)
print(f"\nChi-square statistic: {chi2_stat:.4f}")
print(f"p-value: {p_val:.4f}")

# Interpret the results
if p_val < 0.05:
    print("The driver_id and label columns are significantly associated.")
else:
    print("The driver_id and label columns are not significantly associated.")

In [None]:
# get_mean_stat(stats, stat_name):
#     if stat_name in ["fpr", "tpr"]:
#         # Calculate mean fpr and tpr
#         values = [item[stat_name] for item in stats]
#         min_len = min(len(value) for value in values)
#         interpolated_values = []
#         for value in values:
#             interpolated = np.interp(np.linspace(0, 1, min_len), np.linspace(0, 1, len(value)), value)
#             interpolated_values.append(interpolated)
#         mean_values = np.array(interpolated_values).mean(axis=0)
#         return mean_values
#     else:
#         # Calculate mean for other stats
#         arr = np.array([item[stat_name] for item in stats])
#         return arr.mean()
#
# print('acc:', get_mean_stat(stats, 'acc'))
# print('precision:', get_mean_stat(stats, 'precision'))
# print('recall:', get_mean_stat(stats, 'recall'))
# print('f1:', get_mean_stat(stats, 'f1'))
# print('roc_auc:', get_mean_stat(stats, 'roc_auc'))
# print('average_precision:', get_mean_stat(stats, 'average_precision'))
# print('quadratic_loss:', get_mean_stat(stats, 'quadratic_loss'))
# print('brier_score:', get_mean_stat(stats, 'brier_score'))
# mean_fpr = get_mean_stat(stats, 'fpr')
# mean_tpr = get_mean_stat(stats, 'tpr')
#
# plt.plot(mean_fpr, mean_tpr)
# plt.plot([0, 1], [0, 1], 'k--')  # diagonal line
# plt.xlabel('False Positive Rate')
# plt.ylabel('True Positive Rate')
# plt.title('Mean ROC Curve')
# plt.show()

In [9]:
# Average benchmark

# import numpy as np
# import pandas as pd
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
# train_data, test_data = train_test_split(final_routes, test_size=0.1, random_state=42)
#
#
# average_scores = train_data.groupby(['driver_id_sorted', 'day_of_week'])['score'].mean().reset_index()
# average_scores_dict = average_scores.set_index(['driver_id_sorted', 'day_of_week'])['score'].to_dict()
# train_data['average_score'] = train_data.apply(lambda row: average_scores_dict[(row['driver_id_sorted'], row['day_of_week'])], axis=1)
# test_data['average_score'] = test_data.apply(lambda row: average_scores_dict.get((row['driver_id_sorted'], row['day_of_week']), 0.5), axis=1)
#
# y_pred = test_data['average_score'].values
# y_true = test_data['score'].values
#
# # Calculate metrics
# mse = mean_squared_error(y_true, y_pred)
# rmse = np.sqrt(mse)
# mae = mean_absolute_error(y_true, y_pred)
# r2 = r2_score(y_true, y_pred)
#
# # Print metrics
# print('MSE:', mse)
# print('RMSE:', rmse)
# print('MAE:', mae)
# print('RÂ²:', r2)

In [None]:
# plt.figure(figsize=(10, 6))
# plt.scatter(y_true, y_pred, alpha=0.5)
# plt.plot([0, 1], [0, 1], 'r--')  # Diagonal line representing perfect predictions
# plt.xlabel('True Values')
# plt.ylabel('Predicted Values')
# plt.title('Predicted vs True Values')
# plt.show()

In [10]:
# residuals = y_pred - y_true
#
# plt.figure(figsize=(10, 6))
# plt.scatter(y_true, residuals, alpha=0.5)
# plt.axhline(y=0, color='r', linestyle='--')  # Line for zero residual
# plt.xlabel('True Values')
# plt.ylabel('Residuals')
# plt.title('Residual Plot')
# plt.show()