#### Discussion Section

This file analyses clusters of data to understand how it is affected by external criteria (mass, power and type of work).

Vehicle characteristics were obtained by automatically querying the Dutch vehicle registration database : https://www.rdwdata.nl/

In [None]:
%%capture
import pandas as pd
# import modin.pandas as pd
import numpy as np

import math
import seaborn as sns
import matplotlib.pyplot as plt
# import vaex # https://vaex.io/docs/index.html
import pathlib
from pathlib import *
import os
import pickle
# import cufflinks as cf
# import chart_studio.plotly as py
import seaborn as sns
import plotly.express as px
%matplotlib inline
from plotly.offline import download_plotlyjs, init_notebook_mode,plot,iplot

init_notebook_mode(connected=True)
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
from shapely.geometry import Point, Polygon

In [None]:
##TODO -> Create functions for repetitive tasks

# ,skiprows=range(3, 260000000)
# Input the csv
# Extracting a subset of 1000000 rows by default
def load_data(dir_name,base_filename):
    complete_path=os.path.join(dir_name, base_filename + "." + "csv")
    # df=pd.read_csv(complete_path,sep=';',encoding= 'unicode_escape',nrows=10000,engine='c',infer_datetime_format=True,usecols=['Trip_Summary_Id','Numberplate','Point_time-stamp','Latitude',
    #                                                                                                                           'Longitude','Meters_travelled','Time_elapsed','Point_speed','Number_of_lanes',
    #                                                                                                                           'Road_type','Road_class','Road_form','Speed_restriction','TNO_Time-stamp'])
    # df = pd.concat((chunk for chunk in pd.read_csv(complete_path,sep=';',nrows=5000000,encoding= 'unicode_escape',engine='c',infer_datetime_format=True,chunksize=1000000,low_memory=False,usecols=['Numberplate','Latitude',
    #                                                                                                                           'Longitude','Meters_travelled','Time_elapsed','Point_speed','Number_of_lanes',
    #                                                                                                                           'Road_type','Road_class','Speed_restriction','TNO_Time-stamp'])))
    
    df = pd.concat((chunk for chunk in pd.read_csv(complete_path,sep=';',nrows=20000000,encoding= 'unicode_escape',engine='c',infer_datetime_format=True,chunksize=1000000,low_memory=False)))
    return df

# Dropping the first and last row of csv ("------")
def drop_first_row(df):
    df=df.iloc[1:]
    df=df[:-1]
    return df

def resetIndex(df):
    return df.reset_index()


##TODO -> Rewrite this function

def cast_to_correct_dtype(df):
    
    if 'Id' in df.columns:
        df['Id'] = df['Id'].astype('int')
        
    if 'AOS_position_Id' in df.columns:
        df['AOS_position_Id'] = df['AOS_position_Id'].astype('int')
    
    if 'Acceleration_x' in df.columns:
        df['Acceleration_x'] = df['Acceleration_x'].astype('float')
        
    if 'Acceleration_y' in df.columns:
        df['Acceleration_y'] = df['Acceleration_y'].astype('float')
        
    if 'TNO_Valid' in df.columns:
        df['TNO_Valid'] = df['TNO_Valid'].astype('int')
    
    if 'Latitude' in df.columns:
        df['Latitude'] = df['Latitude'].astype('float')
        
    if 'Longitude' in df.columns:
        df['Longitude'] = df['Longitude'].astype('float')
        
    if 'Event/action_speed' in df.columns:
        df['Event/action_speed'] = df['Event/action_speed'].astype('int')
        
    if 'Event/action_type' in df.columns:
        df['Event/action_type'] = df['Event/action_type'].astype('int')    
        
    if 'Number_of_lanes' in df.columns:
        df['Number_of_lanes'] = df['Number_of_lanes'].astype('int')
        
    if 'Road_class' in df.columns:
        df['Road_class'] = df['Road_class'].astype('int')
        
    if 'Road_type' in df.columns:
        df['Road_type'] = df['Road_type'].astype('int')
        
    if 'Crash_position_Id' in df.columns:
        df['Crash_position_Id']=df['Crash_position_Id'].astype('int')
        
    if 'Crash_position_Id' in df.columns:
        df['Crash_position_Id'] = df['Crash_position_Id'].astype('int')
        
    if 'Point_speed' in df.columns:
        df['Point_speed'] = df['Point_speed'].astype('float')
        
    if 'Average_speed_fpp' in df.columns:
        df['Average_speed_fpp'] = df['Average_speed_fpp'].astype('float')  
        
    if 'Average_Speed' in df.columns:
        df['Average_Speed'] = df['Average_Speed'].astype('float')        

    if 'Maximum_speed' in df.columns:
        df['Maximum_speed'] = df['Maximum_speed'].astype('float')     
        
    if 'Meters_travelled' in df.columns:
        df['Meters_travelled'] = df['Meters_travelled'].astype('int')  

    if 'Road_form' in df.columns:
        df['Road_form'] = df['Road_form'].astype('int') 
        
    if 'Speed_restriction' in df.columns:
        df['Speed_restriction'] = df['Speed_restriction'].astype('int') 
        
    if 'Crash_speed' in df.columns:
        df['Crash_speed'] = df['Crash_speed'].astype('int')
        
    if 'Maximum_acceleration' in df.columns:
        df['Maximum_acceleration'] = df['Maximum_acceleration'].astype('float')
        
    if 'Numberplate' in df.columns:
        df['Numberplate']=df['Numberplate'].astype('str')
        

    return df

            

## Handling date time related fields

def cast_date_time(df):
    if 'AOS_event/action_time' in df.columns:
        df['AOS_event/action_time']=pd.datetime(df['AOS_event/action_time'],errors='coerce')
        df['AOS_event/action_time_hour']=df['AOS_event/action_time'].dt.hour
    
    if 'TNO_Trip-start' in df.columns:
        df['TNO_Trip-start'] = pd.to_datetime(df['TNO_Trip-start'],errors='coerce')
        df['TNO_Trip-start_hour'] = df['TNO_Trip-start'].dt.hour
        
    if 'TNO_Trip-end' in df.columns:
        df['TNO_Trip-end'] = pd.to_datetime(df['TNO_Trip-end'],errors='coerce')
        
    if 'Position_time' in df.columns:
        df['Position_time'] = pd.to_datetime(df['Position_time'],errors='coerce')

    if 'TNO_Time-stamp' in df.columns:
        df['TNO_Time-stamp'] = pd.to_datetime(df['TNO_Time-stamp'],errors='coerce')
        df['TNO_Time-stamp_hour'] = df['TNO_Time-stamp'].dt.hour
        
    return df

## Converting Time Stamps to datetime

def date_and_time_columns(df):
    if 'AOS_event/action_time' in df.columns:
        df['AOS Trip Date']=df['AOS_event/action_time'].dt.date
        df['AOS Event Time']=df['AOS_event/action_time'].dt.time
        df['AOS Trip Hour']=df['AOS_event/action_time'].dt.hour
    
    
    if 'TNO_Time-stamp' in df.columns:
        df['TNO Trip Date'] = df['TNO_Time-stamp'].dt.date
        df['TNO Trip Time'] = df['TNO_Time-stamp'].dt.time
        df['TNO Trip Hour']=df['TNO_Time-stamp'].dt.hour
        # df['TNO Trip Hour'] = df['TNO Trip Time'].dt.hour
    return df

def Time_of_the_day(x):
    if (x > 4) and (x <= 8):
        return 'Early Morning'
    elif (x > 8) and (x < 12 ):
        return 'Morning'
    elif (x >= 12) and (x <= 16):
        return'Noon'
    elif (x > 16) and (x <= 20) :
        return 'Evening'
    elif (x > 20) and (x <= 24):
        return'Night'
    elif (x <= 4):
        return'Late Night'
    
    
def rename_some_stuff(df):
    if 'Event/action_type' in df.columns:
        df["Event type Rename"]=df["Event/action_type"].replace({0: 'Headway Warning = OFF', 1: 'Headway Warning (long)', 2: 'Headway Warning (medium)'
                                                                                 ,3: 'Headway Warning (short)',10:'Lane Departure Warning = OFF',11:'Left Lane Departure Warning = ON'
                                                                                 ,12:'Right Lane Departure Warning = ON',13:'Left and Right Lane Departure Warning = ON'
                                                                                 ,20:' Indicators = OFF',21:'Left Indicator = ON',22:'Right Indicator = ON',23:'Left and Right Indicator = ON'
                                                                                 ,40:'Brakes = OFF',41:'Brakes = ON'})
        
    if 'Road_type' in df.columns:
        df["Road_type_Rename"]=df["Road_type"].replace({0: "Urban", 1: "Motorway",2:"Extra Urban",3:"Unavailable"})
        
    return df

def detect_overspeeding_count(df):
    # create a list of our conditions
    # if 'Event/action_speed' and 'Speed_restriction' in df.columns:
    #     conditions = [(df['Event/action_speed']> df['Speed_restriction'])]
    #     # # create a list of the values we want to assign for each condition
    #     values = [1]
    #     # create a new column and use np.select to assign values to it using our lists as arguments
    #     df['Overspeeding_event'] = np.select(conditions, values)
        
    # return df

    if 'Point_speed' and 'Speed_restriction' in df.columns:
        conditions = [(df['Point_speed']> df['Speed_restriction'])]
        # # create a list of the values we want to assign for each condition
        values = [1]
        # create a new column and use np.select to assign values to it using our lists as arguments
        df['Overspeeding_event'] = np.select(conditions, values)
        
    return df

def some_processing(df):
    if 'Meters_travelled' in df.columns:
        df['KmTravelled']=df['Meters_travelled']/1000
    return df

In [None]:
def select_city_to_explore(city_name):
    if city_name=='Amsterdam':
        city_coordinates = [(52.430779, 4.737382), (52.422361, 4.811680), (52.418126, 4.850948), 
                            (52.431506, 4.859627),(52.421959, 4.913905),(52.414543, 4.933522),(52.425437, 4.954927),
                            (52.428570, 4.982090),(52.423581, 5.017310),(52.417493, 5.067539),(52.397841, 5.031621),
                            (52.373215, 5.014760),(52.352051, 5.030236),(52.324595, 5.019149),(52.304151, 5.024817),
                            (52.277836, 4.960200),(52.307825, 4.925153),(52.318055, 4.910773),(52.323740, 4.820309),
                            (52.339239, 4.789761),(52.357192, 4.752847),(52.387835, 4.753389),(52.399444, 4.729131),(52.430091, 4.737165)]
    elif city_name=='Rotterdam':
        city_coordinates = [(51.964720, 4.379695), (51.979081, 4.427803), (51.982006, 4.467663), 
                            (51.963745, 4.515691),(51.975121, 4.546658),(51.986527, 4.558397),(51.995767, 4.594196),
                            (51.968611, 4.599813),(51.955476, 4.566894),(51.943379, 4.576685),(52.397841, 5.031621),
                            (51.905291, 4.577125),(51.868921, 4.570220),(51.868888, 4.396928),(51.964029, 4.377486)]
        
    elif city_name =="Zwolle":
        city_coordinates = [(52.549349, 6.004091), (52.568947, 6.063982), (52.554697, 6.089382), (52.568898, 6.101135),
                         (52.555875, 6.138225),(52.566288, 6.160296),(52.585973, 6.144575),(52.569103, 6.212615),
                         (52.537363, 6.193830),(52.517736, 6.181333),(52.495107, 6.202496),(52.480071, 6.194845),
                         (52.467620, 6.150828),(52.447958, 6.155530),(52.440524, 6.103350),(52.508357, 6.024621),
                         (52.526347, 6.010919),(52.531352, 6.026051),(52.550606, 5.999967)]
        
    return city_name,city_coordinates


In [None]:
city_name,city_coords=select_city_to_explore('Amsterdam')
# city_name,city_coords=select_city_to_explore('Rotterdam')
# city_name,city_coords=select_city_to_explore('Zwolle')

In [None]:
def load_files(city_name):
    # city_name,_=select_city_to_explore('Amsterdam')
    if city_name == 'Amsterdam':
        df_intra_city=pd.read_csv(r"D:\AOS FOT\Octo\CSV Export\Individual Clustering\Summary\Amsterdam\df_grp_aos_trip_detail_amsterdam.csv")
        df_inter_city=pd.read_csv(r"D:\AOS FOT\Octo\CSV Export\Individual Clustering\Summary\Amsterdam\df_grp_aos_trip_detail_complete_amsterdam_updated2.csv")
        city_name='Amsterdam'
        location="Amsterdam"
        location1="across cities the NL"
        
    elif city_name=='Rotterdam':
        df_intra_city=pd.read_csv(r"D:\AOS FOT\Octo\CSV Export\Individual Clustering\Summary\Rotterdam\df_grp_aos_trip_detail_rotterdam.csv")
        df_inter_city=pd.read_csv(r"D:\AOS FOT\Octo\CSV Export\Individual Clustering\Summary\Rotterdam\df_grp_aos_trip_detail_complete_rotterdam_updated2.csv")
        city_name='Rotterdam'
        location="Rotterdam"
        location1="across cities the NL"
        
    elif city_name=='Zwolle':
        df_intra_city=pd.read_csv(r"D:\AOS FOT\Octo\CSV Export\Individual Clustering\Summary\Zwolle\df_grp_aos_trip_detail_zwolle.csv")
        df_inter_city=pd.read_csv(r"D:\AOS FOT\Octo\CSV Export\Individual Clustering\Summary\Zwolle\df_grp_aos_trip_detail_complete_zwolle_updated2.csv")
        city_name='Zwolle'
        location="Zwolle"
        location1="across cities the NL"
        
    return df_intra_city,df_inter_city,city_name,location,location1


In [None]:
df_intra_city,df_inter_city,city_name,location,location1=load_files(city_name) # This function should run automatically if you've called `select_city_to_explore` function before

In [None]:
class pre_processing:
    
    def __init__(self,df_intra_city,df_inter_city):
        self.df_intra_city = df_intra_city
        self.df_inter_city = df_inter_city
        
    def rename_columns(self):
        # self.df_inter_city.drop(['Unnamed: 0'], axis = 1, inplace = True) 
        # self.df_intra_city.drop(['Unnamed: 0'], axis = 1, inplace = True) 
    
        # self.df_intra_city.rename(columns={'Point_speed_mean_intra_city': 'Point_speed','KmTravelled_sum_intra_city': 'Km_travelled'},inplace=True, errors='raise')
    
        return df_intra_city,df_inter_city
    
    def concat_df(self):
        self.df_intra_city,self.df_inter_city=self.rename_columns()
        df_grouped_cities=pd.concat([self.df_intra_city, self.df_inter_city])
        return df_grouped_cities
    
    def filtering_df(self):
        df_grouped_cities=self.concat_df()
        df_grouped_cities = df_grouped_cities.drop(df_grouped_cities[df_grouped_cities['norm_brakes = ON']>4].index)
        df_grouped_cities = df_grouped_cities.drop(df_grouped_cities[df_grouped_cities['norm_headway_warning(long)']>3].index)
        df_grouped_cities = df_grouped_cities.drop(df_grouped_cities[df_grouped_cities['Point_speed']<20].index)
        df_grouped_cities = df_grouped_cities.drop(df_grouped_cities[df_grouped_cities['Km_travelled']<50].index)
        
        return df_grouped_cities
    
    
pre_processed_data = pre_processing(df_intra_city,df_inter_city)
df_grouped_cities=pre_processed_data.filtering_df() 
# df_grouped_cities

In [None]:
def select_feature_to_examine(feature):
    if feature=="norm_headway_warning(long)":
        feature_name="Norm HW-L(I) "
    
    elif feature=='norm_headway_warning(medium)':
        feature_name="Norm HW-L(II) "

    elif feature=='norm_headway_warning(short)':
        feature_name="Norm HW-L(III) "
        
    elif feature=='Point_speed':
        feature_name="Mean Point Speed "
        
    elif feature=='norm_Left Lane Departure Warning = ON':
        feature_name="Norm L-LDW "
        
    elif feature=='norm_Right Lane Departure Warning = ON':
        feature_name="Norm R-LDW "
        
    elif feature=='norm_brakes = ON':
        feature_name="Norm Braking Events "
        
    return feature,feature_name

"""Uncomment feature to be examined"""

feature,feature_name=select_feature_to_examine('Point_speed')
# feature,feature_name=select_feature_to_examine('norm_brakes = ON')
# feature,feature_name=select_feature_to_examine('norm_headway_warning(long)')
# feature,feature_name=select_feature_to_examine('norm_headway_warning(medium)')
# feature,feature_name=select_feature_to_examine('norm_headway_warning(short)')
# feature,feature_name=select_feature_to_examine('norm_Left Lane Departure Warning = ON')
# feature,feature_name=select_feature_to_examine('norm_Right Lane Departure Warning = ON')

In [None]:
df_outer = pd.merge(df_intra_city, df_inter_city, on='Numberplate', how='inner') #here Numberplate is common column
df_outer

In [None]:
df_outer = df_outer.drop(df_outer[df_outer['norm_brakes = ON_x']>4].index)
df_outer = df_outer.drop(df_outer[df_outer['norm_brakes = ON_y']>4].index)
df_outer = df_outer.drop(df_outer[df_outer['norm_headway_warning(long)_x']>3].index)
df_outer = df_outer.drop(df_outer[df_outer['norm_headway_warning(long)_y']>3].index)
df_outer = df_outer.drop(df_outer[df_outer['Point_speed_x']<20].index)
df_outer = df_outer.drop(df_outer[df_outer['Point_speed_y']<20].index)

In [None]:
feature_name

In [None]:
feature_x= feature_name+'in '
feature_y=feature_name

In [None]:
import pandas as pd
import numpy as np
import datetime as dt
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from kneed import KneeLocator
from sklearn.metrics import silhouette_samples, silhouette_score

In [None]:

df_outer.drop([' Indicators = OFF_x',
       'Brakes = OFF_x', 'Brakes = ON_x', 'Headway Warning (long)_x',
       'Headway Warning (medium)_x', 'Headway Warning (short)_x',
       'Headway Warning = OFF_x', 'Lane Departure Warning = OFF_x',
       'Left Indicator = ON_x', 'Left Lane Departure Warning = ON_x',
       'Left and Right Indicator = ON_x', 'Right Indicator = ON_x',
       'Right Lane Departure Warning = ON_x','norm_Indicators = OFF_x','norm_brakes = OFF_x','norm_Headway Warning = OFF_x','norm_Left Indicator = ON_x', 'norm_Left and Right Indicator = ON_x', 'norm_Right Indicator = ON_x'], axis = 1, inplace = True) 
    
    
df_outer.drop([' Indicators = OFF_y',
       'Brakes = OFF_y', 'Brakes = ON_y', 'Headway Warning (long)_y',
       'Headway Warning (medium)_y', 'Headway Warning (short)_y',
       'Headway Warning = OFF_y', 'Lane Departure Warning = OFF_y',
       'Left Indicator = ON_y', 'Left Lane Departure Warning = ON_y',
       'Left and Right Indicator = ON_y', 'Right Indicator = ON_y',
       'Right Lane Departure Warning = ON_y','norm_Indicators = OFF_y','norm_brakes = OFF_y','norm_Headway Warning = OFF_y','norm_Left Indicator = ON_y', 'norm_Left and Right Indicator = ON_y', 'norm_Right Indicator = ON_y'], axis = 1, inplace = True) 
    

In [None]:
df_outer.columns

In [None]:
df_outer.rename(columns={'Point_speed_x': 'Mean Point Speed in '+location,
                         'Point_speed_y': 'Mean Point Speed '+location1,
                         'norm_brakes = ON_x': 'Norm Braking Events in '+location,
                         'norm_brakes = ON_y': 'Norm Braking Events '+location1,
                         'norm_headway_warning(long)_x': 'Norm HW-L(I) in '+location,
                         'norm_headway_warning(long)_y': 'Norm HW-L(I) '+location1,
                         'norm_headway_warning(medium)_x': 'Norm HW-L(II) in '+location,
                         'norm_headway_warning(medium)_y': 'Norm HW-L(II) '+location1,
                         'norm_headway_warning(short)_x': 'Norm HW-L(III) in '+location,
                         'norm_headway_warning(short)_y': 'Norm HW-L(III) '+location1,
                         'norm_Right Lane Departure Warning = ON_x': 'Norm R-LDW in '+location,
                         'norm_Right Lane Departure Warning = ON_y': 'Norm R-LDW '+location1,
                         'norm_Left Lane Departure Warning = ON_x': 'Norm L-LDW in '+location,
                         'norm_Left Lane Departure Warning = ON_y': 'Norm L-LDW '+location1
                         },
          inplace=True, errors='raise')

In [None]:
# sns.regplot(data=df_outer,x=feature_x+location, y=feature_y+location1,color="sandybrown")

In [None]:
# sns.jointplot(x=feature_x+location, y=feature_y+location1, data=df_outer, kind="reg",palette="pastel",color="sandybrown")

In [None]:
#  import pandas as pd 
#  import seaborn as sns
#  import matplotlib.pyplot as plt 
# import stats
# import matplotlib.pyplot as plt
from scipy import stats
# sns.set(rc = {'figure.figsize':(15,8)})
# from matplotlib import rcParams

# # figure size in inches
# rcParams['figure.figsize'] = 15,8

# sandybrown
#mediumseagreen
#cornflowerblue

# feature_x="Mean Point Speed in "
# feature_y="Mean Point Speed "
# sns.set_style("ticks")
# df = pd.read_excel('data.xlsx')
# assume some random columns called EAV and PAV in your DataFrame 
# assume a third variable used for grouping called "Mammal" which will be used for color coding
p = sns.lmplot(x=feature_x+location, y=feature_y+location1,
        data=df_outer,scatter_kws={"color": "sandybrown"},
        line_kws={'label':"Linear Reg","color":"sandybrown"}, legend=True)

ax = p.axes[0, 0]
ax.legend()
leg = ax.get_legend()
L_labels = leg.get_texts()
# assuming you computed r_squared which is the coefficient of determination somewhere else
slope, intercept, r_value, p_value, std_err = stats.linregress(df_outer[feature_x+location],df_outer[feature_y+location1])
label_line_1 = r'$y={0:.1f}x+{1:.1f}'.format(slope,intercept)
label_line_2 = r'$r:{0:.2f}$'.format(r_value) # as an exampple or whatever you want[!
# L_labels[0].set_text(label_line_1)
L_labels[0].set_text(label_line_2)

plt.xlabel(feature_x+location, fontsize=14)
plt.ylabel(feature_y+location1, fontsize=14)
# ax.xlabel(feature_x+location,fontsize=10)
plt.savefig(r"C:\Users\ivasu\Desktop\p1.png")
# plt.title("Normalized Braking Events - Amsterdam & the NL (Urban Roads)")

In [None]:
p_value

In [None]:
class Clustering:
    def __init__(self,feature,feature_name,df,increment,decrement):
        self.feature = feature
        self.feature_name = feature_name
        self.df=df
        self.increment =increment
        self.decrement =decrement
        
    def preprocess(self,df):
        """Preprocess data for KMeans clustering"""
    
        data = np.array(self.df[self.feature])
        data=data.reshape(-1, 1)
        scaler = StandardScaler()
        scaler.fit(data)
        data = scaler.transform(data)
        
        return data
    
    
    def elbow_plot(self):
        """Create elbow plot from normalized data"""
        data=self.preprocess(self.df)
        sse = {}
        
        for k in range(2,11):
            kmeans = KMeans(n_clusters=k, random_state=1)
            kmeans.fit(data)
            sse[k] = kmeans.inertia_
        
        plt.title('Elbow plot for K selection'+"\n"+feature_name+'-'+ city_name)
        plt.xlabel('k')
        plt.ylabel('SSE')
        sns.pointplot(x=list(sse.keys()),y=list(sse.values()),color="sandybrown")
        plt.show()
        
        
    def silhouette_coeff(self):
        """Checking silhouette score"""
        data=self.preprocess(self.df)
        range_n_clusters = range(2,10)
        
        for n_clusters in range_n_clusters:
            clusterer = KMeans(n_clusters=n_clusters)
            preds = clusterer.fit_predict(data)
            centers = clusterer.cluster_centers_ 

            score = silhouette_score(data, preds)
            print("For n_clusters = {}, silhouette score is {})".format(n_clusters, score))
            
            
    def find_k(self):
        """Find the optimum k clusters"""
        
        data=self.preprocess(self.df)
        sse = {}
        
        for k in range(2, 21):
            kmeans = KMeans(n_clusters=k, random_state=1)
            kmeans.fit(data)
            sse[k] = kmeans.inertia_
        
        kn = KneeLocator(x=list(sse.keys()), 
                    y=list(sse.values()), 
                    curve='convex', 
                    direction='decreasing')
        k = kn.knee + self.increment - self.decrement
        return k
    
    
    def run_kmeans(self):
        """Run KMeans clustering, including the preprocessing of the data
        and the automatic selection of the optimum k. 
        """

        data=self.preprocess(self.df)
        k = self.find_k()
        print(k)
        kmeans = KMeans(n_clusters=k,init='k-means++')
        x=kmeans.fit_predict(data)
        return self.df.assign(Clusters=kmeans.labels_)
    

clustering_kmeans = Clustering(feature,feature_name,df_grouped_cities,increment=0, decrement=2) # increase or decrease number of assigned clusters using `increment` or `decrement`
clustering_kmeans.elbow_plot() 
clustering_kmeans.silhouette_coeff()
Clusters=clustering_kmeans.run_kmeans()
# Clusters
# df_grouped_cities

In [None]:
def viz_classes():
    sns.set_palette("CMRmap")
    sns.kdeplot(Clusters[Clusters['Clusters']==0][feature],label="0",fill=True,alpha=0.5,linewidth=2)
    sns.kdeplot(Clusters[Clusters['Clusters']==1][feature],label="1",fill=True,alpha=0.5,linewidth=2)
    sns.kdeplot(Clusters[Clusters['Clusters']==2][feature],label="2",fill=True,alpha=0.5,linewidth=2)
    sns.kdeplot(Clusters[Clusters['Clusters']==3][feature],label="3",fill=True,alpha=0.5,linewidth=2)
    sns.kdeplot(Clusters[Clusters['Clusters']==4][feature],label="4",fill=True,alpha=0.5,linewidth=2)
    sns.kdeplot(Clusters[Clusters['Clusters']==5][feature],label="5",fill=True,alpha=0.5,linewidth=2)
    plt.legend(title="Cluster")
    plt.title(r"Distribution of "+feature_name+" in "+ city_name)
    # plt.title(r"Distribution of "+feature_name+" in "+ city_name + "\n"+" and the NL (Urban Roads) - Clusters")
    plt.xlabel(feature_name)
    
viz_classes()

In [None]:
df_intra_city.columns

In [None]:

def processing_data_plots(df_clusters):
    df_city=df_clusters.loc[df_clusters['City'] == city_name]
    df_not_city=df_clusters.loc[df_clusters['City'] == city_name+"_complete"]
    
    sns.set_palette("CMRmap")
    sns.kdeplot(df_city[df_city['Clusters']==0][feature],label="0",fill=True,alpha=0.5,linewidth=2)
    sns.kdeplot(df_city[df_city['Clusters']==1][feature],label="1",fill=True,alpha=0.5,linewidth=2)
    sns.kdeplot(df_city[df_city['Clusters']==2][feature],label="2",fill=True,alpha=0.5,linewidth=2)
    sns.kdeplot(df_city[df_city['Clusters']==3][feature],label="3",fill=True,alpha=0.5,linewidth=2)
    sns.kdeplot(df_city[df_city['Clusters']==4][feature],label="4",fill=True,alpha=0.5,linewidth=2)
    plt.legend(title="Cluster")
    plt.title(r"Distribution of "+ feature_name+" in "+city_name)
    plt.xlabel(feature_name)
    
    
    df_city.drop([' Indicators = OFF',
       'Brakes = OFF', 'Brakes = ON', 'Headway Warning (long)',
       'Headway Warning (medium)', 'Headway Warning (short)',
       'Headway Warning = OFF', 'Lane Departure Warning = OFF',
       'Left Indicator = ON', 'Left Lane Departure Warning = ON',
       'Left and Right Indicator = ON', 'Right Indicator = ON',
       'Right Lane Departure Warning = ON','norm_Indicators = OFF','norm_brakes = OFF','norm_Headway Warning = OFF','norm_Left Indicator = ON', 'norm_Left and Right Indicator = ON', 'norm_Right Indicator = ON'], axis = 1, inplace = True) 
    
    
    df_not_city.drop([ 
       'Brakes = OFF', 'Brakes = ON', 'Headway Warning (long)',
       'Headway Warning (medium)', 'Headway Warning (short)',
       'Headway Warning = OFF', 'Lane Departure Warning = OFF',
       'Left Indicator = ON', 'Left Lane Departure Warning = ON',
       'Left and Right Indicator = ON', 'Right Indicator = ON',
       'Right Lane Departure Warning = ON','norm_Indicators = OFF','norm_brakes = OFF','norm_Headway Warning = OFF','norm_Left Indicator = ON', 'norm_Left and Right Indicator = ON', 'norm_Right Indicator = ON'], axis = 1, inplace = True) 
    
    df_city.reset_index(drop=True)
    df_city['Numberplate'] = df_city['Numberplate'].astype(str)
    df_city['City'] = df_city['City'].astype(str)
    df_not_city['Numberplate'] = df_not_city['Numberplate'].astype(str)
    df_not_city['City'] = df_not_city['City'].astype(str)
    
    return df_city,df_not_city

    
df_city,df_not_city=processing_data_plots(Clusters)

In [None]:
import matplotlib as mpl
sns.set_palette("CMRmap")
ax = sns.boxplot(x=Clusters['Clusters'],y=Clusters[feature],fliersize=5)
for patch in ax.artists:
    fc = patch.get_facecolor()
    patch.set_facecolor(mpl.colors.to_rgba(fc, 0.7))

plt.ylabel(feature_name+" - "+ city_name)
# plt.ylabel(feature_name)
plt.xlabel("Cluster")


In [None]:
df_city_cluster0=df_city[df_city['Clusters']==0]
df_city_cluster0_nums=list(df_city_cluster0['Numberplate'])

df_city_cluster1=df_city[df_city['Clusters']==1]
df_city_cluster1_nums=list(df_city_cluster1['Numberplate'])

df_city_cluster2=df_city[df_city['Clusters']==2]
df_city_cluster2_nums=list(df_city_cluster2['Numberplate'])

df_city_cluster3=df_city[df_city['Clusters']==3]
df_city_cluster3_nums=list(df_city_cluster3['Numberplate'])

df_city_cluster4=df_city[df_city['Clusters']==4]
df_city_cluster4_nums=list(df_city_cluster4['Numberplate'])

df_city_cluster5=df_city[df_city['Clusters']==5]
df_city_cluster5_nums=list(df_city_cluster5['Numberplate'])

In [None]:
%%capture
df_not_city_cluster0_nums = df_not_city[df_not_city['Numberplate'].isin(df_city_cluster0_nums)]
df_not_city_cluster0_nums['Cluster_city']=0
df_not_city_cluster1_nums = df_not_city[df_not_city['Numberplate'].isin(df_city_cluster1_nums)]
df_not_city_cluster1_nums['Cluster_city']=1
df_not_city_cluster2_nums = df_not_city[df_not_city['Numberplate'].isin(df_city_cluster2_nums)]
df_not_city_cluster2_nums['Cluster_city']=2
df_not_city_cluster3_nums = df_not_city[df_not_city['Numberplate'].isin(df_city_cluster3_nums)]
df_not_city_cluster3_nums['Cluster_city']=3
df_not_city_cluster4_nums = df_not_city[df_not_city['Numberplate'].isin(df_city_cluster4_nums)]
df_not_city_cluster4_nums['Cluster_city']=4


In [None]:
list_of_dataframes=[df_not_city_cluster0_nums,df_not_city_cluster1_nums,df_not_city_cluster2_nums,df_not_city_cluster3_nums]
df_not_city_new_clus = pd.concat(list_of_dataframes)
df_not_city_new_clus

In [None]:
import matplotlib as mpl
ax = sns.boxplot(x=df_not_city_new_clus['Cluster_city'],y=df_not_city_new_clus[feature],fliersize=5)
for patch in ax.artists:
    fc = patch.get_facecolor()
    patch.set_facecolor(mpl.colors.to_rgba(fc, 0.7))
    
plt.ylabel(feature_name+" across the NL")
plt.xlabel("Cluster")
plt.title(feature_name+ " of corresponding "+"\n"+r"vehicles based on cluster assigned in "+city_name)
# plt.ylim(-0.1,1.4)

In [None]:
df_outer = pd.merge(df_city, df_not_city_new_clus, on='Numberplate', how='inner') #here Numberplate is common column
# df_outer

In [None]:
df_outer.rename(columns={'Point_speed_x': 'Mean Point Speed in '+location,
                         'Point_speed_y': 'Mean Point Speed '+location1,
                         'norm_brakes = ON_x': 'Norm Braking Events in '+location,
                         'norm_brakes = ON_y': 'Norm Braking Events '+location1,
                         'norm_headway_warning(long)_x': 'Norm L(I)-HW in '+location,
                         'norm_headway_warning(long)_y': 'Norm L(I)-HW '+location1,
                         'norm_headway_warning(medium)_x': 'Norm L(II)-HW in '+location,
                         'norm_headway_warning(medium)_y': 'Norm L(II)-HW '+location1,
                         'norm_headway_warning(short)_x': 'Norm L(III)-HW in '+location,
                         'norm_headway_warning(short)_y': 'Norm L(III)-HW '+location1,
                         'norm_Right Lane Departure Warning = ON_x': 'Norm R-LDW in '+location,
                         'norm_Right Lane Departure Warning = ON_y': 'Norm R-LDW '+location1,
                         'norm_Left Lane Departure Warning = ON_x': 'Norm L-LDW in '+location,
                         'norm_Left Lane Departure Warning = ON_y': 'Norm L-LDW '+location1
                         },
          inplace=True, errors='raise')

In [None]:
feature_x= feature_name+'in '
feature_y=feature_name

In [None]:
df_outer.columns

In [None]:
feature_x

In [None]:
sns.set_palette("CMRmap")
feature_name_x="Mean Point Speed in "
feature_name_y="Mean Point Speed "
sns.scatterplot(data=df_outer[df_outer['Clusters_x']==0],x=feature_name_x+location,y=feature_name_y+location1,alpha=0.8,label=0)
sns.scatterplot(data=df_outer[df_outer['Clusters_x']==1],x=feature_name_x+location,y=feature_name_y+location1,alpha=0.8,label=1)
sns.scatterplot(data=df_outer[df_outer['Clusters_x']==2],x=feature_name_x+location,y=feature_name_y+location1,alpha=0.8,label=2)
sns.scatterplot(data=df_outer[df_outer['Clusters_x']==3],x=feature_name_x+location,y=feature_name_y+location1,alpha=0.8,label=3)
plt.legend(title="Cluster")
plt.xlabel(feature_x+city_name)
plt.ylabel(feature_y+" across the NL" +"\n"+"(urban roads)")
plt.title(feature_y+" of vehicles "+"\n"+"and their corresponding clusters")

In [None]:
# df_outer.to_csv(r"D:\AOS FOT\Octo\CSV Export\Individual Clustering City\Clustering_results\df_outer_hw_norm_l1_amsterdam.csv")

In [None]:
# df_outer=pd.read_csv(r"D:\AOS FOT\Octo\CSV Export\Individual Clustering City\Clustering_results\df_outer_hw_norm_l1_amsterdam.csv")

### Kentekens

In [None]:
kentekens=pd.read_csv(r"C:\Users\ivasu\Desktop\Robotics\2021-2022\Thesis\code\DataAnalysis\CarrierWeb\Data\kentekens_iva.csv")

In [None]:
df_outer_kentekens = pd.merge(kentekens, df_outer, on='Numberplate', how='inner') #here Numberplate is the common column
df_outer_kentekens['Power_int']=df_outer_kentekens["Power"].str[0:3]
df_outer_kentekens['Power_int'] = df_outer_kentekens['Power_int'].astype('int')

df_outer_kentekens.columns

In [None]:
df_outer_kentekens = df_outer_kentekens.drop(df_outer_kentekens[df_outer_kentekens['Cylinder capacity']=='onbekend'].index)
df_outer_kentekens['Cylinder capacity']=df_outer_kentekens["Cylinder capacity"].str[0:5]
df_outer_kentekens['Cylinder_capacity_int'] = df_outer_kentekens['Cylinder capacity'].astype('int')

In [None]:
bic_df=pd.read_csv(r"D:\AOS FOT\Octo\CSV Export\Other Tables\BCI - Copy.csv",delimiter=';', encoding='unicode_escape')
# bic_df

In [None]:
df_city_cluster_true_kentekens_more_info= pd.merge(df_outer_kentekens, bic_df, on='Numberplate', how='inner') #here Numberplate is common column

In [None]:
"""
Important columns

- Year
- Mass empty vehicle
- Brand
- Transporter

"""

In [None]:
df = df_city_cluster_true_kentekens_more_info[['Numberplate','Brand_x','Year','Mass empty vehicle','Cylinder capacity','Power_int','Transporter']]


In [None]:
feature_y

In [None]:
df_discussion = pd.merge(df, df_outer, on='Numberplate', how='inner') #here Numberplate is the common column
# df_discussion

In [None]:
df_discussion['Mass_empty_vehicle_int']=df_discussion["Mass empty vehicle"].str[0:5]
df_discussion['Mass_empty_vehicle_int'] = df_discussion['Mass_empty_vehicle_int'].astype('int')
df_discussion['Year'] = df_discussion['Year'].astype('int')

In [None]:
df_discussion['cluster_comp'] = np.where(df_discussion['Clusters_x']==df_discussion['Clusters_y'], 'True', 'False')

In [None]:
a=df_discussion[df_discussion['cluster_comp']=='True']
# a['clusters_utrecht'].value_counts()
len(df_discussion[df_discussion['cluster_comp']=='True'])/len(df_discussion)*100

In [None]:
a['Clusters_x'].value_counts()/len(a)*100

In [None]:
df_discussion_cluster1=df_discussion[df_discussion['Clusters_x']==1]
# df_discussion_cluster1

In [None]:
sns.scatterplot(data=df_discussion[df_discussion['Clusters_x']==0],x="Power_int",y="Norm L(I)-HW in "+city_name,hue='Transporter')
plt.xlabel("Power (kW)")
plt.ylabel("Norm L(I)-HW")
plt.legend(loc='best', bbox_to_anchor=(1., 0., 0.0, 0.5))

In [None]:
sns.scatterplot(data=df_discussion[df_discussion['Clusters_x']==2],x="Power_int",y="Norm L(I)-HW in "+city_name,hue='Transporter')
plt.xlabel("Power (kW)")
plt.ylabel("Norm L(I)-HW")
plt.legend(loc='best', bbox_to_anchor=(1., 0., 0.0, 0.5))

In [None]:
df_discussion_cluster1=df_discussion[df_discussion['Clusters_x']==1]
df_discussion_cluster0=df_discussion[df_discussion['Clusters_x']==0]
df_discussion_cluster2=df_discussion[df_discussion['Clusters_x']==2]
df_discussion_cluster3=df_discussion[df_discussion['Clusters_x']==3]


In [None]:
df_discussion_cluster1_transporter=df_discussion_cluster1['Transporter'].value_counts()
df_discussion_cluster0_transporter=df_discussion_cluster0['Transporter'].value_counts()
df_discussion_cluster2_transporter=df_discussion_cluster2['Transporter'].value_counts()
df_discussion_cluster3_transporter=df_discussion_cluster3['Transporter'].value_counts()

In [None]:

df_discussion_transporte_cluster_comp=df_discussion['cluster_comp'].value_counts()

In [None]:
df_discussion_transporter=df_discussion['Transporter'].value_counts()
# df_discussion_transporter

In [None]:
import pandas as pd
  
# intialise data of lists.
data = {'All Transporter':df_discussion_transporter,
        'Cluster1':df_discussion_cluster1_transporter,
        'Perc_cluster1':(df_discussion_cluster1_transporter/df_discussion_transporter)*100,
        'Cluster0':df_discussion_cluster0_transporter,
        'Perc_cluster0':(df_discussion_cluster0_transporter/df_discussion_transporter)*100,
        'Cluster2':df_discussion_cluster2_transporter,
        'Perc_cluster2':(df_discussion_cluster2_transporter/df_discussion_transporter)*100,
        'Cluster3':df_discussion_cluster3_transporter,
        'Perc_cluster3':(df_discussion_cluster3_transporter/df_discussion_transporter)*100,
        'Cluster_Comp':df_discussion_transporte_cluster_comp}
  
# Create DataFrame
df = pd.DataFrame(data)
  
# Print the output.
# df

In [None]:
df=df.reset_index()

In [None]:
df_50_clustrer1=df[df['Perc_cluster1']>50]
list_of_vehicles_cluster1_50=list(df_50_clustrer1['index'])

df_50_clustrer1_complete_info = df_discussion[df_discussion['Transporter'].isin(list_of_vehicles_cluster1_50)]
df_50_clustrer1_complete_info=df_50_clustrer1_complete_info[df_50_clustrer1_complete_info['Cluster_city']==1]

In [None]:
sns.scatterplot(data=df_50_clustrer1_complete_info,x="Mass_empty_vehicle_int",y="Norm L(I)-HW in "+city_name,hue='Transporter')
plt.xlabel("Mass (kg)")
plt.ylabel("Norm L(I)-HW")
plt.legend(loc='best', bbox_to_anchor=(1., 0.55, 0.0, 0.5))
plt.title("Headway Warnings vs Mass of vehicle (kg)"+"\n"+"(Transport companies with majority of driver assigned lowest cluster)")

In [None]:
sns.scatterplot(data=df_50_clustrer1_complete_info,x="Power_int",y="Norm L(I)-HW in "+city_name,hue='Transporter')
plt.xlabel("Power (kW)")
plt.ylabel("Norm L(I)-HW")
plt.legend(loc='best', bbox_to_anchor=(1., 0.1, 0.0, 0.5))

In [None]:
df_50_clustrer0=df[df['Perc_cluster2']>50]
list_of_vehicles_cluster0_50=list(df_50_clustrer0['index'])

df_50_clustrer0_complete_info = df_discussion[df_discussion['Transporter'].isin(list_of_vehicles_cluster0_50)]
df_50_clustrer0_complete_info=df_50_clustrer0_complete_info[df_50_clustrer0_complete_info['Cluster_city']==2]
# df_50_clustrer0_complete_info

In [None]:
sns.scatterplot(data=df_50_clustrer0_complete_info,x="Power_int",y="Norm L(I)-HW in "+city_name,hue='Brand_x')
plt.xlabel("Power (kW)")
plt.ylabel("Norm L(I)-HW")
# plt.legend(loc='best', bbox_to_anchor=(1., 0.55, 0.0, 0.5))
plt.title("Headway Warnings vs Power (kW)"+"\n"+"(Transport companies with majority of driver assigned lowest cluster)")

In [None]:
a=df_discussion[df_discussion['cluster_comp']=='True']
a['Cluster_city'].value_counts()

In [None]:
### Exploring indi clusters
sns.scatterplot(data=df_discussion[df_discussion['Cluster_city']==1],x="Mass_empty_vehicle_int",y="Norm L(I)-HW in "+city_name)
plt.xlabel("Mass (kg)")
plt.ylabel("Norm L(I)-HW")
# plt.legend(loc='best', bbox_to_anchor=(1., 0.55, 0.0, 0.5))
plt.title("Headway Warnings vs Mass of vehicle (kg)"+"\n"+"(Transport companies with majority of driver assigned lowest cluster)")

In [None]:
# sns.kdeplot(df_discussion['Mass_empty_vehicle_int'],hue=df_discussion['Cluster_city'],linewidth=3)

In [None]:
sns.set_palette("CMRmap")
sns.jointplot(data=df_discussion,x="Mass_empty_vehicle_int",y="Power_int",hue="Cluster_city")

In [None]:
import matplotlib as mpl
sns.set_palette("CMRmap")
ax = sns.boxplot(x=df_city['Clusters'],y=df_city[feature],fliersize=5)
for patch in ax.patches:
    r, g, b, a = patch.get_facecolor()
    patch.set_facecolor((r, g, b, .82))

plt.ylabel(feature_name+" - "+ city_name,fontsize=15)
# plt.ylabel(feature_name)
plt.xlabel("Cluster",fontsize=15)


In [None]:
df_not_city_cluster0_nums = df_not_city[df_not_city['Numberplate'].isin(df_city_cluster0_nums)]
df_not_city_cluster0_nums['Cluster_city']=0
df_not_city_cluster1_nums = df_not_city[df_not_city['Numberplate'].isin(df_city_cluster1_nums)]
df_not_city_cluster1_nums['Cluster_city']=1
df_not_city_cluster2_nums = df_not_city[df_not_city['Numberplate'].isin(df_city_cluster2_nums)]
df_not_city_cluster2_nums['Cluster_city']=2
df_not_city_cluster3_nums = df_not_city[df_not_city['Numberplate'].isin(df_city_cluster3_nums)]
df_not_city_cluster3_nums['Cluster_city']=3
df_not_city_cluster4_nums = df_not_city[df_not_city['Numberplate'].isin(df_city_cluster4_nums)]
df_not_city_cluster4_nums['Cluster_city']=4

In [None]:
list_of_dataframes=[df_not_city_cluster0_nums,df_not_city_cluster1_nums,df_not_city_cluster2_nums,df_not_city_cluster3_nums]
df_not_city_new_clus = pd.concat(list_of_dataframes)
# df_not_city_new_clus

In [None]:
import matplotlib as mpl
sns.set_context("notebook")
ax = sns.boxplot(x=df_not_city_new_clus['Cluster_city'],y=df_not_city_new_clus[feature],fliersize=5)
for patch in ax.patches:
    r, g, b, a = patch.get_facecolor()
    patch.set_facecolor((r, g, b, .82))
    
plt.ylabel(feature_name+" across cities in the NL",fontsize=15)
plt.xlabel("Cluster",fontsize=15)
plt.title(feature_name+ " of corresponding "+"\n"+r"vehicles based on cluster assigned in "+city_name,fontsize=15)

In [None]:
# df_discussion[df_discussion['Cluster_city']==0]['Power_int'].median()
df_discussion[df_discussion['Cluster_city']==0]['Mass_empty_vehicle_int'].median()
# Mass_empty_vehicle_int

In [None]:
# df_discussion[df_discussion['Cluster_city']==1]['Power_int'].median()
df_discussion[df_discussion['Cluster_city']==1]['Mass_empty_vehicle_int'].median()

In [None]:
# df_discussion[df_discussion['Cluster_city']==2]['Power_int'].median()
df_discussion[df_discussion['Cluster_city']==2]['Mass_empty_vehicle_int'].median()

In [None]:
df_discussion[df_discussion['Cluster_city']==3]['Power_int'].median()
# df_discussion[df_discussion['Cluster_city']==3]['Mass_empty_vehicle_int'].median()

In [None]:
palette_2 = sns.color_palette(["slateblue","rebeccapurple","palevioletred","coral"])
# palette_2
sns.boxplot(x=df_discussion['Cluster_city'],y=df_discussion['Power_int'],palette=palette_2)
plt.ylabel("Power (kW)",fontsize=15)
plt.xlabel("Clusters",fontsize=15)
plt.title("Engine Power of different clusters in Amsterdam"+"\n"+"(Feature for clustering - "+ feature_name+")",fontsize=15)

In [None]:
df_discussion.columns

In [None]:
def viz_classes():
    sns.set_palette("CMRmap")
    sns.kdeplot(df_city[df_city['Clusters']==0][feature],label="0",fill=True,alpha=0.5,linewidth=2)
    sns.kdeplot(df_city[df_city['Clusters']==1][feature],label="1",fill=True,alpha=0.5,linewidth=2)
    sns.kdeplot(df_city[df_city['Clusters']==2][feature],label="2",fill=True,alpha=0.5,linewidth=2)
    sns.kdeplot(df_city[df_city['Clusters']==3][feature],label="3",fill=True,alpha=0.5,linewidth=2)
    sns.kdeplot(df_city[df_city['Clusters']==4][feature],label="4",fill=True,alpha=0.5,linewidth=2)
    sns.kdeplot(df_city[df_city['Clusters']==5][feature],label="5",fill=True,alpha=0.5,linewidth=2)
    plt.legend(title="Cluster",fontsize=15)
    # plt.title(r"Distribution of "+feature_name+" in "+ city_name)
    plt.title(r"Distribution of "+feature_name+" in "+ city_name,fontsize=15)
    plt.xlabel(feature_name,fontsize=15)
    plt.ylabel("Density",fontsize=15)
viz_classes()

In [None]:
# palette_2 = sns.color_palette(["slateblue","coral","rebeccapurple","palevioletred"])
palette_2 = sns.color_palette(["slateblue","rebeccapurple","palevioletred","coral"])
palette_2
sns.set_palette("CMRmap")
sns.kdeplot(data=df_discussion,x="Power_int",hue="Cluster_city",linewidth=3,palette=palette_2)
plt.xlabel("Power (kW)")
plt.title("Power distribution of different clusters in Amsterdam"+"\n"+"(Feature for Clustering-Mean Point Speed)")
# plt.legend(title='Cluster')
# plt.legend(title="Cluster")

In [None]:
# palette_2 = sns.color_palette(["slateblue","coral","rebeccapurple","palevioletred"])
palette_2 = sns.color_palette(["slateblue","rebeccapurple","palevioletred","coral"])
palette_2
sns.set_palette("CMRmap")
sns.boxplot(data=df_discussion,y="Mass_empty_vehicle_int",x="Cluster_city",palette=palette_2)
plt.ylabel("Mass empty vehicle (kg)")
plt.xlabel("Clusters")
# plt.title("Power distribution of different clusters in Amsterdam"+"\n"+"(Feature for Clustering-Norm HW-L(I))")
# plt.legend(title='Cluster')
# plt.legend(title="Cluster")

In [None]:
# palette_2 = sns.color_palette(["slateblue","coral","rebeccapurple","palevioletred"])
palette_2 = sns.color_palette(["slateblue","rebeccapurple","palevioletred","coral"])
palette_2
sns.set_palette("CMRmap")
sns.boxplot(data=df_outer_kentekens,y="Power_int",x="Cluster_city",palette=palette_2)
plt.ylabel("Power (kW)",fontsize=15)
plt.xlabel("Clusters",fontsize=15)
plt.title("Engine Power of different clusters in Amsterdam"+"\n"+"(Feature for clustering - "+ feature_name+")",fontsize=15)
# plt.legend(title='Cluster')
# plt.legend(title="Cluster")

In [None]:
# palette_2 = sns.color_palette(["slateblue","coral","rebeccapurple","palevioletred"])
palette_2 = sns.color_palette(["slateblue","rebeccapurple","palevioletred","coral"])
palette_2
sns.set_palette("CMRmap")
sns.boxplot(data=df_outer_kentekens,y="Cylinder_capacity_int",x="Cluster_city",palette=palette_2)
plt.ylabel("Engine Capacity (cc)")
plt.xlabel("Clusters")
plt.title("Engine capacity of different clusters in Amsterdam"+"\n"+"(Feature for clustering - "+ feature_name+")")
# plt.legend(title='Cluster')
# plt.legend(title="Cluster")

In [None]:
df_discussion.columns

In [None]:
palette_2 = sns.color_palette(["slateblue","rebeccapurple","palevioletred","coral"])
palette_2
sns.set_palette("CMRmap")
sns.kdeplot(data=df_outer_kentekens,x="Cylinder_capacity_int",hue="Cluster_city",linewidth=3,palette=palette_2)
plt.xlabel("Engine Capacity (cc)")
plt.title("Engine Capacity of different clusters in Amsterdam "+"\n"+"(Feature for Clustering-Mean Point Speed)")

In [None]:
palette_2 = sns.color_palette(["slateblue","rebeccapurple","palevioletred","coral"])
palette_2
sns.set_palette("CMRmap")
sns.kdeplot(data=df_outer_kentekens,x="Power_int",hue="Cluster_city",linewidth=3,palette=palette_2)
plt.xlabel("Power (kW)")
plt.title("Power distribution of different clusters in Amsterdam "+"\n"+"(Feature for Clustering-Mean Point Speed)")

In [None]:
df_discussion.columns

In [None]:
sns.kdeplot(data=df_outer_kentekens,x="Cylinder_capacity_int",hue="Cluster_city",linewidth=3)
plt.xlabel("Power (kW)")
plt.title("Power (kW) ")

In [None]:
city_norml1_hw=pd.read_excel(r"C:\Users\ivasu\Desktop\Robotics\2021-2022\Thesis\code\DataAnalysis\CarrierWeb\Data\Book12.xlsx")
city_norml1_hw

In [None]:
sns.barplot(y=city_norml1_hw["Power (kW)"],x=city_norml1_hw["Cluster "],hue=city_norml1_hw["City"])