In [64]:
from sklearn.cluster import KMeans, SpectralClustering
from sklearn.metrics import silhouette_samples, silhouette_score
import pandas as pd
import numpy as np
from pathlib import Path
import uuid
import os

In [65]:
input_folder = '../data/clustering'
output_folder = f'../data/output/clustering/{uuid.uuid4()}'
output_folder_path = Path(output_folder)
input_folder_path = Path(input_folder)
os.makedirs(output_folder, exist_ok=True)

clustering_output_filename = 'clustering_output.csv'

In [66]:
def cluster(model, clusterin_data):
    if len(clusterin_data) < 2:
        return [1], [1], 1
    clusters = model.fit_predict(clusterin_data)
    silhouette = silhouette_samples(clusterin_data, clusters)
    return clusters, silhouette


def score_fun(df, clusters, silhouette, label):
    all_df, clusters_df, silhouette_mean = result_fun (df, clusters, silhouette)
    if label is None:
        return all_df, clusters_df, 0, 0, silhouette_mean
    
    # Add most occuring label in cluster: name, count, and percentage of the cluster 
    df3 = all_df.groupby(["Cluster", label]).size().sort_values().groupby(level=0).tail(1).reset_index(level=label, name='Max_Label_Count')
    clusters_df = clusters_df.join(df3, on='Cluster').rename(columns={label:'Max_Label'})
    clusters_df['Percent'] = clusters_df['Max_Label_Count'] / clusters_df['Cluster_Size']
    
    score = clusters_df['Max_Label_Count'].sum() /  clusters_df['Cluster_Size'].sum()
    return all_df, clusters_df, score, clusters_df['Percent'].mean(), silhouette_mean


def result_fun(df, clusters, silhouette):
    clusters = pd.Series(clusters, name='Cluster')
    silhouette = pd.Series(silhouette, name='Silhouette')
    all_df = pd.concat([clusters, silhouette, df], axis=1)
    # Group by clusters and add cluster size ans cluster silhouette mean
    # clusters_df = all_df.groupby(["Cluster"]).size().reset_index(name='Cluster_Size')
    clusters_df = all_df.groupby('Cluster').agg(Cluster_Size= ('Cluster','size'), Silhouette_Mean=('Silhouette', 'mean')).reset_index()
    return all_df, clusters_df, silhouette.mean()


def transform_data(idf, replace):
    idf.fillna(replace, inplace=True)


def min_max_normalization (value, x, y, xnew, ynew):
    # Transform value in range (x to y) into some interval (xnew to ynew)
    return xnew + (ynew - xnew) * (value - x) / (y - x)


def optimal_cluster_num(idf):
    row_count = len(idf)
    if row_count <= 2:
        return row_count
    elif row_count <= 10:
        return int(min_max_normalization(row_count, 2, 10, 2, 5))
    elif row_count <= 100:
        return int(min_max_normalization(row_count, 10, 100, 5, 15))
    elif row_count <= 200:
        return int(min_max_normalization(row_count, 100, 200, 15, 20))
    elif row_count <= 400:
        return int(min_max_normalization(row_count, 200, 400, 20, 30))
    elif row_count <= 1000:
        return int(min_max_normalization(row_count, 400, 1000, 30, 75))
    elif row_count <= 6000:
        return int(min_max_normalization(row_count, 1000, 6000, 75, 150))
    return 150


def one_hot_encoder (idf):
    unique_values = np.unique(idf.astype(str).values).tolist()
    if '*' in unique_values:
        unique_values.remove('*')
    
    data = []
    for row in idf.itertuples():
        values_set = set(row[1:len(row)])
        data.append([int(word in values_set) for word in unique_values])
    return pd.DataFrame(data=data, columns=unique_values)


def comapre_pcap_to_cluster(diff_df, pcap_name):
    pcap_df= diff_df[(diff_df['pcap'] == pcap_name)]
    pcap_cluster_num = pcap_df['Cluster'].values[0]
    
    diff_df = diff_df[diff_df['Cluster']== pcap_cluster_num].reset_index(drop=True)
    diff_df = diff_df[[x for x in diff_df.columns if x in ['Cluster', 'Silhouette', 'pcap', 'Label'] or diff_df[x].mean() not in [0,1]]]
    pcap_df = diff_df[(diff_df['pcap'] == pcap_name)].reset_index(drop=True)
    
    t = pcap_df.T
    t.columns = ['pcap']
    
    clusters_mean = diff_df[(diff_df['Cluster']==pcap_cluster_num)].groupby("Cluster").mean().reset_index() #& (diff_df['pcap'] != pcap_name)
    clusters_mean = clusters_mean.T
    t = t.join(clusters_mean)
    # pd.set_option('display.max_rows', None)
    t.columns = ['pcap', 'Average']
    t['diff'] = (t['pcap'] - t['Average']).abs()
    t= t.sort_values('diff', ascending=False)
    average_by_pcap = 1 - t['diff'].mean()
    result = t[t['pcap'] != t['Average']]
    return result , average_by_pcap
    
    
def new_score(diff_df):
    pcaps = diff_df['pcap'].values
    data = []
    for pcap in pcaps:
        result , average_by_pcap = comapre_pcap_to_cluster(diff_df, pcap)
        data.append(average_by_pcap)
    diff_df['New_Score'] = data
    return diff_df
    
# def comapre_pcap_to_pcap(diff_df, pcap_name, pcap_name2):
#     cluster_num = 48
#     # max_label = 'sip503-C-SBC-Unavailable'
#     max_label = clusters_df[clusters_df['Cluster']==cluster_num]['Max_Label'].values[0]
#     t= all_df[(all_df['Cluster']==cluster_num) & (all_df['Label'] != max_label)] #  & (clusters_df['Silhouette']<0.1) & (clusters_df['Label'] != 'success')]
#     t = t.head(1).T
#     clusters_mean = all_df[(all_df['Cluster']==cluster_num) & (all_df['Label'] == max_label)].groupby("Cluster").mean().reset_index()
#     clusters_mean = clusters_mean.T
#     t = t.join(clusters_mean)
#     # pd.set_option('display.max_rows', None)
#     t.columns = ['pcap', 'Average']
#     return t[t['pcap'] != t['Average']]
    
    
def cluster_info(idf, cluster_num):
    cluster_df = idf[idf['Cluster']==cluster_num]
    clusters_mean = cluster_df[[x for x in cluster_df.columns if x in ['Cluster', 'Silhouette', 'pcap', 'Label'] or cluster_df[x].mean() not in [0,1]]]
    clusters_mean = clusters_mean.groupby("Cluster").mean().reset_index()
    min_silouhete =  cluster_df[cluster_df['Silhouette']== cluster_df['Silhouette'].min()]
    max_silouhete =  cluster_df[cluster_df['Silhouette']== cluster_df['Silhouette'].max()]
    min_sim =  cluster_df[cluster_df['New_Score']== cluster_df['New_Score'].min()]
    max_sim =  cluster_df[cluster_df['New_Score']== cluster_df['New_Score'].max()]
    return clusters_mean.T, min_silouhete.T, max_silouhete.T, min_sim.T, max_sim.T


In [67]:
data_path = input_folder_path / 'dry_run_sip.csv'
index_col = 0
replace_none = None
label = 'Label'
ignore_cols = ['pcap', 'Label', 'Vendor']
use_encoder = True # If True calls one_hot_encoder
number_of_clusters = -1 # specify number of cluster. If -1 calculate optimal_cluster_num

In [68]:
df = pd.read_csv(data_path, index_col=index_col)
df = df.drop_duplicates().reset_index(drop=True)
if replace_none is not None:
    transform_data(df, replace_none)
df

Unnamed: 0,pcap,sip 1,sip 2,sip 3,sip 4,sip 5,sip 6,sip 7,sip 8,sip 9,...,sip 238,sip 239,sip 240,sip 241,sip 242,sip 243,sip 244,sip 245,sip 246,Label
0,PASSED-3PTY_003a-20200716_103917_20200716_1040...,UE-INVITE-P-CSCF02A,P-CSCF02A-100-UE,P-CSCF-INVITE-S-CSCF,S-CSCF-100-P-CSCF,"S-CSCF-407&&SIP;cause=407;text=""CC_IMS_SESS_NE...","P-CSCF02A-407&&SIP;cause=407;text=""CC_IMS_SESS...",P-CSCF-ACK-S-CSCF,UE-ACK-P-CSCF02A,UE-INVITE-P-CSCF02A,...,*,*,*,*,*,*,*,*,*,pass_3pty
1,FAILED-FUNC_ICBVBL_001-20200701_082101_2020070...,UE-INVITE-P-CSCF02A,P-CSCF02A-100-UE,P-CSCF-INVITE-S-CSCF,S-CSCF-100-P-CSCF,"S-CSCF-407&&SIP;cause=407;text=""CC_IMS_SESS_NE...","P-CSCF02A-407&&SIP;cause=407;text=""CC_IMS_SESS...",P-CSCF-ACK-S-CSCF,UE-ACK-P-CSCF02A,UE-INVITE-P-CSCF02A,...,*,*,*,*,*,*,*,*,*,fail-WrongPin-BlockedPin
2,FAILED-FUNC_ICBVBL_001-20200625_091222_2020062...,UE-INVITE-P-CSCF02A,P-CSCF02A-100-UE,P-CSCF-INVITE-S-CSCF,S-CSCF-100-P-CSCF,"S-CSCF-407&&SIP;cause=407;text=""CC_IMS_SESS_NE...",P-CSCF-ACK-S-CSCF,"P-CSCF02A-407&&SIP;cause=407;text=""CC_IMS_SESS...",UE-ACK-P-CSCF02A,UE-INVITE-P-CSCF02A,...,*,*,*,*,*,*,*,*,*,fail-WrongPin-BlockedPin
3,FAILED-FUNC_ICBVBL_001-20200625_101931_2020062...,UE-INVITE-P-CSCF02A,P-CSCF02A-100-UE,P-CSCF-INVITE-S-CSCF,S-CSCF-100-P-CSCF,"S-CSCF-407&&SIP;cause=407;text=""CC_IMS_SESS_NE...",P-CSCF-ACK-S-CSCF,"P-CSCF02A-407&&SIP;cause=407;text=""CC_IMS_SESS...",UE-ACK-P-CSCF02A,UE-INVITE-P-CSCF02A,...,*,*,*,*,*,*,*,*,*,fail-WrongPin-BlockedPin
4,1eFTS8reAN_timothy.vogel@verizonwireless.com_S...,UE-REGISTER-P-CSCF,P-CSCF-401-UE,P-CSCF-200-UE,UE-INVITE-P-CSCF,P-CSCF-100-UE,P-CSCF-480&&Q.850;cause=31-UE,UE-ACK-P-CSCF,-483-,*,...,*,*,*,*,*,*,*,*,*,fail_sip480_PCSCFNotReply
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
181,PASSED-3PTY_001a-20200615_092358__RCA_PoC_12_1...,UE-INVITE-P-CSCF02A,P-CSCF02A-100-UE,P-CSCF-INVITE-S-CSCF,S-CSCF-100-P-CSCF,"S-CSCF-407&&SIP;cause=407;text=""ScscfSessionHe...","P-CSCF02A-407&&SIP;cause=407;text=""ScscfSessio...",P-CSCF-ACK-S-CSCF,UE-ACK-P-CSCF02A,UE-INVITE-P-CSCF02A,...,*,*,*,*,*,*,*,*,*,pass_3pty
182,Hy3vANpPwn_timothy.vogel@verizonwireless.com_S...,UE-REGISTER-P-CSCF,P-CSCF-401-UE,P-CSCF-200-UE,UE-INVITE-P-CSCF,P-CSCF-100-UE,P-CSCF-480&&Q.850;cause=31-UE,UE-ACK-P-CSCF,*,*,...,*,*,*,*,*,*,*,*,*,fail_sip480_PCSCFNotReply
183,PASSED-3PTY_003a-20200626_091744_20200626_0920...,UE-INVITE-P-CSCF02A,P-CSCF02A-100-UE,P-CSCF-INVITE-S-CSCF,S-CSCF-100-P-CSCF,"S-CSCF-407&&SIP;cause=407;text=""CC_IMS_SESS_NE...",P-CSCF-ACK-S-CSCF,"P-CSCF02A-407&&SIP;cause=407;text=""CC_IMS_SESS...",UE-ACK-P-CSCF02A,UE-INVITE-P-CSCF02A,...,*,*,*,*,*,*,*,*,*,pass_3pty
184,PASSED-3PTY_002a-20200626_091744_20200626_0920...,UE-INVITE-P-CSCF02A,P-CSCF02A-100-UE,P-CSCF-INVITE-S-CSCF,S-CSCF-100-P-CSCF,"S-CSCF-407&&SIP;cause=407;text=""CC_IMS_SESS_NE...",P-CSCF-ACK-S-CSCF,"P-CSCF02A-407&&SIP;cause=407;text=""CC_IMS_SESS...",UE-ACK-P-CSCF02A,UE-INVITE-P-CSCF02A,...,*,*,*,*,*,*,*,*,*,pass_3pty


In [69]:
clustering_data = df[[x for x in df.columns if x not in ignore_cols]]
if use_encoder:
    clustering_data = one_hot_encoder (clustering_data)
clustering_data

Unnamed: 0,-100-,-100-P-CSCF02A,-100-P-SBC,-100-S-CSCF,-180-P-CSCF02A,-180-S-CSCF,-200-MSC,-200-P-CSCF,-200-P-CSCF02A,-200-S-CSCF,...,"UE-BYE&&SIP;cause=200;text=""User Triggered""-P-CSCF","UE-BYE&&SIP;text=""User Triggered""-P-CSCF","UE-BYE&&SIP;text=""the user on hook""-P-CSCF02A",UE-CANCEL-P-CSCF,UE-CANCEL-P-CSCF02A,UE-INVITE-P-CSCF,UE-INVITE-P-CSCF02A,UE-PRACK-P-CSCF,UE-REGISTER-P-CSCF,UE-REGISTER-P-CSCF02A
0,0,1,0,1,1,1,0,0,1,1,...,0,0,1,0,0,0,1,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,1,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,1,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,1,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
181,0,1,0,1,1,0,0,0,1,1,...,0,0,1,0,0,0,1,0,0,1
182,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
183,0,1,0,1,1,0,0,0,1,1,...,0,0,1,0,0,0,1,0,0,0
184,0,1,0,1,1,0,0,0,1,1,...,0,0,1,0,0,0,1,0,0,1


In [70]:
if number_of_clusters == -1:
    number_of_clusters = optimal_cluster_num(clustering_data)
params = {'n_clusters': number_of_clusters, 'init': 'k-means++', 'max_iter': 120, 'n_init': 25, 'random_state': 1}
clusterer = KMeans(**params)
clusters, silhouette = cluster(clusterer, clustering_data)
print('number of clusters {}'.format(number_of_clusters))


number of clusters 19


In [71]:
all_df, clusters_df, score, percent_mean, silhouette_mean = score_fun(df, clusters, silhouette, label)
print('Score {} Percent {} Silhouette {}'.format(score, percent_mean, silhouette_mean))
all_df.to_csv(output_folder_path / 'clustered_data.csv')
all_df

Score 0.9354838709677419 Percent 0.9222488038277512 Silhouette 0.6456192344937166


Unnamed: 0,Cluster,Silhouette,pcap,sip 1,sip 2,sip 3,sip 4,sip 5,sip 6,sip 7,...,sip 238,sip 239,sip 240,sip 241,sip 242,sip 243,sip 244,sip 245,sip 246,Label
0,3,0.641336,PASSED-3PTY_003a-20200716_103917_20200716_1040...,UE-INVITE-P-CSCF02A,P-CSCF02A-100-UE,P-CSCF-INVITE-S-CSCF,S-CSCF-100-P-CSCF,"S-CSCF-407&&SIP;cause=407;text=""CC_IMS_SESS_NE...","P-CSCF02A-407&&SIP;cause=407;text=""CC_IMS_SESS...",P-CSCF-ACK-S-CSCF,...,*,*,*,*,*,*,*,*,*,pass_3pty
1,5,0.776211,FAILED-FUNC_ICBVBL_001-20200701_082101_2020070...,UE-INVITE-P-CSCF02A,P-CSCF02A-100-UE,P-CSCF-INVITE-S-CSCF,S-CSCF-100-P-CSCF,"S-CSCF-407&&SIP;cause=407;text=""CC_IMS_SESS_NE...","P-CSCF02A-407&&SIP;cause=407;text=""CC_IMS_SESS...",P-CSCF-ACK-S-CSCF,...,*,*,*,*,*,*,*,*,*,fail-WrongPin-BlockedPin
2,5,0.776211,FAILED-FUNC_ICBVBL_001-20200625_091222_2020062...,UE-INVITE-P-CSCF02A,P-CSCF02A-100-UE,P-CSCF-INVITE-S-CSCF,S-CSCF-100-P-CSCF,"S-CSCF-407&&SIP;cause=407;text=""CC_IMS_SESS_NE...",P-CSCF-ACK-S-CSCF,"P-CSCF02A-407&&SIP;cause=407;text=""CC_IMS_SESS...",...,*,*,*,*,*,*,*,*,*,fail-WrongPin-BlockedPin
3,5,0.776211,FAILED-FUNC_ICBVBL_001-20200625_101931_2020062...,UE-INVITE-P-CSCF02A,P-CSCF02A-100-UE,P-CSCF-INVITE-S-CSCF,S-CSCF-100-P-CSCF,"S-CSCF-407&&SIP;cause=407;text=""CC_IMS_SESS_NE...",P-CSCF-ACK-S-CSCF,"P-CSCF02A-407&&SIP;cause=407;text=""CC_IMS_SESS...",...,*,*,*,*,*,*,*,*,*,fail-WrongPin-BlockedPin
4,15,0.196301,1eFTS8reAN_timothy.vogel@verizonwireless.com_S...,UE-REGISTER-P-CSCF,P-CSCF-401-UE,P-CSCF-200-UE,UE-INVITE-P-CSCF,P-CSCF-100-UE,P-CSCF-480&&Q.850;cause=31-UE,UE-ACK-P-CSCF,...,*,*,*,*,*,*,*,*,*,fail_sip480_PCSCFNotReply
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
181,18,0.361494,PASSED-3PTY_001a-20200615_092358__RCA_PoC_12_1...,UE-INVITE-P-CSCF02A,P-CSCF02A-100-UE,P-CSCF-INVITE-S-CSCF,S-CSCF-100-P-CSCF,"S-CSCF-407&&SIP;cause=407;text=""ScscfSessionHe...","P-CSCF02A-407&&SIP;cause=407;text=""ScscfSessio...",P-CSCF-ACK-S-CSCF,...,*,*,*,*,*,*,*,*,*,pass_3pty
182,15,0.284036,Hy3vANpPwn_timothy.vogel@verizonwireless.com_S...,UE-REGISTER-P-CSCF,P-CSCF-401-UE,P-CSCF-200-UE,UE-INVITE-P-CSCF,P-CSCF-100-UE,P-CSCF-480&&Q.850;cause=31-UE,UE-ACK-P-CSCF,...,*,*,*,*,*,*,*,*,*,fail_sip480_PCSCFNotReply
183,7,0.899798,PASSED-3PTY_003a-20200626_091744_20200626_0920...,UE-INVITE-P-CSCF02A,P-CSCF02A-100-UE,P-CSCF-INVITE-S-CSCF,S-CSCF-100-P-CSCF,"S-CSCF-407&&SIP;cause=407;text=""CC_IMS_SESS_NE...",P-CSCF-ACK-S-CSCF,"P-CSCF02A-407&&SIP;cause=407;text=""CC_IMS_SESS...",...,*,*,*,*,*,*,*,*,*,pass_3pty
184,7,0.644610,PASSED-3PTY_002a-20200626_091744_20200626_0920...,UE-INVITE-P-CSCF02A,P-CSCF02A-100-UE,P-CSCF-INVITE-S-CSCF,S-CSCF-100-P-CSCF,"S-CSCF-407&&SIP;cause=407;text=""CC_IMS_SESS_NE...",P-CSCF-ACK-S-CSCF,"P-CSCF02A-407&&SIP;cause=407;text=""CC_IMS_SESS...",...,*,*,*,*,*,*,*,*,*,pass_3pty


In [72]:
if label is None:
    r = clusters_df
else:
    r = clusters_df[clusters_df['Cluster_Size']!=clusters_df['Max_Label_Count']]
r

Unnamed: 0,Cluster,Cluster_Size,Silhouette_Mean,Max_Label,Max_Label_Count,Percent
6,6,11,0.19339,fail_sip500ServiceUnavailable_PCSCFNotReply,4,0.363636
8,8,12,0.788026,fail-WrongPin-BlockedPin,10,0.833333
9,9,4,0.407271,fail-WrongPin-BlockedPin,3,0.75
10,10,11,0.450686,pass_basic,10,0.909091
13,13,3,0.498394,fail_label_sip503-P-SBC-Link-Down,2,0.666667


In [73]:
pcap_name = '1eFTS8reAN_timothy.vogel@verizonwireless.com_Scheduled_PS911_v4_Nokia_4TX_20-09-29_06_18_00_5662_+16692481432_352410090315321_Galaxy S9_starqltesq-user 9 PPR1.180610.011 G960USQS7CSK5 release-keys_011825'

cols = ['Cluster', 'Silhouette', 'pcap']
if label is not None:
    cols.append(label)
diff_df = pd.concat([all_df[cols], clustering_data], axis=1)
diff_df, similarity = comapre_pcap_to_cluster(diff_df, pcap_name)
print (similarity)
diff_df

0.6567075603310479


Unnamed: 0,pcap,Average,diff
-483-,0,0.909091,0.909091
P-CSCF-200-UE,0,0.818182,0.818182
P-CSCF-401-UE,0,0.636364,0.636364
UE-REGISTER-P-CSCF,0,0.636364,0.636364
P-CSCF-500-UE,1,0.454545,0.545455
UE-CANCEL-P-CSCF,0,0.454545,0.454545
P-CSCF-487-UE,0,0.363636,0.363636
UE-ACK-P-CSCF,1,0.727273,0.272727
-100-,0,0.090909,0.0909091
-300-,0,0.090909,0.0909091


In [74]:
cluster_number = 6

cols = ['Cluster', 'Silhouette', 'pcap']
if label is not None:
    cols.append(label)
diff_df = pd.concat([all_df[cols], clustering_data], axis=1)
scored_diff_df = new_score(diff_df)
clusters_mean, min_silouhete, max_silouhete, min_sim, max_sim = cluster_info(scored_diff_df, cluster_number)
clusters_mean.head (40)

Unnamed: 0,0
Cluster,6.0
Silhouette,0.19339
-100-,0.090909
-300-,0.090909
-483-,0.909091
-ACK-,0.090909
P-CSCF-200-UE,0.818182
P-CSCF-401-UE,0.636364
P-CSCF-480&&Q.850;cause=34-UE,0.090909
P-CSCF-487-UE,0.363636
