In [1]:
import multiprocessing
import uuid
from itertools import repeat
from multiprocessing import Pool
from pathlib import Path

import pandas as pd
from sklearn.cluster import KMeans

import clustering_utils as utils
import pcap_generic_parser_helper as parser

# How to Run
1- Create your project folder in /data/input

2- Place your pcap and json files within your project folder

3- Set the project_name below

4- Run the Notebook from the start

5- Check the parser and clustering output:
> All parsers output ./data/input/<project_name>/all_parser_output.csv
<br>
> Sip only parser output ./data/input/<project_name>/sip_parser_output.csv
<br>
> Clustering ./data/input/<project_name>/clustered_data.csv

6- Check the notebook clustering_analysis for further insights into clustering results

In [None]:
project_name = 'example_project'

In [None]:
input_folder_path = Path.cwd().parent / 'data/input' / project_name
output_folder_path = Path.cwd().parent / 'data/output' / project_name
json_folder_path = output_folder_path / f'jsonfiles_{uuid.uuid4()}'

json_folder_path.mkdir(parents=True, exist_ok=True)

sip_output_filename = 'sip_parser_output.csv'
gtp_output_filename = 'gtp_parser_output.csv'
diameter_output_filename = 'diameter_parser_output.csv'
gtp_sip_output_csv_filename = 'gtp_sip_parser_output.csv'
all_output_csv_filename = 'all_parser_output.csv'

num_processors = multiprocessing.cpu_count()
p = Pool(processes=num_processors)

In [None]:
# Get pcap files from the ./data/input/<project_name>
pcap_files = [x for x in input_folder_path.glob('*.pcap')]
len(pcap_files)

In [None]:
# Convert PCAP to JSON
pcaps_list = p.starmap(parser.convert_pcap, zip(repeat(json_folder_path), pcap_files))

In [None]:
# Get converted json files + ones in ./data/input/<project_name>
json_files = [x for x in input_folder_path.glob('*.json')]
json_files = json_files + [x for x in json_folder_path.glob('*.json')]
len(json_files)

# Sip Parser

In [None]:
# Read and parse json packets, generates a nest List
sip_parsed = p.map(parser.read_parse_sip, json_files)

In [None]:
# Get output from original parser and concatenate both
sip_df_out_final = pd.DataFrame(data=sip_parsed)
sip_df_out_final.fillna("*", inplace=True)
sip_df_out_final = sip_df_out_final.set_index(sip_df_out_final.columns[0])
sip_df_out_final.index.names = ['pcap']
sip_df_out_final = sip_df_out_final.add_prefix('sip ')

In [None]:
sip_df_out_final.to_csv(output_folder_path / sip_output_filename)
sip_df_out_final

# GTPv2 Parser

In [None]:
# Get all possible gtpv2 causes
protocols_list_from_tshark = p.map(parser.tshark_aggregate_gtp_cause, json_files)
# Split each protocol to a new row
values = set()
for x in protocols_list_from_tshark:
    values.update(x)
try:
    values.remove('')
except KeyError as e:
    pass
gtp_causes = list(values)

In [None]:
# Read and parse json packets, generates a nest List
gtp_parse_output = p.starmap(parser.read_parse_gtp, zip(json_files, repeat(gtp_causes)))

In [None]:
# Get output from original parser and concatenate both
gtp_df_out_final = pd.DataFrame(data=gtp_parse_output)
gtp_df_out_final.fillna("*", inplace=True)
gtp_df_out_final = gtp_df_out_final.set_index(gtp_df_out_final.columns[0])
gtp_df_out_final.index.names = ['pcap']
gtp_df_out_final = gtp_df_out_final.rename(columns={1: 'n Requests', 2: 'n Responses', 3: 'n unanswered requests'})
gtp_df_out_final = gtp_df_out_final.add_prefix('gtpv2 ')

for i in range(0, len(gtp_causes)):
    gtp_df_out_final = gtp_df_out_final.rename(columns={'gtpv2 ' + str(i + 4): 'gtpv2 cause = ' + str(gtp_causes[i])})

In [None]:
gtp_df_out_final.to_csv(output_folder_path / gtp_output_filename)
gtp_df_out_final

# Diameter Parser

In [None]:
# Get all possible diameter result codes
diameter_list_from_tshark = p.map(parser.tshark_aggregate_diameter_result_code, json_files)
# Split each protocol to a new row
values = set()
for x in diameter_list_from_tshark:
    values.update(x)
try:
    values.remove('')
except KeyError as e:
    pass
diameter_result_codes = list(values)

In [None]:
# Read and parse json packets, generates a nest List
diameter_parse_output = p.starmap(parser.read_parse_diameter, zip(json_files, repeat(diameter_result_codes)))

In [None]:
# Get output from original parser and concatenate both
diameter_df_out_final = pd.DataFrame(data=diameter_parse_output)
diameter_df_out_final.fillna("*", inplace=True)
diameter_df_out_final = diameter_df_out_final.set_index(diameter_df_out_final.columns[0])
diameter_df_out_final.index.names = ['pcap']
diameter_df_out_final = diameter_df_out_final.rename(columns={1: 'n Requests', 2: 'n Responses', 3: 'n unanswered requests'})
diameter_df_out_final = diameter_df_out_final.add_prefix('diameter ')

for i in range(0, len(diameter_result_codes)):
    diameter_df_out_final = diameter_df_out_final.rename(columns={'diameter ' + str(i + 4): 'diameter result code = ' + str(diameter_result_codes[i])})

In [None]:
diameter_df_out_final.to_csv(output_folder_path / diameter_output_filename)
diameter_df_out_final

# Merge sip, gtpv2, and diameter Parser Outputs

In [None]:
sip_gtp_df = pd.merge(gtp_df_out_final, sip_df_out_final, on='pcap')
sip_gtp_df.to_csv(output_folder_path / gtp_sip_output_csv_filename)

In [None]:
# all_parser_out_df = diameter_df_out_final.merge(gtp_df_out_final , on='pcap').merge(sip_df_out_final, on='pcap')
all_parser_out_df = diameter_df_out_final.join([gtp_df_out_final, sip_df_out_final])
all_parser_out_df.to_csv(output_folder_path / all_output_csv_filename)
all_parser_out_df

# Cluster Parser Output

In [None]:
ignore_cols = ['pcap']
label = None
static_cols = ('gtpv2 n ', 'gtpv2 cause =', 'diameter n ', 'diameter result code =')
replace_none = None
use_encoder = True  # If True calls one_hot_encoder
number_of_clusters = -1  # specify number of cluster. If -1 calculate optimal_cluster_num

In [None]:
all_parser_out_df = all_parser_out_df.reset_index().drop_duplicates()
if replace_none is not None:
    utils.transform_data(all_parser_out_df, replace_none)
all_parser_out_df

In [None]:
clustering_data = all_parser_out_df[[x for x in all_parser_out_df.columns if x not in ignore_cols]]
if use_encoder:
    df_static = clustering_data[[x for x in clustering_data.columns if x.startswith(static_cols)]]
    df_dynamic = clustering_data[[x for x in clustering_data.columns if not x in df_static]]

    clustering_data = utils.one_hot_encoder(df_dynamic)
    clustering_data = pd.concat([df_static, clustering_data], axis=1)
clustering_data

In [None]:
if number_of_clusters == -1:
    number_of_clusters = utils.optimal_cluster_num(clustering_data)
params = {'n_clusters': number_of_clusters, 'init': 'k-means++', 'max_iter': 120, 'n_init': 25, 'random_state': 1}
clusterer = KMeans(**params)
clusters, silhouette = utils.cluster(clusterer, clustering_data)
print('number of clusters {}'.format(number_of_clusters))

In [None]:
all_df, clusters_df, score, percent_mean, silhouette_mean = utils.score_fun(all_parser_out_df, clusters, silhouette, label)
print('Silhouette Mean {}'.format(silhouette_mean))
all_df.to_csv(output_folder_path / 'clustered_data.csv')
all_df

In [None]:
clusters_df