__Created and Maintained by Boon < <boon.siew@illumio.com> >__

__NOTE:__
This script generates the following outputs:<br>
    1. CSV with consolidated flows output - "consolidated_output.csv"<br>
    2. CSV with consolidated flows output without ports and protocols - "consolidated_output-noports.csv"<br>
    3. CSV with Consumer VENs without APP label assigned - "consumer_hostname_without_applabel_output.csv"<br>
    4. CSV with Provider VENs without APP label assigned - "provider_hostname_without_applabel_output.csv"<br>
    5. HTML with graph - "sunburst_chart_with_legend.html"
    
- pip3 install -U kaleido
- pip3 install -U pandas
- pip3 install -U plotly

**Update the following Variables:**

In [None]:
csv_file_path="TrafficData_10_16_2023__4_03_09_PM.csv"

Should we enable the location view? (default is False)

In [None]:
location=True

Sunburst chart depth level (default 2, upto 6)

In [None]:
maxdepth=6

<span style='background:Red;font-weight:bold;'> ### DO NOT EDIT FROM HERE ###</span>

In [None]:
import sys
import os
import pandas as pd
import plotly.express as px
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots

In [None]:
def read_csv_file(file_path):
    try:
        return pd.read_csv(file_path)
    except FileNotFoundError:
        print(f"File not found: {file_path}")
        sys.exit(1)

In [None]:
def preprocess_data(df):
    # Condition 1: If "Consumer Hostname" and "Provider Hostname" are not empty, fill in with "NO LABEL" for certain columns if any one of them is empty.
    hostname_columns = ['Consumer app', 'Consumer env', 'Consumer loc', 'Provider app', 'Provider env', 'Provider loc']
    df.loc[~df['Consumer Hostname'].isna() & df['Consumer Hostname'].notna() &
           ~df['Provider Hostname'].isna() & df['Provider Hostname'].notna(),
           hostname_columns] = df[hostname_columns].fillna('NO LABEL')

    # Condition 2: If "Consumer IPList" is not empty, fill in with "NO LABEL" for certain columns if any one of them is empty.
    consumer_iplist_columns = ['Provider app', 'Provider env', 'Provider loc']
    df.loc[~df['Consumer IPList'].isna(), consumer_iplist_columns] = df[consumer_iplist_columns].fillna('NO LABEL')

    # Condition 3: If "Provider IPList" is not empty, fill in with "NO LABEL" for certain columns if any one of them is empty.
    provider_iplist_columns = ['Consumer app', 'Consumer env', 'Consumer loc']
    df.loc[~df['Provider IPList'].isna(), provider_iplist_columns] = df[provider_iplist_columns].fillna('NO LABEL')

    df['First Detected'] = pd.to_datetime(df['First Detected'])
    df['Last Detected'] = pd.to_datetime(df['Last Detected'])
    
    # Calculate the earliest and latest timestamps
    earliest_timestamp = df['First Detected'].min()
    latest_timestamp = df['Last Detected'].max()
    
    time_difference = latest_timestamp - earliest_timestamp

    return df, time_difference

In [None]:
def sunburst_chart_output(df, path, maxdepth, chart_name, hover_name, hover_data, labels):
    fig = make_subplots(rows=1, cols=1)
    sunburst_chart = px.sunburst(
        df,
        path=path,
        values='Count',
        color='Count',  
        color_continuous_scale='Viridis', 
        hover_name=hover_name,  # Display Consumer app as hover text
        hover_data=hover_data,  
        labels=labels,  
        maxdepth=maxdepth,  
    )

    fig.add_trace(sunburst_chart.data[0])
    fig.update_layout(
        legend_title_text="Legend",
        legend_traceorder="normal",  # Change trace order in the legend
    )
    fig.update_traces(textinfo='label+percent entry', insidetextorientation='radial') 
    fig.update_layout(margin=dict(t=0, l=0, r=0, b=0))  

    chart_path = os.path.join(directory, chart_name)
    
    with open(chart_path, 'w') as f:
        f.write(fig.to_html())

    fig.show()

In [None]:
def generate_sunburst_from_dataframe(df, columns, chart_name, path, hover_data, labels):
    global directory

    grouped = df.groupby(columns).size().reset_index(name='Count')

    if not grouped.empty:
        path1 = os.path.join(directory, f'{chart_name}_consolidated_output.csv')
        grouped.to_csv(path1, index=False)
    else:
        print("No data to generate Sunburst chart.")
        return
    
    path = path
    hover_name = path[-1]
    hover_data = hover_data
    labels = labels

    sunburst_chart_output(grouped, path, maxdepth, f'{chart_name}_sunburst_chart.html', hover_name, hover_data, labels)

In [None]:
def generate_vens_to_vens_sunburst_chart(df, maxdepth=2, location=False):
    df_filtered = df[
        ((~df['Consumer Name'].isna()) | (~df['Consumer Hostname'].isna())) &
        ((~df['Provider Name'].isna()) | (~df['Provider Hostname'].isna()))
    ]
    
    columns = ['Consumer app', 'Consumer env', 'Consumer loc', 'Provider app', 'Provider env', 'Provider loc', 'Port', 'Protocol']
    
    if location:
        path = ['Provider loc', 'Provider env', 'Provider app', 'Consumer loc', 'Consumer env', 'Consumer app']
    else:
        path = ['Provider env', 'Provider app', 'Consumer env', 'Consumer app']

    hover_data = ['Count', 'Consumer app', 'Consumer env', 'Provider app', 'Provider env']
    labels = {'Consumer app': 'Consumer App', 'Provider app': 'Provider App'}

    generate_sunburst_from_dataframe(df_filtered, columns, 'vens-to-vens', path, hover_data, labels)

In [None]:
def generate_iplists_to_vens_sunburst_chart(df, maxdepth=2, location=False):
    df_filtered = df[~df['Consumer IPList'].isna() & ~df['Provider Hostname'].isna()]
    columns = ['Consumer IPList', 'Provider app', 'Provider env', 'Provider loc', 'Port', 'Protocol']

    if location:
        path = ['Provider loc', 'Provider env', 'Provider app', 'Consumer IPList']
    else:
        path = ['Provider env', 'Provider app', 'Consumer IPList']

    hover_data = ['Count', 'Consumer IPList', 'Provider app', 'Provider env']
    labels = {'Consumer iplist': 'Consumer IPList', 'Provider app': 'Provider App'}

    generate_sunburst_from_dataframe(df, columns, 'iplists-to-vens', path, hover_data, labels)

In [None]:
def visualize_ports_usage(df, group, chart_name):
    grouped = df.groupby(group).size().reset_index(name='Count')
    stacked_chart_output(grouped, chart_name)
    table_view(df, group[0], group[-1])

In [None]:
def table_view(df, provider, consumer):    
    # Create a table showing Provider app and its count
    provider_app_counts = df[provider].value_counts().reset_index()
    provider_app_counts.columns = [provider, 'Count']
    
    table_fig = ff.create_table(provider_app_counts)
    table_fig.update_layout(
        title=f"{provider} Unique Hits Counts",
        margin=dict(t=50, l=10, r=10, b=10)
    )
    
    consumer_counts = df[consumer].value_counts().reset_index()
    consumer_counts.columns = [consumer, 'Count']
    
    table_fig2 = ff.create_table(consumer_counts)
    table_fig2.update_layout(
        title=f"{consumer} Unique Hits Counts",
        margin=dict(t=50, l=10, r=10, b=10)
    )
    
    table_fig.show()
    table_fig2.show()

In [None]:
def stacked_chart_output(grouped, chart_name):
    global directory
    
    # Create a new column to group ports within specified ranges
    def group_ports(port):
        if 0 <= port <= 1023:
            return '0-1023'
        elif 1024 <= port <= 49151:
            return '1024-49151'
        elif 49152 <= port <= 65535:
            return '49152-65535'
        else:
            return 'Other'

    grouped['Port Range'] = grouped['Port'].apply(group_ports)

    # Create three separate DataFrames for each port range
    low_ports = grouped[grouped['Port Range'] == '0-1023']
    medium_ports = grouped[grouped['Port Range'] == '1024-49151']
    high_ports = grouped[grouped['Port Range'] == '49152-65535']

    # Create three stacked bar charts using Plotly Express
    fig_low_ports = px.bar(low_ports, x='Provider app', y='Count', color='Port', text='Count',
                           labels={'Provider app': 'Provider Application', 'Count': 'Count'},
                           title=f'{chart_name} Count of Unique Hits (0-1023) per Provider Application (Stacked)')

    fig_medium_ports = px.bar(medium_ports, x='Provider app', y='Count', color='Port', text='Count',
                              labels={'Provider app': 'Provider Application', 'Count': 'Count'},
                              title=f'{chart_name} Count of Unique Hits (1024-49151) per Provider Application (Stacked)')

    fig_high_ports = px.bar(high_ports, x='Provider app', y='Count', color='Port', text='Count',
                            labels={'Provider app': 'Provider Application', 'Count': 'Count'},
                            title=f'{chart_name} Count of Unique Hits (49152-65535) per Provider Application (Stacked)')

    # Customize the appearance of all three charts
    for fig in [fig_low_ports, fig_medium_ports, fig_high_ports]:
        fig.update_traces(textposition='inside')

    chart_path = os.path.join(directory, chart_name)
        
    # Save the three charts as image files (PNG format)
    fig_low_ports.write_image(f"{chart_path}-fig_low_ports.png")
    fig_medium_ports.write_image(f"{chart_path}-fig_medium_ports.png")
    fig_high_ports.write_image(f"{chart_path}-fig_high_ports.png")

    # Show the charts
    fig_low_ports.show()
    fig_medium_ports.show()
    fig_high_ports.show()

In [None]:
def generate_consumer_and_provider_hostname_without_applabel(df):
    global directory
    
    columns_to_replace = ['Consumer app', 'Consumer env', 'Consumer loc', 'Provider app', 'Provider env', 'Provider loc']
    df[columns_to_replace] = df[columns_to_replace].fillna('NO LABEL')
    
    # Filter and group the DataFrame based on 'Consumer app' and 'Provider app'
    consumer_hostname_grouped = df[df['Consumer app'] == 'NO LABEL'].groupby(['Consumer Hostname', 'Consumer IP', 'Consumer app', 'Consumer env', 'Consumer loc']).size().reset_index(name='Count')
    provider_hostname_grouped = df[df['Provider app'] == 'NO LABEL'].groupby(['Provider Hostname', 'Provider IP', 'Provider app', 'Provider env', 'Provider loc']).size().reset_index(name='Count')

    # Check if both consumer_hostname_grouped and provider_hostname_grouped are empty
    if consumer_hostname_grouped.empty and provider_hostname_grouped.empty:
        return "No concern", "No concern"
    
    # Save the grouped DataFrames to CSV files
    consumer_csv_path = os.path.join(directory, 'consumer_hostname_without_applabel_output.csv')
    provider_csv_path = os.path.join(directory, 'provider_hostname_without_applabel_output.csv')
    
    consumer_hostname_grouped.to_csv(consumer_csv_path, index=False)
    provider_hostname_grouped.to_csv(provider_csv_path, index=False)
    
    if consumer_hostname_grouped.empty:
        return "No concern", provider_hostname_grouped
    elif provider_hostname_grouped.empty:
        return consumer_hostname_grouped, "No concern"
    
    return consumer_hostname_grouped, provider_hostname_grouped

In [None]:
def group_and_save_outputs(dataset, name):
    grouped = dataset.groupby(['Provider app', 'Provider env', 'Provider loc'])

    # Iterate through each unique combination and create CSV files
    for group_keys, group_data in grouped:
        provider_app, provider_env, provider_loc = group_keys

        provider_app = provider_app.replace('/','_')
        
        directory_name = f"{provider_app} | {provider_env} | {provider_loc}"
        os.makedirs(directory_name, exist_ok=True)

        csv_filename = f"{name}_{provider_app}_{provider_env}_{provider_loc}.csv"
        csv_path = os.path.join(directory_name, csv_filename)

        group_data.to_csv(csv_path, index=False)

In [None]:
def process_csv_file(csv_file_path, output_prefix, columns):
    if os.path.exists(csv_file_path):
        df = read_csv_file(csv_file_path)
        visualize_ports_usage(df, columns, output_prefix)
        group_and_save_outputs(df, output_prefix)
    else:
        print(f"File not found: {csv_file_path}, skipping visualization.")

In [None]:
def output_workloads(df, columns, filename):
    global directory
    
    # Group the DataFrame by the specified columns and count occurrences
    grouped = df.groupby(columns).size().reset_index(name='Count')
    grouped = grouped.sort_values(by='Count', ascending=False)
    
    csv_path = os.path.join(directory, filename)
    grouped.to_csv(csv_path, index=False)
    
    return grouped

<span style='background:Red;font-weight:bold;'> ### DO NOT EDIT TILL HERE ###</span>

In [None]:
df = read_csv_file(csv_file_path)
df, time_difference = preprocess_data(df)

directory = "consolidated_outputs"
os.makedirs(directory, exist_ok=True)

In [None]:
consumer_hostname_without_applabel, provider_hostname_without_applabel = generate_consumer_and_provider_hostname_without_applabel(df)

In [None]:
if maxdepth < 2:
    maxdepth = 2
    
if maxdepth > 6:
    maxdepth = 6

 <span style='color:Blue;font-weight:bold;font-size: 20px'> Outputs:</span> 

In [None]:
print(f"Dataset timeframe: {time_difference}")

<span style='font-size: 20px'> <span style='color:Blue;font-weight:bold'> INFO:</span> Port Hit Count (refer: "port_usage_count.csv")</span>

In [None]:
output_workloads(df, ['Protocol', 'Port'], "port_usage_count.csv")
table_view(df, 'Protocol', 'Port')

<span style='font-size: 20px'><span style='color:Blue;font-weight:bold;'> INFO:</span> Outgoing Port Hit Count (refer: "outgoing_port_usage_count.csv")</span>

In [None]:
print(output_workloads(df, ['Consumer app', 'Consumer env', 'Consumer loc', 'Protocol', 'Port'], "outgoing_port_usage_count.csv"))

<span style='font-size: 20px'><span style='color:Blue;font-weight:bold;'> INFO:</span> Incoming Port Hit Count (refer: "incoming_port_usage_count.csv")</span>

In [None]:
print(output_workloads(df, ['Provider app', 'Provider env', 'Provider loc', 'Protocol', 'Port'], "incoming_port_usage_count.csv"))

<span style='font-size: 20px'><span style='color:Blue;font-weight:bold;'> INFO:</span> Consumer VENs/UMWLs Hit Count (refer: 'consumer_hit_counts.csv')</span>

In [None]:
print(output_workloads(df, ['Consumer Hostname', 'Consumer app', 'Consumer env', 'Consumer loc'], 'consumer_hit_counts.csv'))

<span style='font-size: 20px'><span style='color:Blue;font-weight:bold;'> INFO:</span> Provider VENs/UMWLs Hit Count (refer: 'provider_hit_counts.csv')</span>

In [None]:
print(output_workloads(df, ['Provider Hostname', 'Provider app', 'Provider env', 'Provider loc'], 'provider_hit_counts.csv'))

<span style='font-size: 20px'><span style='color:Blue;font-weight:bold;'> INFO:</span> Consumer VENs without APP label assigned (refer: 'consumer_hostname_without_applabel_output.csv')</span>

In [None]:
print(consumer_hostname_without_applabel)

<span style='font-size: 20px'><span style='color:Blue;font-weight:bold;'> INFO:</span> Provider VENs without APP label assigned (refer: 'provider_hostname_without_applabel_output.csv')</span>

In [None]:
print(provider_hostname_without_applabel)

<span style='font-size: 20px'><span style='color:Blue;font-weight:bold;'> INFO:</span> Sunburst Chart for VENs to VENs visualization <br>

This Sunburst chart displays a hierarchical representation of data. (refer: 'vens-to-vens_sunburst_chart.html')<br>

Layer 1: Provider environment<br>
Layer 2: Provider application within Provider environment<br>
Layer 3: Consumer environment outbound to the Provider Application<br>
Layer 4: Consumer application within Consumer environment<br></span>

In [None]:
generate_vens_to_vens_sunburst_chart(df, maxdepth=maxdepth, location=location)

<span style='font-size: 20px'><span style='color:Blue;font-weight:bold;'> INFO:</span> VENs to VENs Ports Usage by Provider App Centric</span>

In [None]:
process_csv_file(f"{directory}/vens-to-vens_consolidated_output.csv", "VENs-to-VENs", ['Provider app', 'Port', 'Protocol', 'Consumer app'])

<span style='font-size: 20px'><span style='color:Blue;font-weight:bold;'> INFO:</span> Sunburst Chart for IPLists to VENs visualization <br>

This Sunburst chart displays a hierarchical representation of data. (refer: 
'iplists-to-vens_sunburst_chart.html')<br>

Layer 1: Provider environment<br>
Layer 2: Provider application within Provider environment<br>
Layer 3: Consumer IPLists outbound to the Provider Application<br>
Layer 4: Consumer IPs within Consumer IPLists<br></span>

In [None]:
generate_iplists_to_vens_sunburst_chart(df, maxdepth=maxdepth, location=location)

<span style='font-size: 20px'><span style='color:Blue;font-weight:bold;'> INFO:</span> IPLists to VENs Ports Usage by Provider App Centric</span>

In [None]:
process_csv_file(f"{directory}/iplists-to-vens_consolidated_output.csv", "IPLists-to-VENs", ['Provider app', 'Port', 'Protocol', 'Consumer IPList'])