__Created and Maintained by Boon < <boon.siew@illumio.com> >__

__NOTE:__
This script generates the following outputs:<br>
    1. CSV with consolidated flows output - "consolidated_output.csv"<br>
    2. CSV with consolidated flows output without ports and protocols - "consolidated_output-noports.csv"<br>
    3. CSV with Consumer VENs without APP label assigned - "consumer_hostname_without_applabel_output.csv"<br>
    4. CSV with Provider VENs without APP label assigned - "provider_hostname_without_applabel_output.csv"<br>
    5. HTML with graph - "sunburst_chart_with_legend.html"
    
- pip3 install -U kaleido
- pip3 install -U pandas
- pip3 install -U plotly

**Update the following Variables:**

In [None]:
#csv_file_path="TrafficData 18_09_2023, 17_45_12.csv"
#csv_file_path="prod_to_nonprod.csv"
csv_file_path="1e23bcd3-1b45-4822-bdba-2c9cd1ddc116.csv"

Should we enable the location view? (default is False)

In [None]:
location=False

Sunburst chart depth level (default 2, upto 6)

In [None]:
maxdepth=4

<span style='background:Red;font-weight:bold;'> ### DO NOT EDIT FROM HERE ###</span>

In [None]:
import sys
import os
import pandas as pd
import plotly.express as px
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots

In [None]:
def read_csv_file(file_path):
    try:
        return pd.read_csv(file_path)
    except FileNotFoundError:
        print(f"File not found: {file_path}")
        sys.exit(1)

In [None]:
def preprocess_data(df):
    columns_to_replace = ['Consumer app', 'Consumer env', 'Consumer loc', 'Provider app', 'Provider env', 'Provider loc']
    df.loc[:, columns_to_replace] = df[columns_to_replace].fillna('NO LABEL')

    df.loc[df['Consumer Hostname'].isna() & df['Consumer IPList'].isna(), 'Consumer IPList'] = 'NOT IN IPLIST'
    df.loc[df['Provider Hostname'].isna() & df['Provider IPList'].isna(), 'Provider IPList'] = 'NOT IN IPLIST'

    df['First Detected'] = pd.to_datetime(df['First Detected'])
    df['Last Detected'] = pd.to_datetime(df['Last Detected'])
    
    # Calculate the earliest and latest timestamps
    earliest_timestamp = df['First Detected'].min()
    latest_timestamp = df['Last Detected'].max()
    
    time_difference = latest_timestamp - earliest_timestamp

    return df, time_difference

In [None]:
def sunburst_chart_output(df, path, maxdepth, chart_name, hover_name, hover_data, labels):
    fig = make_subplots(rows=1, cols=1)
    sunburst_chart = px.sunburst(
        df,
        path=path,
        values='Count',
        color='Count',  
        color_continuous_scale='Viridis', 
        hover_name=hover_name,  # Display Consumer app as hover text
        hover_data=hover_data,  
        labels=labels,  
        maxdepth=maxdepth,  
    )

    fig.add_trace(sunburst_chart.data[0])
    fig.update_layout(
        legend_title_text="Legend",
        legend_traceorder="normal",  # Change trace order in the legend
    )
    fig.update_traces(textinfo='label+percent entry', insidetextorientation='radial') 
    fig.update_layout(margin=dict(t=0, l=0, r=0, b=0))  

    with open(chart_name, 'w') as f:
        f.write(fig.to_html())

    fig.show()

In [None]:
def generate_sunburst_from_dataframe(df, columns, merge_columns, chart_name, path, hover_data, labels):
    grouped = df.groupby(columns).size().reset_index(name='Count')
    merge_grouped = df.groupby(merge_columns).size().reset_index(name='Count')

    if not grouped.empty:
        grouped.to_csv(f'{chart_name}_consolidated_output.csv', index=False)
        merge_grouped.to_csv(f'{chart_name}_consolidated_output-noports_info.csv', index=False)
    else:
        print("No data to generate Sunburst chart.")
        return
    
    path = path
    hover_name = path[-1]
    hover_data = hover_data
    labels = labels

    sunburst_chart_output(merge_grouped, path, maxdepth, f'{chart_name}_sunburst_chart.html', hover_name, hover_data, labels)

In [None]:
def generate_vens_to_vens_sunburst_chart(df, maxdepth=2, location=False):
    df_filtered = df[(~df['Consumer Hostname'].isna()) & (~df['Provider Hostname'].isna())]
    columns = ['Consumer Hostname', 'Consumer app', 'Consumer env', 'Consumer loc', 'Provider Hostname', 'Provider app', 'Provider env', 'Provider loc', 'Port', 'Protocol']
    merge_columns = ['Consumer app', 'Consumer env', 'Consumer loc', 'Provider app', 'Provider env', 'Provider loc']

    if location:
        path = ['Provider loc', 'Provider env', 'Provider app', 'Consumer loc', 'Consumer env', 'Consumer app']
    else:
        path = ['Provider env', 'Provider app', 'Consumer env', 'Consumer app']

    hover_data = ['Count', 'Consumer app', 'Consumer env', 'Provider app', 'Provider env']
    labels = {'Consumer app': 'Consumer App', 'Provider app': 'Provider App'}

    generate_sunburst_from_dataframe(df, columns, merge_columns, 'vens-to-vens', path, hover_data, labels)

In [None]:
def generate_iplists_to_vens_sunburst_chart(df, maxdepth=2, location=False):
    df_filtered = df[~df['Consumer IPList'].isna() & ~df['Provider Hostname'].isna()]
    columns = ['Consumer IP', 'Consumer IPList', 'Provider app', 'Provider env', 'Provider loc', 'Port', 'Protocol']
    merge_columns = ['Consumer IP', 'Consumer IPList', 'Provider app', 'Provider env', 'Provider loc']

    if location:
        path = ['Provider loc', 'Provider env', 'Provider app', 'Consumer IPList']
    else:
        path = ['Provider env', 'Provider app', 'Consumer IPList']

    hover_data = ['Count', 'Consumer IPList', 'Provider app', 'Provider env']
    labels = {'Consumer iplist': 'Consumer IPList', 'Provider app': 'Provider App'}

    generate_sunburst_from_dataframe(df, columns, merge_columns, 'iplists-to-vens', path, hover_data, labels)

In [None]:
def visualize_ports_usage(df, group, chart_name):
    grouped = df.groupby(group).size().reset_index(name='Count')
    stacked_chart_output(grouped, chart_name)
    table_view(df, group[0], group[-1])

In [None]:
def table_view(df, provider, consumer):    
    # Create a table showing Provider app and its count
    provider_app_counts = df[provider].value_counts().reset_index()
    provider_app_counts.columns = [provider, 'Count']
    
    table_fig = ff.create_table(provider_app_counts)
    table_fig.update_layout(
        title=f"{provider} Unique Hits Counts",
        margin=dict(t=50, l=10, r=10, b=10)
    )
    
    consumer_counts = df[consumer].value_counts().reset_index()
    consumer_counts.columns = [consumer, 'Count']
    
    table_fig2 = ff.create_table(consumer_counts)
    table_fig2.update_layout(
        title=f"{consumer} Unique Hits Counts",
        margin=dict(t=50, l=10, r=10, b=10)
    )
    
    table_fig.show()
    table_fig2.show()

In [None]:
def stacked_chart_output(grouped, chart_name):
    # Create a new column to group ports within specified ranges
    def group_ports(port):
        if 0 <= port <= 1023:
            return '0-1023'
        elif 1024 <= port <= 49151:
            return '1024-49151'
        elif 49152 <= port <= 65535:
            return '49152-65535'
        else:
            return 'Other'

    grouped['Port Range'] = grouped['Port'].apply(group_ports)

    # Create three separate DataFrames for each port range
    low_ports = grouped[grouped['Port Range'] == '0-1023']
    medium_ports = grouped[grouped['Port Range'] == '1024-49151']
    high_ports = grouped[grouped['Port Range'] == '49152-65535']

    # Create three stacked bar charts using Plotly Express
    fig_low_ports = px.bar(low_ports, x='Provider app', y='Count', color='Port', text='Count',
                           labels={'Provider app': 'Provider Application', 'Count': 'Count'},
                           title=f'{chart_name} Count of Unique Hits (0-1023) per Provider Application (Stacked)')

    fig_medium_ports = px.bar(medium_ports, x='Provider app', y='Count', color='Port', text='Count',
                              labels={'Provider app': 'Provider Application', 'Count': 'Count'},
                              title=f'{chart_name} Count of Unique Hits (1024-49151) per Provider Application (Stacked)')

    fig_high_ports = px.bar(high_ports, x='Provider app', y='Count', color='Port', text='Count',
                            labels={'Provider app': 'Provider Application', 'Count': 'Count'},
                            title=f'{chart_name} Count of Unique Hits (49152-65535) per Provider Application (Stacked)')

    # Customize the appearance of all three charts
    for fig in [fig_low_ports, fig_medium_ports, fig_high_ports]:
        fig.update_traces(textposition='inside')

    # Save the three charts as image files (PNG format)
    fig_low_ports.write_image(f"{chart_name}-fig_low_ports.png")
    fig_medium_ports.write_image(f"{chart_name}-fig_medium_ports.png")
    fig_high_ports.write_image(f"{chart_name}-fig_high_ports.png")

    # Show the charts
    fig_low_ports.show()
    fig_medium_ports.show()
    fig_high_ports.show()

In [None]:
def generate_consumer_and_provider_hostname_without_applabel(df):
    # Replace empty values with "NO LABEL" in 'Consumer app' and 'Provider app' columns
    columns_to_replace = ['Consumer app', 'Consumer env', 'Consumer loc', 'Provider app', 'Provider env', 'Provider loc']
    df[columns_to_replace] = df[columns_to_replace].fillna('NO LABEL')
    
    # Filter and group the DataFrame based on 'Consumer app' and 'Provider app'
    consumer_hostname_grouped = df[df['Consumer app'] == 'NO LABEL'].groupby(['Consumer Hostname', 'Consumer IP', 'Consumer app', 'Consumer env', 'Consumer loc']).size().reset_index(name='Count')
    provider_hostname_grouped = df[df['Provider app'] == 'NO LABEL'].groupby(['Provider Hostname', 'Provider IP', 'Provider app', 'Provider env', 'Provider loc']).size().reset_index(name='Count')

    # Save the grouped DataFrames to CSV files
    consumer_hostname_grouped.to_csv('consumer_hostname_without_applabel_output.csv', index=False)
    provider_hostname_grouped.to_csv('provider_hostname_without_applabel_output.csv', index=False)
    
    return consumer_hostname_grouped, provider_hostname_grouped

In [None]:
def show_time_difference(time_difference):
    days = time_difference.days
    total_seconds = time_difference.total_seconds()
    hours, remainder = divmod(total_seconds, 3600)
    minutes, _ = divmod(remainder, 60)
    
    return {
        'Days': days,
        'Hours': int(hours),
        'Minutes': int(minutes)
    }

<span style='background:Red;font-weight:bold;'> ### DO NOT EDIT TILL HERE ###</span>

In [None]:
df = read_csv_file(csv_file_path)
df, time_difference = preprocess_data(df)

In [None]:
consumer_hostname_without_applabel, provider_hostname_without_applabel = generate_consumer_and_provider_hostname_without_applabel(df)

In [None]:
if maxdepth < 2:
    maxdepth = 2
    
if maxdepth > 4:
    maxdepth = 4

 <span style='color:Blue;font-weight:bold;'> Outputs:</span> 

In [None]:
time = show_time_difference(time_difference)
print(f"Dataset timeframe: {time['Days']} Days, {time['Hours']} Hours, {time['Minutes']} Minutes.")

<span style='color:Blue;font-weight:bold;'> INFO:</span> Consumer VENs without APP label assigned

In [None]:
print(consumer_hostname_without_applabel)

<span style='color:Blue;font-weight:bold;'> INFO:</span> Provider VENs without APP label assigned

In [None]:
print(provider_hostname_without_applabel)

<span style='color:Blue;font-weight:bold;'> INFO:</span> Sunburst Chart for VENs to VENs visualization <br>

This Sunburst chart displays a hierarchical representation of data.<br>

Layer 1: Provider environment<br>
Layer 2: Provider application within Provider environment<br>
Layer 3: Consumer environment outbound to the Provider Application<br>
Layer 4: Consumer application within Consumer environment<br>

In [None]:
generate_vens_to_vens_sunburst_chart(df, maxdepth=maxdepth, location=location)

<span style='color:Blue;font-weight:bold;'> INFO:</span> VENs to VENs Ports Usage by Provider App Centric

In [None]:
csv_file_path = "./vens-to-vens_consolidated_output.csv"
if os.path.exists(csv_file_path):
    rdf = read_csv_file(csv_file_path)
    columns = ['Provider app', 'Port', 'Protocol', 'Consumer app']
    visualize_ports_usage(rdf, columns, "VENs-to-VENs")
else:
    print(f"File not found: {csv_file_path}, skipping visualization.")

<span style='color:Blue;font-weight:bold;'> INFO:</span> Sunburst Chart for IPLists to VENs visualization <br>

This Sunburst chart displays a hierarchical representation of data.<br>

Layer 1: Provider environment<br>
Layer 2: Provider application within Provider environment<br>
Layer 3: Consumer IPLists outbound to the Provider Application<br>
Layer 4: Consumer IPs within Consumer IPLists<br>

In [None]:
generate_iplists_to_vens_sunburst_chart(df, maxdepth=maxdepth, location=location)

<span style='color:Blue;font-weight:bold;'> INFO:</span> IPLists to VENs Ports Usage by Provider App Centric

In [None]:
csv_file_path = "./iplists-to-vens_consolidated_output.csv"
if os.path.exists(csv_file_path):
    idf = read_csv_file("./iplists-to-vens_consolidated_output.csv")
    columns = ['Provider app', 'Port', 'Protocol', 'Consumer IPList']
    visualize_ports_usage(idf, columns, "IPLists-to-VENs")
else:
    print(f"File not found: {csv_file_path}, skipping visualization.")