In [None]:
from IPython.display import clear_output

import panel as pn
import ipywidgets as widgets
import pandas as pd
import altair as alt
import pathlib
import datetime
from vega_datasets import data
import os

TIMESTAMP = "Received At"
RECIPIENT = "Recipient Address"
SENDER = "Sender Address"
SUBJECT = "Subject"
SIZE = "Size"
STATUS = "Status"
SRC_IP = "From IP"
DST_IP = "To IP"

COUNT = "COUNT"
USER_SUBJECT_COUNT = "USER_SUBJECT_COUNT"
TIME_BIN = "Timestamp"

#User Options
TIME_PLOT_DSC = "Time plot"
TOP_RECEIVED_DSC = "Most received emails"
TOP_SENT_DSC = "Most sent emails"
TOP_SUBJECT_DSC = "Most emails by subject"
TIME_HISTOGRAM = "Sent mails in time"


prefix_file_path = os.path.join("Task 2", "536535_data", "536535_data")
phishing_file_path = os.path.join(prefix_file_path, "ms365_phishing", "emails.csv")
dir_path = pathlib.Path().resolve()

phishing_data: pd.DataFrame = pd.DataFrame(pd.read_csv(dir_path / phishing_file_path))

pn.extension('vega')

style = {'description_width': 'initial'}

top_n_widget = widgets.Dropdown(
    options=['10', '20', '30', '40', '50'],
    value='10',
    description='Top N stats',
    disabled=False,
    style = style,
)

data_type_widget = widgets.ToggleButtons(
    options=[TOP_RECEIVED_DSC, TOP_SENT_DSC,
             TOP_SUBJECT_DSC, TIME_HISTOGRAM],
    description='Projection type:',
    disabled=False,
    button_style='info', # 'success', 'info', 'warning', 'danger' or ''
    style = {'button_width':'200px'}, layout={'width': '520px'},
)

apply_changes_widget = widgets.Button(
    description='Apply changes',
    tooltip='Apply changes',
    disabled=False,
    button_style='success', # 'success', 'info', 'warning', 'danger' or ''
    style = style,
)

period_widget = widgets.IntRangeSlider(
    min=0,
    max=168,
    value=[0, 168],
    step=1,
    description="Time range in hours (max 1 week from the first event)",
    disabled=False,
    style = style,
    layout={'width': '520px'}
)

user_input_widget = widgets.Textarea(
    value='ALL',
    description='User UCO as email recipient (seperated by ,)',
    disabled=True,
    style = style,
    layout = widgets.Layout(width='80%')
)

ui_widgets = [top_n_widget, period_widget, data_type_widget,
              user_input_widget, apply_changes_widget,
              ]

table_widget = widgets.HTML(value=phishing_data.iloc[:0].to_html())


def display_widgets():
    for ui_widget in ui_widgets:
        display(ui_widget)

def get_first_time_flow_string():
    return phishing_data[TIMESTAMP].min()

def get_last_time_flow_string():
    return phishing_data[TIMESTAMP].max()

def get_first_time_flow():
    return datetime.datetime.fromisoformat(get_first_time_flow_string())

def get_last_time_flow():
    return datetime.datetime.fromisoformat(get_last_time_flow_string())

def to_datetime(x):
    return datetime.datetime.fromisoformat(x)

FIRST_TIMESTAMP = get_first_time_flow()
FIRST_TIMESTAMP_ROUNDED_DOWN = FIRST_TIMESTAMP.replace(second=0, minute=0)
LAST_TIMESTAMP = get_last_time_flow()

def to_hours_since_start(x):
    curr_time = to_datetime(x) - FIRST_TIMESTAMP_ROUNDED_DOWN
    return curr_time.total_seconds() // 3600

def plus_start_time(x):
    return FIRST_TIMESTAMP + datetime.timedelta(hours=to_hours_since_start(x))

def process_by_top_n(current_data, top_n):
    if current_data.empty:
        return current_data
    return current_data.nlargest(top_n, columns=COUNT)

def process_by_unique_recip_to_subject(processed_data, original_data):

    unique_recip_subject_data = original_data.groupby([RECIPIENT, SUBJECT])[[RECIPIENT, SUBJECT]].size().reset_index(name=USER_SUBJECT_COUNT)
    unique_recip_subject_data = unique_recip_subject_data.groupby(SUBJECT)[SUBJECT].size().reset_index(name=USER_SUBJECT_COUNT)
    processed_data = processed_data.merge(unique_recip_subject_data, how='left', on=SUBJECT)

    return processed_data

def process_by_time_range(current_data, time_period):

    if current_data.empty:
        return current_data

    start_offset, end_offset = time_period

    start = get_first_time_flow() + datetime.timedelta(hours=start_offset)
    end = get_first_time_flow() + datetime.timedelta(hours=end_offset)

    return current_data[(current_data[TIMESTAMP].apply(to_datetime) >= start) &
                      ((current_data[TIMESTAMP].apply(to_datetime)) <= end)]

def process_by_recipient(current_data, recipients_str):
    if recipients_str == "ALL" or recipients_str == "":
        return current_data

    recipients = [ "^" + x.strip() + "@" for x in recipients_str.split(',') ]
    recipients_regex = "|".join(recipients)
    processed_data = current_data[current_data[RECIPIENT].str.contains(pat=recipients_regex, regex=True)]
    return processed_data

def process_data_phishing_subject(current_data, top_n, time_period):
    processed_data = process_by_time_range(current_data, time_period)
    processed_data = processed_data.groupby(SUBJECT)[[SUBJECT]].size().reset_index(name=COUNT)
    processed_data = process_by_unique_recip_to_subject(processed_data, current_data)

    return process_by_top_n(processed_data, top_n)

def process_data_phishing(current_data: pd.DataFrame, top_n, time_period, direction):
    processed_data = process_by_time_range(current_data, time_period)
    processed_data = processed_data.groupby(direction)[direction].size().reset_index(name=COUNT)

    return process_by_top_n(processed_data, top_n)

def process_to_time_histogram(current_data: pd.DataFrame, top_n, time_period, recipients_str: str):
    processed_data = process_by_time_range(current_data, time_period)
    processed_data.loc[:, TIME_BIN] = processed_data.loc[:, TIMESTAMP].apply(plus_start_time)
    processed_data = process_by_recipient(processed_data, recipients_str)

    return processed_data

def make_subject_plot(current_data):
    if current_data.empty:
        print("Log: The data set is empty")
        return
    
    brush = alt.selection_interval(name="brush")
    plot = alt.Chart(current_data).mark_bar().encode(
        x=alt.X(COUNT),
        y=alt.Y(SUBJECT, sort='-x'),
        color=alt.Color(COUNT,
                   scale=alt.Scale(range=['lightgreen', 'green'])),
         tooltip=[
                alt.Tooltip(f"{SUBJECT}", title="Subject"),
                alt.Tooltip(f"{USER_SUBJECT_COUNT}:Q", title="Different recipients with this subject"),
                alt.Tooltip(f"{COUNT}:Q", title="Email count"),
             ],
    ).properties(
        title=f"Top emails by subject",
        width=800,
        height=500,
    ).add_params(brush)
    return alt.JupyterChart(plot)

def make_received_sent_plot(current_data, emails_direction):
    if current_data.empty:
        print("Log: The data set is empty")
        return
    brush = alt.selection_interval(name="brush")
    plot = alt.Chart(current_data).mark_bar().encode(
        x=alt.X(COUNT),
        y=alt.Y(emails_direction, sort='-x'),
        color=alt.Color(COUNT,
                   scale=alt.Scale(range=['lightgreen', 'green'])),
         tooltip=[
                alt.Tooltip(f"{COUNT}:Q", title="Email count"),
             ],
    ).properties(
        title=f"Top {top_n_widget.value} {emails_direction}es",
        width=800,
        height=500,
    ).add_params(brush)

    return alt.JupyterChart(plot)

def make_time_histogram(current_data, time_range):
    if current_data.empty:
        print("Log: The data set is empty")
        return

    start, end = time_range
    brush = alt.selection_interval(name="brush")
    plot = alt.Chart(current_data).mark_bar().encode(
        alt.X(f"{TIME_BIN}:T", bin=alt.Bin(maxbins=end-start, binned=True), timeUnit="yearmonthdatehours"),
        y='count()',
        tooltip=[alt.Tooltip(f"count()", title="Email count"),
                 alt.Tooltip(f"{TIME_BIN}", title="Time", timeUnit="yearmonthdatehours")],
        color=alt.Color("count()",
                   scale=alt.Scale(range=['lightgreen', 'green'])),
    ).properties(
        title=f"Distribution of phishing emails in time",
        width=800,
        height=500,
    ).add_params(brush)

    return alt.JupyterChart(plot)

def data_type_on_change(v):
    if data_type_widget.value == TIME_HISTOGRAM:
        user_input_widget.disabled = False
    else:
        user_input_widget.disabled = True

def on_select_time_range(change):
    sel = change.new.value
    if sel is None or "yearmonthdatehours_Timestamp" not in sel:
        filtered = phishing_data.iloc[:0]
    else:
        start = datetime.datetime.fromtimestamp(sel["yearmonthdatehours_Timestamp"][0] / 1000, datetime.timezone.utc) - FIRST_TIMESTAMP_ROUNDED_DOWN
        end =  datetime.datetime.fromtimestamp(sel["yearmonthdatehours_Timestamp"][-1] / 1000, datetime.timezone.utc) - FIRST_TIMESTAMP_ROUNDED_DOWN

        time_range = (start.total_seconds() // 3600, end.total_seconds() // 3600)
        filtered = process_by_time_range(phishing_data, time_range)

    table_widget.value = filtered.to_html()

def on_select_subject(change):
    sel = change.new.value
    if sel is None or SUBJECT not in sel:
        filtered = phishing_data.iloc[:0]
    else:
        filtered = phishing_data[phishing_data[SUBJECT].isin(sel[SUBJECT])]

    table_widget.value = filtered.to_html()

def on_select_email_address(change):
    sel = change.new.value
    print(sel)
    if sel is None or (RECIPIENT not in sel and SENDER not in sel):
        filtered = phishing_data.iloc[:0]
    else:
        if RECIPIENT in sel:
            filtered = phishing_data[phishing_data[RECIPIENT].isin(sel[RECIPIENT])]
        else:
            filtered = phishing_data[phishing_data[SENDER].isin(sel[SENDER])]
        filtered = process_by_time_range(filtered, period_widget.value)

    table_widget.value = filtered.to_html()

def on_change(v):
    clear_output(wait=True)
    top_n = int(top_n_widget.value)
    display_widgets()

    if data_type_widget.value == TOP_RECEIVED_DSC:
        processed_data = process_data_phishing(phishing_data, top_n, period_widget.value, RECIPIENT)
        plot: alt.JupyterChart = make_received_sent_plot(processed_data, RECIPIENT)
        plot.selections.observe(on_select_email_address, ["brush"])

    elif data_type_widget.value == TOP_SENT_DSC:
        processed_data = process_data_phishing(phishing_data, top_n, period_widget.value, SENDER)
        plot: alt.JupyterChart = make_received_sent_plot(processed_data, SENDER)
        plot.selections.observe(on_select_email_address, ["brush"])

    elif data_type_widget.value == TOP_SUBJECT_DSC:
        processed_data = process_data_phishing_subject(phishing_data, top_n, period_widget.value)
        plot: alt.JupyterChart = make_subject_plot(processed_data)
        plot.selections.observe(on_select_subject, ["brush"])

    elif data_type_widget.value == TIME_HISTOGRAM:
        processed_data = process_to_time_histogram(phishing_data, top_n, period_widget.value, user_input_widget.value)
        plot: alt.JupyterChart = make_time_histogram(processed_data, period_widget.value)
        plot.selections.observe(on_select_time_range, ["brush"])

    display(widgets.VBox([plot, table_widget]))

display_widgets()
apply_changes_widget.on_click(on_change)
data_type_widget.on_trait_change(data_type_on_change)


Dropdown(description='Top N stats', options=('10', '20', '30', '40', '50'), style=DescriptionStyle(description…

IntRangeSlider(value=(0, 168), description='Time range in hours (max 1 week from the first event)', layout=Lay…

ToggleButtons(button_style='info', description='Projection type:', layout=Layout(width='520px'), options=('Mos…

Textarea(value='ALL', description='User UCO as email recipient (seperated by ,)', disabled=True, layout=Layout…

Button(button_style='success', description='Apply changes', style=ButtonStyle(), tooltip='Apply changes')

VBox(children=(JupyterChart(spec={'config': {'view': {'continuousWidth': 300, 'continuousHeight': 300}}, 'data…

{'COUNT': [0.27166668891906737, 0.27666668891906737], 'Recipient Address': ['766826@muni.cz']}
{'COUNT': [0.27166668891906737, 0.2933333301544189], 'Recipient Address': ['766826@muni.cz']}
{'COUNT': [0.27166668891906737, 0.3750000476837158], 'Recipient Address': ['766826@muni.cz', '700015@phil.muni.cz']}
{'COUNT': [0.27166668891906737, 0.40833333015441897], 'Recipient Address': ['766826@muni.cz', '700015@phil.muni.cz']}
