In [17]:
from IPython.display import clear_output

import panel as pn
import ipywidgets as widgets
import pandas as pd
import altair as alt
import pathlib
import datetime
from vega_datasets import data
import os
import pycountry

LOCATION = "Country"
TIMESTAMP = "Creation Time"
COUNT = "COUNT"
IP_ADDR = "Client IP Address"

TIME_BIN = "Time since first flow"

#User options
LOG_MAP = "Log in map"
LOG_CHART = "Log in chart"
SINGIN_ATTEMPTS = "Sign in attempts"

    
    
prefix_file_path = os.path.join("Task 2", "536535_data", "536535_data")
events_file_path = os.path.join(prefix_file_path, "m365_user_sessions", "events.csv")
sign_ins_file_path = os.path.join(prefix_file_path, "ms365_user_analysis", "sign_ins.csv")
dir_path = pathlib.Path().resolve()

sign_in_data: pd.DataFrame = pd.DataFrame(pd.read_csv(dir_path / sign_ins_file_path))

pn.extension('vega')

style = {'description_width': 'initial'}

top_n_widget = widgets.Dropdown(
    options=['10', '20', '30', '40', '50'],
    value='10',
    description='Top N stats',
    disabled=False,
    style = style,
)

data_type_widget = widgets.ToggleButtons(
    options=[LOG_MAP, LOG_CHART, 
             SINGIN_ATTEMPTS],
    description='Data type:',
    disabled=False,
    button_style='info', # 'success', 'info', 'warning', 'danger' or ''
    style = {'button_width':'200px'}, layout={'width': '520px'},
)

apply_changes_widget = widgets.Button(
    description='Apply changes',
    tooltip='Apply changes',
    disabled=False,
    button_style='success', # 'success', 'info', 'warning', 'danger' or ''
    style = style,
)
period_widget = widgets.IntRangeSlider(
    min=0,
    max=168,
    value=[0, 168],
    step=1,
    description="Time range in hours (max 1 week from the first event)",
    disabled=False,
    style = style,
    layout={'width': '520px'}
)

table_widget = widgets.HTML(value=sign_in_data.iloc[:0].to_html())

ui_widgets = [top_n_widget, data_type_widget, 
              period_widget, apply_changes_widget,
              ]

def get_first_time_flow_string():
    return sign_in_data[TIMESTAMP].min()

def get_last_time_flow_string():
    return sign_in_data[TIMESTAMP].max()

def get_first_time_flow():
    return datetime.datetime.fromisoformat(get_first_time_flow_string())

def get_last_time_flow():
    return datetime.datetime.fromisoformat(get_last_time_flow_string())

def to_datetime(x):
    return datetime.datetime.fromisoformat(x)

FIRST_TIMESTAMP = get_first_time_flow()
FIRST_TIMESTAMP_ROUNDED_DOWN = FIRST_TIMESTAMP.replace(second=0, minute=0)
LAST_TIMESTAMP = get_last_time_flow()

def to_hours_since_start(x):
    curr_time = to_datetime(x) - FIRST_TIMESTAMP_ROUNDED_DOWN
    return curr_time.total_seconds() // 3600

def plus_start_time(x):
    return FIRST_TIMESTAMP + datetime.timedelta(hours=to_hours_since_start(x))

def display_widgets():
    for ui_widget in ui_widgets:
        display(ui_widget)

def process_world_map_location(current_data: pd.DataFrame):
    if current_data.empty:
        return current_data

    world_map_data = pd.DataFrame(current_data)

    world_map_data['country_iso_numeric'] = world_map_data[LOCATION].apply(lambda x : pycountry.countries.get(alpha_2 = x).numeric)
    world_map_data = world_map_data.assign(country_count=world_map_data[LOCATION] + " - " + world_map_data[COUNT].astype(str))
    return world_map_data
    
def process_location(current_data: pd.DataFrame):
    if current_data.empty:
        return current_data
    return current_data.groupby(LOCATION)[LOCATION].size().reset_index(name=COUNT)

def process_by_top_n(current_data, top_n):
    if current_data.empty:
        return current_data
    return current_data.nlargest(top_n, columns=COUNT)

def process_data_geo(current_data, top_n, time_period, projection_type):
    if current_data.empty:
        return current_data

    geo_data = pd.DataFrame(current_data)

    geo_data = process_by_time_range(geo_data, time_period)
    geo_data = process_location(geo_data)

    if projection_type == "map":
        geo_data = process_world_map_location(geo_data)

    geo_data = process_by_top_n(geo_data, top_n)
    return geo_data

def process_by_time_range(current_data, time_period):
    if current_data.empty:
        return current_data
    
    start_offset, end_offset = time_period


    start = get_first_time_flow() + datetime.timedelta(hours=start_offset)
    end = get_first_time_flow() + datetime.timedelta(hours=end_offset)

    return current_data[(current_data[TIMESTAMP].apply(to_datetime) >= start) &
                      ((current_data[TIMESTAMP].apply(to_datetime)) <= end)]


def process_to_time_plot(current_data):
    time_plot_data = pd.DataFrame(current_data)

    return time_plot_data.groupby([TIMESTAMP, IP_ADDR]).size().reset_index(name=COUNT)
    
def process_data_sign(current_data, top_n, time_period):
    if current_data.empty:
        return current_data

    processed_data = process_by_time_range(current_data, time_period)
    processed_data[TIME_BIN] = processed_data[TIMESTAMP].apply(plus_start_time)

    return processed_data
    

def make_world_map(world_map_data):
    source = alt.topo_feature(data.world_110m.url, "countries")
    brush = alt.selection_interval(
        encodings=["longitude", "latitude"],
        empty=False,
        value={"longitude": [-50, -110],
               "longidute": [-50, -110]}
    )
    background = alt.Chart(source).mark_geoshape(fill="white", 
                                                 stroke="lightgrey").add_params(brush)
    selection = alt.selection_point(fields=[COUNT], bind='legend')
    foreground = (
        alt.Chart(source)
        .mark_geoshape(stroke="black", strokeWidth=0.15)
        .transform_lookup(
            lookup="id",
            from_=alt.LookupData(world_map_data, "country_iso_numeric", [COUNT, LOCATION]),
        )
        .transform_calculate(
            legend_label="datum.Country + ' - ' + datum.COUNT",
            lon="datum.geometry.coordinates[0]",
            lat="datum.geometry.coordinates[1]",
        )
        .encode(
            color=alt.Color(
                field=COUNT,
                scale=alt.Scale(scheme="lightgreyred"),
                legend=alt.Legend(
                    title="Country - Log in Attempts",
                    labelExpr="datum.label",
                ),
            ),
            opacity=alt.when(selection).then(alt.value(1)).otherwise(alt.value(0.1)),
            longitude="Longitude:Q",
            latitude="Latitude:Q",
            tooltip=[
                alt.Tooltip(f"{LOCATION}:N", title="Country"),
                alt.Tooltip(f"{COUNT}:Q", title="Log in attempts"),
            ],            
            
        ).add_params(
            selection,
            brush
        )
    )

    final_map = (
        (background + foreground)
        .configure_view(strokeWidth=0)
        .properties(width=800, height=500)
        .project("naturalEarth1")
        
    )
    final_map.add_params(brush)
    
    return alt.JupyterChart(final_map)
    
def make_location_graph(graph_data, top_n):
    tooltips = [COUNT]

    brush = alt.selection_interval(name="brush")

    graph = alt.Chart(graph_data).mark_bar().encode(
        x=graph_data.columns[1],
        y=alt.Y(graph_data.columns[0], sort='-x'),
        tooltip=tooltips,
        color=alt.Color(graph_data.columns[1],
                   scale=alt.Scale(range=['lightgreen', 'green']))
    ).properties(
        width=600,
        height=1000 if top_n >= 30 else 600,
        autosize=alt.AutoSizeParams(
            type='fit',
            contains='padding'
        ),
    ).add_params(brush)
    
    return alt.JupyterChart(graph)

def make_time_plot(processed_data, top_n):
    brush = alt.selection_point(name="brush")
    time_plot = alt.Chart(processed_data).mark_line(point=True).encode(
        alt.X(f"{TIME_BIN}:T", bin=alt.Bin(maxbins=168, binned=True), timeUnit="yearmonthdatehours"),
        y='count()',
        tooltip=[alt.Tooltip(f"count()", title="Sign in attempts"),
                 alt.Tooltip(f"{TIME_BIN}", title="Time", timeUnit="yearmonthdatehours")],
        color=alt.Color("count()",
                   scale=alt.Scale(range=['lightgreen', 'green'])),
    ).properties(
        title=f"Distribution of sign in attempts in time",    
        width=800, 
        height=500,
    ).add_params(brush)
    return alt.JupyterChart(time_plot)

def on_select_country(change):
    sel = change.new.value
    if sel is None or LOCATION not in sel:
        filtered = sign_in_data.iloc[:0]
    else:
        filtered = sign_in_data[sign_in_data[LOCATION].isin(sel[LOCATION])]

    table_widget.value = filtered.to_html()

def on_select_time_range(change):
    sel = change.new.value
    if sel is None or "yearmonthdatehours_Time since first flow" not in sel:
        filtered = sign_in_data.iloc[:0]
    else:
        start = datetime.datetime.fromtimestamp(sel["yearmonthdatehours_Time since first flow"][0] / 1000, datetime.timezone.utc) - FIRST_TIMESTAMP_ROUNDED_DOWN 
        end =  datetime.datetime.fromtimestamp(sel["yearmonthdatehours_Time since first flow"][1] / 1000, datetime.timezone.utc) - FIRST_TIMESTAMP_ROUNDED_DOWN 
        
        time_range = (start.total_seconds() // 3600, end.total_seconds() // 3600)
        filtered = process_by_time_range(sign_in_data, time_range)

    table_widget.value = filtered.to_html()

def on_change(v):
    clear_output(wait=True)
    top_n = int(top_n_widget.value)
    display_widgets()

    if data_type_widget.value in [LOG_MAP, LOG_CHART]:
        processed_data = process_data_geo(sign_in_data, top_n, period_widget.value, "map")

    if data_type_widget.value == SINGIN_ATTEMPTS:
        processed_data = process_data_sign(sign_in_data, top_n, period_widget.value)
    
    if processed_data.empty:
        print("Log: The data set is empty")
        return

    if (data_type_widget.value == LOG_MAP):
        plot: alt.JupyterChart = make_world_map(processed_data)
        plot.selections.observe(on_select_country, ["brush"])

    elif data_type_widget.value == LOG_CHART:
        plot: alt.JupyterChart = make_location_graph(processed_data, top_n)
        plot.selections.observe(on_select_country, ["brush"])

    elif data_type_widget.value == SINGIN_ATTEMPTS:
        plot: alt.JupyterChart = make_time_plot(processed_data, top_n)
        plot.selections.observe(on_select_time_range, ["brush"])

    
    display(widgets.VBox([plot, table_widget]))


display_widgets()
apply_changes_widget.on_click(on_change)
 

Dropdown(description='Top N stats', options=('10', '20', '30', '40', '50'), style=DescriptionStyle(description…

ToggleButtons(button_style='info', description='Data type:', index=2, layout=Layout(width='520px'), options=('…

IntRangeSlider(value=(0, 168), description='Time range in hours (max 1 week from the first event)', layout=Lay…

Button(button_style='success', description='Apply changes', style=ButtonStyle(), tooltip='Apply changes')

VBox(children=(JupyterChart(spec={'config': {'view': {'continuousWidth': 300, 'continuousHeight': 300}}, 'data…