In [None]:
from IPython.display import clear_output

import panel as pn
import ipywidgets as widgets
import pandas as pd
import altair as alt
import pathlib
import datetime
from vega_datasets import data
import os
import pycountry

LOCATION = "Country"
TIMESTAMP = "Creation Time"
COUNT = "COUNT"
IP_ADDR = "Client IP Address"

LOG_MAP = "Log in map"
LOG_CHART = "Log in chart"
SINGIN_ATTEMPTS = "Sign in attempts"

    
    
prefix_file_path = os.path.join("Task 2", "536535_data", "536535_data")
events_file_path = os.path.join(prefix_file_path, "m365_user_sessions", "events.csv")
sign_ins_file_path = os.path.join(prefix_file_path, "ms365_user_analysis", "sign_ins.csv")
dir_path = pathlib.Path().resolve()

sign_in_data: pd.DataFrame = pd.DataFrame(pd.read_csv(dir_path / sign_ins_file_path))

pn.extension('vega')

style = {'description_width': 'initial'}

top_n_widget = widgets.Dropdown(
    options=['10', '20', '30', '40', '50'],
    value='10',
    description='Top N stats',
    disabled=False,
    style = style,
)

data_type_widget = widgets.ToggleButtons(
    options=[LOG_MAP, LOG_CHART, 
             SINGIN_ATTEMPTS],
    description='Data type:',
    disabled=False,
    button_style='info', # 'success', 'info', 'warning', 'danger' or ''
    style = {'button_width':'200px'}, layout={'width': '520px'},
)

apply_changes_widget = widgets.Button(
    description='Apply changes',
    tooltip='Apply changes',
    disabled=False,
    button_style='success', # 'success', 'info', 'warning', 'danger' or ''
    style = style,
)
period_widget = widgets.IntRangeSlider(
    min=0,
    max=168,
    value=[0, 168],
    step=1,
    description="Time range in hours (max 1 week from the first event)",
    disabled=False,
    style = style,
    layout={'width': '520px'}
)

ui_widgets = [top_n_widget, data_type_widget, 
              period_widget, apply_changes_widget,
              ]

def get_first_time_flow_string():
    return sign_in_data[TIMESTAMP].min()

def get_last_time_flow_string():
    return sign_in_data[TIMESTAMP].max()

def get_first_time_flow():
    return datetime.datetime.fromisoformat(get_first_time_flow_string())

def get_last_time_flow():
    return datetime.datetime.fromisoformat(get_last_time_flow_string())

def to_datetime(x):
    return datetime.datetime.fromisoformat(x)

def display_widgets():
    for ui_widget in ui_widgets:
        display(ui_widget)

def process_world_map_location(current_data: pd.DataFrame):
    if current_data.empty:
        return current_data

    world_map_data = pd.DataFrame(current_data)

    world_map_data['country_iso_numeric'] = world_map_data[LOCATION].apply(lambda x : pycountry.countries.get(alpha_2 = x).numeric)
    world_map_data = world_map_data.assign(country_count=world_map_data[LOCATION] + " - " + world_map_data[COUNT].astype(str))
    return world_map_data
    
def process_location(current_data: pd.DataFrame):
    if current_data.empty:
        return current_data
    return current_data.groupby(LOCATION)[LOCATION].size().reset_index(name=COUNT)

def process_by_top_n(current_data, top_n):
    if current_data.empty:
        return current_data
    return current_data.nlargest(top_n, columns=COUNT)

def process_data_geo(current_data, top_n, time_period, projection_type):
    if current_data.empty:
        return current_data

    geo_data = pd.DataFrame(current_data)

    geo_data = process_by_time_range(geo_data, time_period)
    geo_data = process_location(geo_data)

    if projection_type == "map":
        geo_data = process_world_map_location(geo_data)

    geo_data = process_by_top_n(geo_data, top_n)
    return geo_data

def process_by_time_range(current_data, time_period):
    if current_data.empty:
        return current_data
    
    start_offset, end_offset = time_period


    start = get_first_time_flow() + datetime.timedelta(hours=start_offset)
    end = get_first_time_flow() + datetime.timedelta(hours=end_offset)

    return current_data[(current_data[TIMESTAMP].apply(to_datetime) >= start) &
                      ((current_data[TIMESTAMP].apply(to_datetime)) <= end)]


def process_to_time_plot(current_data):
    time_plot_data = pd.DataFrame(current_data)

    print(time_plot_data)
    return time_plot_data.groupby([TIMESTAMP, IP_ADDR]).size().reset_index(name=COUNT)
    
def process_data_sign(current_data, top_n, time_period):
    if current_data.empty:
        return current_data
    
    time_plot_data = pd.DataFrame(current_data)

    time_plot_data = process_by_time_range(time_plot_data, time_period)
    time_plot_data[TIMESTAMP] = pd.to_datetime(time_plot_data[TIMESTAMP])

    time_range = pd.date_range(start=time_plot_data[TIMESTAMP].min(),
                            end=time_plot_data[TIMESTAMP].max(),
                            freq='1H'  # 1-minute intervals
                            )
    ip_addresses = time_plot_data[IP_ADDR].unique()
    multi_index = pd.MultiIndex.from_product(
        [time_range, ip_addresses],
        names=[TIMESTAMP, IP_ADDR]
    )
    filled_df = pd.DataFrame(index=multi_index).reset_index()
    filled_df = filled_df.merge(
        time_plot_data,
        on=[TIMESTAMP, IP_ADDR],
        how="left"
    )
    filled_df['count'] = filled_df.groupby([IP_ADDR, 'Application Display Name'])[TIMESTAMP].transform('count').fillna(0)
    filled_df.fillna({'Application Display Name': 'No Activity'}, inplace=True)
    ip_address_offset = {ip: i for i, ip in enumerate(ip_addresses)}
    filled_df['offset_count'] = filled_df['count'] + filled_df['Client IP Address'].map(ip_address_offset)
    #time_plot_data = process_to_time_plot(time_plot_data)
    #time_plot_data = process_by_top_n(time_plot_data, top_n)

    print(filled_df)
    return filled_df
    

def make_world_map(world_map_data):
    source = alt.topo_feature(data.world_110m.url, "countries")

    background = alt.Chart(source).mark_geoshape(fill="white")
    foreground = (
        alt.Chart(source)
        .mark_geoshape(stroke="black", strokeWidth=0.15)
        .transform_lookup(
            lookup="id",
            from_=alt.LookupData(world_map_data, "country_iso_numeric", [COUNT, LOCATION]),
        )
        .transform_calculate(
            legend_label="datum.Country + ' - ' + datum.COUNT"
        )
        .encode(
            color=alt.Color(
                field=COUNT,
                scale=alt.Scale(scheme="lightgreyred"),
                legend=alt.Legend(
                    title="Country - Log in Attempts",
                    labelExpr="datum.label", 
            ),
            ),
            tooltip=[
                alt.Tooltip(f"{LOCATION}:N", title="Country"),
                alt.Tooltip(f"{COUNT}:Q", title="Log in attempts"),
            ],            
            
        )
    )

    final_map = (
        (background + foreground)
        .configure_view(strokeWidth=0)
        .properties(width=800, height=500)
        .project("naturalEarth1")
    )
    
    return alt.JupyterChart(final_map)
    
def make_location_graph(graph_data, top_n):
    tooltips = [COUNT]

    point = alt.selection_point(name="select", on="click", fields=["Category"])

    graph = alt.Chart(graph_data).mark_bar().encode(
        x=graph_data.columns[1],
        y=alt.Y(graph_data.columns[0], sort='-x'),
        tooltip=tooltips,
        color=alt.Color(graph_data.columns[1],
                   scale=alt.Scale(range=['lightgreen', 'green']))
    ).properties(
        width=600,
        height=1000 if top_n >= 30 else 600,
        autosize=alt.AutoSizeParams(
            type='fit',
            contains='padding'
        ),
    ).add_params(point)
    
    return alt.JupyterChart(graph)

def make_time_plot(processed_data, top_n):
    # time_plot = alt.Chart(processed_data).mark_line(point=True).encode(
    #     x=alt.X("Creation Time:O").timeUnit("yearmonthdatehoursminutes").title("date"),
    #     y='Client IP Address:Q',
    #     color='Client IP Address:N',
    # ).transform_window(
    #     rank="rank()",
    #     groupby=["Creation Time"]
    # )
    time_plot = alt.Chart(processed_data).mark_line(point=True).encode(
        x=f'{TIMESTAMP}:T',
        y=alt.Y('offset_count:Q', title="Sign-in Attempts (Offset by IP Address)"),
        color=f'{IP_ADDR}:N',
        tooltip=[
            'Creation Time:T',
            'Client IP Address:N',
            'Application Display Name:N',
            'count:Q'
        ]

    ).properties(
        width=800,
        height=400,
        title="Occurrences of IP Addresses Over Time"
    )   
    return time_plot

def on_change(v):
    clear_output(wait=True)
    top_n = int(top_n_widget.value)
    display_widgets()

    if data_type_widget.value in [LOG_MAP, LOG_CHART]:
        processed_data = process_data_geo(sign_in_data, top_n, period_widget.value, "map")
    if data_type_widget.value == SINGIN_ATTEMPTS:
        processed_data = process_data_sign(sign_in_data, top_n, period_widget.value)
    
    if processed_data.empty:
        print("Log: The data set is empty")
        return

    if (data_type_widget.value ==  LOG_MAP):
        print(processed_data)
        plot: alt.JupyterChart = make_world_map(processed_data)
    elif data_type_widget.value == LOG_CHART:
        plot: alt.JupyterChart = make_location_graph(processed_data, top_n)
    elif data_type_widget.value == SINGIN_ATTEMPTS:
        plot: alt.JupyterChart = make_time_plot(processed_data, top_n)
    
    display(plot)


display_widgets()
apply_changes_widget.on_click(on_change)
 

Dropdown(description='Top N stats', options=('10', '20', '30', '40', '50'), style=DescriptionStyle(description…

ToggleButtons(button_style='info', description='Data type:', index=2, layout=Layout(width='520px'), options=('…

IntRangeSlider(value=(0, 168), description='Time range in hours (max 1 week from the first event)', layout=Lay…

Button(button_style='success', description='Apply changes', style=ButtonStyle(), tooltip='Apply changes')

                Creation Time           Client IP Address  Unnamed: 0  \
0   2024-11-04 00:13:09+01:00              185.152.64.171        15.0   
1   2024-11-04 00:13:09+01:00              185.152.64.171        17.0   
2   2024-11-04 00:13:09+01:00  2001:718:812:101::ffff:20c         NaN   
3   2024-11-04 00:13:09+01:00                203.0.113.45         NaN   
4   2024-11-04 00:13:09+01:00                  192.0.2.10         NaN   
..                        ...                         ...         ...   
441 2024-11-07 16:13:09+01:00              185.152.64.171         NaN   
442 2024-11-07 16:13:09+01:00  2001:718:812:101::ffff:20c         NaN   
443 2024-11-07 16:13:09+01:00                203.0.113.45         NaN   
444 2024-11-07 16:13:09+01:00                  192.0.2.10         NaN   
445 2024-11-07 16:13:09+01:00               198.51.100.25         NaN   

    Application Display Name          Sign-in Type   Status Failure Reason  \
0           Microsoft Office  NoninteractiveS