In [16]:
from pathlib import Path
import pandas as pd
import pygwalker as pyg
import os
import csv
import shutil
from datetime import datetime, timedelta

REPORT_DIR = 'report'
g_data_dir = []

def list_directories(directory):
    path = Path(directory)
    directories = [p.name for p in path.iterdir() if p.name != 'combined']
    directories.sort()
    return directories


def read_all_data_dir(root):
    global g_data_dir
    g_data_dir.clear()
    bots = list_directories(root)
    for b in bots:
        entry = (b, [])
        g_data_dir.append(entry)
        b_dir = root + os.sep + b
        entry[1].extend(list_directories(b_dir))

    if len(g_data_dir) == 0:
        print('No data to analyze.')

def get_report_dir(bot, measurement):
    return f'{REPORT_DIR}/{bot}/{measurement}'


def get_measure_time(m):
    tr = m.split('_')
    s = datetime.strptime(tr[0], '%Y-%m-%d-%H-%M-%S')
    e = datetime.strptime(tr[1], '%Y-%m-%d-%H-%M-%S')
    return s, e

def get_bot_id_prefix(bot_dir):
    last_ = bot_dir.rfind("_")
    return bot_dir[last_ + 1:]


def cnc_stats_combine_same_ip(df):
    def custom_agg(group):
        start_time_min = group['start_time'].min()
        end_time_max = group['end_time'].max()
        attemp_start_min = group['attemp_start'].min()
        return pd.Series({
            'bot_id': group['bot_id'].unique()[0],
            'measure_start': group['measure_start'].unique()[0],
            'measure_end': group['measure_end'].unique()[0],
            'port': ','.join(group['port'].astype(str))[:256],
            'packet_cnt': group['packet_cnt'].sum(),
            'total_bytes': group['total_bytes'].sum(),
            'syn_cnt': group['syn_cnt'].sum(),
            'fin_cnt': group['fin_cnt'].sum(),
            'rst_cnt': group['rst_cnt'].sum(),
            'start_time': start_time_min,
            'end_time': end_time_max,
            'attemp_start': attemp_start_min,
            'is_old_ip': group['is_old_ip'].unique()[0],
            'duration': end_time_max - start_time_min
        })

    grouped = df.groupby('ip')
    result = grouped.apply(custom_agg, include_groups=False).reset_index()
    result = result[['ip'] + [col for col in result.columns if col != 'ip']]

    # filter out records whose total_bytes > 0 and is_old_ip == False which are mistakenly counted as cnc
    # those IPs are actually attacking packets whose SYN==1 and ACK==1 but tcp_len > 0
    result = result[~((result['is_old_ip'] == False) & (result['total_bytes'] > 0))]
    return result


def combine_all_measurements(bot, measurements):
    combined_dir = f'{REPORT_DIR}/{bot}/combined'
    if os.path.exists(combined_dir):
        shutil.rmtree(combined_dir)
    os.makedirs(combined_dir)

    f0 = f'{REPORT_DIR}/{bot}/combined/measurements.csv'
    f1 = f'{REPORT_DIR}/{bot}/combined/cnc-status.csv'
    f2 = f'{REPORT_DIR}/{bot}/combined/cnc-stats.csv'
    f3 = f'{REPORT_DIR}/{bot}/combined/attacks.csv'
    bid = get_bot_id_prefix(bot)
    m_list = []
    df1 = []
    df2 = []
    df3 = []

    # TODO: filter out measurement from 2024-06-29 when tool support multiple CnCs
    filter_date = datetime(2024, 6, 29, 00, 00, 00)

    for m in measurements:
        base = get_report_dir(bot, m)
        ms, me = get_measure_time(m)
        cnc_ips = []
        m_dict = {}
        m_dict['bot_id'] = bid
        m_dict['measure_start'] = ms
        m_dict['measure_end'] = me
        
        if ms > filter_date:
            print(f'measurement of {ms}~{me} filtered out!')
            continue
        
        duration = me - ms
        m_dict['duration'] = 1.0 * duration.total_seconds() / 3600
        m_list.append(m_dict)
        f = f'{base}/cnc-status.csv'
        if os.path.isfile(f):
            d = pd.read_csv(f)
            cnc_ips = d['cnc_ip'].unique()
            d['measure_start'] = ms
            d['measure_end'] = me
            d['bot_id'] = bid
            df1.append(d)
        f = f'{base}/cnc-stats.csv'
        if os.path.isfile(f):
            d = pd.read_csv(f)
            d['measure_start'] = ms
            d['measure_end'] = me
            d['bot_id'] = bid
            d['start_time'] = pd.to_datetime(d['start_time'])
            if 'duration' not in d.columns:
                d['end_time'] = pd.to_datetime(d['end_time'])
                d['duration'] = d['end_time'] - d['start_time']
            d['duration'] =  pd.to_timedelta(d['duration'])
            if 'end_time' not in d.columns:
                d['end_time'] =  d['start_time'] + d['duration']
            d['duration'] =  d['duration'].dt.total_seconds()
            d['attemp_start'] = d['start_time'] - d['measure_start']
            d['is_old_ip'] =  d['ip'].isin(cnc_ips)

            #filter out syn_cnt < 2 which could be scans
            # d = d[~(d['syn_cnt'] < 2)]
            
            # combine records with same IP, we only care Cnc IPs, not port
            d1 = cnc_stats_combine_same_ip(d)
            df2.append(d1)
        f = f'{base}/attacks.csv'
        if os.path.isfile(f):
            d = pd.read_csv(f)
            d['measure_start'] = ms
            d['measure_end'] = me
            d['bot_id'] = bid
            d['duration'] =  pd.to_timedelta(d['duration'])
            d['duration'] =  d['duration'].dt.total_seconds()
            df3.append(d)

    if len(m_list) > 0:
        df_m = pd.DataFrame(m_list, columns=['bot_id', 'measure_start', 'measure_end', 'duration'])
        df_m.to_csv(f0, index=False)
    if len(df1) > 0:
        pd.concat(df1, ignore_index=True).to_csv(f1, index=False)
    if len(df2) > 0:
        pd.concat(df2, ignore_index=True).to_csv(f2, index=False)
    if len(df3) > 0:
        pd.concat(df3, ignore_index=True).to_csv(f3, index=False)

    return "combined"


def combine_all_reports(bots):
    combined_dir = f'{REPORT_DIR}/combined'
    if os.path.exists(combined_dir):
        shutil.rmtree(combined_dir)
    os.makedirs(combined_dir)

    f0 = f'{REPORT_DIR}/combined/measurements.csv'
    f1 = f'{REPORT_DIR}/combined/cnc-status.csv'
    f2 = f'{REPORT_DIR}/combined/cnc-stats.csv'
    f3 = f'{REPORT_DIR}/combined/attacks.csv'
    df0 = []
    df1 = []
    df2 = []
    df3 = []
    for b, ms in bots:
        combine_all_measurements(b, ms)

    for b, _ in bots:
        base = f'{REPORT_DIR}/{b}/combined'
        f = f'{base}/measurements.csv'
        if os.path.isfile(f):
            d = pd.read_csv(f)
            df0.append(d)
        f = f'{base}/cnc-status.csv'
        if os.path.isfile(f):
            d = pd.read_csv(f)
            df1.append(d)
        f = f'{base}/cnc-stats.csv'
        if os.path.isfile(f):
            d = pd.read_csv(f)
            df2.append(d)
        f = f'{base}/attacks.csv'
        if os.path.isfile(f):
            d = pd.read_csv(f)
            df3.append(d)

    if len(df0) > 0:
        pd.concat(df0, ignore_index=True).to_csv(f0, index=False)
    if len(df1) > 0:
        pd.concat(df1, ignore_index=True).to_csv(f1, index=False)
    if len(df2) > 0:
        pd.concat(df2, ignore_index=True).to_csv(f2, index=False)
    if len(df3) > 0:
        pd.concat(df3, ignore_index=True).to_csv(f3, index=False)

    return "combined"


def input_measurement_menu():
    if len(g_data_dir) == 0:
        return
    print('\nChoose bot, 0 for combined report for all bots:')
    i = 1
    for b in g_data_dir:
        print(f'    {i}. {b[0]}')
        i += 1
    b_idx = int(input())
    if b_idx == 0:
        m_dir = combine_all_reports(g_data_dir)
        return f'{REPORT_DIR}/{m_dir}'

    print('\nChoose measurement, 0 for combined report for all measurements:')
    i = 1
    bot = g_data_dir[b_idx - 1]
    for m in bot[1]:
        print(f'    {i}. {m}')
        i += 1
    m_idx = int(input())
    if m_idx == 0:
        m_dir = combine_all_measurements(bot[0], bot[1])
    else:
        m_dir = bot[1][m_idx - 1]
        
    return f'{REPORT_DIR}/{bot[0]}/{m_dir}'

# start
read_all_data_dir(REPORT_DIR)

In [17]:
base = input_measurement_menu()


Choose bot, 0 for combined report for all bots:
    1. 2024_06_10_04_03_20_mirai_87cb7390
    2. 2024_06_10_11_49_29_mirai_114187eb
    3. 2024_06_10_12_15_28_mirai_e2961432
    4. 2024_06_10_13_28_20_mirai_6e00e8d4
    5. 2024_06_11_02_56_58_mirai_d2053d4c
    6. 2024_06_11_06_05_31_mirai_6fdf5b4b
    7. 2024_06_11_06_05_54_mirai_361cd32a
    8. 2024_06_11_16_23_06_mirai_a25372d1
    9. 2024_06_12_06_37_05_mirai_c087ad31
    10. 2024_06_13_10_52_18_mirai_33f9236e
    11. 2024_06_13_15_53_11_gafgyt_a9ce0201
    12. 2024_06_13_19_30_16_mirai_2a80f110
    13. 2024_06_14_07_02_54_mirai_03573b20
    14. 2024_06_14_12_54_42_gafgyt_571f149f
    15. 2024_06_15_06_46_13_gafgyt_e9626f6b
    16. 2024_06_16_08_47_47_mirai_19a6d211
    17. 2024_06_16_12_32_14_mirai_dd1b3e85
    18. 2024_06_17_12_24_31_mirai_12d7da7f
    19. 2024_06_17_14_47_16_mirai_6b941c86
    20. 2024_06_18_07_25_16_mirai_14a11457
    21. 2024_06_18_07_30_41_mirai_e87d52ee
    22. 2024_06_18_08_18_24_gafgyt_5dbaeaca
    23. 20

 0


measurement of 2024-06-29 14:57:08~2024-06-29 15:08:33 filtered out!
measurement of 2024-06-29 14:57:08~2024-06-29 18:14:03 filtered out!
measurement of 2024-06-29 14:57:08~2024-06-29 18:15:02 filtered out!
measurement of 2024-06-29 14:57:10~2024-06-29 15:13:57 filtered out!
measurement of 2024-06-29 14:57:08~2024-06-29 15:17:05 filtered out!
measurement of 2024-06-29 14:57:08~2024-06-29 15:17:34 filtered out!
measurement of 2024-06-29 14:57:08~2024-06-29 15:15:51 filtered out!
measurement of 2024-06-29 14:57:08~2024-06-29 15:09:06 filtered out!
measurement of 2024-06-29 14:57:08~2024-06-29 18:15:03 filtered out!
measurement of 2024-06-29 14:57:08~2024-06-29 18:15:03 filtered out!


In [3]:
f = f'{base}/measurements.csv'
if os.path.isfile(f):
    df = pd.read_csv(f)
    walker = pyg.walk(df, kernel_computation=True)
else:
    print('file not exist!')

Box(children=(HTML(value='\n<div id="ifr-pyg-00061c4a6d72f2bbQbg6XLn0Pako5vK3" style="height: auto">\n    <hea…

In [4]:
f = f'{base}/cnc-status.csv'
if os.path.isfile(f):
    df = pd.read_csv(f)
    walker = pyg.walk(df, kernel_computation=True)
else:
    print('file not exist!')

Box(children=(HTML(value='\n<div id="ifr-pyg-00061c4a6dfbcacfLq38GH7iu9jngNsE" style="height: auto">\n    <hea…

In [None]:
f = f'{base}/cnc-stats.csv'
if os.path.isfile(f):
    df = pd.read_csv(f)
    walker = pyg.walk(df, kernel_computation=True)
else:
    print('file not exist!')

Box(children=(HTML(value='\n<div id="ifr-pyg-00061c4ae86b37b7BXqKaz8v03l1CtE5" style="height: auto">\n    <hea…

In [55]:
f = f'{base}/attacks.csv'
if os.path.isfile(f):
    df = pd.read_csv(f)
    walker = pyg.walk(df, kernel_computation=True)
else:
    print('file not exist!')

Box(children=(HTML(value='\n<div id="ifr-pyg-00061c3717573ea7dK3feG1UWy5XHDOv" style="height: auto">\n    <hea…