In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from matplotlib import style

import datetime
from datetime import datetime

## Definice funkci

In [None]:
# vraci dict ktery priradi kazde komunikaci v df unikatni identifikator
# klic je tuple (srcIP, dstIP) a hodnota je id
def find_communication_pairs(df: pd.DataFrame) -> dict:
    d = dict()
    i = 1
    for (x,y) in zip(df['srcIP'], df['dstIP']):
        if (x,y) not in d:
            d[(x,y)] = i
            i = i + 1
    return d

# Vraci df vyfiltrovany podle absolutniho casu (vcetne hranicnich hodnot)
def filter_by_time_abs(df: pd.DataFrame, start: datetime, end: datetime) -> pd.DataFrame:
    return df[(df['TimeStamp'] >= start) & (df['TimeStamp'] <= end)]

# Vraci df vyfiltrovany podle relativniho casu (vcetne hranicnich hodnot)
def filter_by_time_rel(df: pd.DataFrame, end: float) -> pd.DataFrame:
    return filter_by_time_rel(df, 0.0, end)

def filter_by_time_rel(df: pd.DataFrame, start: float, end: float) -> pd.DataFrame:
    return df[(df['Relative Time'] >= start) & (df['Relative Time'] <= end)]

## Nacteni a predzpracovani dataframu

In [None]:
# Nacteni df
df = pd.read_csv("data/mega104-14-12-18-ioa.csv", sep=";")
# df = pd.read_csv("data/mega104-17-12-18-ioa.csv", sep=";")


# prida sloupec 'commId' do dataframu s id komunikace
communication_pairs = find_communication_pairs(df)
df['commId'] = df.apply(lambda row: communication_pairs[(row['srcIP'], row['dstIP'])], axis=1)

# prevede prvni sloupec do datetime typu
df['TimeStamp'] = pd.to_datetime(df['TimeStamp'], format="%H:%M:%S.%f")

## Filtrace

In [None]:
# ukazka filtrace dle casoveho razitka (potreba vyresit chybejici datum)
if False:
    start = datetime.strptime("17:15:49.97", '%H:%M:%S.%f')
    end = datetime.strptime("17:15:50.17", '%H:%M:%S.%f')
    filtered_df = filter_by_time_abs(df, start, end)

# ukazka filtrace dle casoveho razitka (potreba vyresit chybejici datum)
if True:
    filtered_df = filter_by_time_rel(df, 0, 60)


# df = filtered_df

## Pocet paketu v jednotlivych komunikacich

In [None]:
# comm_ids = list(f"{x}\n{y}" for x,y in communication_pairs.keys())
comm_ids = list(str(x) for x in communication_pairs.values())
comm_count = list(np.count_nonzero(df['commId'] == x) for x in communication_pairs.values())
# labels = communication_pairs.values()
plt.bar(x = comm_ids, height = comm_count)

# pridani labelu s poctem nad sloupce
offset = max(comm_count) / 25
for i in range(len(comm_ids)):
        plt.text(i, comm_count[i] + offset, comm_count[i], ha = 'center', color='white', bbox = dict(facecolor = 'red', alpha =.8))

plt.show()

# vypise idcka komunikace
for k, v in communication_pairs.items():
    print(f"{v}: {k[0]} -> {k[1]}")

## Pocet paketu v jednotlivych casovych usecich

In [None]:
bin_size = 3600 # sirka slopce v sekundach
x = np.arange(0, df['Relative Time'].max() + bin_size, bin_size)
n, bins, _ = plt.hist(df['Relative Time'], x, facecolor='gray', align='mid')
plt.show()

# adhoc reseni pro zobrazeni poctu paketu v jednotlivich hodinach (relativnich!)
for i in range(len(n)):
    print(f"{int(bins[i]/3600)} - {int(bins[i+1]/3600)}: {int(n[i])}")

## Korelace mezi 'ipLen' a 'len'

In [None]:
# vypocet korelace mezi 'ipLen' a 'len'
col1 = df['ipLen']
col2 = df['len']

correlation = col1.corr(col2)
print(f"Hodnota korelace: {correlation}")

# Jednoduchy graf pro zobrazeni korelace
sns.regplot(x=col1, y=col2, marker="+")
plt.show()

In [None]:
# matice korelace mezi vsemi slopci
corr = df.corr()
corr.style.background_gradient(cmap='coolwarm')