<a href="https://colab.research.google.com/github/alihoda/network-analysis/blob/main/traffic_nodes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Libraries

In [None]:
!pip install --upgrade plotly

In [2]:
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import statsmodels.tsa.stattools as sts
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Load Data and Preprocessing

In [11]:
def read_csv(csv_path, cols, datetime_col, datetime_format='%d/%m/%Y%H:%M:%S', pkt_cols=None):
  """
  Read csv file and create a dataframe based on the given params.
  ## Parameters
  - csv_path: The csv file path
  - cols: Name of columns that have datetime, Forward and backward packets, and IP addresses.
  - datetime_col: Name of the column which contains datetime.
  - datetime_format: The format of the datetime_col
  - pkt_cols: If dataset contains columns for forward and backward packets, specify them as a list.
  
  If the dataset doesn't have pkt_cols, it must has column named as pkCount which contains
  total packets.
  """

  df = pd.read_csv(csv_path)
  if pkt_cols:
    df['pktCount'] = df[pkt_cols[0]] + df[pkt_cols[1]]
    
  df.loc[:, cols]
  df[datetime_col] = pd.to_datetime(df[datetime_col], format=datetime_format)
  df.set_index(datetime_col, inplace=True)
  return df

In [4]:
def to_stationary(win_df, win_mean, win_start, win_end, df):
  """
  Check if the given dataframe is stationary or not
  and make it stationary if needed.

  ## Prameters
  - win_df: The window's dataframe to check its stationarity.
  - win_mean: The window's mean value
  - win_start: The window's start index in main dataframe
  - win_end: The window's end index in main dataframe
  - df: The main dataframe
  """

  if sts.adfuller(win_df['pktCount'])[1] > 0.05:
    arr = (win_df['pktCount'] - win_mean).values.tolist()
    df['pktCount'].iloc[win_start:win_end] = arr

In [6]:
def extract_time_nodes(df, win_size=10):
  time_nodes = []
  test_win_start = 0
  test_win_end = win_size

  while test_win_end < len(df):

    ref_win_start = test_win_start
    ref_win_end = test_win_end
    test_win_start = ref_win_end
    test_win_end = test_win_start + win_size

    ref_win = df.iloc[ref_win_start:ref_win_end]
    test_win = df.iloc[test_win_start:test_win_end]

    if not (ref_win.size < 5 or test_win.size < 5):

      to_stationary(ref_win, ref_win['pktCount'].mean(), ref_win_start, ref_win_end, df)
      to_stationary(test_win, test_win['pktCount'].mean(), test_win_start, test_win_end, df)
      
      ref_win_mean = round(ref_win['pktCount'].mean(), 2)
      ref_win_std = round(ref_win['pktCount'].std(), 2)
      test_win_mean = round(test_win['pktCount'].mean(), 2)
      test_win_std = round(test_win['pktCount'].std(), 2)
      
      test_up_braket = ref_win_mean-ref_win_std < test_win_mean+test_win_std < ref_win_mean+ref_win_std
      test_bottom_braket = ref_win_mean-ref_win_std < test_win_mean-test_win_std < ref_win_mean+ref_win_std

      ref_up_braket = test_win_mean-test_win_std < ref_win_mean+ref_win_std < test_win_mean+test_win_std
      ref_bottom_braket = test_win_mean-test_win_std < ref_win_mean-ref_win_std < test_win_mean+test_win_std

      if not (ref_up_braket or ref_bottom_braket or test_up_braket or test_bottom_braket):
        time_nodes.append(ref_win.index.values[-1])
  
  return time_nodes

In [7]:
def plot(df, nodes, title):
  """Plot given dataframe with its time's nodes"""

  fig = px.line(x=df.index, y=df['pktCount'])
  for node in nodes:
    time = pd.to_datetime(node)
    fig.add_vline(x=node, line_dash='dash', line_color='green')
  fig.update_layout(title=title, xaxis_title='DateTime', yaxis_title='Packets')
  fig.show()

# Working with Data

In [None]:
cols = ['Source.IP', 'Destination.IP', 'Timestamp']
pkt_cols = ['Total.Fwd.Packets', 'Total.Backward.Packets']
csv_path = 'drive/MyDrive/data/Dataset-Unicauca-Version2-87Atts.csv'

df_comp = read_csv(csv_path=csv_path, cols=cols, datetime_col='Timestamp', pkt_cols=pkt_cols)
df = df_comp.loc['2017-04-26']
time_nodes = extract_time_nodes(df)
plot(df, time_nodes, '2017-04-26')
