<a href="https://colab.research.google.com/github/alammobaDar/CCADMACL_PROJECT_COM231ML/blob/main/POST_PROCESSING.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
import os
import pandas as pd

# Mounting the Drive

In [2]:
drive.mount("/content/drive", force_remount=True)

path = "/content/drive/My Drive/Datasets/packets_raw.csv"

df = pd.read_csv(path, sep=',', on_bad_lines='skip', engine='python')

Mounted at /content/drive


# RAW VERSION OF THE DATASET

In [3]:
df.head()

Unnamed: 0,frame.time_epoch,frame.len,ip.src,ip.dst,ip.proto,tcp.srcport,tcp.dstport,udp.srcport,udp.dstport
0,1744194000.0,60.0,203.178.162.159,95.147.81.104,1.0,,,,
1,1744194000.0,1324.0,,,,52278.0,443.0,,
2,1744194000.0,60.0,203.178.162.159,38.209.248.201,1.0,,,,
3,1744194000.0,74.0,199.219.149.64,210.34.198.238,6.0,25188.0,1080.0,,
4,1744194000.0,74.0,162.70.131.74,150.67.151.48,6.0,55292.0,8085.0,,


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11762985 entries, 0 to 11762984
Data columns (total 9 columns):
 #   Column            Dtype  
---  ------            -----  
 0   frame.time_epoch  float64
 1   frame.len         float64
 2   ip.src            object 
 3   ip.dst            object 
 4   ip.proto          float64
 5   tcp.srcport       float64
 6   tcp.dstport       float64
 7   udp.srcport       float64
 8   udp.dstport       float64
dtypes: float64(7), object(2)
memory usage: 807.7+ MB


# Managing Null Values

In [5]:
# We are dropping values that has null on IP Proto because we are keeping our dataset above the IP(this is where the user activity manifests)
df = df.dropna(subset=["ip.proto"])

# Keeping protocols separate is not reliable to distinguish user activity anymore in the modern web. That's why we have combine them
df["src_port"] = df["tcp.srcport"].fillna(df["udp.srcport"])
df["dst_port"] = df["tcp.dstport"].fillna(df["udp.dstport"])

# Drop data points that has no port because it will have noise
df = df.dropna(subset=["src_port", "dst_port"])





In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7384999 entries, 3 to 11762983
Data columns (total 11 columns):
 #   Column            Dtype  
---  ------            -----  
 0   frame.time_epoch  float64
 1   frame.len         float64
 2   ip.src            object 
 3   ip.dst            object 
 4   ip.proto          float64
 5   tcp.srcport       float64
 6   tcp.dstport       float64
 7   udp.srcport       float64
 8   udp.dstport       float64
 9   src_port          float64
 10  dst_port          float64
dtypes: float64(9), object(2)
memory usage: 676.1+ MB


In [7]:
df["flow_id"] = (
    df["ip.src"] + "-" +
    df["ip.dst"] + "-" +
    df["src_port"].astype(str) + "-" +
    df["dst_port"].astype(str) + "-" +
    df["ip.proto"].astype(str)
)

In [8]:
df.head()

Unnamed: 0,frame.time_epoch,frame.len,ip.src,ip.dst,ip.proto,tcp.srcport,tcp.dstport,udp.srcport,udp.dstport,src_port,dst_port,flow_id
3,1744194000.0,74.0,199.219.149.64,210.34.198.238,6.0,25188.0,1080.0,,,25188.0,1080.0,199.219.149.64-210.34.198.238-25188.0-1080.0-6.0
4,1744194000.0,74.0,162.70.131.74,150.67.151.48,6.0,55292.0,8085.0,,,55292.0,8085.0,162.70.131.74-150.67.151.48-55292.0-8085.0-6.0
7,1744194000.0,66.0,172.219.225.42,131.113.151.154,6.0,443.0,48565.0,,,443.0,48565.0,172.219.225.42-131.113.151.154-443.0-48565.0-6.0
9,1744194000.0,78.0,150.67.111.120,143.245.10.217,17.0,,,65150.0,34586.0,65150.0,34586.0,150.67.111.120-143.245.10.217-65150.0-34586.0-...
10,1744194000.0,66.0,195.47.191.192,203.178.179.153,6.0,59608.0,830.0,,,59608.0,830.0,195.47.191.192-203.178.179.153-59608.0-830.0-6.0


In [9]:
flows = df.groupby("flow_id").agg(
    packet_count=("frame.len", "count"),
    byte_count=("frame.len", "sum"),
    avg_pkt_len=("frame.len", "mean"),
    std_pkt_len=("frame.len", "std"),
    start_time=("frame.time_epoch", "min"),
    end_time=("frame.time_epoch", "max"),
)
flows["flow_duration"] = flows["end_time"] - flows["start_time"]

In [10]:
flows.head()

Unnamed: 0_level_0,packet_count,byte_count,avg_pkt_len,std_pkt_len,start_time,end_time,flow_duration
flow_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1.0.246.249-133.5.39.235-21338.0-23.0-6.0,2,132.0,66.0,0.0,1744194000.0,1744194000.0,1.007828
1.0.77.248-133.5.199.102-10604.0-7211.0-17.0,4,4220.0,1055.0,0.0,1744194000.0,1744194000.0,4.430177
1.1.7.24-202.152.220.91-48579.0-23.0-6.0,2,148.0,74.0,0.0,1744194000.0,1744194000.0,0.994511
1.1.7.24-202.24.162.124-45888.0-23.0-6.0,9,506.0,56.222222,6.666667,1744194000.0,1744194000.0,35.942059
1.10.126.203-133.5.24.24-40320.0-23.0-6.0,1,66.0,66.0,,1744194000.0,1744194000.0,0.0


In [12]:
flows.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1445998 entries, 1.0.246.249-133.5.39.235-21338.0-23.0-6.0 to 99.86.92.118-203.178.190.20-443.0-15900.0-6.0
Data columns (total 7 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   packet_count   1445998 non-null  int64  
 1   byte_count     1445998 non-null  float64
 2   avg_pkt_len    1445998 non-null  float64
 3   std_pkt_len    254529 non-null   float64
 4   start_time     1445998 non-null  float64
 5   end_time       1445998 non-null  float64
 6   flow_duration  1445998 non-null  float64
dtypes: float64(6), int64(1)
memory usage: 88.3+ MB


In [13]:
flows = flows[
    (flows["packet_count"] >= 5) &
    (flows["flow_duration"] > 0) &
    (flows["packet_count"] <= 2000)
]

In [14]:
flows.info()

<class 'pandas.core.frame.DataFrame'>
Index: 98380 entries, 1.1.7.24-202.24.162.124-45888.0-23.0-6.0 to 99.86.92.118-203.178.190.20-443.0-15900.0-6.0
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   packet_count   98380 non-null  int64  
 1   byte_count     98380 non-null  float64
 2   avg_pkt_len    98380 non-null  float64
 3   std_pkt_len    98380 non-null  float64
 4   start_time     98380 non-null  float64
 5   end_time       98380 non-null  float64
 6   flow_duration  98380 non-null  float64
dtypes: float64(6), int64(1)
memory usage: 6.0+ MB


In [15]:
flows = flows.reset_index()
flows.rename(columns={"index": "flow_id"}, inplace=True)

flows[["ip_src", "ip_dst", "src_port", "dst_port", "ip_proto"]] = flows["flow_id"].str.split("-", expand=True)

flows["src_port"] = flows["src_port"].astype(float)
flows["dst_port"] = flows["dst_port"].astype(float)
flows["ip_proto"] = flows['ip_proto'].astype(float)

In [16]:
flows.head()

Unnamed: 0,flow_id,packet_count,byte_count,avg_pkt_len,std_pkt_len,start_time,end_time,flow_duration,ip_src,ip_dst,src_port,dst_port,ip_proto
0,1.1.7.24-202.24.162.124-45888.0-23.0-6.0,9,506.0,56.222222,6.666667,1744194000.0,1744194000.0,35.942059,1.1.7.24,202.24.162.124,45888.0,23.0,6.0
1,1.108.145.110-133.15.123.54-55106.0-4343.0-6.0,5,330.0,66.0,0.0,1744194000.0,1744194000.0,15.045203,1.108.145.110,133.15.123.54,55106.0,4343.0,6.0
2,1.108.186.229-131.113.191.161-61153.0-443.0-6.0,9,2738.0,304.222222,438.458316,1744194000.0,1744194000.0,1.258956,1.108.186.229,131.113.191.161,61153.0,443.0,6.0
3,1.11.42.26-202.152.241.42-61516.0-11194.0-17.0,9,2096.0,232.888889,154.320644,1744194000.0,1744194000.0,51.536455,1.11.42.26,202.152.241.42,61516.0,11194.0,17.0
4,1.111.109.212-131.112.204.247-36373.0-993.0-6.0,5,429.0,85.8,28.110496,1744194000.0,1744194000.0,0.134992,1.111.109.212,131.112.204.247,36373.0,993.0,6.0


In [17]:
flows = flows.drop("flow_id", axis=1)

In [18]:
flows_sampled = flows.groupby("ip_proto").apply(lambda x: x.sample(min(len(x),1000), random_state=42)).reset_index(drop=True)

  flows_sampled = flows.groupby("ip_proto").apply(lambda x: x.sample(min(len(x),1000), random_state=42)).reset_index(drop=True)


In [19]:
flows_sampled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   packet_count   2000 non-null   int64  
 1   byte_count     2000 non-null   float64
 2   avg_pkt_len    2000 non-null   float64
 3   std_pkt_len    2000 non-null   float64
 4   start_time     2000 non-null   float64
 5   end_time       2000 non-null   float64
 6   flow_duration  2000 non-null   float64
 7   ip_src         2000 non-null   object 
 8   ip_dst         2000 non-null   object 
 9   src_port       2000 non-null   float64
 10  dst_port       2000 non-null   float64
 11  ip_proto       2000 non-null   float64
dtypes: float64(9), int64(1), object(2)
memory usage: 187.6+ KB


In [20]:
flows_sampled.to_csv("flows_sampled_2000.csv", index=False)