In [1]:
#importing all necessary libraries 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn 

#check version
print("Pandas version: " + pd.__version__)
print("SK Learn version: " + sklearn.__version__)


Pandas version: 2.2.2
SK Learn version: 1.5.0


In [2]:
#load csv file with the correct delimiter 
file_path = 'malware-capture-csv/CTU-IoT-Malware-Capture-34-1conn.log.labeled.csv'
df = pd.read_csv(file_path, delimiter='|')

In [3]:
#take a look, displaying the first few rows to verify correct loading 
print("Initial data:") 
print(df.head())

Initial data:
             ts                 uid      id.orig_h  id.orig_p       id.resp_h  \
0  1.545404e+09   CrDn63WjJEmrWGjqf  192.168.1.195      41040  185.244.25.235   
1  1.545404e+09  CY9lJW3gh1Eje4usP6  192.168.1.195      41040  185.244.25.235   
2  1.545404e+09   CcFXLynukEDnUlvgl  192.168.1.195      41040  185.244.25.235   
3  1.545404e+09   CDrkrSobGYxHhYfth  192.168.1.195      41040  185.244.25.235   
4  1.545404e+09  CTWZQf2oJSvq6zmPAc  192.168.1.195      41042  185.244.25.235   

   id.resp_p proto service  duration orig_bytes  ... local_resp missed_bytes  \
0         80   tcp       -  3.139211          0  ...          -            0   
1         80   tcp       -         -          -  ...          -            0   
2         80   tcp       -         -          -  ...          -            0   
3         80   tcp    http  1.477656        149  ...          -         2896   
4         80   tcp       -  3.147116          0  ...          -            0   

       history ori

In [10]:
#check for duplicates 
duplicates = df.duplicated()
num_duplicates = duplicates.sum()
print(f"Number of duplicate rows: {num_duplicates}")

if num_duplicates > 0:
    print("Duplicate rows:")
    print(df[duplicates])


Number of duplicate rows: 0


In [11]:
#check for missing values
missing_values = df.isnull().sum()
print("Missing values per column:")
print(missing_values)

Missing values per column:
ts                    0
uid                   0
id.orig_h             0
id.orig_p             0
id.resp_h             0
id.resp_p             0
proto                 0
service               0
duration              0
orig_bytes            0
resp_bytes            0
conn_state            0
local_orig            0
local_resp            0
missed_bytes          0
history               0
orig_pkts             0
orig_ip_bytes         0
resp_pkts             0
resp_ip_bytes         0
tunnel_parents        0
label                 0
detailed-label    21222
dtype: int64


In [4]:
#display the initial columns
print("Initial columns:")
print(df.columns)

Initial columns:
Index(['ts', 'uid', 'id.orig_h', 'id.orig_p', 'id.resp_h', 'id.resp_p',
       'proto', 'service', 'duration', 'orig_bytes', 'resp_bytes',
       'conn_state', 'local_orig', 'local_resp', 'missed_bytes', 'history',
       'orig_pkts', 'orig_ip_bytes', 'resp_pkts', 'resp_ip_bytes',
       'tunnel_parents', 'label', 'detailed-label'],
      dtype='object')


In [5]:
#defined the required columns with desired labels
required_columns = {
    'ts': 'Timestamp', 
    'id.orig_h': 'Source_IP', 
    'id.resp_h': 'Destination_IP', 
    'id.orig_p': 'Source_Port', 
    'id.resp_p': 'Destination_Port', 
    'proto': 'Protocol', 
    'duration': 'Duration', 
    'orig_bytes': 'Bytes_Sent', 
    'resp_bytes': 'Bytes_Received', 
    'label': 'Label'
}

#addtional columns if available 
optional_columns = {
    'orig_country': 'Source_Country', 
    'resp_country': 'Destination_Country', 
    'detailed_label': 'Malware_Type', 
    'orig_pkts': 'Packets_Sent', 
    'resp_pkts': 'Packets_Received'
}


In [8]:
#combine required and optional columns
all_columns = {**required_columns, **optional_columns}

#filter the dataframe to keep only the relevant columns
filtered_columns = [col for col in all_columns.keys() if col in df.columns]
df_filtered = df[filtered_columns].copy()

#rename the columns
df_filtered.rename(columns=all_columns, inplace=True)

#display the cleaned DataFrame
print("Cleaned Data:")
print(df_filtered.head())

      Timestamp      Source_IP  Destination_IP  Source_Port  Destination_Port  \
0  1.545404e+09  192.168.1.195  185.244.25.235        41040                80   
1  1.545404e+09  192.168.1.195  185.244.25.235        41040                80   
2  1.545404e+09  192.168.1.195  185.244.25.235        41040                80   
3  1.545404e+09  192.168.1.195  185.244.25.235        41040                80   
4  1.545404e+09  192.168.1.195  185.244.25.235        41042                80   

  Protocol  Duration Bytes_Sent Bytes_Received   Label  Packets_Sent  \
0      tcp  3.139211          0              0  Benign             3   
1      tcp         -          -              -  Benign             1   
2      tcp         -          -              -  Benign             1   
3      tcp  1.477656        149         128252  Benign            94   
4      tcp  3.147116          0              0  Benign             3   

   Packets_Received  
0                 0  
1                 0  
2             