In [2]:
#importing all necessary libraries 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#check version
print("Pandas version: " + pd.__version__)

Pandas version: 2.2.2


In [3]:
#load csv file with the correct delimiter 
file_path = 'malware-capture-csv/CTU-IoT-Malware-Capture-42-1conn.log.labeled.csv'
df = pd.read_csv(file_path, delimiter='|')

In [4]:
#take a look, displaying the first few rows to verify correct loading 
print("Initial data:") 
print(df.head())

Initial data:
             ts                 uid      id.orig_h  id.orig_p      id.resp_h  \
0  1.547127e+09  CXY5uG2sSmjJ0grfY2  192.168.1.197      58312  104.24.96.120   
1  1.547127e+09   Ce3AJzwzXwM3Z1XBg  192.168.1.197      45082  104.24.97.120   
2  1.547127e+09  CJgnSb3XpbbOcMHKUd  192.168.1.197      58316  104.24.96.120   
3  1.547127e+09   Cq43w4aHlsW8nXZ3l  192.168.1.197      59357    192.168.1.1   
4  1.547127e+09  C5uLwl2hGy10y9PSr6  192.168.1.197      39686    192.168.1.1   

   id.resp_p proto service  duration orig_bytes  ... local_resp missed_bytes  \
0         80   tcp    http  3.909013         83  ...          -         4380   
1         80   tcp    http  4.767024        150  ...          -            0   
2         80   tcp       -  3.107228          0  ...          -            0   
3         53   udp     dns  0.029483         58  ...          -            0   
4         53   udp     dns  0.001249         58  ...          -            0   

      history orig_pkts 

In [5]:
#display the initial columns
print("Initial columns:")
print(df.columns)

Initial columns:
Index(['ts', 'uid', 'id.orig_h', 'id.orig_p', 'id.resp_h', 'id.resp_p',
       'proto', 'service', 'duration', 'orig_bytes', 'resp_bytes',
       'conn_state', 'local_orig', 'local_resp', 'missed_bytes', 'history',
       'orig_pkts', 'orig_ip_bytes', 'resp_pkts', 'resp_ip_bytes',
       'tunnel_parents', 'label', 'detailed-label'],
      dtype='object')


In [11]:
#defined the required columns with desired labels
required_columns = {
    'ts': 'Timestamp', 
    'id_orig_h': 'Source_IP', 
    'id_resp_h': 'Destination_IP', 
    'id_orig_p': 'Source_Port', 
    'id_resp_p': 'Destination_Port', 
    'proto': 'Protocol', 
    'duration': 'Duration', 
    'orig_bytes': 'Bytes_Sent', 
    'resp_bytes': 'Bytes_Received', 
    'label': 'Label'
}

#addtional columns if available 
optional_columns = {
    'orig_country': 'Source_Country', 
    'resp_country': 'Destination_Country', 
    'detailed_label': 'Malware_Type', 
    'orig_pkts': 'Packets_Sent', 
    'resp_pkts': 'Packets_Received'
}



Initial columns:
Index(['ts', 'uid', 'id.orig_h', 'id.orig_p', 'id.resp_h', 'id.resp_p',
       'proto', 'service', 'duration', 'orig_bytes', 'resp_bytes',
       'conn_state', 'local_orig', 'local_resp', 'missed_bytes', 'history',
       'orig_pkts', 'orig_ip_bytes', 'resp_pkts', 'resp_ip_bytes',
       'tunnel_parents', 'label', 'detailed-label'],
      dtype='object')


In [10]:
#combine required and optional columns
all_columns = {**required_columns, **optional_columns}

#filter the dataframe to keep only the relevant columns
filtered_columns = [col for col in all_columns.keys() if col in df.columns]
df_filtered = df[filtered_columns].copy()

#rename the columns
df_filtered.rename(columns=all_columns, inplace=True)

#display the first few rows of the cleaned dataframe
print(df_filtered)


         Timestamp Protocol      Duration Bytes_Sent Bytes_Received  \
0     1.547127e+09      tcp      3.909013         83          67212   
1     1.547127e+09      tcp      4.767024        150          67212   
2     1.547127e+09      tcp      3.107228          0              0   
3     1.547127e+09      udp      0.029483         58            146   
4     1.547127e+09      udp      0.001249         58            146   
...            ...      ...           ...        ...            ...   
4421  1.547156e+09      udp             -          -              -   
4422  1.547156e+09      udp      0.003995         48             48   
4423  1.547157e+09      udp      0.005252         48             48   
4424  1.547127e+09      tcp  30007.122211       8499           8025   
4425  1.547157e+09      udp             -          -              -   

                         Label  Packets_Sent  Packets_Received  
0     Malicious   FileDownload            54                50  
1              Ma