In [1]:
import pandas as pd
import os
import gc

# --- Paths for CTU-13 dataset ---
dataset_files = {
    '1': 'sensor1/sensor1.binetflow',
    '2': 'sensor2/sensor2.binetflow',
    '3': 'sensor3/sensor3.binetflow',
}
# --- End paths ---

# Define the folder name to save sample results
output_folder = 'sampled_data'

# Create an empty dictionary to hold all COMPLETE DataFrames
# This variable will persist after this cell is executed
dict_of_dataframes = {}

In [2]:
print("Starting process: Reading data & taking samples...")

# 1. Create output folder if it doesn't exist
try:
    os.makedirs(output_folder, exist_ok=True)
    print(f"Output folder '{output_folder}' has been prepared.")
except Exception as e:
    print(f"Failed to create folder '{output_folder}': {e}")


# 2. Loop, read, save to dict, take samples, and save samples
for key, filepath in dataset_files.items():
    try:
        print(f"\n--- Processing Dataset {key} ---")
        print(f"Reading file: {filepath}...")
        
        # a. Read file
        df = pd.read_csv(filepath, low_memory=False)
        total_rows = len(df)
        
        # b. Save COMPLETE DataFrame to dictionary
        # This fulfills your request to access it later
        dict_of_dataframes[key] = df
        print(f"Dataset {key} ({total_rows} rows) has been loaded into memory.")

        # c. Determine sample size
        sample_size = min(100, total_rows)
        if sample_size < 100:
            print(f"Warning: File {key} only has {total_rows} rows. Taking {sample_size} rows.")
        else:
            print(f"Taking 100 random rows from file {key}...")

        # d. Take samples from the DataFrame that was just read
        sampled_df_botnet_not_spam = df[
            df['Label'].str.contains('botnet', case=False, na=False) &
            ~df['Label'].str.contains('spam', case=False, na=False)
        ].sample(n=3, random_state=42)

        sampled_df_botnet_and_spam = df[
            df['Label'].str.contains('botnet', case=False, na=False) &
            df['Label'].str.contains('spam', case=False, na=False)
        ].sample(n=3, random_state=42)

        sampled_df_no_botnet_no_spam = df[
            ~df['Label'].str.contains('botnet', case=False, na=False) &
            ~df['Label'].str.contains('spam', case=False, na=False)
        ].sample(n=4, random_state=42)

        # Combine the three DataFrames into one
        sampled_df = pd.concat([sampled_df_botnet_not_spam, sampled_df_botnet_and_spam, sampled_df_no_botnet_no_spam], ignore_index=True)

        # Shuffle the row order
        sampled_df = sampled_df.sample(frac=1, random_state=42).reset_index(drop=True)

        # Delete the three variables that have been combined to free memory
        del sampled_df_botnet_not_spam
        del sampled_df_botnet_and_spam
        del sampled_df_no_botnet_no_spam

        # Ensure variables are deleted and memory is more efficient
        gc.collect()

        # e. Determine output file name and save sample
        original_folder = os.path.dirname(filepath)  # Get original folder from file path
        folder_name = os.path.basename(original_folder)  # Get folder name like '1', '2', etc.
        
        # Create folder inside sampled_data with original folder name
        folder_output_path = os.path.join(output_folder, folder_name)
        os.makedirs(folder_output_path, exist_ok=True)

        base_filename = os.path.basename(filepath)  # Get original file name
        output_filepath = os.path.join(folder_output_path, base_filename)  # Combine with output folder
        
        sampled_df.to_csv(output_filepath, index=False)
        print(f"Success! Sample saved to '{output_filepath}'")
        
    except FileNotFoundError:
        print(f"WARNING: File not found at '{filepath}'. Dataset {key} will be skipped.")
    except Exception as e:
        print(f"Failed to process file {filepath}: {e}. Dataset {key} will be skipped.")

print("\n--- All processes completed ---")
print(f"Successfully loaded {len(dict_of_dataframes)} datasets into memory.")

Memulai proses: Membaca data & mengambil sampel...
Folder output 'sampled_data' telah disiapkan.

--- Memproses Dataset 1 ---
Membaca file: sensor1/sensor1.binetflow...
Dataset 1 (100 baris) telah dimuat ke memori.
Mengambil 100 baris acak dari file 1...
Sukses! Sampel disimpan ke 'sampled_data\sensor1\sensor1.binetflow'

--- Memproses Dataset 2 ---
Membaca file: sensor2/sensor2.binetflow...
Dataset 2 (100 baris) telah dimuat ke memori.
Mengambil 100 baris acak dari file 2...
Sukses! Sampel disimpan ke 'sampled_data\sensor2\sensor2.binetflow'

--- Memproses Dataset 3 ---
Membaca file: sensor3/sensor3.binetflow...
Dataset 3 (100 baris) telah dimuat ke memori.
Mengambil 100 baris acak dari file 3...
Sukses! Sampel disimpan ke 'sampled_data\sensor3\sensor3.binetflow'

--- Semua proses selesai ---
Berhasil memuat 3 dataset ke memori.


In [3]:
# Check which keys (datasets) are available in the dictionary
print(f"Datasets successfully loaded into memory: {list(dict_of_dataframes.keys())}")

# Contoh: Mengakses dan menampilkan 5 baris pertama dari dataset '3' (yang LENGKAP)
try:
    print("\nDisplaying .head() of dataset '3' (retrieved from memory):")
    
    # This is how you access the COMPLETE data
    df_5_full = dict_of_dataframes['3']
    
    display(df_5_full.head())
    print(f"Size of dataset '3' in memory: {df_5_full.shape}")
    
except KeyError:
    print("Dataset '3' was not successfully loaded (possibly error or not found).")

Dataset yang berhasil dimuat ke memori: ['1', '2', '3']

Menampilkan .head() dari dataset '3' (diambil dari memori):


Unnamed: 0,StartTime,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,sTos,dTos,TotPkts,TotBytes,SrcBytes,Label,ActivityLabel,BotnetName,SensorId
0,2022-07-07 10:56:23,2261.78833,udp,147.32.85.123,13815,<->,69.114.239.94,26915,CON,0.0,0.0,10,1890,535,flow=Background-UDP-Established,0,-,3
1,2022-07-07 13:01:26,0.000509,udp,125.204.77.235,58432,<->,147.32.84.229,13363,CON,0.0,0.0,2,134,74,flow=Background-UDP-Established,0,-,3
2,2022-07-07 11:04:09,0.020055,udp,147.32.84.59,35054,<->,147.32.80.9,53,CON,0.0,0.0,2,420,128,flow=To-Background-UDP-CVUT-DNS-Server,0,-,3
3,2022-07-07 15:38:13,10.082277,tcp,147.32.84.165,2900,->,205.188.186.137,587,FSPA_FSPA,0.0,0.0,70,8782,5540,flow=From-Botnet-V54-TCP-Established-SPAM,1,virut,3
4,2022-07-07 15:02:54,9.013451,tcp,147.32.84.165,4307,->,64.12.90.98,25,S_,0.0,,3,186,186,flow=From-Botnet-V54-TCP-Attempt-SPAM,1,virut,3


Ukuran dataset '3' di memori: (100, 18)
