In [1]:
import pandas as pd
import os
import gc

# --- Paths for CTU-13 dataset ---
dataset_files = {
    '1': 'scenario_dataset_1/dataset_result.binetflow',
    '2': 'scenario_dataset_2/dataset_result.binetflow',
    '5': 'scenario_dataset_5/dataset_result.binetflow',
    '9': 'scenario_dataset_9/dataset_result.binetflow',
    '13': 'scenario_dataset_13/dataset_result.binetflow'
}
# --- End paths ---

# Define the folder name to save sample results
output_folder = 'sampled_data'

# Create an empty dictionary to hold all COMPLETE DataFrames
# This variable will persist after this cell is executed
dict_of_dataframes = {}

In [2]:
print("Starting process: Reading data & taking samples...")

# 1. Create output folder if it doesn't exist
try:
    os.makedirs(output_folder, exist_ok=True)
    print(f"Output folder '{output_folder}' has been prepared.")
except Exception as e:
    print(f"Failed to create folder '{output_folder}': {e}")


# 2. Loop, read, save to dict, take samples, and save samples
for key, filepath in dataset_files.items():
    try:
        print(f"\n--- Processing Dataset {key} ---")
        print(f"Reading file: {filepath}...")
        
        # a. Read file
        df = pd.read_csv(filepath, low_memory=False)
        total_rows = len(df)
        
        # b. Save COMPLETE DataFrame to dictionary
        # This fulfills your request to access it later
        dict_of_dataframes[key] = df
        print(f"Dataset {key} ({total_rows} rows) has been loaded into memory.")

        # c. Determine sample size
        sample_size = min(100, total_rows)
        if sample_size < 100:
            print(f"Warning: File {key} only has {total_rows} rows. Taking {sample_size} rows.")
        else:
            print(f"Taking 100 random rows from file {key}...")

        # d. Take samples from the DataFrame that was just read
        sampled_df_botnet_not_spam = df[
            df['Label'].str.contains('botnet', case=False, na=False) &
            ~df['Label'].str.contains('spam', case=False, na=False)
        ].sample(n=3, random_state=42)

        sampled_df_botnet_and_spam = df[
            df['Label'].str.contains('botnet', case=False, na=False) &
            df['Label'].str.contains('spam', case=False, na=False)
        ].sample(n=3, random_state=42)

        sampled_df_no_botnet_no_spam = df[
            ~df['Label'].str.contains('botnet', case=False, na=False) &
            ~df['Label'].str.contains('spam', case=False, na=False)
        ].sample(n=4, random_state=42)

        # Combine the three DataFrames into one
        sampled_df = pd.concat([sampled_df_botnet_not_spam, sampled_df_botnet_and_spam, sampled_df_no_botnet_no_spam], ignore_index=True)

        # Shuffle the row order
        sampled_df = sampled_df.sample(frac=1, random_state=42).reset_index(drop=True)

        # Delete the three variables that have been combined to free memory
        del sampled_df_botnet_not_spam
        del sampled_df_botnet_and_spam
        del sampled_df_no_botnet_no_spam

        # Ensure variables are deleted and memory is more efficient
        gc.collect()

        # e. Determine output file name and save sample
        original_folder = os.path.dirname(filepath)  # Get original folder from file path
        folder_name = os.path.basename(original_folder)  # Get folder name like '1', '2', etc.
        
        # Create folder inside sampled_data with original folder name
        folder_output_path = os.path.join(output_folder, folder_name)
        os.makedirs(folder_output_path, exist_ok=True)

        base_filename = os.path.basename(filepath)  # Get original file name
        output_filepath = os.path.join(folder_output_path, base_filename)  # Combine with output folder
        
        sampled_df.to_csv(output_filepath, index=False)
        print(f"Success! Sample saved to '{output_filepath}'")
        
    except FileNotFoundError:
        print(f"WARNING: File not found at '{filepath}'. Dataset {key} will be skipped.")
    except Exception as e:
        print(f"Failed to process file {filepath}: {e}. Dataset {key} will be skipped.")

print("\n--- All processes completed ---")
print(f"Successfully loaded {len(dict_of_dataframes)} datasets into memory.")

Memulai proses: Membaca data & mengambil sampel...
Folder output 'sampled_data' telah disiapkan.

--- Memproses Dataset 1 ---
Membaca file: scenario_dataset_1/dataset_result.binetflow...
Dataset 1 (100 baris) telah dimuat ke memori.
Mengambil 100 baris acak dari file 1...
Sukses! Sampel disimpan ke 'sampled_data\scenario_dataset_1\dataset_result.binetflow'

--- Memproses Dataset 2 ---
Membaca file: scenario_dataset_2/dataset_result.binetflow...
Dataset 2 (100 baris) telah dimuat ke memori.
Mengambil 100 baris acak dari file 2...
Sukses! Sampel disimpan ke 'sampled_data\scenario_dataset_2\dataset_result.binetflow'

--- Memproses Dataset 5 ---
Membaca file: scenario_dataset_5/dataset_result.binetflow...
Dataset 5 (100 baris) telah dimuat ke memori.
Mengambil 100 baris acak dari file 5...
Sukses! Sampel disimpan ke 'sampled_data\scenario_dataset_5\dataset_result.binetflow'

--- Memproses Dataset 9 ---
Membaca file: scenario_dataset_9/dataset_result.binetflow...
Dataset 9 (100 baris) telah

In [3]:
# Check which keys (datasets) are available in the dictionary
print(f"Datasets successfully loaded into memory: {list(dict_of_dataframes.keys())}")

# Example: Access and display the first 5 rows from dataset '5' (the COMPLETE one)
try:
    print("\nDisplaying .head() of dataset '5' (retrieved from memory):")
    
    # This is how you access the COMPLETE data
    df_5_full = dict_of_dataframes['5']
    
    display(df_5_full.head())
    print(f"Size of dataset '5' in memory: {df_5_full.shape}")
    
except KeyError:
    print("Dataset '5' was not successfully loaded (possibly error or not found).")

Dataset yang berhasil dimuat ke memori: ['1', '2', '5', '9', '13']

Menampilkan .head() dari dataset '5' (diambil dari memori):


Unnamed: 0,StartTime,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,sTos,dTos,TotPkts,TotBytes,SrcBytes,Label
0,2020-01-01 00:21:26,12.810612,tcp,147.32.86.44,2705,->,147.32.80.13,3128,SPA_SPA,0.0,0.0,10,2297,1494,flow=To-Background-CVUT-Proxy
1,2020-01-01 00:33:15,0.000144,udp,147.32.84.138,48568,<->,147.32.80.9,53,CON,0.0,0.0,2,214,81,flow=To-Background-UDP-CVUT-DNS-Server
2,2020-01-01 07:29:42,0.000381,udp,147.32.84.138,36854,<->,147.32.80.9,53,CON,0.0,0.0,2,206,78,flow=To-Background-UDP-CVUT-DNS-Server
3,2020-01-01 02:45:21,32.58025,tcp,147.32.84.165,1531,->,205.188.186.137,587,FSPA_FSPA,0.0,0.0,34,3963,2002,flow=From-Botnet-V46-TCP-Established-SPAM
4,2020-01-01 07:27:06,7.832476,tcp,147.32.84.165,1528,->,205.188.186.137,587,FSPA_FSPA,0.0,0.0,59,7125,4142,flow=From-Botnet-V46-TCP-Established-SPAM


Ukuran dataset '5' di memori: (100, 15)
