In [1]:
#!pip install kafka-python

In [2]:
#!pip install tqdm

In [3]:
import pandas as pd
from kafka import KafkaProducer
from datetime import datetime
from json import dumps
from tqdm import tqdm

In [4]:
producer = KafkaProducer(bootstrap_servers=['localhost:9092'],
                         value_serializer=lambda x: 
                         dumps(x).encode('utf-8'))
if producer.bootstrap_connected():
    print(f"Successfully connected to bootstrap server")
else:
    print("Couldn't connect to bootstrap server.")

TOPIC_NAME = "ml-raw-dns"

Successfully connected to bootstrap server


In [5]:
def produce_message(producer_instance, topic, message):
    producer_instance.send(topic, message)
    producer_instance.flush()
    return

In [6]:
# Load
df = pd.read_csv("combined_test_df.csv")

# Drop auto‑generated index column if present
df.drop(columns=["Unnamed: 0"], inplace=True, errors="ignore")

# Convert labels to binary
df["label"] = df["label"].apply(lambda x: 0 if x == "BENIGN" else 1)

# Specify the top features exactly
best_features = [
    "bwd packets/s",
    "destination port",
    "init_win_bytes_forward",
    "flow packets/s",
    "bwd packet length min",
    "down/up ratio",
    "psh flag count",
    "act_data_pkt_fwd",
    "total fwd packets",
    "subflow fwd bytes",
]

# Select only those columns + the label
df = df[best_features + ["label"]]

# Shuffle and reset index
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Save
df.to_csv("combined_test_df_phase2.csv", index=False)

print("Dataset updated: dropped 'Unnamed: 0', converted labels, reshuffled rows.")
print("New shape:", df.shape)
print(df.head())

Dataset updated: dropped 'Unnamed: 0', converted labels, reshuffled rows.
New shape: (94615, 11)
   bwd packets/s  destination port  init_win_bytes_forward  flow packets/s  \
0      35.825280                53                      -1       71.650560   
1       8.792677                53                      -1       17.585355   
2       0.000000                81                     396     -620.086957   
3       0.485000             33208                     229        0.582001   
4       0.081689                80                     251        0.163379   

   bwd packet length min  down/up ratio  psh flag count  act_data_pkt_fwd  \
0                     56              1               0                 3   
1                     89              1               0                 0   
2                      0              1               0                 0   
3                      6              5               0                 0   
4                      0              1          

In [7]:
with open("combined_test_df_phase2.csv") as f:
    start_time = datetime.now()
    for i, line in tqdm(enumerate(f)):
        produce_message(producer_instance=producer, topic=TOPIC_NAME, message=line)
    end_time = datetime.now()
    print(f"Batch took {end_time-start_time} time for ingesting data")

print("Ingestion Completed")

94616it [12:19, 128.00it/s]

Batch took 0:12:19.946195 time for ingesting data
Ingestion Completed



