In [1]:
#!pip install kafka-python

In [2]:
#!pip install tqdm

In [3]:
import pandas as pd
from kafka import KafkaProducer
from datetime import datetime
from json import dumps
from tqdm import tqdm

In [4]:
producer = KafkaProducer(bootstrap_servers=['localhost:9092'],
                         value_serializer=lambda x: 
                         dumps(x).encode('utf-8'))
if producer.bootstrap_connected():
    print(f"Successfully connected to bootstrap server")
else:
    print("Couldn't connect to bootstrap server.")

TOPIC_NAME = "ml-raw-dns"

Successfully connected to bootstrap server


In [5]:
def produce_message(producer_instance, topic, message):
    producer_instance.send(topic, message)
    producer_instance.flush()
    return

In [6]:
df = pd.read_csv("combined_test_df.csv")

df.drop('Unnamed: 0', axis=1, inplace=True, errors='ignore')

df['label'] = df['label'].apply(lambda x: 0 if x == 'BENIGN' else 1)

df = df.sample(frac=1, random_state=42).reset_index(drop=True)

df.to_csv("combined_test_df.csv", index=False)

print("Dataset updated: dropped 'Unnamed: 0', converted labels, reshuffled rows.")
print("New shape:", df.shape)
print(df.head())

Dataset updated: dropped 'Unnamed: 0', converted labels, reshuffled rows.
New shape: (94615, 79)
   destination port  flow duration  total fwd packets  total backward packets  \
0             44018          39970                  3                       1   
1                53          31148                  2                       2   
2               443       64999522                 30                      19   
3                80          59720                  3                       2   
4                80      103702830                 15                       3   

   total length of fwd packets  total length of bwd packets  \
0                          108                            6   
1                           56                           88   
2                         4480                         7625   
3                          430                          110   
4                         2541                            6   

   fwd packet length max  fwd packet 

In [7]:
with open("combined_test_df.csv") as f:
    start_time = datetime.now()
    for i, line in tqdm(enumerate(f)):
        produce_message(producer_instance=producer, topic=TOPIC_NAME, message=line)
    end_time = datetime.now()
    print(f"Batch took {end_time-start_time} time for ingesting data")

print("Ingestion Completed")

94616it [09:10, 171.92it/s]

Batch took 0:09:10.461012 time for ingesting data
Ingestion Completed



