### Importing necessary modules:

In [51]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np
from sklearn.metrics import classification_report

### Reading training and testing data

In [52]:
train_data= pd.read_parquet('UNSW_NB15_training-set.parquet')

In [53]:
train_data

Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sload,...,trans_depth,response_body_len,ct_src_dport_ltm,ct_dst_sport_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,is_sm_ips_ports,attack_cat,label
0,0.000011,udp,-,INT,2,0,496,0,90909.093750,1.803636e+08,...,0,0,1,1,0,0,0,0,Normal,0
1,0.000008,udp,-,INT,2,0,1762,0,125000.000000,8.810000e+08,...,0,0,1,1,0,0,0,0,Normal,0
2,0.000005,udp,-,INT,2,0,1068,0,200000.000000,8.544000e+08,...,0,0,1,1,0,0,0,0,Normal,0
3,0.000006,udp,-,INT,2,0,900,0,166666.656250,6.000000e+08,...,0,0,2,1,0,0,0,0,Normal,0
4,0.000010,udp,-,INT,2,0,2126,0,100000.000000,8.504000e+08,...,0,0,2,1,0,0,0,0,Normal,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82327,0.000005,udp,-,INT,2,0,104,0,200000.000000,8.320000e+07,...,0,0,1,1,0,0,0,0,Normal,0
82328,1.106101,tcp,-,FIN,20,8,18062,354,24.410067,1.241044e+05,...,0,0,1,1,0,0,0,0,Normal,0
82329,0.000000,arp,-,INT,1,0,46,0,0.000000,0.000000e+00,...,0,0,1,1,0,0,0,1,Normal,0
82330,0.000000,arp,-,INT,1,0,46,0,0.000000,0.000000e+00,...,0,0,1,1,0,0,0,1,Normal,0


In [54]:
test_data= pd.read_parquet('UNSW_NB15_testing-set.parquet')

In [55]:
test_data

Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sload,...,trans_depth,response_body_len,ct_src_dport_ltm,ct_dst_sport_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,is_sm_ips_ports,attack_cat,label
0,0.121478,tcp,-,FIN,6,4,258,172,74.087486,1.415894e+04,...,0,0,1,1,0,0,0,0,Normal,0
1,0.649902,tcp,-,FIN,14,38,734,42014,78.473373,8.395112e+03,...,0,0,1,1,0,0,0,0,Normal,0
2,1.623129,tcp,-,FIN,8,16,364,13186,14.170161,1.572272e+03,...,0,0,1,1,0,0,0,0,Normal,0
3,1.681642,tcp,ftp,FIN,12,12,628,770,13.677108,2.740179e+03,...,0,0,1,1,1,1,0,0,Normal,0
4,0.449454,tcp,-,FIN,10,6,534,268,33.373825,8.561499e+03,...,0,0,2,1,0,0,0,0,Normal,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175336,0.000009,udp,dns,INT,2,0,114,0,111111.109375,5.066666e+07,...,0,0,24,13,0,0,0,0,Generic,1
175337,0.505762,tcp,-,FIN,10,8,620,354,33.612648,8.826286e+03,...,0,0,1,1,0,0,0,0,Shellcode,1
175338,0.000009,udp,dns,INT,2,0,114,0,111111.109375,5.066666e+07,...,0,0,3,3,0,0,0,0,Generic,1
175339,0.000009,udp,dns,INT,2,0,114,0,111111.109375,5.066666e+07,...,0,0,30,14,0,0,0,0,Generic,1


### Setting columns that we need

In [56]:
columns = ['dur', 'proto', 'sbytes', 'dbytes', 'spkts', 'dpkts', 'rate', 'sload', 'dload', 'label']

In [57]:
train_data = train_data[columns]

In [58]:
train_data.head()

Unnamed: 0,dur,proto,sbytes,dbytes,spkts,dpkts,rate,sload,dload,label
0,1.1e-05,udp,496,0,2,0,90909.09375,180363632.0,0.0,0
1,8e-06,udp,1762,0,2,0,125000.0,881000000.0,0.0,0
2,5e-06,udp,1068,0,2,0,200000.0,854400000.0,0.0,0
3,6e-06,udp,900,0,2,0,166666.65625,600000000.0,0.0,0
4,1e-05,udp,2126,0,2,0,100000.0,850400000.0,0.0,0


In [59]:
test_data = test_data[columns]

In [60]:
test_data.head()

Unnamed: 0,dur,proto,sbytes,dbytes,spkts,dpkts,rate,sload,dload,label
0,0.121478,tcp,258,172,6,4,74.087486,14158.942383,8495.365234,0
1,0.649902,tcp,734,42014,14,38,78.473373,8395.112305,503571.3125,0
2,1.623129,tcp,364,13186,8,16,14.170161,1572.271851,60929.230469,0
3,1.681642,tcp,628,770,12,12,13.677108,2740.178955,3358.62207,0
4,0.449454,tcp,534,268,10,6,33.373825,8561.499023,3987.059814,0


### Preprocessing:

##### extracting numerical columns

In [61]:
numeric_cols = train_data.select_dtypes(include=np.number).columns.tolist()

##### removing 'label' column:`

In [62]:
if 'label' in numeric_cols:
    numeric_cols.remove('label')

In [63]:
print("Numerical Columns:", numeric_cols)

Numerical Columns: ['dur', 'sbytes', 'dbytes', 'spkts', 'dpkts', 'rate', 'sload', 'dload']


##### extracting categorical columns

In [64]:
categorical_cols = train_data.select_dtypes(exclude=np.number).columns.tolist()

In [65]:
print("Categorical Columns:", categorical_cols)

Categorical Columns: ['proto']


##### Initializing a StandardScaler for numerical features

In [66]:
numerical_transformer = StandardScaler()

##### Initializing a OneHotEncoder for categorical features

In [67]:
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

##### Applying transformers to relevant columns using ColumnTransformer (Defining the processing steps)

In [68]:
preprocessing_steps = ColumnTransformer(transformers=[('num', numerical_transformer, numeric_cols),('cat', categorical_transformer, categorical_cols)])

###### Update the pipeline with the new steps

In [69]:
preprocess_pipeline = Pipeline(steps=[('preprocessor', preprocessing_steps)])

##### Reapply transformations to both training and testing data

In [70]:
x_train_preprocessed = preprocess_pipeline.fit_transform(train_data.drop('label', axis=1))

In [71]:
x_test_preprocessed = preprocess_pipeline.transform(test_data.drop('label', axis=1))

In [72]:
# Display the shapes of the preprocessed datasets
x_train_preprocessed.shape,x_test_preprocessed.shape

((82332, 139), (175341, 139))

In [73]:
y_train = train_data['label']
y_test = test_data['label']

In [74]:
y_train.shape

(82332,)

In [75]:
y_test.shape

(175341,)

### Applying genetic algorithm:

In [76]:
population_size = 100

In [77]:
chromosome_length = x_train_preprocessed.shape[1]
print(chromosome_length)

139


In [78]:
population = np.random.randint(0, 2, (population_size, chromosome_length))
print(population)

[[1 1 1 ... 1 0 0]
 [1 1 0 ... 1 0 0]
 [1 0 1 ... 0 1 0]
 ...
 [1 0 1 ... 0 1 1]
 [0 0 1 ... 1 0 1]
 [1 1 0 ... 1 0 1]]


##### function to calculate fitness

In [79]:
accuracy = 0.0
precision =0.0

def calculate_fitness(individual, features, labels):
    if isinstance(features, np.ndarray):
        features_dense = features
    else:
        features_dense = features.toarray()
    individual = np.array(individual).reshape(-1)
    prediction_scores = np.dot(features_dense, individual)
    predictions = prediction_scores > 0.5
    true_positives = np.sum((predictions == 1) & (labels == 1))
    true_negatives = np.sum((predictions == 0) & (labels == 0))
    false_positives = np.sum((predictions == 1) & (labels == 0))
    false_negatives = np.sum((predictions == 0) & (labels == 1))
    return true_positives * 2 + true_negatives - false_positives - 2 * false_negatives

##### Function to delect parents to generate a new generation:

In [80]:
def select(population, fitness):
    fitness_shifted = fitness - np.min(fitness) + 1e-3
    probability = fitness_shifted / np.sum(fitness_shifted)
    indices = np.random.choice(np.arange(population_size), size=population_size, p=probability)
    return population[indices]


##### Crossover :)

In [81]:
def crossover(parent1, parent2):
    point = np.random.randint(1, chromosome_length - 1)
    child1 = np.concatenate((parent1[:point], parent2[point:]))
    child2 = np.concatenate((parent2[:point], parent1[point:]))
    return child1, child2

##### Mutations:

In [82]:
def mutate(individual, mutation_rate=0.01):
    for i in range(chromosome_length):
        if np.random.rand() < mutation_rate:
            individual[i] = 1 - individual[i]
    return individual


##### Executing the genetic algorithm:

##### For multiple populations

In [83]:
population_sizes = [50, 100, 200]
for pop_size in population_sizes:
    print(f"Population Size: {pop_size}")
    population_size = pop_size
    population = np.random.randint(0, 2, (population_size, chromosome_length))
    best_chromosome = None
    best_fitness_score = -np.inf
    for generation in range(int(30)):  
        fitness = np.array([calculate_fitness(ind, x_train_preprocessed, y_train) for ind in population])
        if np.max(fitness) > best_fitness_score:
            best_fitness_score = np.max(fitness)
            best_chromosome = population[np.argmax(fitness)]
        population = select(population, fitness)
        next_population = []
        for i in range(0, population_size, 2):
            parent1, parent2 = population[i], population[i+1]
            child1, child2 = crossover(parent1, parent2)
            next_population.extend([child1, child2])
        population = np.array([mutate(ind) for ind in next_population])
        print(f"Generation {generation}: Best Fitness - {best_fitness_score}")
    print()

Population Size: 50
Generation 0: Best Fitness - 49692
Generation 1: Best Fitness - 51682
Generation 2: Best Fitness - 51682
Generation 3: Best Fitness - 55466
Generation 4: Best Fitness - 55466
Generation 5: Best Fitness - 55466
Generation 6: Best Fitness - 66368
Generation 7: Best Fitness - 66368
Generation 8: Best Fitness - 66368
Generation 9: Best Fitness - 66368
Generation 10: Best Fitness - 66368
Generation 11: Best Fitness - 66368
Generation 12: Best Fitness - 67160
Generation 13: Best Fitness - 67296
Generation 14: Best Fitness - 67296
Generation 15: Best Fitness - 67296
Generation 16: Best Fitness - 67296
Generation 17: Best Fitness - 67296
Generation 18: Best Fitness - 67296
Generation 19: Best Fitness - 67296
Generation 20: Best Fitness - 67296
Generation 21: Best Fitness - 67296
Generation 22: Best Fitness - 67296
Generation 23: Best Fitness - 67378
Generation 24: Best Fitness - 67378
Generation 25: Best Fitness - 67378
Generation 26: Best Fitness - 67378
Generation 27: Bes

##### Experiment to on varied mutation ratess:

In [84]:
mutation_rates = [0.01, 0.05, 0.1]

for mutation_rate in mutation_rates:
    print(f"Mutation Rate: {mutation_rate}")
    best_chromosome = None
    best_fitness_score = -np.inf
    for generation in range(int(30)):  
        fitness = np.array([calculate_fitness(ind, x_train_preprocessed, y_train) for ind in population])
        if np.max(fitness) > best_fitness_score:
            best_fitness_score = np.max(fitness)
            best_chromosome = population[np.argmax(fitness)]
        population = select(population, fitness)
        next_population = []
        for i in range(0, population_size, 2):
            parent1, parent2 = population[i], population[i+1]
            child1, child2 = crossover(parent1, parent2)
            next_population.extend([child1, child2])
        population = np.array([mutate(ind, mutation_rate) for ind in next_population])
        print(f"Generation {generation}: Best Fitness - {best_fitness_score}")
    print()

Mutation Rate: 0.01
Generation 0: Best Fitness - 67058
Generation 1: Best Fitness - 67058
Generation 2: Best Fitness - 67058
Generation 3: Best Fitness - 67058
Generation 4: Best Fitness - 67744
Generation 5: Best Fitness - 67744
Generation 6: Best Fitness - 67744
Generation 7: Best Fitness - 67744
Generation 8: Best Fitness - 67744
Generation 9: Best Fitness - 67744
Generation 10: Best Fitness - 67744
Generation 11: Best Fitness - 67744
Generation 12: Best Fitness - 67744
Generation 13: Best Fitness - 67744
Generation 14: Best Fitness - 67744
Generation 15: Best Fitness - 67744
Generation 16: Best Fitness - 67744
Generation 17: Best Fitness - 67744
Generation 18: Best Fitness - 67744
Generation 19: Best Fitness - 67744
Generation 20: Best Fitness - 67744
Generation 21: Best Fitness - 67744
Generation 22: Best Fitness - 67744
Generation 23: Best Fitness - 67744
Generation 24: Best Fitness - 67858
Generation 25: Best Fitness - 67858
Generation 26: Best Fitness - 67858
Generation 27: Bes

#### Crossover experiment (single-point, multi-point, uniform)

In [85]:
def single_point_crossover(parent1, parent2):
    point = np.random.randint(1, len(parent1) - 1)
    child1 = np.concatenate((parent1[:point], parent2[point:]))
    child2 = np.concatenate((parent2[:point], parent1[point:]))
    return child1, child2

In [86]:
def multi_point_crossover(parent1, parent2):
    num_points = np.random.randint(1, len(parent1) - 1)
    points = sorted(np.random.choice(range(1, len(parent1)), num_points, replace=False))
    child1 = np.copy(parent1)
    child2 = np.copy(parent2)
    for i in range(0, len(points), 2):
        if i < len(points) - 1:
            child1[points[i]:points[i+1]], child2[points[i]:points[i+1]] = child2[points[i]:points[i+1]], child1[points[i]:points[i+1]]
    return child1, child2

In [87]:
def uniform_crossover(parent1, parent2, prob=0.5):
    child1 = np.copy(parent1)
    child2 = np.copy(parent2)
    for i in range(len(parent1)):
        if np.random.rand() < prob:
            child1[i], child2[i] = child2[i], child1[i]
    return child1, child2

In [88]:
# Experiment 2: Crossover Type Variation
crossover_types = ["single-point", "multi-point", "uniform"]

for cross_type in crossover_types:
    print(f"Crossover Type: {cross_type}")
    if cross_type == "single-point":
        crossover_func = single_point_crossover
    elif cross_type == "multi-point":
        crossover_func = multi_point_crossover
    elif cross_type == "uniform":
        crossover_func = uniform_crossover
    best_chromosome = None
    best_fitness_score = -np.inf
    for generation in range(int(30)):  
        fitness = np.array([calculate_fitness(ind, x_train_preprocessed, y_train) for ind in population])
        if np.max(fitness) > best_fitness_score:
            best_fitness_score = np.max(fitness)
            best_chromosome = population[np.argmax(fitness)]
        population = select(population, fitness)
        next_population = []
        for i in range(0, population_size, 2):
            parent1, parent2 = population[i], population[i+1]
            child1, child2 = crossover_func(parent1, parent2)
            next_population.extend([child1, child2])
        population = np.array([mutate(ind) for ind in next_population])
        print(f"Generation {generation}: Best Fitness - {best_fitness_score}")
    print()

Crossover Type: single-point
Generation 0: Best Fitness - 61808
Generation 1: Best Fitness - 67326
Generation 2: Best Fitness - 67326
Generation 3: Best Fitness - 67326
Generation 4: Best Fitness - 67326
Generation 5: Best Fitness - 67326
Generation 6: Best Fitness - 67326
Generation 7: Best Fitness - 67326
Generation 8: Best Fitness - 67326
Generation 9: Best Fitness - 67326
Generation 10: Best Fitness - 67326
Generation 11: Best Fitness - 67326
Generation 12: Best Fitness - 67326
Generation 13: Best Fitness - 67326
Generation 14: Best Fitness - 67326
Generation 15: Best Fitness - 67326
Generation 16: Best Fitness - 67326
Generation 17: Best Fitness - 67392
Generation 18: Best Fitness - 67392
Generation 19: Best Fitness - 67592
Generation 20: Best Fitness - 67932
Generation 21: Best Fitness - 67932
Generation 22: Best Fitness - 67932
Generation 23: Best Fitness - 67932
Generation 24: Best Fitness - 67932
Generation 25: Best Fitness - 67932
Generation 26: Best Fitness - 67932
Generatio