In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [3]:
label_encoder = LabelEncoder()

#PREPROCESSING

In [9]:
#load dataset
final_df = pd.read_csv('final_df.csv', sep=',')
final_df.head()

#undersampling kelas "Benign", "Background", "Bruteforce", dan "Bruteforce-XML" terbanyak ke 10k
#pisahkan masing-masing kelas
benign_df = final_df[final_df['traffic'] == 'Benign']
background_df = final_df[final_df['traffic'] == 'Background']
probing_df = final_df[final_df['traffic'] == 'Probing']
bruteforce_df = final_df[final_df['traffic'] == 'Bruteforce']
bruteforce_xml_df = final_df[final_df['traffic'] == 'Bruteforce-XML']
xmr_gcc_df = final_df[final_df['traffic'] == 'XMRIGCC CryptoMiner']

# Undersampling masing-masing kelas ke 10k
benign_sampled = benign_df.sample(10000, random_state=42)
background_sampled = background_df.sample(10000, random_state=42)
probing_sampled = probing_df.sample(10000, random_state=42)
bruteforce_sampled = bruteforce_df.sample(10000, random_state=42)
bruteforce_xml_sampled = bruteforce_xml_df.sample(10000, random_state=42)
xmr_gcc_sampled = xmr_gcc_df.sample(10000, random_state=42)

#Gabungkan kembali dataset
final_df = pd.concat([benign_sampled, background_sampled, probing_sampled, bruteforce_sampled, bruteforce_xml_sampled, xmr_gcc_sampled])
final_df.reset_index(drop=True, inplace=True)

#Pisahkan dataset dengan target column dan non target column
X_balanced_subset = final_df.drop('traffic', axis=1)
y_balanced_subset = final_df['traffic']

# Fit and transform label
y_encoded = label_encoder.fit_transform(y_balanced_subset)
final_df['traffic'] = y_encoded

In [10]:
final_df.head()

Unnamed: 0,response_port,flow_duration,forward_packets_per_sec,flow_packets_per_sec,down_up_ratio,flow_SYN_flags,flow_RST_flags,forward_PSH_flags,backward_PSH_flags,forward_URG_flags,...,forward_bulk_bytes,backward_bulk_bytes,forward_bulk_packets,backward_bulk_rate,active,forward_initial_window_size,forward_last_window_size,origin_host_encoded,response_host_encoded,traffic
0,0.0,-0.677583,-0.758512,-0.615617,0.732951,2.139421,1.584303,-0.600242,-0.380092,-0.623884,...,-0.6884,-0.695891,-0.741051,1.512756,-0.814934,-0.596758,0.0,-1.021097,0.0,1
1,0.0,1.477153,-0.75887,1.720209,0.732951,-0.467416,1.584303,-0.600242,-0.380092,-0.623884,...,1.452643,1.437006,-0.741051,1.512756,-0.764215,-0.596758,0.0,-1.021097,0.0,1
2,0.0,-0.67702,1.369743,1.720209,-1.364347,-0.467416,1.584303,-0.600242,-0.380092,-0.623884,...,-0.6884,-0.695891,1.349434,-0.661045,-0.039634,-0.596758,0.0,0.979339,0.0,1
3,0.0,-0.677176,1.369743,1.720209,0.732951,-0.467416,1.584303,1.665994,-0.380092,1.602863,...,1.452643,1.437006,-0.741051,1.512756,-0.254824,-0.596758,0.0,0.979339,0.0,1
4,0.0,1.477153,-0.712538,-0.559525,0.732951,-0.467416,-0.631193,-0.600242,-0.380092,-0.623884,...,-0.6884,-0.695891,-0.741051,-0.661045,-0.454916,-0.596758,0.0,-1.021097,0.0,1


#MODELLING

In [11]:
from sklearn.model_selection import train_test_split

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X_balanced_subset, y_balanced_subset, test_size=0.3, random_state=42)

#MLP
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(
    activation='tanh',  # Logistic activation (sigmoid) in hidden layers
    solver='adam',          # Optimizer, 'adam' is commonly used
    alpha=0.01,             # Ridge penalty factor
    batch_size=400,         # Number of instances for batch processing
    learning_rate_init=0.001, # Initial learning rate
    max_iter=5000,           # Maximum number of iterations
    tol=1e-6,               # Tolerance for the optimization
    hidden_layer_sizes=(10, ), # Number of hidden units (2 in your case)
    random_state=42,
    verbose=True
)

# Train the model
mlp.fit(X_train, y_train)

# Make predictions
y_pred = mlp.predict(X_test)

# Evaluate
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.metrics import balanced_accuracy_score
accuracy = accuracy_score(y_test, y_pred) # For classification
print(f"Accuracy: {accuracy:.4f}")
#mse = mean_squared_error(y_test, y_pred) # For regression (if your task was regression instead)
#print(f"Mean Squared Error: {mse:.4f}")
balanced_acc = balanced_accuracy_score(y_test, y_pred) # Calculate balanced accuracy
print(f"Balanced Accuracy: {balanced_acc:.4f}")



Iteration 1, loss = 1.91618594
Iteration 2, loss = 1.82038634
Iteration 3, loss = 1.78889804
Iteration 4, loss = 1.77632288
Iteration 5, loss = 1.76964868
Iteration 6, loss = 1.76523828
Iteration 7, loss = 1.76148398
Iteration 8, loss = 1.75814121
Iteration 9, loss = 1.75468111
Iteration 10, loss = 1.75131790
Iteration 11, loss = 1.74793445
Iteration 12, loss = 1.74451830
Iteration 13, loss = 1.74117656
Iteration 14, loss = 1.73776234
Iteration 15, loss = 1.73448039
Iteration 16, loss = 1.73123170
Iteration 17, loss = 1.72800314
Iteration 18, loss = 1.72505165
Iteration 19, loss = 1.72218901
Iteration 20, loss = 1.71940098
Iteration 21, loss = 1.71685945
Iteration 22, loss = 1.71457300
Iteration 23, loss = 1.71231697
Iteration 24, loss = 1.71016863
Iteration 25, loss = 1.70816174
Iteration 26, loss = 1.70623838
Iteration 27, loss = 1.70451871
Iteration 28, loss = 1.70287390
Iteration 29, loss = 1.70123993
Iteration 30, loss = 1.69965697
Iteration 31, loss = 1.69823395
Iteration 32, los

#TESTING

In [12]:
test_ori = pd.read_csv('test.csv', sep=',')
test_ori.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 138805 entries, 0 to 138804
Data columns (total 42 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   id                            138805 non-null  object 
 1   origin_host                   138805 non-null  object 
 2   origin_port                   138805 non-null  int64  
 3   response_host                 138805 non-null  object 
 4   response_port                 138805 non-null  int64  
 5   flow_duration                 86780 non-null   float64
 6   forward_packets_per_sec       96316 non-null   float64
 7   backward_packets_per_sec      91138 non-null   float64
 8   flow_packets_per_sec          90465 non-null   float64
 9   down_up_ratio                 98791 non-null   float64
 10  flow_FIN_flags                115082 non-null  float64
 11  flow_SYN_flags                108899 non-null  float64
 12  flow_RST_flags                93683 non-null

In [5]:
test = pd.read_csv('test_for_training.csv', sep=',')
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 138805 entries, 0 to 138804
Data columns (total 25 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   response_port                138805 non-null  float64
 1   flow_duration                138805 non-null  float64
 2   forward_packets_per_sec      138805 non-null  float64
 3   flow_packets_per_sec         138805 non-null  float64
 4   down_up_ratio                138805 non-null  float64
 5   flow_SYN_flags               138805 non-null  float64
 6   flow_RST_flags               138805 non-null  float64
 7   forward_PSH_flags            138805 non-null  float64
 8   backward_PSH_flags           138805 non-null  float64
 9   forward_URG_flags            138805 non-null  float64
 10  forward_pkts_payload         138805 non-null  float64
 11  backward_pkts_payload        138805 non-null  float64
 12  flow_pkts_payload            138805 non-null  float64
 13 

In [14]:
predictions = mlp.predict(test)

In [17]:
predictions_df = pd.DataFrame(predictions, columns=['Predictions'])

predictions_df.head()

Unnamed: 0,Predictions
0,Bruteforce
1,Benign
2,Bruteforce-XML
3,Bruteforce-XML
4,XMRIGCC CryptoMiner
