In [1]:
import pandas as pd
import numpy as np


In [2]:
df=pd.read_csv("Train_Data.csv")

## Data features


In [3]:
df.nunique()

Unnamed: 0,0
duration,1594
protocoltype,3
service,65
flag,10
srcbytes,3022
dstbytes,8367
land,2
wrongfragment,1
urgent,3
hot,25


In [4]:
df.head()

Unnamed: 0,duration,protocoltype,service,flag,srcbytes,dstbytes,land,wrongfragment,urgent,hot,...,dsthostsamesrvrate,dsthostdiffsrvrate,dsthostsamesrcportrate,dsthostsrvdiffhostrate,dsthostserrorrate,dsthostsrvserrorrate,dsthostrerrorrate,dsthostsrvrerrorrate,lastflag,attack
0,0,tcp,netbios_dgm,REJ,0,0,0,0,0,0,...,0.06,0.06,0.0,0.0,0.0,0.0,1.0,1.0,21,1
1,0,tcp,smtp,SF,1239,400,0,0,0,0,...,0.45,0.04,0.0,0.0,0.11,0.0,0.02,0.0,18,0
2,0,tcp,http,SF,222,945,0,0,0,0,...,1.0,0.0,0.02,0.03,0.0,0.0,0.0,0.0,21,0
3,0,tcp,http,SF,235,1380,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21,0
4,0,tcp,uucp_path,REJ,0,0,0,0,0,0,...,0.01,0.08,0.0,0.0,0.0,0.0,1.0,1.0,19,1


In [5]:
print("Training data shape:", df.shape)

Training data shape: (86845, 43)


In [6]:
df.isna().sum()

Unnamed: 0,0
duration,0
protocoltype,0
service,0
flag,0
srcbytes,0
dstbytes,0
land,0
wrongfragment,0
urgent,0
hot,0


## splitting

In [7]:
X = df.drop(columns='attack')
y = df['attack']


In [8]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [9]:
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(exclude=['object']).columns

In [10]:
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

In [11]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [12]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [13]:
#X.nunique()

In [14]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=42)


In [29]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [78]:

# Adding Gaussian noise to numerical features
X_train_noisy = X_train.copy()
X_val_noisy = X_val.copy()

# Add noise only to numerical columns
num_cols = X_train.select_dtypes(include=['float64', 'int64']).columns

for col in num_cols:
    X_train_noisy[col] += np.random.normal(0.5,2, X_train.shape[0])  # Mean=0.5, Std=2
    X_val_noisy[col] += np.random.normal(0,2, X_val.shape[0])


In [79]:
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42, n_jobs=-1))
])

# Train the model
model_pipeline.fit(X_train_noisy, y_train)

In [80]:
y_pred = model_pipeline.predict(X_val_noisy)

# Evaluate the model
accuracy = accuracy_score(y_val, y_pred)
conf_matrix = confusion_matrix(y_val, y_pred)
class_report = classification_report(y_val, y_pred)

In [81]:
class_report

'              precision    recall  f1-score   support\n\n           0       1.00      1.00      1.00     13452\n           1       1.00      1.00      1.00      8260\n\n    accuracy                           1.00     21712\n   macro avg       1.00      1.00      1.00     21712\nweighted avg       1.00      1.00      1.00     21712\n'

In [82]:
test_data=pd.read_csv("Test_Data.csv")

In [83]:
# Make predictions on the test data
test_predictions = model_pipeline.predict(test_data)


In [84]:
test_predictions

array([1, 0, 1, ..., 1, 0, 1])

In [85]:
predictions_df = pd.DataFrame(test_predictions, columns=['attack'])


In [86]:
predictions_df.to_csv('my submission.csv', index=False)


In [87]:
xyz=pd.read_csv("my submission.csv")

In [88]:
xyz.head()

Unnamed: 0,attack
0,1
1,0
2,1
3,1
4,1


In [89]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report


# Metrics calculation
accuracy = accuracy_score(y_val, y_pred)
precision = precision_score(y_val, y_pred)
recall = recall_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)

# Display results
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

print("\nConfusion Matrix:\n", confusion_matrix(y_val, y_pred))


Accuracy: 0.9994
Precision: 0.9993
Recall: 0.9993
F1 Score: 0.9993

Confusion Matrix:
 [[13446     6]
 [    6  8254]]
