In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Load data
X = pd.read_csv('Train_data.csv')
y = pd.read_csv('Test_data.csv')

# Check for NaN values and drop or fill
print("NaN values in each column:\n", X.isnull().sum())

# Assuming 'class' is your label
X_labels = X['class'].dropna()
X_features = X.drop('class', axis=1)

# Handle categorical variables
categorical_features = X_features.select_dtypes(include=['object']).columns.tolist()
X_features = pd.get_dummies(X_features, columns=categorical_features)

# Scale features
scaler = StandardScaler()
X_features_scaled = scaler.fit_transform(X_features)

# Split data
X_train, X_val, y_train, y_val = train_test_split(X_features_scaled, X_labels, test_size=0.2, random_state=42)



NaN values in each column:
 duration                       0
protocol_type                  0
service                        0
flag                           0
src_bytes                      0
dst_bytes                      0
land                           0
wrong_fragment                 0
urgent                         0
hot                            0
num_failed_logins              0
logged_in                      0
num_compromised                0
root_shell                     0
su_attempted                   0
num_root                       0
num_file_creations             0
num_shells                     0
num_access_files               0
num_outbound_cmds              0
is_host_login                  0
is_guest_login                 0
count                          0
srv_count                      0
serror_rate                    0
srv_serror_rate                0
rerror_rate                    0
srv_rerror_rate                0
same_srv_rate                  0
diff_srv_rate  

In [2]:
# Remove NaN values from labels
print("Total NaN labels before removal:", X_labels.isna().sum())
X_combined = pd.concat([X_features, X_labels], axis=1)
X_combined.dropna(subset=['class'], inplace=True)  

X_labels = X_combined['class']
X_features = X_combined.drop('class', axis=1)

# Now re-check the unique values in the labels
print("Unique label values after NaN removal:", X_labels.unique())

Total NaN labels before removal: 0
Unique label values after NaN removal: ['normal' 'anomaly']


In [3]:
# Check data types
print(X_features.dtypes)

# Check for any object types that shouldn't be there
if X_features.select_dtypes(include=['object']).columns.any():
    print("Categorical data still present:", X_features.select_dtypes(include=['object']).columns)
else:
    print("All data is numeric.")

duration          int64
src_bytes         int64
dst_bytes         int64
land              int64
wrong_fragment    int64
                  ...  
flag_S1           uint8
flag_S2           uint8
flag_S3           uint8
flag_SF           uint8
flag_SH           uint8
Length: 118, dtype: object
All data is numeric.


In [4]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_features = pd.DataFrame(scaler.fit_transform(X_features), columns=X_features.columns)  

In [5]:
# Define a mapping for labels
label_mapping = {'normal': 0, 'anomaly': 1}

# Apply the mapping to convert labels to integers
X_labels = X_labels.map(label_mapping)

# Confirm conversion
print("Labels after conversion:", X_labels.unique())

Labels after conversion: [0 1]


In [6]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X_features, X_labels, test_size=0.2, random_state=42)

In [7]:
# Model Definition
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

def create_model(input_shape):
    model = Sequential([
        Dense(64, activation='relu', input_shape=(input_shape,)),
        Dense(32, activation='relu'),
        Dense(16, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

model = create_model(X_train.shape[1])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [8]:
# Model Training
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_val, y_val), verbose=1)

Epoch 1/50
[1m630/630[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 5ms/step - accuracy: 0.9224 - loss: 0.2097 - val_accuracy: 0.9875 - val_loss: 0.0412
Epoch 2/50
[1m630/630[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.9872 - loss: 0.0377 - val_accuracy: 0.9909 - val_loss: 0.0312
Epoch 3/50
[1m630/630[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.9919 - loss: 0.0243 - val_accuracy: 0.9917 - val_loss: 0.0275
Epoch 4/50
[1m630/630[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.9917 - loss: 0.0281 - val_accuracy: 0.9903 - val_loss: 0.0289
Epoch 5/50
[1m630/630[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.9928 - loss: 0.0209 - val_accuracy: 0.9911 - val_loss: 0.0269
Epoch 6/50
[1m630/630[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.9924 - loss: 0.0211 - val_accuracy: 0.9909 - val_loss: 0.0270
Epoch 7/50
[1m630/630[0m 

In [9]:
# Model Evaluation
train_loss, train_acc = model.evaluate(X_train, y_train, verbose=0)
val_loss, val_acc = model.evaluate(X_val, y_val, verbose=0)
print(f"Training Accuracy: {train_acc:.4f}, Training Loss: {train_loss:.4f}")
print(f"Validation Accuracy: {val_acc:.4f}, Validation Loss: {val_loss:.4f}")

Training Accuracy: 0.9976, Training Loss: 0.0065
Validation Accuracy: 0.9956, Validation Loss: 0.0404
