In [1]:
!unzip /content/archive.zip -d /content/dataset


Archive:  /content/archive.zip
  inflating: /content/dataset/KDDTest+.arff  
  inflating: /content/dataset/KDDTest+.txt  
  inflating: /content/dataset/KDDTest-21.arff  
  inflating: /content/dataset/KDDTest-21.txt  
  inflating: /content/dataset/KDDTest1.jpg  
  inflating: /content/dataset/KDDTrain+.arff  
  inflating: /content/dataset/KDDTrain+.txt  
  inflating: /content/dataset/KDDTrain+_20Percent.arff  
  inflating: /content/dataset/KDDTrain+_20Percent.txt  
  inflating: /content/dataset/KDDTrain1.jpg  
  inflating: /content/dataset/index.html  
  inflating: /content/dataset/nsl-kdd/KDDTest+.arff  
  inflating: /content/dataset/nsl-kdd/KDDTest+.txt  
  inflating: /content/dataset/nsl-kdd/KDDTest-21.arff  
  inflating: /content/dataset/nsl-kdd/KDDTest-21.txt  
  inflating: /content/dataset/nsl-kdd/KDDTest1.jpg  
  inflating: /content/dataset/nsl-kdd/KDDTrain+.arff  
  inflating: /content/dataset/nsl-kdd/KDDTrain+.txt  
  inflating: /content/dataset/nsl-kdd/KDDTrain+_20Percent.arff 

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Load NSL-KDD dataset
# If you're using the raw content, you can save it as `nsl_kdd.csv` and load:
df = pd.read_csv('dataset/KDDTrain+.txt', header=None)

# Assign column names (based on NSL-KDD documentation)
column_names = [
    "duration", "protocol_type", "service", "flag", "src_bytes", "dst_bytes", "land",
    "wrong_fragment", "urgent", "hot", "num_failed_logins", "logged_in", "num_compromised",
    "root_shell", "su_attempted", "num_root", "num_file_creations", "num_shells",
    "num_access_files", "num_outbound_cmds", "is_host_login", "is_guest_login", "count",
    "srv_count", "serror_rate", "srv_serror_rate", "rerror_rate", "srv_rerror_rate",
    "same_srv_rate", "diff_srv_rate", "srv_diff_host_rate", "dst_host_count",
    "dst_host_srv_count", "dst_host_same_srv_rate", "dst_host_diff_srv_rate",
    "dst_host_same_src_port_rate", "dst_host_srv_diff_host_rate", "dst_host_serror_rate",
    "dst_host_srv_serror_rate", "dst_host_rerror_rate", "dst_host_srv_rerror_rate",
    "label", "difficulty"
]
df.columns = column_names

# Drop difficulty level (not useful for classification)
df.drop('difficulty', axis=1, inplace=True)

# Convert label into binary: 'normal' vs 'attack'
df['label'] = df['label'].apply(lambda x: 'normal' if x == 'normal' else 'attack')

# Encode categorical features
cat_cols = ['protocol_type', 'service', 'flag']
df[cat_cols] = df[cat_cols].apply(LabelEncoder().fit_transform)

# Split into features and target
X = df.drop('label', axis=1)
y = df['label']

# Encode labels
le = LabelEncoder()
y = le.fit_transform(y)  # 0 for attack, 1 for normal

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

# Train a model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluation
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))
print("Accuracy Score:", accuracy_score(y_test, y_pred))


Confusion Matrix:
 [[11746    27]
 [    7 13415]]

Classification Report:
               precision    recall  f1-score   support

      attack       1.00      1.00      1.00     11773
      normal       1.00      1.00      1.00     13422

    accuracy                           1.00     25195
   macro avg       1.00      1.00      1.00     25195
weighted avg       1.00      1.00      1.00     25195

Accuracy Score: 0.9986505258979956


In [5]:
import joblib
# Save the trained model to a file
joblib.dump(model, 'rf_model.pkl')

['rf_model.pkl']

In [6]:
import joblib
import pandas as pd

# Load your trained model
model = joblib.load('rf_model.pkl')  # Random Forest model

# Function to predict if input is normal (0) or attack (1)
def predict(input_data):
    # Convert input (dictionary) to a DataFrame
    df = pd.DataFrame([input_data])

    # Predict using the model
    result = model.predict(df)[0]  # Get the first (and only) prediction
    return int(result)


In [17]:
import numpy as np
import joblib
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer

# Load the trained model
model = joblib.load('rf_model.pkl')

# Define column names that the model expects (41 features in total)
columns = [
    'protocol_type', 'service', 'flag', 'duration', 'src_bytes', 'dst_bytes',
    'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in',
    'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations',
    'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_hot_login', 'is_guest_login',
    'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate',
    'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
    'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
    'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
    'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate'
]

new_data = pd.DataFrame([
    ['tcp', 'ftp_data', 'SF', 491, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0.00, 0.00, 0.00, 0.00, 1.00, 0.00, 0.00, 150, 25, 0.17, 0.03, 0.17, 0.00, 0.00, 0.00, 0.05, 0.00, 0.0]  # Added one more value (e.g., 0) to make 41 columns
], columns=columns)

# Handle categorical columns (e.g., 'protocol_type', 'service', 'flag')
categorical_columns = ['protocol_type', 'service', 'flag']

# Apply LabelEncoder to each categorical column
encoder = LabelEncoder()

for col in categorical_columns:
    new_data[col] = encoder.fit_transform(new_data[col])

# Predict using the trained model
prediction = model.predict(new_data)

# Output the prediction result
print("Result:", "Attack" if prediction == 1 else "Normal")


Result: Attack


