In [None]:
!pip install pandas scikit-learn matplotlib seaborn



# **Load and Inspect the Datasets:**

In [None]:
import pandas as pd

# Load the datasets
train_df = pd.read_csv('/content/KDDTrain.csv')
test_df = pd.read_csv('/content/KDDTest.csv')

# Display basic information about the datasets
print("Training Data Info:")
print(train_df.info())
print("\nTesting Data Info:")
print(test_df.info())

# Display the first few rows of the training dataset
print("\nFirst few rows of the training dataset:")
print(train_df.head())


Training Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125973 entries, 0 to 125972
Data columns (total 43 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   duration                     125973 non-null  int64  
 1   protocol_type                125973 non-null  object 
 2   service                      125973 non-null  object 
 3   flag                         125973 non-null  object 
 4   src_bytes                    125973 non-null  int64  
 5   dst_bytes                    125973 non-null  int64  
 6   land                         125973 non-null  int64  
 7   wrong_fragment               125973 non-null  int64  
 8   urgent                       125973 non-null  int64  
 9   hot                          125973 non-null  int64  
 10  num_failed_logins            125973 non-null  int64  
 11  logged_in                    125973 non-null  int64  
 12  num_compromised              125973 no

# **Data Preprocessing:**

In [None]:
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Handle missing values (if any)
train_df = train_df.dropna()
test_df = test_df.dropna()

# Encode categorical variables, INCLUDING 'attack_class'
categorical_columns = ['protocol_type', 'service', 'flag', 'attack_class'] # Add 'attack_class' here
label_encoders = {col: LabelEncoder().fit(train_df[col]) for col in categorical_columns}

# Function to safely transform categorical variables
def transform_with_unknown(label_encoder, series):
    unique_values = set(series.unique())
    known_values = set(label_encoder.classes_)
    unknown_values = unique_values - known_values
    if unknown_values:
        # Add unknown values to the encoder's classes
        label_encoder.classes_ = np.append(label_encoder.classes_, list(unknown_values))
    return label_encoder.transform(series)

for col, le in label_encoders.items():
    train_df[col] = le.transform(train_df[col])
    test_df[col] = transform_with_unknown(le, test_df[col])

# Scale numerical features
scaler = StandardScaler()
numerical_columns = train_df.columns.difference(['attack_class'])
train_df[numerical_columns] = scaler.fit_transform(train_df[numerical_columns])
test_df[numerical_columns] = scaler.transform(test_df[numerical_columns])

# Display the first few rows of the processed training dataset
print("Processed Training Data:")
print(train_df.head())


Processed Training Data:
   duration  protocol_type   service      flag  src_bytes  dst_bytes  \
0 -0.110249      -0.124706 -0.686785  0.751111  -0.007679  -0.004919   
1 -0.110249       2.219312  0.781428  0.751111  -0.007737  -0.004919   
2 -0.110249      -0.124706  1.087305 -0.736235  -0.007762  -0.004919   
3 -0.110249      -0.124706 -0.442083  0.751111  -0.007723  -0.002891   
4 -0.110249      -0.124706 -0.442083  0.751111  -0.007728  -0.004814   

       land  wrong_fragment    urgent       hot  ...  dst_host_same_srv_rate  \
0 -0.014089       -0.089486 -0.007736 -0.095076  ...               -0.782367   
1 -0.014089       -0.089486 -0.007736 -0.095076  ...               -1.161030   
2 -0.014089       -0.089486 -0.007736 -0.095076  ...               -0.938287   
3 -0.014089       -0.089486 -0.007736 -0.095076  ...                1.066401   
4 -0.014089       -0.089486 -0.007736 -0.095076  ...                1.066401   

   dst_host_diff_srv_rate  dst_host_same_src_port_rate  \
0  

# **Feature and Target Separation:**

In [None]:
from sklearn.model_selection import train_test_split

X_train = train_df.drop('attack_class', axis=1)
y_train = train_df['attack_class']
X_test = test_df.drop('attack_class', axis=1)
y_test = test_df['attack_class']

# Split training data for training and validation
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# **Model Training and Evaluation:**

In [None]:
from sklearn.ensemble import IsolationForest
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Train Isolation Forest model
iso_forest = IsolationForest(random_state=42)
iso_forest.fit(X_train_split)

# Predict on validation data
iso_val_preds = iso_forest.predict(X_val_split)

# Convert predictions to 0 and 1 (anomaly and normal)
iso_val_preds = [1 if pred == -1 else 0 for pred in iso_val_preds]

# Evaluate the model
iso_accuracy = accuracy_score(y_val_split, iso_val_preds)
iso_precision = precision_score(y_val_split, iso_val_preds, average='weighted')
iso_recall = recall_score(y_val_split, iso_val_preds, average='weighted')
iso_f1 = f1_score(y_val_split, iso_val_preds, average='weighted')

print(f'Isolation Forest - Accuracy: {iso_accuracy}, Precision: {iso_precision}, Recall: {iso_recall}, F1 Score: {iso_f1}')


Isolation Forest - Accuracy: 0.00472315935701528, Precision: 3.7649622439836684e-05, Recall: 0.00472315935701528, F1 Score: 7.470143237311839e-05


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
from sklearn.ensemble import RandomForestClassifier

# Train Random Forest model
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_split, y_train_split)

# Predict on validation data
rf_val_preds = rf.predict(X_val_split)

# Evaluate the model
rf_accuracy = accuracy_score(y_val_split, rf_val_preds)
rf_precision = precision_score(y_val_split, rf_val_preds, average='weighted')
rf_recall = recall_score(y_val_split, rf_val_preds, average='weighted')
rf_f1 = f1_score(y_val_split, rf_val_preds, average='weighted')

print(f'Random Forest - Accuracy: {rf_accuracy}, Precision: {rf_precision}, Recall: {rf_recall}, F1 Score: {rf_f1}')


Random Forest - Accuracy: 0.9984917642389363, Precision: 0.9984929300781247, Recall: 0.9984917642389363, F1 Score: 0.9984050216988444


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# **Ensemble Model Training and Evaluation:**

In [None]:
import numpy as np
from sklearn.linear_model import LogisticRegression

# Prepare the meta features
train_meta_features = np.column_stack((iso_forest.predict(X_train_split), rf.predict(X_train_split)))
val_meta_features = np.column_stack((iso_forest.predict(X_val_split), rf.predict(X_val_split)))

# Train the meta model
meta_model = LogisticRegression(random_state=42)
meta_model.fit(train_meta_features, y_train_split)

# Predict on validation data using the meta model
val_meta_preds = meta_model.predict(val_meta_features)

# Evaluate the ensemble model
ensemble_accuracy = accuracy_score(y_val_split, val_meta_preds)
ensemble_precision = precision_score(y_val_split, val_meta_preds, average='weighted')
ensemble_recall = recall_score(y_val_split, val_meta_preds, average='weighted')
ensemble_f1 = f1_score(y_val_split, val_meta_preds, average='weighted')

print(f'Ensemble Model - Accuracy: {ensemble_accuracy}, Precision: {ensemble_precision}, Recall: {ensemble_recall}, F1 Score: {ensemble_f1}')


Ensemble Model - Accuracy: 0.9372891446715618, Precision: 0.9107324794071925, Recall: 0.9372891446715618, F1 Score: 0.9213599596443796


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# Prepare meta features for test set
test_meta_features = np.column_stack((iso_forest.predict(X_test), rf.predict(X_test)))

# Predict on test data
test_meta_preds = meta_model.predict(test_meta_features)

# Evaluate the ensemble model on test data
test_accuracy = accuracy_score(y_test, test_meta_preds)
test_precision = precision_score(y_test, test_meta_preds, average='weighted')
test_recall = recall_score(y_test, test_meta_preds, average='weighted')
test_f1 = f1_score(y_test, test_meta_preds, average='weighted')

print(f'Ensemble Model Test Set - Accuracy: {test_accuracy}, Precision: {test_precision}, Recall: {test_recall}, F1 Score: {test_f1}')


Ensemble Model Test Set - Accuracy: 0.6878992193044713, Precision: 0.5087245624144348, Recall: 0.6878992193044713, F1 Score: 0.5795289177354543


  _warn_prf(average, modifier, msg_start, len(result))
