In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pandas.api.types import is_numeric_dtype
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncode
from sklearn.tree  import DecisionTreeClassifi
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.metrics import classification_report

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


ModuleNotFoundError: No module named 'sklearn'

# Reading the Training Dataset

In [None]:
train = pd.read_csv("Train_data.csv")

In [None]:
train.info()

In [None]:
train.shape

In [None]:
train.head()

In [None]:
train.describe()

In [None]:
train.describe(include='object')

# Exploratory Data Analysis(EDA)

In [None]:
train['class'].value_counts()

In [None]:
sns.countplot(x=train['class'])

In [None]:
plt.pie(train['class'].value_counts(),labels=['normal','anomaly'],autopct="%0.2f")
plt.show()

In [None]:
train['protocol_type'].value_counts()

In [None]:
sns.countplot(x=train['protocol_type'])

In [None]:
plt.pie(train['protocol_type'].value_counts(),labels=['tcp','udp','icmp'],autopct="%0.2f")
plt.show()

In [None]:
plt.figure(figsize=(40,30))
sns.heatmap(train.corr(numeric_only = True))

# Data Preprocessing

- Missing Values
- Duplicates
- Encoding 
- Scaling
- Features Selection
- Dataset Splitting

## Missing Values

In [None]:
train.isnull().sum()

## Duplicates

In [None]:
print(f"Number of duplicates: {train.duplicated().sum()}")

In [None]:
train.head()

## Label Encoding

In [None]:
label_encoder = LabelEncoder()
def le(df):
    for col in df.columns:
        if df[col].dtype == 'object':
                df[col] = label_encoder.fit_transform(df[col])

le(train)

In [None]:
train

## Features Selection 

In [None]:
train['num_outbound_cmds']

In [None]:
#Drop the redundant feature
train.drop(['num_outbound_cmds'], axis=1, inplace=True)

In [None]:

X = train.drop(['class'], axis=1)
y = train['class']
rfc = RandomForestClassifier()

rfe = RFE(rfc, n_features_to_select=10)
rfe = rfe.fit(X, y)

feature_map = [(i, v) for i, v in itertools.zip_longest(rfe.get_support(), X.columns)]
selected_features = [v for i, v in feature_map if i==True]
top_features = pd.DataFrame({'Features': selected_features})
top_features.index = top_features.index + 1
     

In [None]:
top_features

## Standard Scaler

In [None]:
X = X[selected_features]
scale = StandardScaler()
X = scale.fit_transform(X)

## Dataset Splitting

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,train_size=0.80,random_state=2)

# Model Selection

## Support Vector Machine

## Decision-Tree Classifier

In [None]:
dtc = DecisionTreeClassifier(max_depth=15,criterion='entropy')
dtc.fit(X_train,y_train)
y_pred = dtc.predict(X_test)
report = classification_report(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
print('============================== Decision-Tree Classifier ==============================')
print("Accuracy: ", accuracy_score(y_test,y_pred))
print("Precision: ", precision_score(y_test,y_pred))
print("Classification Report:\n", report)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cbar=False)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title("Confusion Matrix")
plt.show()

## Logistic Regression

In [None]:
lrc = LogisticRegression()
lrc.fit(X_train,y_train)
y_pred = lrc.predict(X_test)
report = classification_report(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
print('============================== Logistic Regression ==============================')
print("Accuracy: ", accuracy_score(y_test,y_pred))
print("Precision: ", precision_score(y_test,y_pred))
print("Classification Report:\n", report)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cbar=False)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title("Confusion Matrix")
plt.show()

## Random Forest Classifier

In [None]:
rfc = RandomForestClassifier(n_estimators=50, random_state=2)
rfc.fit(X_train,y_train)
y_pred = rfc.predict(X_test)
report = classification_report(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
print('============================== Random Forest Classifier ==============================')
print("Accuracy: ", accuracy_score(y_test,y_pred))
print("Precision: ", precision_score(y_test,y_pred))
print("Classification Report:\n", report)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cbar=False)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title("Confusion Matrix")
plt.show()

# Evaluation Metrics

In [None]:
clfs = {
    'SVC' : svc,
    'KN' : knc, 
    'NB': bnb, 
    'DT': dtc, 
    'LR': lrc, 
    'RF': rfc, 
    'AdaBoost': abc, 
    'BgC': bc, 
    'ETC': etc,
    'xgb':xgb
}

In [None]:
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score

def metrics(clf, X_train, y_train, X_test, y_test):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    
    return accuracy, precision, f1, recall

In [None]:
accuracy_scores = []
precision_scores = []
f1_scores = []
recall_scores = []

for name, clf in clfs.items():
    current_accuracy, current_precision, current_f1, current_recall = metrics(clf, X_train, y_train, X_test, y_test)
    accuracy_scores.append(current_accuracy)
    precision_scores.append(current_precision)
    f1_scores.append(current_f1)
    recall_scores.append(current_recall)

# Comparing Performance of Different Machine Learning Algorithms

In [None]:
performance_df

## Accuracy Scores of Different Machine Learning Algorithms

In [None]:
sns.barplot(performance_df, x='Algorithm', y='Accuracy',palette='dark')
plt.xlabel('Algorithms')
plt.ylabel('Accuracy')
plt.title('Accuracy Scores for Different Algorithms')
plt.tight_layout()
plt.show()

## Precision Scores of Different Machine Learning Algorithms

In [None]:
sns.barplot(performance_df, x='Algorithm', y='Precision',palette='dark')
plt.xlabel('Algorithms')
plt.ylabel('Precision')
plt.title('Precision Scores for Different Algorithms')
plt.tight_layout()
plt.show()


## F1 Scores of Different Machine Learning Algorithms

In [None]:
sns.barplot(performance_df, x='Algorithm', y='F1-score',palette='dark')
plt.xlabel('Algorithms')
plt.ylabel('F1-Score')
plt.title('F1 Scores for Different Algorithms')
plt.tight_layout()
plt.show()


## Recall Scores of Different Machine Learning Algorithms

In [None]:
sns.barplot(performance_df, x='Algorithm', y='Recall',palette='dark')
plt.xlabel('Algorithms')
plt.ylabel('Recall')
plt.title('Recall Scores for Different Algorithms')
plt.tight_layout()
plt.show()

# Prediction For Test Dataset

## Reading the Testing Dataset

In [None]:
test = pd.read_csv("Test_data.csv")

## Exploratory Data Analysis(EDA)

In [None]:
test.info()

In [None]:
test.shape

In [None]:
test.head()

In [None]:
test.describe()

In [None]:
test['num_outbound_cmds']

In [None]:
#Drop the redundant feature
test.drop(['num_outbound_cmds'], axis=1, inplace=True)

## Missing Values

In [None]:
test.isnull().sum()

## Duplicates

In [None]:
print(f"Number of duplicates: {test.duplicated().sum()}")

In [None]:
test.drop_duplicates(inplace=True)

In [None]:
test.describe()

In [None]:
test.describe(include='object')

## Label Encoding

In [None]:
test

In [None]:
le(test)

In [None]:
test

## Feature Selection


In [None]:
X = test[selected_features]
scale = StandardScaler()
X_scaled = scale.fit_transform(X)

## Prediction of Test Data

In [None]:
# Decision Tree is better model due to less false positives
dtc.predict(X_scaled)