In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv("DataToDetect.csv")

In [4]:
df.shape

(4898431, 10)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898431 entries, 0 to 4898430
Data columns (total 10 columns):
 #   Column         Dtype 
---  ------         ----- 
 0   duration       int64 
 1   protocol_type  object
 2   service        object
 3   flag           object
 4   src_bytes      int64 
 5   dst_bytes      int64 
 6   land           int64 
 7   count          int64 
 8   srv_count      int64 
 9   result         object
dtypes: int64(6), object(4)
memory usage: 373.7+ MB


In [6]:
df.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,count,srv_count,result
0,0,tcp,http,SF,215,45076,0,1,1,0
1,0,tcp,http,SF,162,4528,0,2,2,0
2,0,tcp,http,SF,236,1228,0,1,1,0
3,0,tcp,http,SF,233,2032,0,2,2,0
4,0,tcp,http,SF,239,486,0,3,3,0


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
import joblib

# Load your DataFrame (assuming it's already loaded as `df`)

# Preprocessing
# 1. Inspect the data
print(df.info())
print(df.describe())

# 2. Handle missing values
df.fillna(method='ffill', inplace=True)  # Example: forward fill

# 3. Encode categorical variables
df = pd.get_dummies(df, columns=['protocol_type', 'service', 'flag'])

# 4. Encode the target variable (result)
label_encoder = LabelEncoder()
df['result'] = label_encoder.fit_transform(df['result'])  # "normal" -> 0, "abnormal" -> 1

# 5. Feature scaling
scaler = StandardScaler()
df[['duration', 'src_bytes', 'dst_bytes', 'land', 'count', 'srv_count']] = scaler.fit_transform(
    df[['duration', 'src_bytes', 'dst_bytes', 'land', 'count', 'srv_count']]
)

# 6. Split the data into features and target
X = df.drop('result', axis=1)  # Features
y = df['result']                # Target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model training
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Model evaluation
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
# Save the model
joblib.dump(model, 'trained_model.pkl')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898431 entries, 0 to 4898430
Data columns (total 10 columns):
 #   Column         Dtype 
---  ------         ----- 
 0   duration       int64 
 1   protocol_type  object
 2   service        object
 3   flag           object
 4   src_bytes      int64 
 5   dst_bytes      int64 
 6   land           int64 
 7   count          int64 
 8   srv_count      int64 
 9   result         object
dtypes: int64(6), object(4)
memory usage: 373.7+ MB
None
           duration     src_bytes     dst_bytes          land         count  \
count  4.898431e+06  4.898431e+06  4.898431e+06  4.898431e+06  4.898431e+06   
mean   4.834243e+01  1.834621e+03  1.093623e+03  5.716116e-06  3.349734e+02   
std    7.233298e+02  9.414311e+05  6.450123e+05  2.390833e-03  2.119908e+02   
min    0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00   
25%    0.000000e+00  4.500000e+01  0.000000e+00  0.000000e+00  1.210000e+02   
50%    0.000000e+00  5.200000e+0

  df.fillna(method='ffill', inplace=True)  # Example: forward fill


              precision    recall  f1-score   support

           0       1.00      1.00      1.00    784998
           1       1.00      1.00      1.00    194689

    accuracy                           1.00    979687
   macro avg       1.00      1.00      1.00    979687
weighted avg       1.00      1.00      1.00    979687

[[784954     44]
 [    18 194671]]


In [None]:
# Hyperparameter tuning (optional)
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20, 30],
}
grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=3)
grid_search.fit(X_train, y_train)