In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv("realistic_network_data.csv")

In [4]:
df.shape

(10000000, 10)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000000 entries, 0 to 9999999
Data columns (total 10 columns):
 #   Column         Dtype  
---  ------         -----  
 0   duration       float64
 1   protocol_type  object 
 2   service        object 
 3   flag           object 
 4   src_bytes      int64  
 5   dst_bytes      int64  
 6   land           int64  
 7   count          int64  
 8   srv_count      int64  
 9   result         object 
dtypes: float64(1), int64(5), object(4)
memory usage: 762.9+ MB


In [6]:
df.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,count,srv_count,result
0,15.6,icmp,finger,OTH,2039,575,0,12,9,normal
1,13.7,arp,cifs,ECE,2507,684,0,17,6,normal
2,7.3,eigrp,ncp,SYN,1894,1836,0,15,4,normal
3,7.9,ospf,irc,SHR,4500,3293,0,8,2,normal
4,1.8,tcp,irc,CWR,2524,477,0,18,6,normal


In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
import joblib

# Load your DataFrame (assuming it's already loaded as `df`)

# Preprocessing
# 1. Inspect the data
print(df.info())
print(df.describe())

# 2. Handle missing values
df.fillna(method='ffill', inplace=True)  # Example: forward fill

# 3. Encode categorical variables
df = pd.get_dummies(df, columns=['protocol_type', 'service', 'flag'])

# 4. Encode the target variable (result)
label_encoder = LabelEncoder()
df['result'] = label_encoder.fit_transform(df['result'])  # "normal" -> 0, "abnormal" -> 1

# 5. Feature scaling
scaler = StandardScaler()
df[['duration', 'src_bytes', 'dst_bytes', 'land', 'count', 'srv_count']] = scaler.fit_transform(
    df[['duration', 'src_bytes', 'dst_bytes', 'land', 'count', 'srv_count']]
)

# 6. Split the data into features and target
X = df.drop('result', axis=1)  # Features
y = df['result']                # Target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model training
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Model evaluation
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
# Save the model
joblib.dump(model, 'trained_model_synthetic_data.pkl')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000000 entries, 0 to 9999999
Data columns (total 10 columns):
 #   Column         Dtype  
---  ------         -----  
 0   duration       float64
 1   protocol_type  object 
 2   service        object 
 3   flag           object 
 4   src_bytes      int64  
 5   dst_bytes      int64  
 6   land           int64  
 7   count          int64  
 8   srv_count      int64  
 9   result         object 
dtypes: float64(1), int64(5), object(4)
memory usage: 762.9+ MB
None
           duration     src_bytes     dst_bytes          land         count  \
count  1.000000e+07  1.000000e+07  1.000000e+07  1.000000e+07  1.000000e+07   
mean   1.080341e+02  3.000063e+03  1.500044e+03  1.001104e-01  1.850628e+01   
std    2.347941e+02  2.082080e+03  1.280447e+03  3.001472e-01  2.119799e+01   
min    0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00  1.000000e+00   
25%    6.200000e+00  1.388000e+03  4.190000e+02  0.000000e+00  6.000000e+00   
50%    

  df.fillna(method='ffill', inplace=True)  # Example: forward fill


              precision    recall  f1-score   support

           0       1.00      1.00      1.00    398825
           1       1.00      1.00      1.00   1601175

    accuracy                           1.00   2000000
   macro avg       1.00      1.00      1.00   2000000
weighted avg       1.00      1.00      1.00   2000000

[[ 398762      63]
 [      0 1601175]]


['trained_model_synthetic_data.pkl']

In [None]:
# Hyperparameter tuning (optional)
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20, 30],
}
grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=3)
grid_search.fit(X_train, y_train)