# XSS Detection in form data using Machine Learning for attack discovery and mitigation 

## Information
- Kaggle XSS dataset (https://github.com/agentsmith1337/AgenticWAF/blob/main/Models/XSS/XSS_dataset.csv)
- Focus on SVMs and Forest Models
- XSS Detection can be done by detecting javascript and html alone, but this is a good broader starting point.
- Names and generic text has to be augmented into this data since real traffic rarely has code snippets as user input. 

In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

url = "https://raw.githubusercontent.com/agentsmith1337/AgenticWAF/main/Models/XSS/XSS_dataset.csv"

dat=pd.read_csv(url, encoding="unicode_escape")
dat.head()

Unnamed: 0,ï»¿,Sentence,Label
0,0,"<li><a href=""/wiki/File:Socrates.png"" class=""i...",0
1,1,"<tt onmouseover=""alert(1)"">test</tt>",1
2,2,"\t </span> <span class=""reference-text"">Steeri...",0
3,3,"\t </span> <span class=""reference-text""><cite ...",0
4,4,"\t </span>. <a href=""/wiki/Digital_object_iden...",0


In [2]:
dat.shape

(13686, 3)

In [3]:
dat.columns

Index(['ï»¿', 'Sentence', 'Label'], dtype='object')

In [4]:
dat.info

<bound method DataFrame.info of          ï»¿                                           Sentence  Label
0          0  <li><a href="/wiki/File:Socrates.png" class="i...      0
1          1               <tt onmouseover="alert(1)">test</tt>      1
2          2  \t </span> <span class="reference-text">Steeri...      0
3          3  \t </span> <span class="reference-text"><cite ...      0
4          4  \t </span>. <a href="/wiki/Digital_object_iden...      0
...      ...                                                ...    ...
13681  13681             <img onpointerenter=alert(1)>XSS</img>      1
13682  13682  <source onbeforepaste="alert(1)" contenteditab...      1
13683  13683  <div draggable="true" contenteditable>drag me<...      1
13684  13684  <li><cite id="CITEREFDomingos2015" class="cita...      0
13685  13685                                         \t </span>      0

[13686 rows x 3 columns]>

We have to add nearly 8000 data rows consisting of generic text

In [5]:
from faker import Faker
import random
fake=Faker()
aug=[]
for i in range(500):
    choice=random.choice([fake.name, fake.address, fake.sentence, fake.email])
    aug.append({
        'text': choice(),
        'label': 0
    })
dat0=pd.DataFrame(aug)
dat0.head()
    

Unnamed: 0,text,label
0,jasminepeterson@example.org,0
1,"43646 Rodriguez Squares\nPort Thomas, VA 50635",0
2,"89093 Tran Port Suite 935\nLake Mark, NM 96621",0
3,Put focus without.,0
4,Jeffrey Martin,0


In [6]:
dat=dat.iloc[:, 1:]
dat.head()

Unnamed: 0,Sentence,Label
0,"<li><a href=""/wiki/File:Socrates.png"" class=""i...",0
1,"<tt onmouseover=""alert(1)"">test</tt>",1
2,"\t </span> <span class=""reference-text"">Steeri...",0
3,"\t </span> <span class=""reference-text""><cite ...",0
4,"\t </span>. <a href=""/wiki/Digital_object_iden...",0


In [7]:
dat1=dat.rename(columns={'Sentence':'text','Label':'label'})
dat1['text']=dat1['text'].str.strip()
dat1.head()

Unnamed: 0,text,label
0,"<li><a href=""/wiki/File:Socrates.png"" class=""i...",0
1,"<tt onmouseover=""alert(1)"">test</tt>",1
2,"</span> <span class=""reference-text"">Steering ...",0
3,"</span> <span class=""reference-text""><cite cla...",0
4,"</span>. <a href=""/wiki/Digital_object_identif...",0


In [8]:
data=pd.concat([dat1, dat0], ignore_index=True)
data.head()

Unnamed: 0,text,label
0,"<li><a href=""/wiki/File:Socrates.png"" class=""i...",0
1,"<tt onmouseover=""alert(1)"">test</tt>",1
2,"</span> <span class=""reference-text"">Steering ...",0
3,"</span> <span class=""reference-text""><cite cla...",0
4,"</span>. <a href=""/wiki/Digital_object_identif...",0


In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14186 entries, 0 to 14185
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    14186 non-null  object
 1   label   14186 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 221.8+ KB


In [10]:
df=data.copy()

# Feature Engineering with TF-IDF for weightage of words in relation to the interesting class

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

#--------------------------------------------------
# 1. Data Split
#--------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

#--------------------------------------------------
# 2. Create a Pipeline (Vectorizer + Model)
# Key setting: analyzer='char' is CRITICAL for XSS to catch partial tags like '<scr'
#--------------------------------------------------

pipeline = make_pipeline(
    TfidfVectorizer(analyzer='char', ngram_range=(2, 5), max_features=10000),
    SVC(kernel='linear', probability=True) # Linear Kernel for projection
)
#--------------------------------------------------
# 3. Train
pipeline.fit(X_train, y_train)
#--------------------------------------------------

#--------------------------------------------------
# 4. Evaluate
#--------------------------------------------------
print(classification_report(y_test, pipeline.predict(X_test)))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1363
           1       1.00      1.00      1.00      1475

    accuracy                           1.00      2838
   macro avg       1.00      1.00      1.00      2838
weighted avg       1.00      1.00      1.00      2838



In [12]:
df['label'].value_counts()
df.head(10)

Unnamed: 0,text,label
0,"<li><a href=""/wiki/File:Socrates.png"" class=""i...",0
1,"<tt onmouseover=""alert(1)"">test</tt>",1
2,"</span> <span class=""reference-text"">Steering ...",0
3,"</span> <span class=""reference-text""><cite cla...",0
4,"</span>. <a href=""/wiki/Digital_object_identif...",0
5,"<li id=""cite_note-118""><span class=""mw-cite-ba...",0
6,"<li><a href=""/wiki/Contextualism"" title=""Conte...",0
7,"<li id=""cite_note-Representing_causation-95""><...",0
8,"<tr><td class=""plainlist"" style=""padding:0 0.1...",0
9,</span>,0
