# SQL Injection ML Model

In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix

### Loading the Dataset 

In [2]:
data = pd.read_csv('Modified_SQL_Dataset.csv')
print(len(data))
data.head()

30919


Unnamed: 0,Query,Label
0,""" or pg_sleep ( __TIME__ ) --",1
1,create user name identified by pass123 tempora...,1
2,AND 1 = utl_inaddr.get_host_address ( ...,1
3,select * from users where id = '1' or @ @1 ...,1
4,"select * from users where id = 1 or 1#"" ( ...",1


### Label 

In [3]:
label = data.iloc[:,1].unique()
label

array([1, 0], dtype=int64)

### Injections

In [4]:
injected_text = data[data.iloc[:,1]==label[0]]
print("len = ",len(injected_text))
injected_text

len =  11382


Unnamed: 0,Query,Label
0,""" or pg_sleep ( __TIME__ ) --",1
1,create user name identified by pass123 tempora...,1
2,AND 1 = utl_inaddr.get_host_address ( ...,1
3,select * from users where id = '1' or @ @1 ...,1
4,"select * from users where id = 1 or 1#"" ( ...",1
...,...,...
19330,â or 1 = 1 --,1
19331,or 'x' = 'x,1
19332,29%,1
19333,28%,1


### Data Cleaning 

In [5]:
clean_text = data[data.iloc[:,1]==label[1]]
print(clean_text)
print('clean text: ',len(clean_text))

                                              Query  Label
11330                                     99745017c      0
11331                                      ejerci78      0
11332                                         47209      0
11333           calle valencia de don juan 161, 7?d      0
11334                                        b3r3al      0
...                                             ...    ...
30914         DELETE FROM door WHERE grow = 'small'      0
30915                          DELETE FROM tomorrow      0
30916                  SELECT wide ( s )  FROM west      0
30917  SELECT * FROM  ( SELECT slide FROM breath )       0
30918                      SELECT TOP 3 * FROM race      0

[19537 rows x 2 columns]
clean text:  19537


In [6]:
vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')

X = vectorizer.fit_transform(data['Query'])
y = data['Label']

In [7]:
print(X[1], data.iloc[1][0])

  (0, 12062)	1
  (0, 23058)	1
  (0, 18116)	1
  (0, 15441)	1
  (0, 10965)	1
  (0, 19162)	1
  (0, 22273)	1
  (0, 22125)	2
  (0, 22269)	1
  (0, 12425)	1
  (0, 23066)	1 create user name identified by pass123 temporary tablespace temp default tablespace users;


### Spliting Into Training & Testing Data

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Decision Tree Classifier

In [9]:
clf = DecisionTreeClassifier(random_state=42)

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

print("accuracy: {:.2f}%".format(accuracy * 100))
print("recall: {:.2f}%".format(recall * 100))
print("precision: {:.2f}%".format(precision * 100))
print("F1-score: {:.2f}%".format(f1 * 100))
print("True negative rate: {:.2f}%".format(tn / (tn + fp) * 100))
print("True positive rate: {:.2f}%".format(tp / (tp + fn) * 100))

accuracy: 81.47%
recall: 99.69%
precision: 66.73%
F1-score: 79.94%
True negative rate: 70.74%
True positive rate: 99.69%


## Support Vector Machine

In [10]:
from sklearn.svm import SVC

clf = SVC(kernel='linear', random_state=42)


clf.fit(X_train, y_train)


y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

print("accuracy: {:.2f}%".format(accuracy * 100))
print("recall: {:.2f}%".format(recall * 100))
print("precision: {:.2f}%".format(precision * 100))
print("F1-score: {:.2f}%".format(f1 * 100))
print("True negative rate: {:.2f}%".format(tn / (tn + fp) * 100))
print("True positive rate: {:.2f}%".format(tp / (tp + fn) * 100))

accuracy: 99.48%
recall: 99.00%
precision: 99.60%
F1-score: 99.30%
True negative rate: 99.77%
True positive rate: 99.00%


## LSTM - Long Short Term Memory

In [None]:
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix
import numpy as np

data = pd.read_csv('Modified_SQL_Dataset.csv')

MAX_WORDS = 10000
MAX_LEN = 100

tokenizer = Tokenizer(num_words=MAX_WORDS)
tokenizer.fit_on_texts(data['Query'])
sequences = tokenizer.texts_to_sequences(data['Query'])

X = pad_sequences(sequences, maxlen=MAX_LEN)

y = data['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = Sequential()
model.add(Embedding(MAX_WORDS, 128, input_length=MAX_LEN))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# tranning
model.fit(X_train, y_train, epochs=10, batch_size=32)

# testing
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)

#data collection
accuracy = accuracy_score(y_test, y_pred_classes)
recall = recall_score(y_test, y_pred_classes, zero_division=1)
precision = precision_score(y_test, y_pred_classes, zero_division=1)
f1 = f1_score(y_test, y_pred_classes, zero_division=1)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_classes).ravel()

# Evaluation
print("Accuracy: {:.2f}%".format(accuracy * 100))
print("recall: {:.2f}%".format(recall * 100))
print("precision: {:.2f}%".format(precision * 100))
print("F1-score: {:.2f}%".format(f1 * 100))
print("True Positive rate: {:.2f}%".format(tn / (tn + fp) * 100))
print("True Negative rate: {:.2f}%".format(tp / (tp + fn) * 100))

Epoch 1/10
Epoch 2/10

## Linear Regression

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Assuming X_train, X_test, y_train, y_test are already defined and loaded with your dataset

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and fit the linear regression model
regressor = LinearRegression()
regressor.fit(X_train, y_train)

# Predict the target variable on the testing set
y_pred = regressor.predict(X_test)

# Calculate evaluation metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error: {:.2f}".format(mse))
print("R-squared: {:.2f}".format(r2))


## Logistic Regression

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Assuming X_train, X_test, y_train, y_test are already defined and loaded with your dataset

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and fit the logistic regression model
clf = LogisticRegression(random_state=42)
clf.fit(X_train, y_train)

# Predict the target variable on the testing set
y_pred = clf.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = conf_matrix.ravel()

print("Accuracy: {:.2f}%".format(accuracy * 100))
print("Precision: {:.2f}%".format(precision * 100))
print("Recall: {:.2f}%".format(recall * 100))
print("F1 Score: {:.2f}%".format(f1 * 100))
print("True negative rate: {:.2f}%".format(tn / (tn + fp) * 100))
print("True positive rate: {:.2f}%".format(tp / (tp + fn) * 100))
print("Confusion Matrix:")
print(conf_matrix)


## Random Forest 

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Assuming X_train, X_test, y_train, y_test are already defined and loaded with your dataset

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and fit the Random Forest classifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# Predict the target variable on the testing set
y_pred = clf.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = conf_matrix.ravel()

print("Accuracy: {:.2f}%".format(accuracy * 100))
print("Precision: {:.2f}%".format(precision * 100))
print("Recall: {:.2f}%".format(recall * 100))
print("F1 Score: {:.2f}%".format(f1 * 100))
print("True negative rate: {:.2f}%".format(tn / (tn + fp) * 100))
print("True positive rate: {:.2f}%".format(tp / (tp + fn) * 100))
print("Confusion Matrix:")
print(conf_matrix)
