In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

import re

import tensorflow as tf
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, Dense, LSTM, Dropout

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# Load data
data = pd.read_csv("/content/Modified_SQL_Dataset.csv").dropna().drop_duplicates().reset_index(drop=True)

In [None]:
# Text preprocessing
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation
    # text = re.sub(r'[^\w\s]', '', text)
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stop words
    tokens = [token for token in tokens if token not in stop_words]
    # Lemmatize tokens
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(tokens)

data['Cleaned_Query'] = data['Query'].apply(preprocess_text)

# Text vectorization
vectorizer = CountVectorizer(min_df=2, max_df=0.7)
posts = vectorizer.fit_transform(data['Cleaned_Query']).toarray()

transformed_posts = pd.DataFrame(posts)
data = pd.concat([data, transformed_posts], axis=1)

X = data[data.columns[2:]]
Y = data['Label']

# Convert only numeric columns to float32
numeric_columns = X.select_dtypes(include=['float32']).columns
X[numeric_columns] = X[numeric_columns].astype('float32')

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=42)

# Print data types and shapes for verification
print("X_train data types:")
print(X_train.dtypes)
print("\nX_train shape:", X_train.shape)

print("\ny_train data types:")
print(y_train.dtypes)
print("\ny_train shape:", y_train.shape)

print("\nX_test data types:")
print(X_test.dtypes)
print("\nX_test shape:", X_test.shape)

print("\ny_test data types:")
print(y_test.dtypes)
print("\ny_test shape:", y_test.shape)

X_train data types:
Cleaned_Query    object
0                 int64
1                 int64
2                 int64
3                 int64
                  ...  
6713              int64
6714              int64
6715              int64
6716              int64
6717              int64
Length: 6719, dtype: object

X_train shape: (24725, 6719)

y_train data types:
int64

y_train shape: (24725,)

X_test data types:
Cleaned_Query    object
0                 int64
1                 int64
2                 int64
3                 int64
                  ...  
6713              int64
6714              int64
6715              int64
6716              int64
6717              int64
Length: 6719, dtype: object

X_test shape: (6182, 6719)

y_test data types:
int64

y_test shape: (6182,)


In [None]:
#debug print lines
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)

X_train shape: (24725, 6719)
X_test shape: (6182, 6719)


In [None]:
#debug prints
print(X_train.dtypes)
print(X_test.dtypes)

Cleaned_Query    object
0                 int64
1                 int64
2                 int64
3                 int64
                  ...  
6713              int64
6714              int64
6715              int64
6716              int64
6717              int64
Length: 6719, dtype: object
Cleaned_Query    object
0                 int64
1                 int64
2                 int64
3                 int64
                  ...  
6713              int64
6714              int64
6715              int64
6716              int64
6717              int64
Length: 6719, dtype: object


In [None]:
# Exclude 'Cleaned_Query' column from input data
X_train_numeric = X_train.drop(columns=['Cleaned_Query'])
X_test_numeric = X_test.drop(columns=['Cleaned_Query'])

# Autoencoder model
input_dim = X_train_numeric.shape[1]
encoding_dim = 64

#autoencoder has 1 ip layer and 2 dense layers and 1 output layer
input_layer = Input(shape=(input_dim,))
encoder_layer = Dense(encoding_dim, activation='relu')(input_layer)
decoder_layer = Dense(input_dim, activation='relu')(encoder_layer)

autoencoder = Model(input_layer, decoder_layer)
autoencoder.compile(optimizer='adam', loss='binary_crossentropy')

# Fit the autoencoder model
autoencoder.fit(X_train_numeric, X_train_numeric, epochs=25, batch_size=64, shuffle=True, validation_data=(X_test_numeric, X_test_numeric))

encoder = Model(input_layer, encoder_layer)

# Predict with the encoder model
encoded_train = encoder.predict(X_train_numeric)
encoded_test = encoder.predict(X_test_numeric)


In [None]:
# Define the LSTM model
model = Sequential()
model.add(LSTM(64, input_shape=(encoding_dim, 1)))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(encoded_train.reshape(encoded_train.shape[0], encoding_dim, 1), y_train, epochs=25, batch_size=64, validation_data=(encoded_test.reshape(encoded_test.shape[0], encoding_dim, 1), y_test))

# Evaluate the model
loss, accuracy = model.evaluate(encoded_test.reshape(encoded_test.shape[0], encoding_dim, 1), y_test)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)


In [None]:
#making predictions on the test data
predictions = model.predict(encoded_test.reshape(encoded_test.shape[0], encoding_dim, 1))

# Convert the predicted probabilities to binary predictions (0 or 1)
class_predictions = (predictions > 0.5).astype(int)

# Print the predicted class labels
print("Predicted class labels:", class_predictions)

# Evaluate the model's performance
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy = accuracy_score(y_test, class_predictions)
precision = precision_score(y_test, class_predictions)
recall = recall_score(y_test, class_predictions)
f1 = f1_score(y_test, class_predictions)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Use the trained model to make predictions on the test data
predictions = model.predict(encoded_test.reshape(encoded_test.shape[0], encoding_dim, 1))
class_predictions = (predictions > 0.5).astype(int)

# Calculate the confusion matrix
cm = confusion_matrix(y_test, class_predictions)

# Plot the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Non-Malicious', 'Malicious'], yticklabels=['Non-Malicious', 'Malicious'])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()
