In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from urllib.parse import urlparse, parse_qs
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from tensorflow.keras.utils import to_categorical
from sklearn.pipeline import make_pipeline
from sklearn.utils import resample


# Load the dataset
df = pd.read_csv('C:\\Users\\User\\Desktop\\url_dataset_updated.csv')

url_column_name = 'URL'  # Replace with your actual column name

# Remove 'http://' and 'https://' from all URLs
df[url_column_name] = df[url_column_name].str.replace('http://', '', regex=False)
df[url_column_name] = df[url_column_name].str.replace('https://', '', regex=False)


# Separate the dataset into malicious and benign
malicious_df = df[df['Label'] == 1]
benign_df = df[df['Label'] == 0]

# Randomly sample 150,000 entries from each
malicious_sampled_df = resample(malicious_df, n_samples=150000, random_state=42)
benign_sampled_df = resample(benign_df, n_samples=150000, random_state=42)

# Combine the sampled data
balanced_df = pd.concat([malicious_sampled_df, benign_sampled_df])

# Shuffle the combined dataset to mix malicious and benign URLs
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Drop all duplicates from balanced_df
balanced_df = balanced_df.drop_duplicates()

# Reset the index after dropping duplicates
balanced_df = balanced_df.reset_index(drop=True)

# Now, balanced_df contains the balanced dataset ready for further processing



In [10]:
# Feature Extraction
# For simplicity, we'll use TF-IDF on the URLs themselves. Advanced features can be added based on URL structure and content.
vectorizer = TfidfVectorizer()

# Prepare the data
X = balanced_df['URL']
y = balanced_df['Label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [11]:
# Model Training
# Create a pipeline that first transforms the data using TfidfVectorizer then applies RandomForestClassifier
model = make_pipeline(TfidfVectorizer(), RandomForestClassifier(n_estimators=20, random_state=42))

# Train the model
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)


In [12]:
# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9046580631323061
Confusion Matrix:
 [[27659   514]
 [ 4820 22953]]
Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.98      0.91     28173
           1       0.98      0.83      0.90     27773

    accuracy                           0.90     55946
   macro avg       0.91      0.90      0.90     55946
weighted avg       0.91      0.90      0.90     55946



In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from urllib.parse import urlparse, parse_qs

# Load the dataset
df = pd.read_csv('C:\\Users\\User\\Desktop\\url_dataset_updated.csv')

url_column_name = 'URL'  # Replace with your actual column name

# Remove 'http://' and 'https://' from all URLs
df[url_column_name] = df[url_column_name].str.replace('http://', '', regex=False)
df[url_column_name] = df[url_column_name].str.replace('https://', '', regex=False)

# Separate the dataset into malicious and benign
malicious_df = df[df['Label'] == 1]
benign_df = df[df['Label'] == 0]

# Randomly sample 150,000 entries from each
malicious_sampled_df = resample(malicious_df, n_samples=150000, random_state=42)
benign_sampled_df = resample(benign_df, n_samples=150000, random_state=42)

# Combine the sampled data
balanced_df = pd.concat([malicious_sampled_df, benign_sampled_df])

# Shuffle the combined dataset to mix malicious and benign URLs
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Drop all duplicates from balanced_df
balanced_df = balanced_df.drop_duplicates()

# Reset the index after dropping duplicates
balanced_df = balanced_df.reset_index(drop=True)


# Feature extraction functions
def get_url_length(url):
    return len(url)

def get_dot_count(url):
    return url.count('.')

def get_hyphen_count_in_domain(url):
    domain = urlparse(url).netloc
    return domain.count('-')

def contains_security_sensitive_words(url):
    security_sensitive_words = ['login', 'signin', 'bank', 'account', 'verification', 'authenticate']
    return int(any(word in url for word in security_sensitive_words))

def get_directory_length(url):
    path = urlparse(url).path
    return len(path)

def get_sub_directory_count(url):
    path = urlparse(url).path
    return path.count('/') - 1

def contains_ip(url):
    domain = urlparse(url).netloc
    try:
        socket.inet_aton(domain)
        return 1
    except:
        return 0

def get_token_count_in_path(url):
    path = urlparse(url).path
    tokens = path.split('/')
    return len(tokens) - 1

def get_largest_token_length(url):
    path = urlparse(url).path
    tokens = path.split('/')
    if tokens:
        return max(len(token) for token in tokens)
    return 0

def get_average_token_length(url):
    path = urlparse(url).path
    tokens = [token for token in path.split('/') if token]
    if tokens:
        return np.mean([len(token) for token in tokens])
    return 0

def get_file_length(url):
    path = urlparse(url).path
    filename = path.split('/')[-1]
    return len(filename)

def get_dot_count_in_file(url):
    path = urlparse(url).path
    filename = path.split('/')[-1]
    return filename.count('.')

def get_delimiter_count_in_file(url):
    path = urlparse(url).path
    filename = path.split('/')[-1]
    delimiters = ['.', '_', '-']
    return sum(filename.count(delimiter) for delimiter in delimiters)

def get_arguments_length(url):
    query = urlparse(url).query
    return len(query)

def get_number_of_arguments(url):
    query = urlparse(url).query
    return len(parse_qs(query))

def get_length_of_largest_argument_value(url):
    query = urlparse(url).query
    params = parse_qs(query)
    if params:
        return max(len(max(values, key=len)) for values in params.values())
    return 0

def get_max_delimiters_in_arguments(url):
    query = urlparse(url).query
    params = parse_qs(query)
    delimiters = ['&', '=', '-', '_']
    if params:
        return max(sum(value.count(delimiter) for delimiter in delimiters) for values in params.values() for value in values)
    return 0

# Apply feature extraction
features = balanced_df['URL'].apply(lambda x: pd.Series({
    'url_length': get_url_length(x),
    'dot_count': get_dot_count(x),
    'hyphen_count_domain': get_hyphen_count_in_domain(x),
    'security_sensitive_words': contains_security_sensitive_words(x),
    'directory_length': get_directory_length(x),
    'sub_directory_count': get_sub_directory_count(x),
    'contains_ip': contains_ip(x),
    'token_count_path': get_token_count_in_path(x),
    'largest_token_length': get_largest_token_length(x),
    'average_token_length': get_average_token_length(x),
    'file_length': get_file_length(x),
    'dot_count_in_file': get_dot_count_in_file(x),
    'delimiter_count_in_file': get_delimiter_count_in_file(x),
    'arguments_length': get_arguments_length(x),
    'number_of_arguments': get_number_of_arguments(x),
    'length_of_largest_argument_value': get_length_of_largest_argument_value(x),
    'max_delimiters_in_arguments': get_max_delimiters_in_arguments(x),
}))

# Concatenate original DF with features
balanced_df = pd.concat([balanced_df, features], axis=1)

# Define X and y
X = balanced_df.drop(['Label', 'URL'], axis=1)
y = balanced_df['Label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the model pipeline
model_pipeline = Pipeline([
    ('classifier', RandomForestClassifier(n_estimators=20, random_state=42))
])

# Train the model
model_pipeline.fit(X_train, y_train)

# Predictions
y_pred = model_pipeline.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.8814213706073714
Confusion Matrix:
 [[27134  1039]
 [ 5595 22178]]
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.96      0.89     28173
           1       0.96      0.80      0.87     27773

    accuracy                           0.88     55946
   macro avg       0.89      0.88      0.88     55946
weighted avg       0.89      0.88      0.88     55946



In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Tokenization and sequence padding parameters
max_len = 100  # Adjust based on the length of the longest URL in your dataset
max_words = 60000  # Adjust based on the size of your vocabulary

# Tokenize the URLs
tokenizer = Tokenizer(num_words=max_words, char_level=True)
tokenizer.fit_on_texts(balanced_df['URL'])
sequences = tokenizer.texts_to_sequences(balanced_df['URL'])

# Pad the sequences
data = pad_sequences(sequences, maxlen=max_len)

# Labels
labels = np.asarray(balanced_df['Label'])
labels = to_categorical(labels)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)

# RNN model definition
model = Sequential()
model.add(Embedding(max_words, 32, input_length=max_len))
model.add(SimpleRNN(32))
model.add(Dense(2, activation='softmax'))

model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])

# Define early stopping criteria
early_stopping = EarlyStopping(monitor='val_acc', patience=3, restore_best_weights=True)

# Train the model with early stopping
history = model.fit(X_train, y_train, epochs=10, batch_size=128, validation_split=0.2, callbacks=[early_stopping])

# Predictions
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
y_test_classes = np.argmax(y_test, axis=1)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test_classes, y_pred_classes))
print("Confusion Matrix:\n", confusion_matrix(y_test_classes, y_pred_classes))
print("Classification Report:\n", classification_report(y_test_classes, y_pred_classes))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 0.9483072963214528
Confusion Matrix:
 [[27616   557]
 [ 2335 25438]]
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.98      0.95     28173
           1       0.98      0.92      0.95     27773

    accuracy                           0.95     55946
   macro avg       0.95      0.95      0.95     55946
weighted avg       0.95      0.95      0.95     55946



In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Tokenization and sequence padding parameters
max_len = 100  # Adjust based on the length of the longest URL in your dataset
max_words = 10000  # Adjust based on the size of your vocabulary

# Tokenize the URLs
tokenizer = Tokenizer(num_words=max_words, char_level=True)
tokenizer.fit_on_texts(balanced_df['URL'])
sequences = tokenizer.texts_to_sequences(balanced_df['URL'])

# Pad the sequences
data = pad_sequences(sequences, maxlen=max_len)

# Labels
labels = np.asarray(balanced_df['Label'])
labels = to_categorical(labels)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)

# RNN model definition
model = Sequential()
model.add(Embedding(max_words, 32, input_length=max_len))
model.add(Bidirectional(LSTM(32, dropout=0.2, recurrent_dropout=0.2)))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(2, activation='softmax'))

# Use Adam optimizer with a lower learning rate
optimizer = Adam(learning_rate=0.001)

model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['acc'])

# Define early stopping criteria
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model with early stopping
history = model.fit(X_train, y_train, epochs=20, batch_size=128, validation_split=0.2, callbacks=[early_stopping])

# Predictions
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
y_test_classes = np.argmax(y_test, axis=1)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test_classes, y_pred_classes))
print("Confusion Matrix:\n", confusion_matrix(y_test_classes, y_pred_classes))
print("Classification Report:\n", classification_report(y_test_classes, y_pred_classes))



Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
 109/1399 [=>............................] - ETA: 1:44 - loss: 0.1540 - acc: 0.9408