In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from urllib.parse import urlparse, parse_qs
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from tensorflow.keras.utils import to_categorical
from sklearn.pipeline import make_pipeline
from sklearn.utils import resample


# Load the dataset
df = pd.read_csv('C:\\Users\\User\\Desktop\\url_dataset_updated.csv')

# Drop all duplicates from df
df = df.drop_duplicates()

# Reset the index after dropping duplicates
df = df.reset_index(drop=True)

url_column_name = 'URL'  # Replace with your actual column name

# Remove 'http://' and 'https://' from all URLs
df[url_column_name] = df[url_column_name].str.replace('http://', '', regex=False)
df[url_column_name] = df[url_column_name].str.replace('https://', '', regex=False)


# Separate the dataset into malicious and benign
malicious_df = df[df['Label'] == 1]
benign_df = df[df['Label'] == 0]

# Randomly sample 150,000 entries from each
malicious_sampled_df = resample(malicious_df, n_samples=150000, random_state=42)
benign_sampled_df = resample(benign_df, n_samples=150000, random_state=42)

# Combine the sampled data
balanced_df = pd.concat([malicious_sampled_df, benign_sampled_df])

# Shuffle the combined dataset to mix malicious and benign URLs
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)



# Now, balanced_df contains the balanced dataset ready for further processing



In [115]:
!pip install python-whois


Collecting python-whois
  Downloading python-whois-0.8.0.tar.gz (109 kB)
     ---------------------------------------- 0.0/109.6 kB ? eta -:--:--
     --- ------------------------------------ 10.2/109.6 kB ? eta -:--:--
     ---------- -------------------------- 30.7/109.6 kB 330.3 kB/s eta 0:00:01
     -------------------- ---------------- 61.4/109.6 kB 409.6 kB/s eta 0:00:01
     ------------------------------------ 109.6/109.6 kB 577.9 kB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: python-whois
  Building wheel for python-whois (setup.py): started
  Building wheel for python-whois (setup.py): finished with status 'done'
  Created wheel for python-whois: filename=python_whois-0.8.0-py3-none-any.whl size=103273 sha256=3e2e949e2cb8cb4f94ea1142528916a862d06ce05f001a3e13086bf1f1a8a66a
  Stored in directory: c:\users\user\appdata\local\pip\cache\wheels\8a\d4\1d\bab4b44ad52eadf1b

In [111]:
balanced_df

Unnamed: 0,URL,Label
0,etransfers.interac.ca-ssl.net/sh/2o05I9/bdesj/...,1
1,betterhealthsmoothies.com/Adobe/adobe-3D6/inde...,1
2,lloydsbank.deregister-payee-secure-auth.com/Lo...,1
3,archive.md,0
4,pkg00-atx.netgate.com,0
...,...,...
299995,infomation-fb-service.e82443.repl.co,1
299996,img-1000736.ad-score.com,0
299997,sosyalsat.com/help/home.html,1
299998,storageapi.fleek.co/12678f8a-04f9-4b69-a70f-49...,1


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from urllib.parse import urlparse, parse_qs
import tldextract 
from collections import Counter
import math
import re
import whois
from datetime import datetime

def ensure_scheme(url):
    if not urlparse(url).scheme:
        url = 'http://' + url
    return url

# Feature extraction functions
def get_url_length(url):
    return len(url)

def get_dot_count(url):
    return url.count('.')

def contains_security_sensitive_words(url):
    security_sensitive_words = ['login', 'signin', 'auth', 'bank', 'update', 'account', 'verification', 'authenticate','authentication','verify','user']
    return int(any(word in url for word in security_sensitive_words))

def get_directory_length(url):
    url = ensure_scheme(url)
    path = urlparse(url).path
    return len(path)

def get_sub_directory_count(url):
    url = ensure_scheme(url)
    path = urlparse(url).path
    return path.count('/') - 1

def get_token_count_in_path(url):
    url = ensure_scheme(url)
    path = urlparse(url).path
    tokens = path.split('/')
    return len(tokens) - 1

def get_largest_token_length(url):
    url = ensure_scheme(url)
    path = urlparse(url).path
    tokens = path.split('/')
    if tokens:
        return max(len(token) for token in tokens)
    return 0

def get_average_token_length(url):
    url = ensure_scheme(url)
    path = urlparse(url).path
    tokens = [token for token in path.split('/') if token]
    if tokens:
        return np.mean([len(token) for token in tokens])
    return 0

def get_file_length(url):
    url = ensure_scheme(url)
    path = urlparse(url).path
    filename = path.split('/')[-1]
    return len(filename)

def get_dot_count_in_file(url):
    url = ensure_scheme(url)
    path = urlparse(url).path
    filename = path.split('/')[-1]
    return filename.count('.')

def get_delimiter_count_in_file(url):
    path = urlparse(url).path
    filename = path.split('/')[-1]
    delimiters = ['.', '_', '-']
    return sum(filename.count(delimiter) for delimiter in delimiters)

def get_arguments_length(url):
    url = ensure_scheme(url)
    query = urlparse(url).query
    return len(query)

def get_number_of_arguments(url):
    url = ensure_scheme(url)
    query = urlparse(url).query
    return len(parse_qs(query))

def get_length_of_largest_argument_value(url):
    url = ensure_scheme(url)
    query = urlparse(url).query
    params = parse_qs(query)
    if params:
        return max(len(max(values, key=len)) for values in params.values())
    return 0

def get_max_delimiters_in_arguments(url):
    url = ensure_scheme(url)
    query = urlparse(url).query
    params = parse_qs(query)
    delimiters = ['&', '=', '-', '_']
    if params:
        return max(sum(value.count(delimiter) for delimiter in delimiters) for values in params.values() for value in values)
    return 0


def get_hyphen_count_in_domain(url):
    url = ensure_scheme(url)
    domain = urlparse(url).netloc
    return domain.count('-')

def contains_ip(url):
    url = ensure_scheme(url)
    domain = urlparse(url).netloc
    try:
        socket.inet_aton(domain)
        return 1
    except:
        return 0
    
def get_domain_features(url):
    url = ensure_scheme(url)
    domain = urlparse(ensure_scheme(url)).netloc
    tokens = domain.split('.')
    
    # Domain Length
    domain_length = len(domain)
    
    # Count of Tokens in the Domain
    token_count = len(tokens)
    
    # Length of Largest Token in the Domain
    largest_token_length = max(len(token) for token in tokens) if tokens else 0
    
    # Average Domain Token Length
    average_token_length = sum(len(token) for token in tokens) / len(tokens) if tokens else 0
    
    return domain_length, token_count, largest_token_length, average_token_length

# New feature extraction functions
def get_special_character_count(url):
    special_characters = ['@', '=', '+', '*', '?', '&', '%', '$', '#', '!']
    return sum(url.count(char) for char in special_characters)

def get_entropy(url):
    # Count the frequency of each character in the string
    freq = Counter(url)
    # Calculate the probabilities
    probs = [count / len(url) for count in freq.values()]
    # Calculate the Shannon entropy
    entropy = -sum(p * math.log(p, 2) for p in probs if p > 0)
    return entropy

def check_url_shortened(url):
    shortened_services = ['bit.ly', 'tinyurl.com', 'goo.gl', 'ow.ly', 't.co']
    url = ensure_scheme(url)
    domain = urlparse(url).netloc
    return int(domain in shortened_services)


def get_port_number(url):
    url = ensure_scheme(url)
    port = urlparse(url).port
    return port if port else -1  # Return -1 if no port specified

def get_subdomain_count(url):
    url = ensure_scheme(url)
    domain_parts = urlparse(url).netloc.split('.')
    # Count as subdomains any parts beyond the second-level domain and TLD
    return max(0, len(domain_parts) - 2)

def get_suspicious_tld(url):
    suspicious_tlds = ['xyz', 'top', 'loan', 'win', 'club']
    url = ensure_scheme(url)
    tld = urlparse(url).netloc.split('.')[-1]
    return int(tld in suspicious_tlds)

def get_numeric_ratio(url):
    numeric_chars = sum(c.isdigit() for c in url)
    return numeric_chars / len(url) if len(url) > 0 else 0

def get_word_count(url):
    words = re.findall(r'\w+', url)
    return len(words)

def get_url_is_internationalized(url):
    try:
        url.encode('ascii')
        return 0
    except UnicodeEncodeError:
        return 1
    


# Apply feature extraction
features = balanced_df['URL'].apply(lambda x: pd.Series({
    'url_length': get_url_length(x),
    'dot_count': get_dot_count(x),
    'hyphen_count_domain': get_hyphen_count_in_domain(x),
    'security_sensitive_words': contains_security_sensitive_words(x),
    'directory_length': get_directory_length(x),
    'sub_directory_count': get_sub_directory_count(x),
    'token_count_path': get_token_count_in_path(x),
    'largest_token_length': get_largest_token_length(x),
    'average_token_length': get_average_token_length(x),
    'file_length': get_file_length(x),
    'contains_ip': contains_ip(x),
    'dot_count_in_file': get_dot_count_in_file(x),
    'delimiter_count_in_file': get_delimiter_count_in_file(x),
    'arguments_length': get_arguments_length(x),
    'number_of_arguments': get_number_of_arguments(x),
    'length_of_largest_argument_value': get_length_of_largest_argument_value(x),
    'max_delimiters_in_arguments': get_max_delimiters_in_arguments(x),
    'special_character_count': get_special_character_count(x),
    'entropy': get_entropy(x),
    'url_shortened': check_url_shortened(x),
    'port_number': get_port_number(x),
    'subdomain_count': get_subdomain_count(x),
    'suspicious_tld': get_suspicious_tld(x),
    'numeric_ratio': get_numeric_ratio(x),
    'url_is_internationalized': get_url_is_internationalized(x),
    'domain_length': get_domain_features(x)[0],
    'domain_token_count': get_domain_features(x)[1],
    'largest_domain_token_length': get_domain_features(x)[2],
    'average_domain_token_length': get_domain_features(x)[3],
    'word_count': get_word_count(x)
}))


# Concatenate original DF with features
balanced_df = pd.concat([balanced_df, features], axis=1)


# Define X and y correctly
X = balanced_df.drop(['Label', 'URL'], axis=1)  # Features
y = balanced_df['Label']  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model Training
# Since your features are already numerical, directly use RandomForestClassifier without TfidfVectorizer
model = RandomForestClassifier(n_estimators=50, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Error trying to connect to socket: closing socket - timed out
Error trying to connect to socket: closing socket - [Errno 11001] getaddrinfo failed
Error trying to connect to socket: closing socket - timed out
Error trying to connect to socket: closing socket - timed out
Error trying to connect to socket: closing socket - [WinError 10054] An existing connection was forcibly closed by the remote host
Error trying to connect to socket: closing socket - [Errno 11001] getaddrinfo failed
Error trying to connect to socket: closing socket - timed out
Error trying to connect to socket: closing socket - [Errno 11001] getaddrinfo failed
Error trying to connect to socket: closing socket - [WinError 10054] An existing connection was forcibly closed by the remote host
Error trying to connect to socket: closing socket - [Errno 11001] getaddrinfo failed
Error trying to connect to socket: closing socket - timed out
Error trying to connect to socket: closing socket - timed out
Error trying to connect to

In [113]:
balanced_df


Unnamed: 0,URL,Label,url_length,dot_count,hyphen_count_domain,security_sensitive_words,directory_length,sub_directory_count,token_count_path,largest_token_length,...,port_number,subdomain_count,suspicious_tld,numeric_ratio,url_is_internationalized,domain_length,domain_token_count,largest_domain_token_length,average_domain_token_length,word_count
0,etransfers.interac.ca-ssl.net/sh/2o05I9/bdesj/...,1,58.0,4.0,1.0,0.0,29.0,3.0,4.0,12.0,...,-1.0,2.0,0.0,0.068966,0.0,29.0,4.0,10.0,6.500000,10.0
1,betterhealthsmoothies.com/Adobe/adobe-3D6/inde...,1,51.0,2.0,0.0,0.0,26.0,2.0,3.0,9.0,...,-1.0,0.0,0.0,0.039216,0.0,25.0,2.0,21.0,12.000000,7.0
2,lloydsbank.deregister-payee-secure-auth.com/Lo...,1,53.0,3.0,3.0,1.0,10.0,0.0,1.0,9.0,...,-1.0,1.0,0.0,0.000000,0.0,43.0,3.0,28.0,13.666667,8.0
3,archive.md,0,10.0,1.0,0.0,0.0,0.0,-1.0,0.0,0.0,...,-1.0,0.0,0.0,0.000000,0.0,10.0,2.0,7.0,4.500000,2.0
4,pkg00-atx.netgate.com,0,21.0,2.0,1.0,0.0,0.0,-1.0,0.0,0.0,...,-1.0,1.0,0.0,0.095238,0.0,21.0,3.0,9.0,6.333333,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299995,infomation-fb-service.e82443.repl.co,1,36.0,3.0,2.0,0.0,0.0,-1.0,0.0,0.0,...,-1.0,2.0,0.0,0.138889,0.0,36.0,4.0,21.0,8.250000,6.0
299996,img-1000736.ad-score.com,0,24.0,2.0,2.0,0.0,0.0,-1.0,0.0,0.0,...,-1.0,1.0,0.0,0.291667,0.0,24.0,3.0,11.0,7.333333,5.0
299997,sosyalsat.com/help/home.html,1,28.0,2.0,0.0,0.0,15.0,1.0,2.0,9.0,...,-1.0,0.0,0.0,0.000000,0.0,13.0,2.0,9.0,6.000000,5.0
299998,storageapi.fleek.co/12678f8a-04f9-4b69-a70f-49...,1,75.0,3.0,0.0,0.0,56.0,1.0,2.0,43.0,...,-1.0,1.0,0.0,0.320000,0.0,19.0,3.0,10.0,5.666667,11.0


In [91]:
# Assuming 'balanced_df' contains your feature "entropy"
num_rows_entropy_greater_than_zero = (balanced_df['contains_ip'] > 0).sum()

print(f'Number of rows with "entropy" larger than 0: {num_rows_entropy_greater_than_zero}')


Number of rows with "entropy" larger than 0: 3269


In [114]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.regularizers import l2
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Assuming balanced_df is your DataFrame with URLs and labels

# Tokenization and sequence padding parameters
max_len = 100  # Adjust based on the length of the longest URL in your dataset
max_words = 60000  # Adjust based on the size of your vocabulary

# Tokenize the URLs
tokenizer = Tokenizer(num_words=max_words, char_level=True)
tokenizer.fit_on_texts(balanced_df['URL'])
sequences = tokenizer.texts_to_sequences(balanced_df['URL'])

# Pad the sequences
data = pad_sequences(sequences, maxlen=max_len)

# Labels
labels = np.asarray(balanced_df['Label'])
labels = to_categorical(labels)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)


# Model definition
model = Sequential()
model.add(Embedding(max_words, 32, input_length=max_len))
model.add(Bidirectional(LSTM(64, return_sequences=True)))  # Add return_sequences if stacking LSTM layers
model.add(Dropout(0.5))  # Adjust dropout rate as needed
model.add(Bidirectional(LSTM(32)))  # Second LSTM layer, without return_sequences
model.add(Dense(64, activation='relu', kernel_regularizer=l2(0.001)))  # Add L2 regularization
model.add(Dropout(0.5))  # Adjust dropout rate as needed
model.add(Dense(2, activation='softmax'))

model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['acc'])

# Early stopping
early_stopping = EarlyStopping(monitor='val_acc', patience=3, restore_best_weights=True)

# Train the model with early stopping
history = model.fit(X_train, y_train, epochs=30, batch_size=128, validation_split=0.2, callbacks=[early_stopping])

# Predictions
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
y_test_classes = np.argmax(y_test, axis=1)

# Evaluation
print("Accuracy:", accuracy_score(y_test_classes, y_pred_classes))
print("Confusion Matrix:\n", confusion_matrix(y_test_classes, y_pred_classes))
print("Classification Report:\n", classification_report(y_test_classes, y_pred_classes))


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Accuracy: 0.9651333333333333
Confusion Matrix:
 [[29643   419]
 [ 1673 28265]]
Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.99      0.97     30062
           1       0.99      0.94      0.96     29938

    accuracy                           0.97     60000
   macro avg       0.97      0.97      0.97     60000
weighted avg       0.97      0.97      0.97     60000



In [93]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Tokenization and sequence padding parameters
max_len = 200  # Adjust based on the length of the longest URL in your dataset
max_words = 60000  # Adjust based on the size of your vocabulary

# Tokenize the URLs
tokenizer = Tokenizer(num_words=max_words, char_level=True)
tokenizer.fit_on_texts(balanced_df['URL'])
sequences = tokenizer.texts_to_sequences(balanced_df['URL'])

# Pad the sequences
data = pad_sequences(sequences, maxlen=max_len)

# Labels
labels = np.asarray(balanced_df['Label'])
labels = to_categorical(labels)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)

# RNN model definition
model = Sequential()
model.add(Embedding(max_words, 32, input_length=max_len))
model.add(Bidirectional(LSTM(32, dropout=0.2, recurrent_dropout=0.2)))
model.add(Dense(64, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.5))
model.add(BatchNormalization())
model.add(Dense(2, activation='softmax'))

# Use Adam optimizer with a lower learning rate
optimizer = Adam(learning_rate=0.001)

model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['acc'])

# Define early stopping criteria
early_stopping = EarlyStopping(monitor='val_acc', patience=3, restore_best_weights=True)

# Train the model with early stopping
history = model.fit(X_train, y_train, epochs=30, batch_size=128, validation_split=0.2, callbacks=[early_stopping])

# Predictions
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
y_test_classes = np.argmax(y_test, axis=1)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test_classes, y_pred_classes))
print("Confusion Matrix:\n", confusion_matrix(y_test_classes, y_pred_classes))
print("Classification Report:\n", classification_report(y_test_classes, y_pred_classes))



Epoch 1/30
 162/1500 [==>...........................] - ETA: 4:47 - loss: 0.3578 - acc: 0.8515

KeyboardInterrupt: 

In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from urllib.parse import urlparse, parse_qs
import math
import re
import whois
from datetime import datetime
import socket
from collections import Counter

def ensure_scheme(url):
    if not urlparse(url).scheme:
        url = 'http://' + url
    return url

def get_url_length(url):
    return len(url)

def get_dot_count(url):
    return url.count('.')

def contains_security_sensitive_words(url):
    security_sensitive_words = ['login', 'signin', 'auth', 'bank', 'update', 'account', 'verification', 'authenticate', 'authentication', 'verify', 'user']
    return int(any(word in url for word in security_sensitive_words))

def get_directory_length(url):
    url = ensure_scheme(url)
    path = urlparse(url).path
    return len(path)

def get_sub_directory_count(url):
    url = ensure_scheme(url)
    path = urlparse(url).path
    return path.count('/') - 1

def get_token_count_in_path(url):
    url = ensure_scheme(url)
    path = urlparse(url).path
    tokens = path.split('/')
    return len(tokens) - 1

def get_largest_token_length(url):
    url = ensure_scheme(url)
    path = urlparse(url).path
    tokens = path.split('/')
    if tokens:
        return max(len(token) for token in tokens)
    return 0

def get_average_token_length(url):
    url = ensure_scheme(url)
    path = urlparse(url).path
    tokens = [token for token in path.split('/') if token]
    if tokens:
        return np.mean([len(token) for token in tokens])
    return 0

def get_file_length(url):
    url = ensure_scheme(url)
    path = urlparse(url).path
    filename = path.split('/')[-1]
    return len(filename)

def get_dot_count_in_file(url):
    url = ensure_scheme(url)
    path = urlparse(url).path
    filename = path.split('/')[-1]
    return filename.count('.')

def get_delimiter_count_in_file(url):
    url = ensure_scheme(url)
    path = urlparse(url).path
    filename = path.split('/')[-1]
    delimiters = ['.', '_', '-']
    return sum(filename.count(delimiter) for delimiter in delimiters)

def get_arguments_length(url):
    url = ensure_scheme(url)
    query = urlparse(url).query
    return len(query)

def get_number_of_arguments(url):
    url = ensure_scheme(url)
    query = urlparse(url).query
    return len(parse_qs(query))

def get_length_of_largest_argument_value(url):
    url = ensure_scheme(url)
    query = urlparse(url).query
    params = parse_qs(query)
    if params:
        return max(len(max(values, key=len)) for values in params.values())
    return 0

def get_max_delimiters_in_arguments(url):
    url = ensure_scheme(url)
    query = urlparse(url).query
    params = parse_qs(query)
    delimiters = ['&', '=', '-', '_']
    if params:
        return max(sum(value.count(delimiter) for delimiter in delimiters) for values in params.values() for value in values)
    return 0

def get_hyphen_count_in_domain(url):
    url = ensure_scheme(url)
    domain = urlparse(url).netloc
    return domain.count('-')

def contains_ip(url):
    url = ensure_scheme(url)
    domain = urlparse(url).netloc
    try:
        socket.inet_aton(domain)
        return 1
    except:
        return 0

def get_domain_info(url):
    try:
        return whois.whois(url)
    except Exception as e:  # Catching all exceptions
        print(f"Error querying WHOIS data: {e}")
        return None

def get_domain_age_months(url):
    domain_info = get_domain_info(url)
    if domain_info and isinstance(domain_info.creation_date, (datetime, list)):
        creation_date = domain_info.creation_date if not isinstance(domain_info.creation_date, list) else domain_info.creation_date[0]
        today = datetime.now()
        return round((today - creation_date).days / 30) if creation_date else 0
    return 0

def get_domain_expiry_age_months(url):
    domain_info = get_domain_info(url)
    if domain_info and isinstance(domain_info.expiration_date, (datetime, list)):
        expiry_date = domain_info.expiration_date if not isinstance(domain_info.expiration_date, list) else domain_info.expiration_date[0]
        today = datetime.now()
        return round((expiry_date - today).days / 30) if expiry_date else 0
    return 0

def get_domain_updating_age_days(url):
    domain_info = get_domain_info(url)
    if domain_info and isinstance(domain_info.updated_date, (datetime, list)):
        updated_date = domain_info.updated_date if not isinstance(domain_info.updated_date, list) else domain_info.updated_date[0]
        today = datetime.now()
        return (today - updated_date).days if updated_date else 0
    return 0

def get_zip_code_of_domain_holder(url):
    domain_info = get_domain_info(url)
    if domain_info and domain_info.address:
        return domain_info.address[-1] if isinstance(domain_info.address, list) else ''
    return ''

def get_special_character_count(url):
    special_characters = ['@', '=', '+', '*', '?', '&', '%', '$', '#', '!']
    return sum(url.count(char) for char in special_characters)

def get_entropy(url):
    freq = Counter(url)
    probs = [count / len(url) for count in freq.values()]
    entropy = -sum(p * math.log(p, 2) for p in probs if p > 0)
    return entropy

def check_url_shortened(url):
    shortened_services = ['bit.ly', 'tinyurl.com', 'goo.gl', 'ow.ly', 't.co']
    url = ensure_scheme(url)
    domain = urlparse(url).netloc
    return int(domain in shortened_services)

def get_port_number(url):
    url = ensure_scheme(url)
    port = urlparse(url).port
    return port if port else -1

def get_subdomain_count(url):
    url = ensure_scheme(url)
    domain_parts = urlparse(url).netloc.split('.')
    return max(0, len(domain_parts) - 2)

def get_suspicious_tld(url):
    suspicious_tlds = ['xyz', 'top', 'loan', 'win', 'club']
    url = ensure_scheme(url)
    tld = urlparse(url).netloc.split('.')[-1]
    return int(tld in suspicious_tlds)

def get_numeric_ratio(url):
    numeric_chars = sum(c.isdigit() for c in url)
    return numeric_chars / len(url) if len(url) > 0 else 0

def get_word_count(url):
    words = re.findall(r'\w+', url)
    return len(words)

def get_url_is_internationalized(url):
    try:
        url.encode('ascii')
        return 0
    except UnicodeEncodeError:
        return 1

# Assume balanced_df is your DataFrame containing URLs and Labels
# balanced_df = pd.DataFrame({'URL': ['example.com', 'another-example.com'], 'Label': [0, 1]})

# Feature extraction as an example
features = balanced_df['URL'].apply(lambda x: pd.Series({
    'url_length': get_url_length(x),
    'dot_count': get_dot_count(x),
    'security_sensitive_words': contains_security_sensitive_words(x),
    'directory_length': get_directory_length(x),
    'sub_directory_count': get_sub_directory_count(x),
    'token_count_path': get_token_count_in_path(x),
    'largest_token_length': get_largest_token_length(x),
    'average_token_length': get_average_token_length(x),
    'file_length': get_file_length(x),
    'dot_count_in_file': get_dot_count_in_file(x),
    'delimiter_count_in_file': get_delimiter_count_in_file(x),
    'arguments_length': get_arguments_length(x),
    'number_of_arguments': get_number_of_arguments(x),
    'length_of_largest_argument_value': get_length_of_largest_argument_value(x),
    'max_delimiters_in_arguments': get_max_delimiters_in_arguments(x),
    'hyphen_count_domain': get_hyphen_count_in_domain(x),
    'contains_ip': contains_ip(x),
    'special_character_count': get_special_character_count(x),
    'entropy': get_entropy(x),
    'url_shortened': check_url_shortened(x),
    'port_number': get_port_number(x),
    'subdomain_count': get_subdomain_count(x),
    'suspicious_tld': get_suspicious_tld(x),
    'numeric_ratio': get_numeric_ratio(x),
    'url_is_internationalized': get_url_is_internationalized(x),
    'domain_age_months': get_domain_age_months(x),
    'domain_expiry_age_months': get_domain_expiry_age_months(x),
    'domain_updating_age_days': get_domain_updating_age_days(x),
    'zip_code_of_domain_holder': get_zip_code_of_domain_holder(x),
    'word_count': get_word_count(x)
}))

# Append features to the DataFrame
balanced_df = pd.concat([balanced_df, features], axis=1)

# Define X and y
X = balanced_df.drop(['Label', 'URL'], axis=1)  # Features
y = balanced_df['Label']  # Target variable

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model training
model = RandomForestClassifier(n_estimators=50, random_state=42)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Error querying WHOIS data: No match for "CA-SSL.NET".
>>> Last update of whois database: 2024-02-15T19:51:09Z <<<

NOTICE: The expiration date displayed in this record is the date the
registrar's sponsorship of the domain name registration in the registry is
currently set to expire. This date does not necessarily reflect the expiration
date of the domain name registrant's agreement with the sponsoring
registrar.  Users may consult the sponsoring registrar's Whois database to
view the registrar's reported date of expiration for this registration.

TERMS OF USE: You are not authorized to access or query our Whois
database through the use of electronic processes that are high-volume and
automated except as reasonably necessary to register domain names or
modify existing registrations; the Data in VeriSign Global Registry
Services' ("VeriSign") Whois database is provided by VeriSign for
information purposes only, and to assist persons in obtaining information
about or related to a domain n

Error trying to connect to socket: closing socket - timed out
Error querying WHOIS data: No match for "DEREGISTER-PAYEE-SECURE-AUTH.COM".
>>> Last update of whois database: 2024-02-15T19:51:25Z <<<

NOTICE: The expiration date displayed in this record is the date the
registrar's sponsorship of the domain name registration in the registry is
currently set to expire. This date does not necessarily reflect the expiration
date of the domain name registrant's agreement with the sponsoring
registrar.  Users may consult the sponsoring registrar's Whois database to
view the registrar's reported date of expiration for this registration.

TERMS OF USE: You are not authorized to access or query our Whois
database through the use of electronic processes that are high-volume and
automated except as reasonably necessary to register domain names or
modify existing registrations; the Data in VeriSign Global Registry
Services' ("VeriSign") Whois database is provided by VeriSign for
information purposes

Error trying to connect to socket: closing socket - timed out
Error trying to connect to socket: closing socket - [Errno 11001] getaddrinfo failed
Error trying to connect to socket: closing socket - [Errno 11001] getaddrinfo failed
Error trying to connect to socket: closing socket - [Errno 11001] getaddrinfo failed
Error trying to connect to socket: closing socket - [Errno 11001] getaddrinfo failed
Error querying WHOIS data: No match for "COMMBANKNETCODE.COM".
>>> Last update of whois database: 2024-02-15T19:51:56Z <<<

NOTICE: The expiration date displayed in this record is the date the
registrar's sponsorship of the domain name registration in the registry is
currently set to expire. This date does not necessarily reflect the expiration
date of the domain name registrant's agreement with the sponsoring
registrar.  Users may consult the sponsoring registrar's Whois database to
view the registrar's reported date of expiration for this registration.

TERMS OF USE: You are not authorized

Error trying to connect to socket: closing socket - timed out
Error querying WHOIS data: No match for "COIANASBSBSELOG.AZUREWEBSITES.NET".
>>> Last update of whois database: 2024-02-15T19:52:26Z <<<

NOTICE: The expiration date displayed in this record is the date the
registrar's sponsorship of the domain name registration in the registry is
currently set to expire. This date does not necessarily reflect the expiration
date of the domain name registrant's agreement with the sponsoring
registrar.  Users may consult the sponsoring registrar's Whois database to
view the registrar's reported date of expiration for this registration.

TERMS OF USE: You are not authorized to access or query our Whois
database through the use of electronic processes that are high-volume and
automated except as reasonably necessary to register domain names or
modify existing registrations; the Data in VeriSign Global Registry
Services' ("VeriSign") Whois database is provided by VeriSign for
information purpose

KeyboardInterrupt: 