In [23]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from urllib.parse import urlparse, parse_qs
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from tensorflow.keras.utils import to_categorical
from sklearn.pipeline import make_pipeline
from sklearn.utils import resample


# Load the dataset
df = pd.read_csv('C:\\Users\\User\\Desktop\\url_dataset_updated.csv')

# Drop all duplicates from df
df = df.drop_duplicates()

# Reset the index after dropping duplicates
df = df.reset_index(drop=True)

url_column_name = 'URL'  # Replace with your actual column name

# Function to concatenate "https://" to URLs labeled with 0
def add_https(url, label):
    if label == 0:
        return "https://" + url
    else:
        return url

# Apply the function to the URL column
df[url_column_name] = df.apply(lambda row: add_https(row[url_column_name], row['Label']), axis=1)


# Separate the dataset into malicious and benign
malicious_df = df[df['Label'] == 1]
benign_df = df[df['Label'] == 0]

# Randomly sample 150,000 entries from each
malicious_sampled_df = resample(malicious_df, n_samples=150000, random_state=42)
benign_sampled_df = resample(benign_df, n_samples=150000, random_state=42)

# Combine the sampled data
balanced_df = pd.concat([malicious_sampled_df, benign_sampled_df])

# Shuffle the combined dataset to mix malicious and benign URLs
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)



# Now, balanced_df contains the balanced dataset ready for further processing



In [24]:
balanced_df




Unnamed: 0,URL,Label
0,http://etransfers.interac.ca-ssl.net/sh/2o05I9...,1
1,http://betterhealthsmoothies.com/Adobe/adobe-3...,1
2,http://lloydsbank.deregister-payee-secure-auth...,1
3,https://archive.md,0
4,https://pkg00-atx.netgate.com,0
...,...,...
299995,https://infomation-fb-service.e82443.repl.co,1
299996,https://img-1000736.ad-score.com,0
299997,http://sosyalsat.com/help/home.html,1
299998,https://storageapi.fleek.co/12678f8a-04f9-4b69...,1


In [111]:
balanced_df

Unnamed: 0,URL,Label
0,etransfers.interac.ca-ssl.net/sh/2o05I9/bdesj/...,1
1,betterhealthsmoothies.com/Adobe/adobe-3D6/inde...,1
2,lloydsbank.deregister-payee-secure-auth.com/Lo...,1
3,archive.md,0
4,pkg00-atx.netgate.com,0
...,...,...
299995,infomation-fb-service.e82443.repl.co,1
299996,img-1000736.ad-score.com,0
299997,sosyalsat.com/help/home.html,1
299998,storageapi.fleek.co/12678f8a-04f9-4b69-a70f-49...,1


In [25]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from urllib.parse import urlparse, parse_qs
import tldextract 
from collections import Counter
import math
import re
import whois
from datetime import datetime

def ensure_scheme(url):
    if not urlparse(url).scheme:
        url = 'http://' + url
    return url

# Feature extraction functions
def get_url_length(url):
    return len(url)

def get_dot_count(url):
    return url.count('.')

def contains_security_sensitive_words(url):
    security_sensitive_words = [
    'login', 'password', 'admin', 'root', 'secret', 'private', 'secure', 'confidential', 
    'bank', 'creditcard', 'account', 'authentication', 'authorization', 'session', 'token', 
    'apikey', 'ssl', 'https', 'secure', 'encrypted', 'auth', 'signin', 'signup', 'verification', 
    'resetpassword', 'change-password', 'forgot-password', 'otp', '2fa', 'phishing', 'malware', 
    'virus', 'trojan', 'exploit', 'hacker', 'attack', 'security', 'vulnerable', 'injection', 
    'xss', 'csrf', 'dos', 'ddos', 'bruteforce', 'firewall', 'vpn', 'proxy', 'tor', 'security-question', 
    'privacy-policy'
]

    return int(any(word in url for word in security_sensitive_words))

def get_directory_length(url):
    url = ensure_scheme(url)
    path = urlparse(url).path
    return len(path)

def get_sub_directory_count(url):
    url = ensure_scheme(url)
    path = urlparse(url).path
    return path.count('/') - 1

def get_token_count_in_path(url):
    url = ensure_scheme(url)
    path = urlparse(url).path
    tokens = path.split('/')
    return len(tokens) - 1

def get_largest_token_length(url):
    url = ensure_scheme(url)
    path = urlparse(url).path
    tokens = path.split('/')
    if tokens:
        return max(len(token) for token in tokens)
    return 0

def get_average_token_length(url):
    url = ensure_scheme(url)
    path = urlparse(url).path
    tokens = [token for token in path.split('/') if token]
    if tokens:
        return np.mean([len(token) for token in tokens])
    return 0

def get_file_length(url):
    url = ensure_scheme(url)
    path = urlparse(url).path
    filename = path.split('/')[-1]
    return len(filename)

def get_dot_count_in_file(url):
    url = ensure_scheme(url)
    path = urlparse(url).path
    filename = path.split('/')[-1]
    return filename.count('.')

def get_delimiter_count_in_file(url):
    path = urlparse(url).path
    filename = path.split('/')[-1]
    delimiters = ['.', '_', '-']
    return sum(filename.count(delimiter) for delimiter in delimiters)

def get_arguments_length(url):
    url = ensure_scheme(url)
    query = urlparse(url).query
    return len(query)

def get_number_of_arguments(url):
    url = ensure_scheme(url)
    query = urlparse(url).query
    return len(parse_qs(query))

def get_length_of_largest_argument_value(url):
    url = ensure_scheme(url)
    query = urlparse(url).query
    params = parse_qs(query)
    if params:
        return max(len(max(values, key=len)) for values in params.values())
    return 0

def get_max_delimiters_in_arguments(url):
    url = ensure_scheme(url)
    query = urlparse(url).query
    params = parse_qs(query)
    delimiters = ['&', '=', '-', '_']
    if params:
        return max(sum(value.count(delimiter) for delimiter in delimiters) for values in params.values() for value in values)
    return 0


def get_hyphen_count_in_domain(url):
    url = ensure_scheme(url)
    domain = urlparse(url).netloc
    return domain.count('-')

def contains_ip(url):
    url = ensure_scheme(url)
    domain = urlparse(url).netloc
    try:
        socket.inet_aton(domain)
        return 1
    except:
        return 0
    
def get_domain_features(url):
    url = ensure_scheme(url)
    domain = urlparse(ensure_scheme(url)).netloc
    tokens = domain.split('.')
    
    # Domain Length
    domain_length = len(domain)
    
    # Count of Tokens in the Domain
    token_count = len(tokens)
    
    # Length of Largest Token in the Domain
    largest_token_length = max(len(token) for token in tokens) if tokens else 0
    
    # Average Domain Token Length
    average_token_length = sum(len(token) for token in tokens) / len(tokens) if tokens else 0
    
    return domain_length, token_count, largest_token_length, average_token_length

# New feature extraction functions
def get_special_character_count(url):
    special_characters = ['@', '=', '+', '*', '?', '&', '%', '$', '#', '!']
    return sum(url.count(char) for char in special_characters)

def get_entropy(url):
    # Count the frequency of each character in the string
    freq = Counter(url)
    # Calculate the probabilities
    probs = [count / len(url) for count in freq.values()]
    # Calculate the Shannon entropy
    entropy = -sum(p * math.log(p, 2) for p in probs if p > 0)
    return entropy

def check_url_shortened(url):
    shortened_services = ['bit.ly', 'tinyurl.com', 'goo.gl', 'ow.ly', 't.co']
    url = ensure_scheme(url)
    domain = urlparse(url).netloc
    return int(domain in shortened_services)

def get_subdomain_count(url):
    url = ensure_scheme(url)
    domain_parts = urlparse(url).netloc.split('.')
    # Count as subdomains any parts beyond the second-level domain and TLD
    return max(0, len(domain_parts) - 2)

def get_suspicious_tld(url):
    suspicious_tlds = ['xyz', 'top', 'loan', 'win', 'club']
    url = ensure_scheme(url)
    tld = urlparse(url).netloc.split('.')[-1]
    return int(tld in suspicious_tlds)

def get_numeric_ratio(url):
    numeric_chars = sum(c.isdigit() for c in url)
    return numeric_chars / len(url) if len(url) > 0 else 0

def get_word_count(url):
    words = re.findall(r'\w+', url)
    return len(words)
    

# Apply feature extraction
features = balanced_df['URL'].apply(lambda x: pd.Series({
    'url_length': get_url_length(x),
    'dot_count': get_dot_count(x),
    'hyphen_count_domain': get_hyphen_count_in_domain(x),
    'security_sensitive_words': contains_security_sensitive_words(x),
    'directory_length': get_directory_length(x),
    'sub_directory_count': get_sub_directory_count(x),
    'token_count_path': get_token_count_in_path(x),
    'largest_token_length': get_largest_token_length(x),
    'average_token_length': get_average_token_length(x),
    'file_length': get_file_length(x),
    'contains_ip': contains_ip(x),
    'dot_count_in_file': get_dot_count_in_file(x),
    'delimiter_count_in_file': get_delimiter_count_in_file(x),
    'arguments_length': get_arguments_length(x),
    'number_of_arguments': get_number_of_arguments(x),
    'length_of_largest_argument_value': get_length_of_largest_argument_value(x),
    'max_delimiters_in_arguments': get_max_delimiters_in_arguments(x),
    'special_character_count': get_special_character_count(x),
    'entropy': get_entropy(x),
    'url_shortened': check_url_shortened(x),
    'subdomain_count': get_subdomain_count(x),
    'suspicious_tld': get_suspicious_tld(x),
    'numeric_ratio': get_numeric_ratio(x),
    'domain_length': get_domain_features(x)[0],
    'domain_token_count': get_domain_features(x)[1],
    'largest_domain_token_length': get_domain_features(x)[2],
    'average_domain_token_length': get_domain_features(x)[3],
    'word_count': get_word_count(x)
}))


# Concatenate original DF with features
balanced_df = pd.concat([balanced_df, features], axis=1)


# Define X and y correctly
X = balanced_df.drop(['Label', 'URL'], axis=1)  # Features
y = balanced_df['Label']  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model Training
# Since your features are already numerical, directly use RandomForestClassifier without TfidfVectorizer
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9494833333333333
Confusion Matrix:
 [[29158   904]
 [ 2127 27811]]
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.97      0.95     30062
           1       0.97      0.93      0.95     29938

    accuracy                           0.95     60000
   macro avg       0.95      0.95      0.95     60000
weighted avg       0.95      0.95      0.95     60000



In [113]:
balanced_df


Unnamed: 0,URL,Label,url_length,dot_count,hyphen_count_domain,security_sensitive_words,directory_length,sub_directory_count,token_count_path,largest_token_length,...,port_number,subdomain_count,suspicious_tld,numeric_ratio,url_is_internationalized,domain_length,domain_token_count,largest_domain_token_length,average_domain_token_length,word_count
0,etransfers.interac.ca-ssl.net/sh/2o05I9/bdesj/...,1,58.0,4.0,1.0,0.0,29.0,3.0,4.0,12.0,...,-1.0,2.0,0.0,0.068966,0.0,29.0,4.0,10.0,6.500000,10.0
1,betterhealthsmoothies.com/Adobe/adobe-3D6/inde...,1,51.0,2.0,0.0,0.0,26.0,2.0,3.0,9.0,...,-1.0,0.0,0.0,0.039216,0.0,25.0,2.0,21.0,12.000000,7.0
2,lloydsbank.deregister-payee-secure-auth.com/Lo...,1,53.0,3.0,3.0,1.0,10.0,0.0,1.0,9.0,...,-1.0,1.0,0.0,0.000000,0.0,43.0,3.0,28.0,13.666667,8.0
3,archive.md,0,10.0,1.0,0.0,0.0,0.0,-1.0,0.0,0.0,...,-1.0,0.0,0.0,0.000000,0.0,10.0,2.0,7.0,4.500000,2.0
4,pkg00-atx.netgate.com,0,21.0,2.0,1.0,0.0,0.0,-1.0,0.0,0.0,...,-1.0,1.0,0.0,0.095238,0.0,21.0,3.0,9.0,6.333333,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299995,infomation-fb-service.e82443.repl.co,1,36.0,3.0,2.0,0.0,0.0,-1.0,0.0,0.0,...,-1.0,2.0,0.0,0.138889,0.0,36.0,4.0,21.0,8.250000,6.0
299996,img-1000736.ad-score.com,0,24.0,2.0,2.0,0.0,0.0,-1.0,0.0,0.0,...,-1.0,1.0,0.0,0.291667,0.0,24.0,3.0,11.0,7.333333,5.0
299997,sosyalsat.com/help/home.html,1,28.0,2.0,0.0,0.0,15.0,1.0,2.0,9.0,...,-1.0,0.0,0.0,0.000000,0.0,13.0,2.0,9.0,6.000000,5.0
299998,storageapi.fleek.co/12678f8a-04f9-4b69-a70f-49...,1,75.0,3.0,0.0,0.0,56.0,1.0,2.0,43.0,...,-1.0,1.0,0.0,0.320000,0.0,19.0,3.0,10.0,5.666667,11.0


In [3]:
# Assuming 'balanced_df' contains your feature "entropy"
num_rows_entropy_greater_than_zero = (balanced_df['port_number'] > 0).sum()

print(f'Number of rows with "entropy" larger than 0: {num_rows_entropy_greater_than_zero}')


Number of rows with "entropy" larger than 0: 55


In [114]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.regularizers import l2
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Assuming balanced_df is your DataFrame with URLs and labels

# Tokenization and sequence padding parameters
max_len = 100  # Adjust based on the length of the longest URL in your dataset
max_words = 60000  # Adjust based on the size of your vocabulary

# Tokenize the URLs
tokenizer = Tokenizer(num_words=max_words, char_level=True)
tokenizer.fit_on_texts(balanced_df['URL'])
sequences = tokenizer.texts_to_sequences(balanced_df['URL'])

# Pad the sequences
data = pad_sequences(sequences, maxlen=max_len)

# Labels
labels = np.asarray(balanced_df['Label'])
labels = to_categorical(labels)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)


# Model definition
model = Sequential()
model.add(Embedding(max_words, 32, input_length=max_len))
model.add(Bidirectional(LSTM(64, return_sequences=True)))  # Add return_sequences if stacking LSTM layers
model.add(Dropout(0.5))  # Adjust dropout rate as needed
model.add(Bidirectional(LSTM(32)))  # Second LSTM layer, without return_sequences
model.add(Dense(64, activation='relu', kernel_regularizer=l2(0.001)))  # Add L2 regularization
model.add(Dropout(0.5))  # Adjust dropout rate as needed
model.add(Dense(2, activation='softmax'))

model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['acc'])

# Early stopping
early_stopping = EarlyStopping(monitor='val_acc', patience=3, restore_best_weights=True)

# Train the model with early stopping
history = model.fit(X_train, y_train, epochs=30, batch_size=128, validation_split=0.2, callbacks=[early_stopping])

# Predictions
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
y_test_classes = np.argmax(y_test, axis=1)

# Evaluation
print("Accuracy:", accuracy_score(y_test_classes, y_pred_classes))
print("Confusion Matrix:\n", confusion_matrix(y_test_classes, y_pred_classes))
print("Classification Report:\n", classification_report(y_test_classes, y_pred_classes))


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Accuracy: 0.9651333333333333
Confusion Matrix:
 [[29643   419]
 [ 1673 28265]]
Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.99      0.97     30062
           1       0.99      0.94      0.96     29938

    accuracy                           0.97     60000
   macro avg       0.97      0.97      0.97     60000
weighted avg       0.97      0.97      0.97     60000



In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
import torch
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Load dataset and preprocess as needed
# balanced_df = ...

# Tokenization and sequence padding parameters
max_len = 200  # Adjust based on the length of the longest URL in your dataset

# Tokenize the URLs using BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
inputs = tokenizer(balanced_df['URL'].tolist(), padding=True, truncation=True, max_length=max_len, return_tensors='pt')

# Labels
labels = torch.tensor(balanced_df['Label'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(inputs['input_ids'], labels, test_size=0.2, random_state=42)

# Create PyTorch DataLoader for training and testing data
train_data = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)

test_data = TensorDataset(X_test, y_test)
test_loader = DataLoader(test_data, batch_size=32)

# Load pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Define optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)

# Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

epochs = 3  # Adjust as needed
for epoch in range(epochs):
    model.train()
    for batch in train_loader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[0],
                  'labels': batch[1]}
        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

# Evaluation
model.eval()
predictions = []
true_labels = []
with torch.no_grad():
    for batch in test_loader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[0]}
        outputs = model(**inputs)
        logits = outputs.logits
        predictions.extend(torch.argmax(logits, dim=1).tolist())
        true_labels.extend(batch[1].tolist())

# Convert predictions and true labels to numpy arrays
predictions = np.array(predictions)
true_labels = np.array(true_labels)

# Evaluate the model
print("Accuracy:", accuracy_score(true_labels, predictions))
print("Confusion Matrix:\n", confusion_matrix(true_labels, predictions))
print("Classification Report:\n", classification_report(true_labels, predictions))


  torch.utils._pytree._register_pytree_node(


Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


KeyboardInterrupt: 

In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from urllib.parse import urlparse, parse_qs
import tldextract 
from collections import Counter
import math
import re
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.feature_selection import VarianceThreshold



def ensure_scheme(url):
    if not urlparse(url).scheme:
        url = 'http://' + url
    return url

# Feature extraction functions
def get_url_length(url):
    return len(url)

def get_dot_count(url):
    return url.count('.')

def contains_security_sensitive_words(url):
    security_sensitive_words = [
    'login', 'password', 'admin', 'root', 'secret', 'private', 'secure', 'confidential', 
    'bank', 'creditcard', 'account', 'authentication', 'authorization', 'session', 'token', 
    'apikey', 'ssl', 'https', 'secure', 'encrypted', 'auth', 'signin', 'signup', 'verification', 
    'resetpassword', 'change-password', 'forgot-password', 'otp', '2fa', 'phishing', 'malware', 
    'virus', 'trojan', 'exploit', 'hacker', 'attack', 'security', 'vulnerable', 'injection', 
    'xss', 'csrf', 'dos', 'ddos', 'bruteforce', 'firewall', 'vpn', 'proxy', 'tor', 'security-question', 
    'privacy-policy'
]

    return int(any(word in url for word in security_sensitive_words))

def get_directory_length(url):
    url = ensure_scheme(url)
    path = urlparse(url).path
    return len(path)

def get_sub_directory_count(url):
    url = ensure_scheme(url)
    path = urlparse(url).path
    return path.count('/') - 1

def get_token_count_in_path(url):
    url = ensure_scheme(url)
    path = urlparse(url).path
    tokens = path.split('/')
    return len(tokens) - 1

def get_largest_token_length(url):
    url = ensure_scheme(url)
    path = urlparse(url).path
    tokens = path.split('/')
    if tokens:
        return max(len(token) for token in tokens)
    return 0

def get_average_token_length(url):
    url = ensure_scheme(url)
    path = urlparse(url).path
    tokens = [token for token in path.split('/') if token]
    if tokens:
        return np.mean([len(token) for token in tokens])
    return 0

def get_file_length(url):
    url = ensure_scheme(url)
    path = urlparse(url).path
    filename = path.split('/')[-1]
    return len(filename)

def get_dot_count_in_file(url):
    url = ensure_scheme(url)
    path = urlparse(url).path
    filename = path.split('/')[-1]
    return filename.count('.')

def get_delimiter_count_in_file(url):
    path = urlparse(url).path
    filename = path.split('/')[-1]
    delimiters = ['.', '_', '-']
    return sum(filename.count(delimiter) for delimiter in delimiters)

def get_arguments_length(url):
    url = ensure_scheme(url)
    query = urlparse(url).query
    return len(query)

def get_number_of_arguments(url):
    url = ensure_scheme(url)
    query = urlparse(url).query
    return len(parse_qs(query))

def get_length_of_largest_argument_value(url):
    url = ensure_scheme(url)
    query = urlparse(url).query
    params = parse_qs(query)
    if params:
        return max(len(max(values, key=len)) for values in params.values())
    return 0

def get_max_delimiters_in_arguments(url):
    url = ensure_scheme(url)
    query = urlparse(url).query
    params = parse_qs(query)
    delimiters = ['&', '=', '-', '_']
    if params:
        return max(sum(value.count(delimiter) for delimiter in delimiters) for values in params.values() for value in values)
    return 0


def get_hyphen_count_in_domain(url):
    url = ensure_scheme(url)
    domain = urlparse(url).netloc
    return domain.count('-')

def contains_ip(url):
    url = ensure_scheme(url)
    domain = urlparse(url).netloc
    try:
        socket.inet_aton(domain)
        return 1
    except:
        return 0
    
def get_domain_features(url):
    url = ensure_scheme(url)
    domain = urlparse(ensure_scheme(url)).netloc
    tokens = domain.split('.')
    
    # Domain Length
    domain_length = len(domain)
    
    # Count of Tokens in the Domain
    token_count = len(tokens)
    
    # Length of Largest Token in the Domain
    largest_token_length = max(len(token) for token in tokens) if tokens else 0
    
    # Average Domain Token Length
    average_token_length = sum(len(token) for token in tokens) / len(tokens) if tokens else 0
    
    return domain_length, token_count, largest_token_length, average_token_length

# New feature extraction functions
def get_special_character_count(url):
    special_characters = ['@', '=', '+', '*', '?', '&', '%', '$', '#', '!']
    return sum(url.count(char) for char in special_characters)

def get_entropy(url):
    # Count the frequency of each character in the string
    freq = Counter(url)
    # Calculate the probabilities
    probs = [count / len(url) for count in freq.values()]
    # Calculate the Shannon entropy
    entropy = -sum(p * math.log(p, 2) for p in probs if p > 0)
    return entropy

def check_url_shortened(url):
    shortened_services = ['bit.ly', 'tinyurl.com', 'goo.gl', 'ow.ly', 't.co']
    url = ensure_scheme(url)
    domain = urlparse(url).netloc
    return int(domain in shortened_services)

def get_subdomain_count(url):
    url = ensure_scheme(url)
    domain_parts = urlparse(url).netloc.split('.')
    # Count as subdomains any parts beyond the second-level domain and TLD
    return max(0, len(domain_parts) - 2)

def get_suspicious_tld(url):
    suspicious_tlds = ['xyz', 'top', 'loan', 'win', 'club']
    url = ensure_scheme(url)
    tld = urlparse(url).netloc.split('.')[-1]
    return int(tld in suspicious_tlds)

def get_numeric_ratio(url):
    numeric_chars = sum(c.isdigit() for c in url)
    return numeric_chars / len(url) if len(url) > 0 else 0

def get_word_count(url):
    words = re.findall(r'\w+', url)
    return len(words)
    

# Apply feature extraction
features = balanced_df['URL'].apply(lambda x: pd.Series({
    'url_length': get_url_length(x),
    'dot_count': get_dot_count(x),
    'hyphen_count_domain': get_hyphen_count_in_domain(x),
    'security_sensitive_words': contains_security_sensitive_words(x),
    'directory_length': get_directory_length(x),
    'sub_directory_count': get_sub_directory_count(x),
    'token_count_path': get_token_count_in_path(x),
    'largest_token_length': get_largest_token_length(x),
    'average_token_length': get_average_token_length(x),
    'file_length': get_file_length(x),
    'contains_ip': contains_ip(x),
    'dot_count_in_file': get_dot_count_in_file(x),
    'delimiter_count_in_file': get_delimiter_count_in_file(x),
    'arguments_length': get_arguments_length(x),
    'number_of_arguments': get_number_of_arguments(x),
    'length_of_largest_argument_value': get_length_of_largest_argument_value(x),
    'max_delimiters_in_arguments': get_max_delimiters_in_arguments(x),
    'special_character_count': get_special_character_count(x),
    'entropy': get_entropy(x),
    'url_shortened': check_url_shortened(x),
    'subdomain_count': get_subdomain_count(x),
    'suspicious_tld': get_suspicious_tld(x),
    'numeric_ratio': get_numeric_ratio(x),
    'domain_length': get_domain_features(x)[0],
    'domain_token_count': get_domain_features(x)[1],
    'largest_domain_token_length': get_domain_features(x)[2],
    'average_domain_token_length': get_domain_features(x)[3],
    'word_count': get_word_count(x)
}))


# Concatenate original DF with features
balanced_df = pd.concat([balanced_df, features], axis=1)


# Define X and y correctly
X = balanced_df.drop(['Label', 'URL'], axis=1)  # Features
y = balanced_df['Label']  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model Training
# Since your features are already numerical, directly use RandomForestClassifier without TfidfVectorizer
model = RandomForestClassifier(n_estimators=50, random_state=42)

feature_selection_pipeline = Pipeline([
    ('variance_threshold', VarianceThreshold(threshold=0)),
])

# Fit and transform the pipeline on the training data
X_train_transformed = feature_selection_pipeline.fit_transform(X_train, y_train)

# Transform the test data based on the fitted pipeline
X_test_transformed = feature_selection_pipeline.transform(X_test)

# Now, train your model on X_train_transformed and test on X_test_transformed
model.fit(X_train_transformed, y_train)
y_pred = model.predict(X_test_transformed)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Error fetching or parsing URL http://etransfers.interac.ca-ssl.net/sh/2o05I9/bdesj/continue.php: HTTPConnectionPool(host='etransfers.interac.ca-ssl.net', port=80): Max retries exceeded with url: /sh/2o05I9/bdesj/continue.php (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x0000012170AAABD0>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
Error fetching or parsing URL http://etransfers.interac.ca-ssl.net/sh/2o05I9/bdesj/continue.php: HTTPConnectionPool(host='etransfers.interac.ca-ssl.net', port=80): Max retries exceeded with url: /sh/2o05I9/bdesj/continue.php (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x0000012170AA8A50>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
Error fetching or parsing URL http://etransfers.interac.ca-ssl.net/sh/2o05I9/bdesj/continue.php: HTTPConnectionPool(host='etransfers.interac.ca-ssl.net', port=80): Max retries exceeded with url: /sh/2o05I9/bd

Error fetching or parsing URL http://hgdggdgfghygsugfytsfgssytstys.gq/83cbc: HTTPConnectionPool(host='hgdggdgfghygsugfytsfgssytstys.gq', port=80): Max retries exceeded with url: /83cbc (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x00000121709DC590>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
Error fetching or parsing URL http://hgdggdgfghygsugfytsfgssytstys.gq/83cbc: HTTPConnectionPool(host='hgdggdgfghygsugfytsfgssytstys.gq', port=80): Max retries exceeded with url: /83cbc (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x00000121709DF050>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
Error fetching or parsing URL http://hgdggdgfghygsugfytsfgssytstys.gq/83cbc: HTTPConnectionPool(host='hgdggdgfghygsugfytsfgssytstys.gq', port=80): Max retries exceeded with url: /83cbc (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x00000121709DF910>: Failed

Error fetching or parsing URL http://payleboncoinid.site: HTTPConnectionPool(host='payleboncoinid.site', port=80): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x00000121707B3E90>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
Error fetching or parsing URL http://payleboncoinid.site: HTTPConnectionPool(host='payleboncoinid.site', port=80): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x00000121707B3A10>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
Error fetching or parsing URL http://payleboncoinid.site: HTTPConnectionPool(host='payleboncoinid.site', port=80): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x00000121707B3A10>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
Error fetching or parsing URL http://payleb

  soup = BeautifulSoup(html_content, 'html.parser')


Error fetching or parsing URL http://cc76387.tmweb.ru/0f19a279bcab94f/region.php?particulier: HTTPConnectionPool(host='cc76387.tmweb.ru', port=80): Max retries exceeded with url: /0f19a279bcab94f/region.php?particulier (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x000001217073F810>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
Error fetching or parsing URL http://cc76387.tmweb.ru/0f19a279bcab94f/region.php?particulier: HTTPConnectionPool(host='cc76387.tmweb.ru', port=80): Max retries exceeded with url: /0f19a279bcab94f/region.php?particulier (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x000001217073EC50>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
Error fetching or parsing URL http://cc76387.tmweb.ru/0f19a279bcab94f/region.php?particulier: HTTPConnectionPool(host='cc76387.tmweb.ru', port=80): Max retries exceeded with url: /0f19a279bcab94f/region.php?particulier (

Error fetching or parsing URL http://frescofish.in/blessings/english: HTTPConnectionPool(host='frescofish.in', port=80): Max retries exceeded with url: /blessings/english (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x00000121708E1E50>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
Error fetching or parsing URL http://frescofish.in/blessings/english: HTTPConnectionPool(host='frescofish.in', port=80): Max retries exceeded with url: /blessings/english (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x00000121708E2090>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
Error fetching or parsing URL http://frescofish.in/blessings/english: HTTPConnectionPool(host='frescofish.in', port=80): Max retries exceeded with url: /blessings/english (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x00000121708E1850>: Failed to establish a new connection: [Errno 110

Error fetching or parsing URL http://80g10hb.cn: HTTPConnectionPool(host='www.80g10hb.cn', port=80): Read timed out. (read timeout=0.5)
Error fetching or parsing URL http://80g10hb.cn: HTTPConnectionPool(host='www.80g10hb.cn', port=80): Read timed out. (read timeout=0.5)
Error fetching or parsing URL http://80g10hb.cn: HTTPConnectionPool(host='www.80g10hb.cn', port=80): Read timed out. (read timeout=0.5)
Error fetching or parsing URL http://80g10hb.cn: HTTPConnectionPool(host='www.80g10hb.cn', port=80): Read timed out. (read timeout=0.5)
Error fetching or parsing URL http://grapecreek.com: HTTPSConnectionPool(host='grapecreek.com', port=443): Read timed out. (read timeout=0.5)
Error fetching or parsing URL http://grapecreek.com: HTTPSConnectionPool(host='www.grapecreek.com', port=443): Read timed out. (read timeout=0.5)
Error fetching or parsing URL http://gate2.murrelektronik.de: HTTPConnectionPool(host='gate2.murrelektronik.de', port=80): Max retries exceeded with url: / (Caused by C

Error fetching or parsing URL http://mos.cms.futurecdn.net: HTTPConnectionPool(host='mos.cms.futurecdn.net', port=80): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x000001216FD32110>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
Error fetching or parsing URL http://mos.cms.futurecdn.net: HTTPConnectionPool(host='mos.cms.futurecdn.net', port=80): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x000001216FD31B90>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
Error fetching or parsing URL http://mos.cms.futurecdn.net: HTTPConnectionPool(host='mos.cms.futurecdn.net', port=80): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x000001216FD324D0>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
Error fetching or parsing URL h

Error fetching or parsing URL http://vsa07.thrivenextgen.com: HTTPConnectionPool(host='vsa07.thrivenextgen.com', port=80): Max retries exceeded with url: / (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x000001216FDB4310>, 'Connection to vsa07.thrivenextgen.com timed out. (connect timeout=0.5)'))
Error fetching or parsing URL http://sucursalpersonas.transacionesbancolombia.com/mua/USER?scis=HGXmnhj015fu/mNq9r5ZriAKtHK71zoLgJkOib89pnw=: HTTPConnectionPool(host='sucursalpersonas.transacionesbancolombia.com', port=80): Max retries exceeded with url: /mua/USER?scis=HGXmnhj015fu/mNq9r5ZriAKtHK71zoLgJkOib89pnw= (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x000001216FDE5810>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
Error fetching or parsing URL http://sucursalpersonas.transacionesbancolombia.com/mua/USER?scis=HGXmnhj015fu/mNq9r5ZriAKtHK71zoLgJkOib89pnw=: HTTPConnectionPool(host='sucursalpersonas.tr

Error fetching or parsing URL http://delexpress.org: HTTPConnectionPool(host='delexpress.org', port=80): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x000001216F8EBF50>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
Error fetching or parsing URL http://delexpress.org: HTTPConnectionPool(host='delexpress.org', port=80): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x000001216F8EA210>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
Error fetching or parsing URL http://delexpress.org: HTTPConnectionPool(host='delexpress.org', port=80): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x000001216F8E8610>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
Error fetching or parsing URL http://delexpress.org: HTTPConnectionPool(h

Error fetching or parsing URL http://wallet-api.urbanairship.com: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error fetching or parsing URL http://profilefacebook-1139800146.agencija-klopotec.si/profile.html?countuser=c8f4a601226e1a16a48bab97ecf51889: HTTPConnectionPool(host='profilefacebook-1139800146.agencija-klopotec.si', port=80): Max retries exceeded with url: /profile.html?countuser=c8f4a601226e1a16a48bab97ecf51889 (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x000001216B1DFA90>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
Error fetching or parsing URL http://profilefacebook-1139800146.agencija-klopotec.si/profile.html?countuser=c8f4a601226e1a16a48bab97ecf51889: HTTPConnectionPool(host='profilefacebook-1139800146.agencija-klopotec.si', port=80): Max retries exceeded with url: /profile.html?countuser=c8f4a601226e1a16a48bab97ecf51889 (Caused by NewConnectionError('<urllib3.

Error fetching or parsing URL http://balley6.com/mazon/amazon: HTTPConnectionPool(host='balley6.com', port=80): Max retries exceeded with url: /mazon/amazon (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x0000012169A82B50>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
Error fetching or parsing URL http://balley6.com/mazon/amazon: HTTPConnectionPool(host='balley6.com', port=80): Max retries exceeded with url: /mazon/amazon (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x0000012169A82B50>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
Error fetching or parsing URL http://balley6.com/mazon/amazon: HTTPConnectionPool(host='balley6.com', port=80): Max retries exceeded with url: /mazon/amazon (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x0000012169A80D10>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
Error fetching o

Error fetching or parsing URL http://dataprivacyoffice.com.br/wp-admin/network/update-core/standard2land/0zde1mjy=/password.php: HTTPConnectionPool(host='dataprivacyoffice.com.br', port=80): Max retries exceeded with url: /wp-admin/network/update-core/standard2land/0zde1mjy=/password.php (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x000001216912BE90>, 'Connection to dataprivacyoffice.com.br timed out. (connect timeout=0.5)'))
Error fetching or parsing URL http://dataprivacyoffice.com.br/wp-admin/network/update-core/standard2land/0zde1mjy=/password.php: HTTPConnectionPool(host='dataprivacyoffice.com.br', port=80): Max retries exceeded with url: /wp-admin/network/update-core/standard2land/0zde1mjy=/password.php (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x000001216FBF38D0>, 'Connection to dataprivacyoffice.com.br timed out. (connect timeout=0.5)'))
Error fetching or parsing URL http://dataprivacyoffice.com.br/wp-admin/netwo



Error fetching or parsing URL http://xn--banxo-caisse-pargne-nzb.fr/58bb61e68a0dd06: HTTPConnectionPool(host='xn--banxo-caisse-pargne-nzb.fr', port=80): Max retries exceeded with url: /58bb61e68a0dd06 (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x0000012168736C10>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
Error fetching or parsing URL http://xn--banxo-caisse-pargne-nzb.fr/58bb61e68a0dd06: HTTPConnectionPool(host='xn--banxo-caisse-pargne-nzb.fr', port=80): Max retries exceeded with url: /58bb61e68a0dd06 (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x0000012168734D10>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
Error fetching or parsing URL http://xn--banxo-caisse-pargne-nzb.fr/58bb61e68a0dd06: HTTPConnectionPool(host='xn--banxo-caisse-pargne-nzb.fr', port=80): Max retries exceeded with url: /58bb61e68a0dd06 (Caused by NewConnectionError('<urllib3.connection.HTTP

Error fetching or parsing URL http://older-escaped.duckdns.org: HTTPConnectionPool(host='older-escaped.duckdns.org', port=80): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x00000121754534D0>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
Error fetching or parsing URL http://older-escaped.duckdns.org: HTTPConnectionPool(host='older-escaped.duckdns.org', port=80): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x0000012175452390>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
Error fetching or parsing URL http://older-escaped.duckdns.org: HTTPConnectionPool(host='older-escaped.duckdns.org', port=80): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x00000121754531D0>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
Error f

Error fetching or parsing URL http://nathanmcguirelaw.com/09bb63ac0f2a1e960e9d60941e03fcb7/verify.php?country_x=-&acct_x=id-ppl=pa324162.158.63.222=scrpg=9971370eb8ecb024894f3d7782f11aec1d03a38e91527cfb4379f125e29efb92s=$1$u0xotjfc$k/ouhrtwv9.qsslqvo6p8/w38gecyjhkgitpudn9ux5afspldtre2lxvhmy4saf0ioqb6z71zjrcknmwvbqotnbe1lmgkqxrfvuyspv6zkd3ih2wiqm5eg7ocra8tyhb4jslnzjwfoc9ap0uxd3497525542: HTTPConnectionPool(host='nathanmcguirelaw.com', port=80): Max retries exceeded with url: /09bb63ac0f2a1e960e9d60941e03fcb7/verify.php?country_x=-&acct_x=id-ppl=pa324162.158.63.222=scrpg=9971370eb8ecb024894f3d7782f11aec1d03a38e91527cfb4379f125e29efb92s=$1$u0xotjfc$k/ouhrtwv9.qsslqvo6p8/w38gecyjhkgitpudn9ux5afspldtre2lxvhmy4saf0ioqb6z71zjrcknmwvbqotnbe1lmgkqxrfvuyspv6zkd3ih2wiqm5eg7ocra8tyhb4jslnzjwfoc9ap0uxd3497525542 (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x000001215D0D5910>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
Error fetching or 

Error fetching or parsing URL http://mtb.dns2.us/login/86eed61e3fd93082e3e47e5b1559c422/email.php?token=85168ba507495b931946c8afc3e8b55d7e79a7a1d797ba58d5d22ca7624ce0a2b4bcf040c2eb51112090f0a06e97e581bf249030e674be31ad27db0821878705: HTTPConnectionPool(host='mtb.dns2.us', port=80): Max retries exceeded with url: /login/86eed61e3fd93082e3e47e5b1559c422/email.php?token=85168ba507495b931946c8afc3e8b55d7e79a7a1d797ba58d5d22ca7624ce0a2b4bcf040c2eb51112090f0a06e97e581bf249030e674be31ad27db0821878705 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x000001210A3D9E90>, 'Connection to mtb.dns2.us timed out. (connect timeout=0.5)'))
Error fetching or parsing URL http://mtb.dns2.us/login/86eed61e3fd93082e3e47e5b1559c422/email.php?token=85168ba507495b931946c8afc3e8b55d7e79a7a1d797ba58d5d22ca7624ce0a2b4bcf040c2eb51112090f0a06e97e581bf249030e674be31ad27db0821878705: HTTPConnectionPool(host='mtb.dns2.us', port=80): Max retries exceeded with url: /login/86eed61e3fd93082e3e4

Error fetching or parsing URL http://marketing.jffalcom.com.br/DHL/dhl/dhl/dhl/59b18ff76e025c22ccf6d8333facb07b: HTTPConnectionPool(host='marketing.jffalcom.com.br', port=80): Max retries exceeded with url: /DHL/dhl/dhl/dhl/59b18ff76e025c22ccf6d8333facb07b (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x000001216B1BBD90>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
Error fetching or parsing URL http://marketing.jffalcom.com.br/DHL/dhl/dhl/dhl/59b18ff76e025c22ccf6d8333facb07b: HTTPConnectionPool(host='marketing.jffalcom.com.br', port=80): Max retries exceeded with url: /DHL/dhl/dhl/dhl/59b18ff76e025c22ccf6d8333facb07b (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x000001216B1B9ED0>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
Error fetching or parsing URL http://marketing.jffalcom.com.br/DHL/dhl/dhl/dhl/59b18ff76e025c22ccf6d8333facb07b: HTTPConnectionPool(host='market

Error fetching or parsing URL http://betaal-wijze.nl/knab: HTTPConnectionPool(host='betaal-wijze.nl', port=80): Max retries exceeded with url: /knab (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x000001216B7B9290>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
Error fetching or parsing URL http://betaal-wijze.nl/knab: HTTPConnectionPool(host='betaal-wijze.nl', port=80): Max retries exceeded with url: /knab (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x000001216B7B92D0>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
Error fetching or parsing URL http://betaal-wijze.nl/knab: HTTPConnectionPool(host='betaal-wijze.nl', port=80): Max retries exceeded with url: /knab (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x000001216B7B8F10>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
Error fetching or parsing URL http://bet

Error fetching or parsing URL http://dkds8srdioygieonmq6zsc.qwo231sdx.club/index/usps/index.html: HTTPConnectionPool(host='dkds8srdioygieonmq6zsc.qwo231sdx.club', port=80): Max retries exceeded with url: /index/usps/index.html (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x0000012168A87210>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
Error fetching or parsing URL http://dkds8srdioygieonmq6zsc.qwo231sdx.club/index/usps/index.html: HTTPConnectionPool(host='dkds8srdioygieonmq6zsc.qwo231sdx.club', port=80): Max retries exceeded with url: /index/usps/index.html (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x0000012168A87550>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
Error fetching or parsing URL http://dkds8srdioygieonmq6zsc.qwo231sdx.club/index/usps/index.html: HTTPConnectionPool(host='dkds8srdioygieonmq6zsc.qwo231sdx.club', port=80): Max retries exceeded with url: 

Error fetching or parsing URL http://share-field-7570.yralecaeaghnrsn.workers.dev/99ec8f90-8267-49a3-82bb-a9ca2a44854d: object of type 'NoneType' has no len()
Error fetching or parsing URL http://share-field-7570.yralecaeaghnrsn.workers.dev/99ec8f90-8267-49a3-82bb-a9ca2a44854d: object of type 'NoneType' has no len()
Error fetching or parsing URL http://share-field-7570.yralecaeaghnrsn.workers.dev/99ec8f90-8267-49a3-82bb-a9ca2a44854d: object of type 'NoneType' has no len()
Error fetching or parsing URL http://share-field-7570.yralecaeaghnrsn.workers.dev/99ec8f90-8267-49a3-82bb-a9ca2a44854d: object of type 'NoneType' has no len()
Error fetching or parsing URL http://aep.devonway.com: HTTPConnectionPool(host='aep.devonway.com', port=80): Max retries exceeded with url: / (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x0000012168ABF1D0>, 'Connection to aep.devonway.com timed out. (connect timeout=0.5)'))
Error fetching or parsing URL http://aep.devonway.com: HT

Error fetching or parsing URL http://87bde12c.dr.youme.im: HTTPConnectionPool(host='87bde12c.dr.youme.im', port=80): Max retries exceeded with url: / (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x000001216888FAD0>, 'Connection to 87bde12c.dr.youme.im timed out. (connect timeout=0.5)'))
Error fetching or parsing URL http://87bde12c.dr.youme.im: HTTPConnectionPool(host='87bde12c.dr.youme.im', port=80): Max retries exceeded with url: / (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x0000012168A44610>, 'Connection to 87bde12c.dr.youme.im timed out. (connect timeout=0.5)'))
Error fetching or parsing URL http://87bde12c.dr.youme.im: HTTPConnectionPool(host='87bde12c.dr.youme.im', port=80): Max retries exceeded with url: / (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x0000012168AF0DD0>, 'Connection to 87bde12c.dr.youme.im timed out. (connect timeout=0.5)'))
Error fetching or parsing URL http://vovaer

Error fetching or parsing URL http://194.195.86.192/sh/CA72u10/mbmo/continue.php: HTTPConnectionPool(host='194.195.86.192', port=80): Max retries exceeded with url: /sh/CA72u10/mbmo/continue.php (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x000001216B651DD0>, 'Connection to 194.195.86.192 timed out. (connect timeout=0.5)'))
Error fetching or parsing URL http://194.195.86.192/sh/CA72u10/mbmo/continue.php: HTTPConnectionPool(host='194.195.86.192', port=80): Max retries exceeded with url: /sh/CA72u10/mbmo/continue.php (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x00000121698BC290>, 'Connection to 194.195.86.192 timed out. (connect timeout=0.5)'))
Error fetching or parsing URL http://194.195.86.192/sh/CA72u10/mbmo/continue.php: HTTPConnectionPool(host='194.195.86.192', port=80): Max retries exceeded with url: /sh/CA72u10/mbmo/continue.php (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x00000121698

Error fetching or parsing URL http://ns-1207.awsdns-22.org: HTTPConnectionPool(host='ns-1207.awsdns-22.org', port=80): Max retries exceeded with url: / (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x000001215CD4EC90>, 'Connection to ns-1207.awsdns-22.org timed out. (connect timeout=0.5)'))
Error fetching or parsing URL http://ns-1207.awsdns-22.org: HTTPConnectionPool(host='ns-1207.awsdns-22.org', port=80): Max retries exceeded with url: / (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x000001215CCA8D10>, 'Connection to ns-1207.awsdns-22.org timed out. (connect timeout=0.5)'))
Error fetching or parsing URL http://orvis-us.attn.tv: HTTPSConnectionPool(host='orvis-us.attn.tv', port=443): Read timed out. (read timeout=0.5)
Error fetching or parsing URL http://orvis-us.attn.tv: HTTPSConnectionPool(host='orvis-us.attn.tv', port=443): Read timed out. (read timeout=0.5)
Error fetching or parsing URL http://orvis-us.attn.tv: HTTPSConn

KeyboardInterrupt: 