In [4]:
import pandas as pd
import matplotlib.pyplot as plt

In [5]:
# Import data
legit = pd.read_csv("datasets/unb-university/Benign_list_big_final.csv")
phishing = pd.read_csv("datasets/unb-university/phishing_dataset.csv")

print(legit.head(2))
print(phishing.head(2))

  http://1337x.to/torrent/1048648/American-Sniper-2014-MD-iTALiAN-DVDSCR-X264-BST-MT/
0  http://1337x.to/torrent/1110018/Blackhat-2015-...                                 
1  http://1337x.to/torrent/1122940/Blackhat-2015-...                                 
  http://v2.email-marketing.adminsimple.com/track/link?s=a879370e133bf6f71b5cc7ce0c2043e1&amp;AdministratorID=20238&amp;MemberID=21410&amp;CampaignID=1&amp;CampaignStatisticsID=1&amp;URL=http%3A%2F%2Fwww.cadivi.pro%2Faprobados%2Faprobados2012.php%3Futm_source%3Dv2.email-marketing.adminsimple.com
0  http://bid.openx.net/json?amp;amp;amp;amp;cid;...                                                                                                                                                                                                                                                    
1  http://webmail2.centurytel.net/hwebmail/servic...                                                                                                

In [6]:
legit.columns = ['url']
phishing.columns = ['url']

# **Preparing Data**

In [7]:
# Check null values
print(f"Legit set null values: {legit.isnull().sum()}")
print()
print(f"Phishing set null values: {phishing.isnull().sum()}")

Legit set null values: url    0
dtype: int64

Phishing set null values: url    0
dtype: int64


In [8]:
# Drop duplicates
print(f"Legit's duplicates num: {legit.duplicated().sum()}")
print(f"Phishing duplicates num: {phishing.duplicated().sum()}")

legit = legit.drop_duplicates()
phishing = phishing.drop_duplicates()
print()
print("After dropping duplicates")

print(f"Legit's duplicates num: {legit.duplicated().sum()}")
print(f"Phishing duplicates num: {phishing.duplicated().sum()}")

Legit's duplicates num: 0
Phishing duplicates num: 9

After dropping duplicates
Legit's duplicates num: 0
Phishing duplicates num: 0


In [9]:
# Check if there is imbalance
print(legit.shape)
print(phishing.shape)

(35377, 1)
(9955, 1)


In [10]:
# Add the right amount of phishing urls data from phishtank
phishtank = pd.read_csv('datasets/phishtank-phishing-urls.csv')

# Get the missing amount
legit_mal_diff = int(len(legit.index) - len(phishing.index))

# Randomly select the urls from phishtank dataset
phishtank = phishtank.sample(n=legit_mal_diff,random_state=42)
phishtank.shape

(25422, 1)

In [11]:
phishing_added = pd.concat([phishing, phishtank])

print(legit.shape)
print(phishing_added.shape)

(35377, 1)
(35377, 1)


In [12]:
# add a 'label' column to the dataframes
legit["label"] = 0
phishing_added["label"] = 1

In [14]:
# combine the two dataframes
df = pd.concat([legit, phishing_added], ignore_index=True)

# shuffle the dataframe's rows randomly
df = df.sample(frac=1, random_state=42) # Set random_state to an integer for reproducibility
df.reset_index(drop=True, inplace=True)

print(df.shape)

(70754, 2)


In [15]:
print(df.isnull().sum())
print(df.duplicated().sum())

url      0
label    0
dtype: int64
1


# **Transform Data**

In [17]:
from sklearn.model_selection import train_test_split

# Extract features (URLs) and labels
X = df['url'].values
y = df['label'].values

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [52]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from torchtext.vocab import Vocab
from torch.utils.data import DataLoader, TensorDataset
from nltk.tokenize import word_tokenize
import nltk
from collections import Counter  # Corrected import

# Download the punkt tokenizer (needed for word_tokenize)
nltk.download('punkt')

# Tokenize using nltk
def tokenize_text(text):
    return word_tokenize(text)


# Tokenize URLs
X_train_tokens = [tokenize_text(url) for url in X_train]
X_test_tokens = [tokenize_text(url) for url in X_test]

# Build vocabulary
flattened_tokens = [token for sublist in X_train_tokens for token in sublist]
counter = Counter(flattened_tokens)

# Filter out tokens that occur less than min_freq times
min_freq = 1
filtered_counter = {word: freq for word, freq in counter.items() if freq >= min_freq}

# Create Vocab
vocab = Vocab(filtered_counter)

# Find the maximum sequence length
max_sequence_length = max(len(tokens) for tokens in X_train_tokens + X_test_tokens)

# Convert tokens to numerical indices with padding, using <unk> for unknown tokens
unk_index = vocab['<unk>']

X_train_indices = torch.LongTensor([
    [vocab.get(token, unk_index) for token in tokens] + [0] * (max_sequence_length - len(tokens))
    for tokens in X_train_tokens
])

X_test_indices = torch.LongTensor([
    [vocab.get(token, unk_index) for token in tokens] + [0] * (max_sequence_length - len(tokens))
    for tokens in X_test_tokens
])



# Padding sequences
max_sequence_length = 100
X_train_padded = torch.nn.functional.pad(X_train_indices, (0, max_sequence_length - X_train_indices.size(1)))
X_test_padded = torch.nn.functional.pad(X_test_indices, (0, max_sequence_length - X_test_indices.size(1)))

# Create DataLoader
train_dataset = TensorDataset(X_train_padded, torch.FloatTensor(y_train))
test_dataset = TensorDataset(X_test_padded, torch.FloatTensor(y_test))

batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


KeyError: '<unk>'

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

class CNNModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, dropout):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.convs = nn.ModuleList([
            nn.Conv1d(in_channels=embedding_dim, out_channels=n_filters, kernel_size=fs)
            for fs in filter_sizes
        ])
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text):
        embedded = self.embedding(text)
        embedded = embedded.permute(0, 2, 1)
        conved = [nn.functional.relu(conv(embedded)) for conv in self.convs]
        pooled = [nn.functional.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        cat = self.dropout(torch.cat(pooled, dim=1))
        return self.fc(cat)

# Define model hyperparameters
vocab_size = len(TEXT.vocab)
embedding_dim = 50
n_filters = 128
filter_sizes = [3, 4, 5]
output_dim = 1
dropout = 0.5

# Create an instance of the model
model = CNNModel(vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, dropout)

# Define loss and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters())


Epoch [1/10], Loss: 0.6865
Epoch [2/10], Loss: 0.6907
Epoch [3/10], Loss: 0.7028
Epoch [4/10], Loss: 0.7212
Epoch [5/10], Loss: 0.6932
Epoch [6/10], Loss: 0.6922
Epoch [7/10], Loss: 0.6933
Epoch [8/10], Loss: 0.6939
Epoch [9/10], Loss: 0.7008
Epoch [10/10], Loss: 0.6930


In [None]:
# Training loop
num_epochs = 10
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
criterion = criterion.to(device)

for epoch in range(num_epochs):
    model.train()
    for batch in train_iterator:
        text, text_lengths = batch.url
        labels = batch.label
        optimizer.zero_grad()
        predictions = model(text).squeeze(1)
        loss = criterion(predictions, labels)
        loss.backward()
        optimizer.step()

# Evaluation
model.eval()
with torch.no_grad():
    for batch in test_iterator:
        text, text_lengths = batch.url
        labels = batch.label
        predictions = model(text).squeeze(1)
        # Evaluate and report metrics as needed


Accuracy on test set: 0.5000
