In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Import data
legit = pd.read_csv("datasets/unb-university/Benign_list_big_final.csv")
phishing = pd.read_csv("datasets/unb-university/phishing_dataset.csv")

print(legit.head(2))
print(phishing.head(2))

  http://1337x.to/torrent/1048648/American-Sniper-2014-MD-iTALiAN-DVDSCR-X264-BST-MT/
0  http://1337x.to/torrent/1110018/Blackhat-2015-...                                 
1  http://1337x.to/torrent/1122940/Blackhat-2015-...                                 
  http://v2.email-marketing.adminsimple.com/track/link?s=a879370e133bf6f71b5cc7ce0c2043e1&amp;AdministratorID=20238&amp;MemberID=21410&amp;CampaignID=1&amp;CampaignStatisticsID=1&amp;URL=http%3A%2F%2Fwww.cadivi.pro%2Faprobados%2Faprobados2012.php%3Futm_source%3Dv2.email-marketing.adminsimple.com
0  http://bid.openx.net/json?amp;amp;amp;amp;cid;...                                                                                                                                                                                                                                                    
1  http://webmail2.centurytel.net/hwebmail/servic...                                                                                                

In [3]:
legit.columns = ['url']
phishing.columns = ['url']

# **Preparing Data**

In [4]:
# Check null values
print(f"Legit set null values: {legit.isnull().sum()}")
print()
print(f"Phishing set null values: {phishing.isnull().sum()}")

Legit set null values: url    0
dtype: int64

Phishing set null values: url    0
dtype: int64


In [5]:
# Drop duplicates
print(f"Legit's duplicates num: {legit.duplicated().sum()}")
print(f"Phishing duplicates num: {phishing.duplicated().sum()}")

legit = legit.drop_duplicates()
phishing = phishing.drop_duplicates()
print()
print("After dropping duplicates")

print(f"Legit's duplicates num: {legit.duplicated().sum()}")
print(f"Phishing duplicates num: {phishing.duplicated().sum()}")

Legit's duplicates num: 0
Phishing duplicates num: 9

After dropping duplicates
Legit's duplicates num: 0
Phishing duplicates num: 0


In [6]:
# Check if there is imbalance
print(legit.shape)
print(phishing.shape)

(35377, 1)
(9955, 1)


In [7]:
# Add the right amount of phishing urls data from phishtank
phishtank = pd.read_csv('datasets/phishtank-phishing-urls.csv')

# Get the missing amount
legit_mal_diff = int(len(legit.index) - len(phishing.index))

# Randomly select the urls from phishtank dataset
phishtank = phishtank.sample(n=legit_mal_diff,random_state=42)
phishtank.shape

(25422, 1)

In [8]:
phishing_added = pd.concat([phishing, phishtank])

print(legit.shape)
print(phishing_added.shape)

(35377, 1)
(35377, 1)


In [9]:
# add a 'label' column to the dataframes
legit["label"] = 0
phishing_added["label"] = 1

In [10]:
# combine the two dataframes
df = pd.concat([legit, phishing_added], ignore_index=True)

# shuffle the dataframe's rows randomly
df = df.sample(frac=1, random_state=42) # Set random_state to an integer for reproducibility
df.reset_index(drop=True, inplace=True)

print(df.shape)

(70754, 2)


In [11]:
print(df.isnull().sum())
print(df.duplicated().sum())

url      0
label    0
dtype: int64
1


# **Transform Data**

In [12]:
import torch

# Assuming 'df' is your DataFrame containing URL data

# Tokenize URLs into characters
all_characters = set(''.join(df['url']))  # Get all unique characters in URLs
char_to_index = {char: idx for idx, char in enumerate(all_characters)}
index_to_char = {idx: char for char, idx in char_to_index.items()}

# Convert URLs to numerical sequences
max_sequence_length = max(len(url) for url in df['url'])  # Max URL length
num_sequences = len(df)
url_sequences = torch.zeros(num_sequences, max_sequence_length, dtype=torch.long)

for i, url in enumerate(df['url']):
    for t, char in enumerate(url):
        url_sequences[i][t] = char_to_index[char]

# Convert labels to PyTorch tensor
labels = torch.tensor(df['label'].values, dtype=torch.long)


In [13]:
import torch.nn as nn

class CharRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=1):
        super(CharRNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.rnn = nn.RNN(hidden_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        batch_size = x.size(0)
        hidden = self.init_hidden(batch_size)
        embedded = self.embedding(x)
        output, hidden = self.rnn(embedded, hidden)
        output = self.fc(output[:, -1, :])  # Get output from the last time step
        return output

    def init_hidden(self, batch_size):
        return torch.zeros(self.num_layers, batch_size, self.hidden_size)


In [14]:
# Define hyperparameters
input_size = len(all_characters)
hidden_size = 128  # Adjust as needed
output_size = 2  # Assuming binary classification
num_layers = 1
learning_rate = 0.001
num_epochs = 10
batch_size = 64

# Create model, loss function, and optimizer
model = CharRNN(input_size, hidden_size, output_size, num_layers)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Create DataLoader
dataset = torch.utils.data.TensorDataset(url_sequences, labels)
loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Training loop
for epoch in range(num_epochs):
    for batch in loader:
        inputs, targets = batch
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')


Epoch [1/10], Loss: 0.6865
Epoch [2/10], Loss: 0.6907
Epoch [3/10], Loss: 0.7028
Epoch [4/10], Loss: 0.7212
Epoch [5/10], Loss: 0.6932
Epoch [6/10], Loss: 0.6922
Epoch [7/10], Loss: 0.6933
Epoch [8/10], Loss: 0.6939
Epoch [9/10], Loss: 0.7008
Epoch [10/10], Loss: 0.6930


In [15]:
# Evaluate the trained model
model.eval()
total_correct = 0
total_samples = 0

with torch.no_grad():
    for batch in loader:
        inputs, targets = batch
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        total_correct += (predicted == targets).sum().item()
        total_samples += targets.size(0)

accuracy = total_correct / total_samples
print(f'Accuracy on test set: {accuracy:.4f}')


Accuracy on test set: 0.5000


# **Train Test Split**

# **Model**