In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [9]:
url_df1 = pd.read_csv("./data/urldata.csv", index_col=0)
url_df1['result'].value_counts()

result
0    345738
1    104438
Name: count, dtype: int64

In [11]:
url_df2 = pd.read_csv("./data/malicious_phish.csv")
url_df2['type'].value_counts()

type
benign        428103
defacement     96457
phishing       94111
malware        32520
Name: count, dtype: int64

In [12]:
def gen_result_col(row):
    if(row['type']=='benign'):
        return 0
    return 1

url_df2['result'] = url_df2.apply(gen_result_col, axis=1)
url_df2.head()

Unnamed: 0,url,type,result
0,br-icloud.com.br,phishing,1
1,mp3raid.com/music/krizz_kaliko.html,benign,0
2,bopsecrets.org/rexroth/cr/1.htm,benign,0
3,http://www.garage-pirenne.be/index.php?option=...,defacement,1
4,http://adventure-nicaragua.net/index.php?optio...,defacement,1


In [13]:
print(url_df1.shape)
print(url_df2.shape)

(450176, 3)
(651191, 3)


In [17]:
desired_col = ['url', 'result']
combined_url_df = pd.concat([url_df1[desired_col], url_df2[url_df2['result'] == 1][desired_col]], axis=0)

In [18]:
print(combined_url_df.shape)
print(combined_url_df['result'].value_counts())

(673264, 2)
result
0    345738
1    327526
Name: count, dtype: int64


In [19]:
combined_url_df.head()

Unnamed: 0,url,result
0,https://www.google.com,0
1,https://www.youtube.com,0
2,https://www.facebook.com,0
3,https://www.baidu.com,0
4,https://www.wikipedia.org,0


In [60]:
max_url_len = 0
for index, row in combined_url_df.iterrows():
    max_url_len = max(max_url_len, len(row['url']))

print(max_url_len)

2314


In [21]:
combined_url_df = combined_url_df.drop_duplicates()
print(combined_url_df.shape)
print(combined_url_df['result'].value_counts())

(662147, 2)
result
0    345738
1    316409
Name: count, dtype: int64


In [179]:
train_data, temp_data = train_test_split(combined_url_df, test_size=0.3, random_state=42)

# Split temp_data into validation and test sets
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

# Display the shapes of the resulting datasets
print("Train set shape:", train_data.shape)
print(train_data['result'].value_counts())

print("Validation set shape:", val_data.shape)
print(val_data['result'].value_counts())

print("Test set shape:", test_data.shape)
print(test_data['result'].value_counts())


Train set shape: (463502, 2)
result
0    242203
1    221299
Name: count, dtype: int64
Validation set shape: (99322, 2)
result
0    51816
1    47506
Name: count, dtype: int64
Test set shape: (99323, 2)
result
0    51719
1    47604
Name: count, dtype: int64


In [38]:
from bloom_filter2 import BloomFilter

In [52]:
n = 221300 #no of items to add 
p = 0.05 #false positive probability 
  
bloomfilter = BloomFilter(max_elements=n,error_rate=p)

In [53]:
for index, row in train_data[train_data['result']==1].iterrows():
    bloomfilter.add(row['url'])

In [54]:
fp = 0
for index, row in test_data[test_data['result']==0].iterrows():
    if(row['url'] in bloomfilter):
        fp+=1

print(fp)

2614


In [55]:
fpr = fp/51719
print(fpr)

0.050542353873818134


In [56]:
print(bloomfilter.num_bits_m / (1024*1024))

1.3159332275390625


In [57]:
print(bloomfilter.num_probes_k)

5


In [94]:
char_to_idx = {char: idx for idx, char in enumerate(set("".join(combined_url_df['url'])))}
idx_to_char = {idx: char for char, idx in char_to_idx.items()}

In [95]:
vocab_size = len(char_to_idx)
print((char_to_idx))

{'q': 0, '{': 1, '¸': 2, '$': 3, '\x9b': 4, 'O': 5, 'ด': 6, '9': 7, '#': 8, '\x15': 9, '\x07': 10, 'B': 11, 'A': 12, 'c': 13, '\xa0': 14, 'ל': 15, 'Š': 16, 'ฟ': 17, 'ו': 18, '\x02': 19, '\x1b': 20, 'î': 21, '\x9c': 22, '\x18': 23, '§': 24, 'Ü': 25, 'ศ': 26, '\x8a': 27, '\x9d': 28, 'อ': 29, '÷': 30, '\n': 31, 'V': 32, '\x8c': 33, 'e': 34, 'ญ': 35, '\x1d': 36, 'Ú': 37, '\x88': 38, 'บ': 39, 'p': 40, 'R': 41, 'Ã': 42, '\x92': 43, '¢': 44, '‰': 45, '`': 46, 'Ó': 47, '\x81': 48, 'ה': 49, 'º': 50, '¯': 51, '\x12': 52, '“': 53, '-': 54, '^': 55, 'y': 56, '”': 57, 'ต': 58, 'h': 59, '£': 60, 'ว': 61, 'ผ': 62, '\x82': 63, 'ื': 64, 'þ': 65, '‚': 66, 'í': 67, 'ู': 68, 'á': 69, 'ฅ': 70, 'W': 71, 'Â': 72, 'Þ': 73, 'า': 74, 'כ': 75, 'ë': 76, 'Ë': 77, '่': 78, '\x7f': 79, '!': 80, '©': 81, '\x8f': 82, 'ๅ': 83, '成': 84, 'Ç': 85, 'ถ': 86, '±': 87, '\x14': 88, 'Q': 89, 'ข': 90, 'Õ': 91, '¾': 92, '\x80': 93, 'ง': 94, '\x0f': 95, '•': 96, 'Æ': 97, '_': 98, 'ö': 99, '用': 100, '\x03': 101, '}': 102, '\x9e': 1

In [180]:
import model
import torch
from tqdm import tqdm
import torch.nn as nn
from util import CharacterLevelDataset
import torch.optim as optim
from torch.utils.data import DataLoader

In [181]:
import torch
from torch.utils.data import Dataset

class CharacterLevelDataset(Dataset):
    def __init__(self, data, labels, max_seq_length):
        self.data = data
        self.labels = labels
        self.max_seq_length = max_seq_length
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        # Truncate or pad sequences to max_seq_length
        sequence = self.data[idx][:self.max_seq_length].ljust(self.max_seq_length)
        # Convert sequence to tensor of character indices
        sequence_tensor = torch.tensor([char_to_idx[char] for char in sequence])
        # Convert label to tensor
        label_tensor = torch.tensor(self.labels[idx], dtype=torch.float32)
        return sequence_tensor, label_tensor

In [182]:
max_seq_len = 16
# train_data = train_data.head(10)
train_dataset = CharacterLevelDataset(train_data['url'].values, train_data['result'].values, max_seq_len)
batch_size = 16
dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

In [183]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [184]:
class CharacterLevelGRU(nn.Module):
    def __init__(self, vocab_size, hidden_size, embedding_dim):
        super(CharacterLevelGRU, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.gru = nn.GRU(embedding_dim, hidden_size,  batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        embedded = self.embedding(x)
        # print(embedded.shape)
        output, _ = self.gru(embedded)
        # print(output.shape)
        last_output = output[:, -1, :]
        # print(last_output.shape)
        output = self.fc(last_output)  # Use only the last output of the sequence
        # print(output.shape)
        output = self.sigmoid(output)  # Apply sigmoid activation function
        # print(output.shape)
        return output

In [185]:
hidden_size = 16
embedding_dim = 32
model = CharacterLevelGRU(vocab_size, hidden_size, embedding_dim)
model.to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [186]:
num_epochs = 10
total_batches = len(dataloader)
running_loss = 0.0

# Apply tqdm to the epoch loop
for epoch in range(num_epochs):

    # Reset the running loss for each epoch
    epoch_loss = 0.0
    
    # Apply tqdm to the dataloader loop
    batch_progress = tqdm(enumerate(dataloader), total=total_batches, desc=f"Epoch {epoch+1}/{num_epochs}", leave=False)
    for i, (data, labels) in batch_progress:
        data = data.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output.squeeze(), labels)  # Compute loss
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        epoch_loss += loss.item()

    # Print average loss for the epoch after all batches
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss / total_batches:.4f}")

# Print final average loss for all epochs
print(f"Final Average Loss: {running_loss / total_batches:.4f}")

Epoch 1/10:   0%|          | 0/28969 [00:00<?, ?it/s]

                                                                  

Epoch 1/10, Loss: 0.0253


                                                                  

Epoch 2/10, Loss: 0.0189


                                                                  

Epoch 3/10, Loss: 0.0183


                                                                  

Epoch 4/10, Loss: 0.0180


                                                                  

Epoch 5/10, Loss: 0.0177


                                                                 

Epoch 6/10, Loss: 0.0174


                                                                  

Epoch 7/10, Loss: 0.0173


                                                                  

Epoch 8/10, Loss: 0.0172


                                                                  

Epoch 9/10, Loss: 0.0170


                                                                   

Epoch 10/10, Loss: 0.0168
Final Average Loss: 0.1838




In [187]:
import pickle
model.to('cpu')

# Save model state dictionary
torch.save(model.state_dict(), 'model.pkl')

In [119]:
28969%16

9