In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import pandas as pd
import matplotlib.pyplot as plt
from gensim.models import Word2Vec
import time
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import classification_report
import chardet
import codecs
import re


def create_special_chars_set(df):
    special_chars = set()
    for url in df["URL"]:
        special_chars.update(re.findall(r'[-_./:?=&*+()]', url))
    special_chars = {re.escape(char) for char in special_chars}  
    return special_chars

class Tokenizer:
    def __init__(self, wv, special_chars):
        self.wv = wv
        self.special_chars = special_chars

    def tokenize(self, text):
        tokens = re.split(f'({"|".join(self.special_chars)})+', text)
        tokens = [token for token in tokens if token != '' and token is not None] 
        token_ids = [self.wv.key_to_index[token] for token in tokens if token in self.wv.key_to_index]
        return torch.tensor(token_ids)
    

def preprocess_data():
    with open("Data/dataset1/train.csv", "rb") as f:
        result = chardet.detect(f.read())

    with codecs.open("Data/dataset1/train.csv", "r", encoding=result['encoding'], errors='replace') as f:
        df_train = pd.read_csv(f)

    with open("Data/dataset1/valid.csv", "rb") as f:
        result = chardet.detect(f.read())

    with codecs.open("Data/dataset1/valid.csv", "r", encoding=result['encoding'], errors='replace') as f:
        df_valid = pd.read_csv(f)

    return df_train, df_valid


def build_word2vec(df_train):
    sentences = [list(url) for url in df_train["URL"]]
    wv = Word2Vec(sentences, min_count=1)
    return wv


class URLDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        url = row["URL"]
        label = row["label"]
        tokens = self.tokenizer.tokenize(url)
        min_len = 6  
        if len(tokens) < min_len:
            tokens = torch.cat([tokens, torch.zeros(min_len - len(tokens), dtype=torch.long)])
        return {"tokens": tokens.to(torch.long), "label": 1 if label == "good" else 0}


def load_data(df_train, df_valid, tokenizer):
    wv = build_word2vec(df_train)
    dataset_train = URLDataset(df_train, tokenizer)
    dataset_valid = URLDataset(df_valid, tokenizer)


    vocab = wv.wv.key_to_index


    url_tensors_train = [item["tokens"] for item in dataset_train]
    labels_train = [item["label"] for item in dataset_train]
    url_tensors_valid = [item["tokens"] for item in dataset_valid]
    labels_valid = [item["label"] for item in dataset_valid]

    train_data = [(torch.tensor(x).clone().detach(), torch.tensor(y).clone().detach()) for x, y in
                  zip(url_tensors_train, labels_train)]
    valid_data = [(torch.tensor(x).clone().detach(), torch.tensor(y).clone().detach()) for x, y in
                  zip(url_tensors_valid, labels_valid)]

    train_iterator = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
    valid_iterator = DataLoader(valid_data, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

    return train_iterator, valid_iterator, vocab


def collate_fn(batch):
    urls, labels = zip(*batch)
    urls = nn.utils.rnn.pad_sequence(urls, batch_first=True, padding_value=0)
    labels = torch.stack(labels)  
    return urls, labels


class CNN(nn.Module):
    def __init__(self, pretrained_embeddings, n_filters, filter_sizes, output_dim, dropout):
        super().__init__()
        self.embedding = nn.Embedding.from_pretrained(pretrained_embeddings)
        self.convs = nn.ModuleList(
            [nn.Sequential(
                nn.Conv2d(in_channels=1, out_channels=n_filters, kernel_size=(fs, pretrained_embeddings.shape[1])),
                nn.BatchNorm2d(n_filters),  
                nn.ReLU()
            ) for fs in filter_sizes]
        )
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = x.unsqueeze(1)
        x = self.embedding(x)
        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs]
        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]
        x = torch.cat(x, 1)
        x = self.dropout(x)
        logits = self.fc(x)
        return logits


def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    acc = correct.sum() / len(correct)
    return acc


def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.train()

    for batch in iterator:
        optimizer.zero_grad()
        tokens, labels = batch
        logits = model(tokens.to(device)).squeeze(1)
        loss = criterion(logits, labels.to(device).float())
        acc = binary_accuracy(logits, labels.to(device))

        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)


def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    with torch.no_grad():
        for batch in iterator:
            tokens, labels = batch
            logits = model(tokens.to(device)).squeeze(1)
            loss = criterion(logits, labels.to(device).float())
            acc = binary_accuracy(logits, labels.to(device))

            epoch_loss += loss.item()
            epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)


def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs



epochs = 50  
batch_size = 64  
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

df_train, df_valid = preprocess_data() 
wv = build_word2vec(df_train)


special_chars = create_special_chars_set(df_train)
tokenizer = Tokenizer(wv.wv, special_chars)
train_iterator, valid_iterator, vocab = load_data(df_train, df_valid, tokenizer)

pretrained_embeddings = torch.FloatTensor(wv.wv.vectors)
n_filters = 50  
filter_sizes = [2, 3, 4, 5, 6] 
output_dim = 1
dropout = 0.3

model = CNN(pretrained_embeddings, n_filters, filter_sizes, output_dim, dropout).to(device)

optimizer = optim.Adam(model.parameters(), lr=0.0005, weight_decay=1e-5) 

criterion = nn.BCEWithLogitsLoss()

train_losses = []
valid_losses = []
train_accuracies = []
valid_accuracies = []

print('start');
for epoch in range(epochs):
    start_time = time.time()

    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    train_losses.append(train_loss)
    valid_losses.append(valid_loss)
    train_accuracies.append(train_acc)
    valid_accuracies.append(valid_acc)

    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    print(f'Epoch: {epoch + 1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')

    print(
        f'Epoch: {epoch + 1}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc:.3f}, Val. Loss: {valid_loss:.3f}, Val. Acc: {valid_acc:.3f}')

print('end');


model_save_path = "url_detection_model.pth"
torch.save(model.state_dict(), model_save_path)

def plot_roc_curve(model, iterator):
    model.eval()
    with torch.no_grad():
        y_true = []
        y_score = []
        for batch in iterator:
            tokens, labels = batch
            logits = model(tokens.to(device)).squeeze(1)
            probs = torch.sigmoid(logits)
            y_true += labels.cpu().tolist()
            y_score += probs.cpu().tolist()

    fpr, tpr, thresholds = roc_curve(y_true, y_score)
    roc_auc = auc(fpr, tpr)

    plt.figure()
    plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic')
    plt.legend(loc="lower right")
    plt.show()


plot_roc_curve(model, valid_iterator)


def evaluate_final(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    with torch.no_grad():
        y_true = []
        y_pred = []
        for batch in iterator:
            tokens, labels = batch
            logits = model(tokens.to(device)).squeeze(1)
            probs = torch.sigmoid(logits)
            preds = torch.round(probs)
            y_true += labels.cpu().tolist()
            y_pred += preds.cpu().tolist()
    print(classification_report(y_true, y_pred))


evaluate_final(model, valid_iterator, criterion)

plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.plot(train_losses, label='Train Loss')
plt.plot(valid_losses, label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(train_accuracies, label='Train Accuracy')
plt.plot(valid_accuracies, label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

plt.show()


# loaded_model = CNN(pretrained_embeddings, n_filters, filter_sizes, output_dim, dropout).to(device)
# loaded_model.load_state_dict(torch.load(model_save_path))
# loaded_model.eval()