# Encoding URLs

In [1]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

ROOT_DIR = os.path.dirname(os.path.abspath(""))

file = os.path.join(ROOT_DIR, "data/sitges_access.csv")
df = pd.read_csv(file)
df.head()

Unnamed: 0.1,Unnamed: 0,server_name,IP,logname,authenticate,date,petition,URL,status,bytes,referer,user-agent,level
0,0,sitgesanytime.com,47.76.35.19,-,-,2024-01-22 00:00:00+01:00,HEAD,/fr/pag492/explora-platges-i-ports-2/id12/les-...,301,4840,-,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,0
1,1,www.sitgesanytime.com,47.76.35.19,-,-,2024-01-22 00:00:01+01:00,HEAD,/fr/pag492/explora-platges-i-ports-2/id12/les-...,200,5223,-,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,0
2,2,www.sitgesanytime.com,40.77.167.53,-,-,2024-01-22 00:00:06+01:00,GET,/ca/noticias/84/sitges-obt%C3%A9-el-certificat...,404,2509,-,"Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Ge...",5
3,3,sitgesanytime.com,47.76.35.19,-,-,2024-01-22 00:00:06+01:00,HEAD,/fr/pag492/explora-platges-i-ports-2/id12/les-...,301,4840,-,Mozilla/5.0 (Windows NT 6.1; Win64; x64) Apple...,0
4,4,www.sitgesanytime.com,47.76.35.19,-,-,2024-01-22 00:00:07+01:00,HEAD,/fr/pag492/explora-platges-i-ports-2/id12/les-...,200,5260,-,Mozilla/5.0 (Windows NT 6.1; Win64; x64) Apple...,0


In [2]:
counts = pd.Series([str(u).split(" ")[-1] for u in df["URL"].unique()]).value_counts()
{url: count for url, count in counts.items() if count > 1}

{'HTTP/1.1': 69449, 'HTTP/2.0': 25303, 'HTTP/1.0': 158}

In [3]:
len(df[~df["URL"].str.contains("HTTP").astype(bool)])

432

In [4]:
clean_url = df[df["URL"].str.contains("HTTP").astype(bool)]["URL"].dropna()
counts = pd.Series([str(u).split(" ")[-1] for u in clean_url]).value_counts()
{url: f"{count/len(df):.2f}%" for url, count in counts.items()}

{'HTTP/2.0': '0.82%', 'HTTP/1.1': '0.18%', 'HTTP/1.0': '0.00%'}

## Tokenization

In [55]:
corpus = clean_url.to_list()
vocab_init = set(" ".join(corpus))
print(vocab_init)
corpus[:5]

{'A', 'I', 'r', 'Q', 'F', '@', 'W', ')', '1', 'M', 'h', '+', '=', 'V', ':', '}', '>', 'K', 'G', 'y', 'R', 'v', 'q', ']', '\\', 'w', 'i', 'f', '$', 's', 'X', 'H', 'N', 'C', ';', 'o', 'm', 'p', '/', ' ', '9', 'Y', '&', 'x', 't', '(', 'a', '4', '5', 'E', '2', '_', 'S', '*', 'c', "'", 'J', '.', '-', 'P', 'D', 'g', 'O', '?', 'e', '7', 'n', '!', '0', 'd', 'j', 'T', '8', 'z', 'U', 'b', 'Z', '3', 'k', 'u', 'B', '[', 'L', '6', ',', '%', '{', 'l'}


['/fr/pag492/explora-platges-i-ports-2/id12/les-anquines.htm HTTP/1.1',
 '/fr/pag492/explora-platges-i-ports-2/id12/les-anquines.htm HTTP/1.1',
 '/ca/noticias/84/sitges-obt%C3%A9-el-certificat-de-turisme-responsable-biosphere.html HTTP/2.0',
 '/fr/pag492/explora-platges-i-ports-2/id12/les-anquines.htm HTTP/1.1',
 '/fr/pag492/explora-platges-i-ports-2/id12/les-anquines.htm HTTP/1.1']

### Byte Pair Encoding (BPE)

In [89]:
from tokenizers import ByteLevelBPETokenizer # https://github.com/huggingface/tokenizers/blob/main/bindings/python/py_src/tokenizers/implementations/byte_level_bpe.py
tokenizer = ByteLevelBPETokenizer()

special_tokens = ["<s>", "<pad>", "</s>", "<unk>", "<mask>"]
tokenizer.train_from_iterator(corpus, vocab_size=1000, min_frequency=2, special_tokens=special_tokens)






In [84]:
tokenizer.save_model(os.path.join(ROOT_DIR, "models/"), prefix="bpe")

['/media/eric/D/repos/Synthesis-Project-1/models/bpe-vocab.json',
 '/media/eric/D/repos/Synthesis-Project-1/models/bpe-merges.txt']

In [90]:
tokenizer = ByteLevelBPETokenizer(
    vocab = os.path.join(ROOT_DIR, "models/", "bpe-vocab.json"),
    merges = os.path.join(ROOT_DIR, "models/", "bpe-merges.txt")
)

In [91]:
# print 5 longest vocab items
sorted(tokenizer.get_vocab(), key=len, reverse=True)[:5]

['oncssanimationend',
 'pictogramestaules',
 'responsivemenu',
 'cookiemanager',
 'internacional']

In [92]:
print(tokenizer.encode(clean_url.iloc[0]).tokens)
print(tokenizer.encode(clean_url.iloc[-1]).tokens)

['/', 'fr', '/', 'pag', '492', '/', 'explora', '-', 'platges', '-', 'i', '-', 'ports', '-', '2', '/', 'id', '12', '/', 'les', '-', 'an', 'qu', 'in', 'es', '.', 'htm', 'ĠHTTP', '/', '1', '.', '1']
['/', 'plantilles', '/', 'turisme', '/', 'css', '/', 'estils', '-', 'capcalera', '.', 'css', '?', 'v', '=', '3', 'ĠHTTP', '/', '2', '.', '0']


In [88]:
clean_url.iloc[0], clean_url.iloc[-1]

('/fr/pag492/explora-platges-i-ports-2/id12/les-anquines.htm HTTP/1.1',
 '/plantilles/turisme/css/estils-capcalera.css?v=3 HTTP/2.0')

### Wordpiece

In [100]:
from tokenizers import BertWordPieceTokenizer
tokenizer = BertWordPieceTokenizer()

special_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
tokenizer.train_from_iterator(corpus, vocab_size=1000, min_frequency=2, special_tokens=special_tokens)






In [101]:
tokenizer.save_model(os.path.join(ROOT_DIR, "models/"), prefix="wordpiece")

['/media/eric/D/repos/Synthesis-Project-1/models/wordpiece-vocab.txt']

In [102]:
tokenizer = BertWordPieceTokenizer(
	vocab = os.path.join(ROOT_DIR, "models/", "wordpiece-vocab.txt"),
)

In [103]:
# print 5 longest vocab items
sorted(tokenizer.get_vocab(), key=len, reverse=True)[:5]

['oncssanimationend',
 'pictogramestaules',
 'oncssanimation',
 'responsivemenu',
 '##ssanimation']

In [104]:
print(tokenizer.encode(clean_url.iloc[0]).tokens)
print(tokenizer.encode(clean_url.iloc[-1]).tokens)

['[CLS]', '/', 'fr', '/', 'pag', '##49', '##2', '/', 'explora', '-', 'platges', '-', 'i', '-', 'ports', '-', '2', '/', 'id1', '##2', '/', 'les', '-', 'an', '##qu', '##ine', '##s', '.', 'htm', 'http', '/', '1', '.', '1', '[SEP]']
['[CLS]', '/', 'plantilles', '/', 'turisme', '/', 'css', '/', 'estils', '-', 'capcalera', '.', 'css', '?', 'v', '=', '3', 'http', '/', '2', '.', '0', '[SEP]']
