# Project Information

### Project Description

In this notebook, we are training and testing a **tokenizer** on **5 samples** of scrapped data. The tokenizer is trained to process raw text data, and we will evaluate its performance by testing it on these 5 samples. 

### Team Involved:
- **Training**: The training was done by **Vinayak Rana** and **Kaloori Shiva Prasad**.
- **Testing**: The testing was done by **Abhyudaya Nair**.

This exercise demonstrates the effectiveness of the tokenizer in handling real-world, unstructured data and how it can be fine-tuned for specific tasks.

---


In [15]:
import os
from tqdm import tqdm
import pandas as pd
import argparse
from tokenizers import SentencePieceBPETokenizer
from transformers import PreTrainedTokenizerFast
import argparse
import datetime
import pandas as pd
import re
import numpy as np

In [2]:
!pip install gdown
!pip install tqdm

Collecting gdown
  Downloading gdown-5.2.0-py3-none-any.whl (18 kB)
Collecting PySocks!=1.5.7,>=1.5.6
  Downloading PySocks-1.7.1-py3-none-any.whl (16 kB)
Installing collected packages: PySocks, gdown
Successfully installed PySocks-1.7.1 gdown-5.2.0
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [3]:
# import gdown
# gdown.download_folder('https://drive.google.com/drive/folders/1BvQwl3_9fUOTHr5oqN3XidU5CNIgPKqs?usp=sharing')

In [4]:
def train_tokenizer(data_list, vocab_size=32768, model_name="tokenizer_sample2"):

    ## Change bos & eos

    bos_tok = "<sos>"

    eos_tok = "<end_of_sen>"

    ## Add basic characters to this below list, including numbers & special language characters.

    special_char = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]

    tokenizer = SentencePieceBPETokenizer()

    tokenizer.train_from_iterator(

        data_list,

        vocab_size = vocab_size,

        min_frequency = 5,

        special_tokens = ["<pad>", "<unk>", bos_tok, eos_tok, "<user>", "<assistant>"] + special_char,

        show_progress = True,

    )

    ## Don't forget to add special tokens.

    transformer_tokenizer = PreTrainedTokenizerFast(

        tokenizer_object=tokenizer,

        bos_token = bos_tok,

        eos_token = eos_tok,

        unk_token = "<unk>",

        pad_token = "<pad>",

        mask_token = "<mask>",

        padding_side = "left",

        truncation_side = "right",

        additional_special_tokens = ["<user>", "<assistant>"],

        clean_up_tokenization_spaces = False,

    )

    transformer_tokenizer.save_pretrained(f'Drive/MyDrive/NLP LLM/{model_name}')

In [8]:
# data_list = []

# base_dir = 'Pdf text'

# for file in os.listdir(base_dir):

#     f_path = os.path.join(base_dir,file)

#     with open(f_path,'r',encoding='utf8') as f:

#         text = f.read()

#     data_list.append(text)

In [10]:
import pandas as pd

"""" We had taken 5 different samples from our collected data of different sizes. 
After that we made a csv file for those samples and uploaded on google drive, here we are downloading
them using gdown and using them to train the tokenizer"""

df_text = pd.read_csv('sample_2.csv')  

train_tokenizer()

In [3]:
!pip install langdetect

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m32.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25ldone
[?25h  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993227 sha256=09913264228e79ee4121e74f1db751965ed254bb33e0fa0588795696acd76b14
  Stored in directory: /root/.cache/pip/wheels/95/03/7d/59ea870c70ce4e5a370638b5462a7711ab78fba2f655d05106
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9


#### We also tried using cleaning methods like removing other language words, but it took a lot of time so we ended up not using it, considering that we only collected the data from english sources and there are very less chances of any word being of other language.

In [None]:
# from langdetect import detect

# def clean_text(text):

#     if not isinstance(text, str):

#         return text  # If text is not a string, return it as-is



#     # Remove email addresses

#     text = re.sub(r'\S+@\S+\.\S+', ' ', text)



#     # Remove URLs

#     text = re.sub(r'http\S+|www\S+', ' ', text)

#     # Remove numbers

#     text = re.sub(r'\d+', '', text)

#     # Remove non-ASCII characters except punctuations

#     text = re.sub(r'[^\w\s.,!?]', ' ', text)



#     # Replace multiple spaces with a single space

#     text = re.sub(r'\s+', ' ', text)


#     # Function to check if the word is English

#     def is_english(word):

#         try:

#             # Detect the language of the word

#             return detect(word) == 'en'

#         except:

#             return False  # In case language detection fails


#     # Split text into words and filter out non-English words

#     words = text.split()

#     cleaned_words = [word for word in words if is_english(word)]

#     # Rejoin the words back into a single string

#     cleaned_text = " ".join(cleaned_words)

#     return cleaned_text.strip()  # Remove leading and trailing spaces


# df_text['cleaned_text'] = df_text['Content'].apply(clean_text)


In [5]:
# Define the cleaning function
def clean_text(text):
    # Remove email addresses
    text = re.sub(r'\S+@\S+\.\S+', '', text)

    # Remove URLs
    text = re.sub(r'http\S+|www\S+', '', text)

    # Remove digits
    text = re.sub(r'\d+', '', text)

    # Remove emojis
    text = re.sub(r'[^\x00-\x7F]+', '', text)

    # Remove extra spaces, tabs, and newlines
    text = re.sub(r'\s+', ' ', text).strip()

    return text


In [None]:
df_text['cleaned_text'] = df_text['Content'].apply(clean_text)

In [None]:
df_text_cleaned = df_text_cleaned.dropna().reset_index(drop=True)

In [15]:
train_tokenizer(df_text_cleaned['cleaned_text'].to_list())






In [6]:
### Testing Training Tokenizer

from transformers import AutoTokenizer
import gdown



In [7]:
gdown.download('https://drive.google.com/uc?id=1LOOSEwIUedmAj25kGsJAIntQ81haWvaH')

Downloading...
From (original): https://drive.google.com/uc?id=1LOOSEwIUedmAj25kGsJAIntQ81haWvaH
From (redirected): https://drive.google.com/uc?id=1LOOSEwIUedmAj25kGsJAIntQ81haWvaH&confirm=t&uuid=1427c0b3-8ee8-4ace-a060-4bf643103d3a
To: /kaggle/working/propublicadotorg.csv
100%|██████████| 197M/197M [00:01<00:00, 124MB/s]  


'propublicadotorg.csv'

In [8]:
test_df = pd.read_csv('propublicadotorg.csv')

In [9]:
test_df.columns=['filename','content']

In [10]:
test_df['content'] = test_df['content'].apply(clean_text)

In [11]:
# Downloading all the tokenizers from drive to test them

url1 = "https://drive.google.com/drive/folders/1-Ii5eFhtB1TxT28_apSViIz4UcpJ20sT?usp=share_link"
url2 = 'https://drive.google.com/drive/folders/1Un-XZ6vUY85P7N8ZdP3m6wellj8DvSpX?usp=drive_link'
url3 = 'https://drive.google.com/drive/folders/1CxNtWVa_kbwVFfrPlG8gPA7cbhxSkBB_?usp=drive_link'
url4 = 'https://drive.google.com/drive/folders/1-2ObI4bRChu5ZVn3Fh_yKNM2a-QVFzV7?usp=drive_link'
url5 = 'https://drive.google.com/drive/folders/1-EOo1G4hZ1enpaVP2vIviwV5x_GhzCl5?usp=drive_link'
gdown.download_folder(url1, quiet=False)
gdown.download_folder(url2, quiet=False)
gdown.download_folder(url3, quiet=False)
gdown.download_folder(url4, quiet=False)
gdown.download_folder(url5, quiet=False)

Retrieving folder contents


Processing file 1-QsNQfVWC2qqNErfn7jetYwUd--A4_rp special_tokens_map.json
Processing file 1-YH-b28bEq_-Wl7dvvJr96DMlGXW0A9P tokenizer_config.json
Processing file 1-PVwVpAmP3SlTl0ub_s5Lap2ZdCBz-D3 tokenizer.json


Retrieving folder contents completed
Building directory structure
Building directory structure completed
Downloading...
From: https://drive.google.com/uc?id=1-QsNQfVWC2qqNErfn7jetYwUd--A4_rp
To: /kaggle/working/tokenizer_sample1/special_tokens_map.json
100%|██████████| 202/202 [00:00<00:00, 895kB/s]
Downloading...
From: https://drive.google.com/uc?id=1-YH-b28bEq_-Wl7dvvJr96DMlGXW0A9P
To: /kaggle/working/tokenizer_sample1/tokenizer_config.json
100%|██████████| 3.26k/3.26k [00:00<00:00, 6.81MB/s]
Downloading...
From: https://drive.google.com/uc?id=1-PVwVpAmP3SlTl0ub_s5Lap2ZdCBz-D3
To: /kaggle/working/tokenizer_sample1/tokenizer.json
100%|██████████| 1.42M/1.42M [00:00<00:00, 135MB/s]
Download completed
Retrieving folder contents


Processing file 1yGsDkm9p5W9gRBbJCK10TNDtlSYqm-fP special_tokens_map.json
Processing file 1ae5KiEkXChWdEjusKd-hH0_oAKtNJOYl tokenizer_config.json
Processing file 1FPARh0-vvnMLPmPTh82bjYCT2_zFSvzY tokenizer.json


Retrieving folder contents completed
Building directory structure
Building directory structure completed
Downloading...
From: https://drive.google.com/uc?id=1yGsDkm9p5W9gRBbJCK10TNDtlSYqm-fP
To: /kaggle/working/tokenizer_sample2/special_tokens_map.json
100%|██████████| 202/202 [00:00<00:00, 625kB/s]
Downloading...
From: https://drive.google.com/uc?id=1ae5KiEkXChWdEjusKd-hH0_oAKtNJOYl
To: /kaggle/working/tokenizer_sample2/tokenizer_config.json
100%|██████████| 3.26k/3.26k [00:00<00:00, 6.44MB/s]
Downloading...
From: https://drive.google.com/uc?id=1FPARh0-vvnMLPmPTh82bjYCT2_zFSvzY
To: /kaggle/working/tokenizer_sample2/tokenizer.json
100%|██████████| 2.38M/2.38M [00:00<00:00, 193MB/s]
Download completed
Retrieving folder contents


Processing file 1-B_4pu0xv5NdetiWVlcSGNgLkNmobWml special_tokens_map.json
Processing file 1-Byt5cNtUDuneHYu01xGFG4Nunw3utF- tokenizer_config.json
Processing file 1-929m6VH1INUNk8FCq6dlKfJ3CdQT5PS tokenizer.json


Retrieving folder contents completed
Building directory structure
Building directory structure completed
Downloading...
From: https://drive.google.com/uc?id=1-B_4pu0xv5NdetiWVlcSGNgLkNmobWml
To: /kaggle/working/tokenizer_sample/special_tokens_map.json
100%|██████████| 202/202 [00:00<00:00, 709kB/s]
Downloading...
From: https://drive.google.com/uc?id=1-Byt5cNtUDuneHYu01xGFG4Nunw3utF-
To: /kaggle/working/tokenizer_sample/tokenizer_config.json
100%|██████████| 3.26k/3.26k [00:00<00:00, 6.52MB/s]
Downloading...
From: https://drive.google.com/uc?id=1-929m6VH1INUNk8FCq6dlKfJ3CdQT5PS
To: /kaggle/working/tokenizer_sample/tokenizer.json
100%|██████████| 1.48M/1.48M [00:00<00:00, 142MB/s]
Download completed
Retrieving folder contents


Processing file 1-6u1fvIN01CxwiNpSInuQd3EbMe76J30 special_tokens_map.json
Processing file 1-75holrFY-RwhOkjR6qM0tuoO5Q0Yce2 tokenizer_config.json
Processing file 1-44ejtkVRIdDG1FNZTNqaBSKTllGbmD1 tokenizer.json


Retrieving folder contents completed
Building directory structure
Building directory structure completed
Downloading...
From: https://drive.google.com/uc?id=1-6u1fvIN01CxwiNpSInuQd3EbMe76J30
To: /kaggle/working/tokenizer_sample5/special_tokens_map.json
100%|██████████| 195/195 [00:00<00:00, 876kB/s]
Downloading...
From: https://drive.google.com/uc?id=1-75holrFY-RwhOkjR6qM0tuoO5Q0Yce2
To: /kaggle/working/tokenizer_sample5/tokenizer_config.json
100%|██████████| 3.24k/3.24k [00:00<00:00, 8.06MB/s]
Downloading...
From: https://drive.google.com/uc?id=1-44ejtkVRIdDG1FNZTNqaBSKTllGbmD1
To: /kaggle/working/tokenizer_sample5/tokenizer.json
100%|██████████| 1.43M/1.43M [00:00<00:00, 142MB/s]
Download completed
Retrieving folder contents


Processing file 1-O3MdvMI5rfkP3blpEp22-3m0DNTcY8Y special_tokens_map.json
Processing file 1-R4w_CGBTxJf38gOnPpSvh2VTD1nGkXF tokenizer_config.json
Processing file 1-K75ZAV-7G4Gs6JV75FlF1zZWWyFLMY9 tokenizer.json


Retrieving folder contents completed
Building directory structure
Building directory structure completed
Downloading...
From: https://drive.google.com/uc?id=1-O3MdvMI5rfkP3blpEp22-3m0DNTcY8Y
To: /kaggle/working/tokenizer_sample4/special_tokens_map.json
100%|██████████| 195/195 [00:00<00:00, 953kB/s]
Downloading...
From: https://drive.google.com/uc?id=1-R4w_CGBTxJf38gOnPpSvh2VTD1nGkXF
To: /kaggle/working/tokenizer_sample4/tokenizer_config.json
100%|██████████| 3.24k/3.24k [00:00<00:00, 4.91MB/s]
Downloading...
From: https://drive.google.com/uc?id=1-K75ZAV-7G4Gs6JV75FlF1zZWWyFLMY9
To: /kaggle/working/tokenizer_sample4/tokenizer.json
100%|██████████| 1.43M/1.43M [00:00<00:00, 135MB/s]
Download completed


['/kaggle/working/tokenizer_sample4/special_tokens_map.json',
 '/kaggle/working/tokenizer_sample4/tokenizer_config.json',
 '/kaggle/working/tokenizer_sample4/tokenizer.json']

In [12]:
tokenizer1 = AutoTokenizer.from_pretrained("/kaggle/working/tokenizer_sample1")
tokenizer2 = AutoTokenizer.from_pretrained("/kaggle/working/tokenizer_sample2")
tokenizer3 = AutoTokenizer.from_pretrained("/kaggle/working/tokenizer_sample")
tokenizer4 = AutoTokenizer.from_pretrained("/kaggle/working/tokenizer_sample4")
tokenizer5 = AutoTokenizer.from_pretrained("/kaggle/working/tokenizer_sample5")

In [33]:
len(tokenizer.get_vocab())

32769

In [13]:
matrix = dict()

In [16]:
# Testing all tokenizers on a sample dataset of 200MB

for sample_no in ['sample1','sample2','sample','sample4','sample5']:
    f_score = []

    if sample_no=='sample1':
        tokenizer = tokenizer1
    if sample_no=='sample2':
        tokenizer = tokenizer2
    if sample_no=='sample':
        tokenizer = tokenizer3
    if sample_no=='sample4':
        tokenizer = tokenizer4
    if sample_no=='sample5':
        tokenizer = tokenizer5
        
    for text in test_df['content'].to_list():
    
      input_ids = tokenizer.encode(text)
    
      input_text = text.split(" ")
    
      f_score.append(len(input_ids)/len(input_text))
        
    matrix[f'tokenizer_{sample_no}'] = np.mean(np.array(f_score))

In [19]:
matrix

{'tokenizer_sample1': 1.2156464655540813,
 'tokenizer_sample2': 1.1974310278241673,
 'tokenizer_sample': 1.184354252015691,
 'tokenizer_sample4': 1.1847581147241528,
 'tokenizer_sample5': 1.3070056610796612}

In [23]:
df = pd.DataFrame.from_dict(matrix, orient='index', columns=['value'])

In [25]:
df['Dataset size(MB)'] = [560,450,950,550,560]

### PERFORMANCE OF ALL TRAINED TOKENIZERS

In [26]:
df

Unnamed: 0,value,Dataset size(MB)
tokenizer_sample1,1.215646,560
tokenizer_sample2,1.197431,450
tokenizer_sample,1.184354,950
tokenizer_sample4,1.184758,550
tokenizer_sample5,1.307006,560
