In [1]:
import os

# Folder path
train_folder = r"C:\Users\ASAD SHAH\Desktop\NLP-Assignment-22i-0597\BPE_dataset_Training\dataset"
test_folder = r"C:\Users\ASAD SHAH\Desktop\NLP-Assignment-22i-0597\BPE_dataset_Testing"

def loaded_text(folder_path):
    data = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"): 
            file_path = os.path.join(folder_path, filename)
            try:
                with open(file_path, "r", encoding="utf-8") as f:
                    data.extend(f.readlines())  # Read each line
            except UnicodeDecodeError:
                with open(file_path, "r", encoding="ISO-8859-1") as f: 
                    data.extend(f.readlines())
    return data

# Load training and testing datasets
train_data = loaded_text(train_folder)
test_data = loaded_text(test_folder)

print(" Training & Testing Files Loaded Successfully!")
print(f"Total Training Sentences: {len(train_data)}")
print(f"Total Testing Sentences: {len(test_data)}")


print("\n Sample Training Data:", train_data[:3])
print("\n Sample Testing Data:", test_data[:3])


 Training & Testing Files Loaded Successfully!
Total Training Sentences: 1018
Total Testing Sentences: 247

 Sample Training Data: ['1. subha 5 bjhey uthna perha trip thaa jaldi jaldi ready hua aur 0540 ghar saay nikal gaay\n', '2. 0615 bus chal perhee joo kaay first time thaa kaay trip time saay chala\n', '3. nust kaay 3 larkay thy unsaay batain kee phir bluetooth trip coordinator kaay pass thaa tou humm shoor daltay rahay kaay song change krr dain\n']

 Sample Testing Data: ['1. Subah utha, brush kiya aur naha dho kar nashta kiya.\n', '2. Ghar ke kaam kiye aur sabzi lene bazar gaya.\n', '3. Sabzi lekar aya aur khane ki tayari mein madad ki.\n']


In [3]:
import re

# preprocess text
def preprocess_text(text):
    text = text.lower() 
    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation
    text = re.sub(r"\s+", " ", text).strip()  # Normalize spaces
    return text

train_data_cleaned = [preprocess_text(sentence) for sentence in train_data if sentence.strip()]
test_data_cleaned = [preprocess_text(sentence) for sentence in test_data if sentence.strip()]


print(" Text Preprocessing Completed!")
print("\n Sample Cleaned Training Data:", train_data_cleaned[:3])
print("\n Sample Cleaned Testing Data:", test_data_cleaned[:3])


 Text Preprocessing Completed!

 Sample Cleaned Training Data: ['1 subha 5 bjhey uthna perha trip thaa jaldi jaldi ready hua aur 0540 ghar saay nikal gaay', '2 0615 bus chal perhee joo kaay first time thaa kaay trip time saay chala', '3 nust kaay 3 larkay thy unsaay batain kee phir bluetooth trip coordinator kaay pass thaa tou humm shoor daltay rahay kaay song change krr dain']

 Sample Cleaned Testing Data: ['1 subah utha brush kiya aur naha dho kar nashta kiya', '2 ghar ke kaam kiye aur sabzi lene bazar gaya', '3 sabzi lekar aya aur khane ki tayari mein madad ki']


In [7]:
import pickle

# Load the BPE model from pickle file
with open("bpe_model.pkl", "rb") as f:
    bpe_model = pickle.load(f)

# Extract vocabulary and merges
vocab = bpe_model["vocab"]
merges = bpe_model["merges"]

print(" BPE Model Loaded Successfully!")
print(f"Total Vocabulary Size: {len(vocab)}")
print(f"Total Merge Rules: {len(merges)}")
print("📌 First 20 Merge Rules:", merges[:20])  # Show first 20 merges
print("📌 Total Merge Rules:", len(merges))


✅ BPE Model Loaded Successfully!
Total Vocabulary Size: 2619
Total Merge Rules: 2077
📌 First 20 Merge Rules: [('h', 'a'), ('a', 'y'), ('k', 'a'), ('h', 'i'), ('k', 'i'), ('b', 'a'), ('i', 'n'), ('u', 'r'), ('m', 'a'), ('n', 'a'), ('m', 'e'), ('h', 'o'), ('a', 'r'), ('l', 'a'), ('k', 'e'), ('a', 'ur'), ('n', 'e'), ('y', 'a'), ('a', 'a'), ('s', 'e')]
📌 Total Merge Rules: 2077


In [61]:
import pickle
from collections import Counter

def get_vocab(data):
    vocab = Counter()
    for sentence in data:
        words = sentence.split()
        for word in words:
            vocab[" ".join(word)] += 1 
    return vocab

vocab = get_vocab(train_data_cleaned)

# Set vocabulary size limit
vocab_size = 1000  
merges = [] 


while len(vocab) > vocab_size:  
    pairs = Counter()

    # Count all adjacent character pairs
    for word in vocab:
        symbols = word.split()
        for i in range(len(symbols) - 1):
            pairs[(symbols[i], symbols[i + 1])] += vocab[word]

    if not pairs:
        print(f"⚠ No more pairs to merge. Current vocab size: {len(vocab)}")
        break  


    most_common_pair = max(pairs, key=pairs.get)
    merges.append(most_common_pair)

    # Merge the most common pair in vocabulary
    new_vocab = {}
    for word, freq in vocab.items():
        new_word = word.replace(" ".join(most_common_pair), "".join(most_common_pair))
        new_vocab[new_word] = freq
    vocab = new_vocab  # Update vocabulary

#  **Fix: Ensure Vocabulary Size is Exactly 1000**
if len(vocab) > vocab_size:
    vocab = dict(list(vocab.items())[:vocab_size])  # Keep only 1000 subwords

final_vocab = list(vocab.keys())  # Store only subwords

# Assign unique IDs to each subword
token_to_id = {token: idx for idx, token in enumerate(final_vocab, start=1)}  # Unique IDs
id_to_token = {idx: token for token, idx in token_to_id.items()}  

# Save Updated BPE Model (Final Vocabulary + Merges + ID Mappings)
bpe_model = {
    "vocab": final_vocab,  
    "merges": merges, 
    "token_to_id": token_to_id,  
    "id_to_token": id_to_token
}

with open("bpe_model.pkl", "wb") as f:
    pickle.dump(bpe_model, f)

print("\n BPE Model Trained & Saved Successfully!")
print(f"Final Vocabulary Size: {len(final_vocab)} (Target: 1000)")
print(f"Total Merge Rules: {len(merges)}")
print(f"Total Tokens with IDs: {len(token_to_id)}")


⚠ No more pairs to merge. Current vocab size: 2619

✅ BPE Model Trained & Saved Successfully!
Final Vocabulary Size: 1000 (Target: 1000)
Total Merge Rules: 2077
Total Tokens with IDs: 1000


In [75]:
import pickle

# Load the trained BPE model
with open("bpe_model.pkl", "rb") as f:
    bpe_model = pickle.load(f)

vocab = bpe_model["vocab"]  
merges = bpe_model["merges"]  
token_to_id = bpe_model["token_to_id"]  
id_to_token = bpe_model["id_to_token"]  

print("\n BPE Model Loaded Successfully!")
print(f"Total Tokens: {len(token_to_id)} (Target: 1000)")
print(f"Total Merge Rules: {len(merges)}")
# Print first 50 token-to-ID mappings to check correctness
print("\n Sample Token-to-ID Mapping:")
for i, (token, token_id) in enumerate(token_to_id.items()):
    print(f"{token} → {token_id}")
    if i == 49:  # Show only 50 samples
        break




 BPE Model Loaded Successfully!
Total Tokens: 1000 (Target: 1000)
Total Merge Rules: 2077

 Sample Token-to-ID Mapping:
1 → 1
subha → 2
5 → 3
bjhey → 4
uthna → 5
perha → 6
trip → 7
thaa → 8
jaldi → 9
ready → 10
hua → 11
aur → 12
0540 → 13
ghar → 14
saay → 15
nikal → 16
gaay → 17
2 → 18
0615 → 19
bus → 20
chal → 21
perhee → 22
joo → 23
kaay → 24
first → 25
time → 26
chala → 27
3 → 28
nust → 29
larkay → 30
thy → 31
unsaay → 32
batain → 33
kee → 34
phir → 35
bluetooth → 36
coordinator → 37
pass → 38
tou → 39
humm → 40
shoor → 41
daltay → 42
rahay → 43
song → 44
change → 45
krr → 46
dain → 47
4 → 48
murree → 49
mein → 50


In [73]:
def encode(text, merges, token_to_id):
    words = text.split()
    encoded_ids = []

    for word in words:
        word_tokens = list(word)  

        while True:
            pairs = [(word_tokens[i], word_tokens[i+1]) for i in range(len(word_tokens)-1)]
            merge_found = None

            for pair in pairs:
                if pair in merges:
                    merge_found = pair
                    break

            if not merge_found:
                break  # Stop if no more merges apply

            # Apply the merge everywhere in the word
            new_word_tokens = []
            i = 0
            while i < len(word_tokens):
                if i < len(word_tokens) - 1 and (word_tokens[i], word_tokens[i+1]) == merge_found:
                    new_word_tokens.append("".join(merge_found))  # Merge pair
                    i += 2  # Skip merged pair
                else:
                    new_word_tokens.append(word_tokens[i])
                    i += 1
            word_tokens = new_word_tokens

        # Convert subwords to token IDs
        word_ids = [token_to_id.get(token, 0) for token in word_tokens]  # 0 for unknown tokens
        encoded_ids.extend(word_ids)

    return encoded_ids

# Encode testing dataset
encoded_test_data = [encode(sentence, merges, token_to_id) for sentence in test_data]

print("\n Testing Data Encoded Successfully!")
print(" Sample Encoded Testing Data:", encoded_test_data[:3])  # Print first 3 encoded samples



 Testing Data Encoded Successfully!
 Sample Encoded Testing Data: [[1, 0, 0, 0, 0, 0, 187, 407, 0, 930, 0, 0, 0, 407, 0, 0, 0, 407, 0, 128, 0, 508, 0, 0, 0, 407, 0], [18, 0, 0, 117, 0, 365, 128, 656, 0, 0, 0, 0, 372, 0, 492, 179, 0, 0, 0, 809, 0], [28, 0, 0, 534, 0, 492, 128, 0, 0, 407, 0, 0, 0, 0, 0, 148, 0, 809, 0, 0, 0, 686, 0, 0, 148, 0]]


In [71]:
def decode(encoded_ids, id_to_token):
    decoded_tokens = [id_to_token.get(token_id, "<UNK>") for token_id in encoded_ids]
    return " ".join(decoded_tokens)

# Test decoding
decoded_sentence = decode(encoded_sentence, id_to_token)

print("\n Decoded Sentence:", decoded_sentence)



 Decoded Sentence: <UNK> a <UNK> uth a b <UNK> <UNK> <UNK> a


In [69]:
def encode(text, merges, token_to_id):
    words = text.split()
    encoded_ids = []

    for word in words:
        word_tokens = list(word)  # Start with individual characters

        while True:
            pairs = [(word_tokens[i], word_tokens[i+1]) for i in range(len(word_tokens)-1)]
            merge_found = None

            for pair in pairs:
                if pair in merges:
                    merge_found = pair
                    break

            if not merge_found:
                break  # Stop if no more merges apply

            # Apply the merge everywhere in the word
            new_word_tokens = []
            i = 0
            while i < len(word_tokens):
                if i < len(word_tokens) - 1 and (word_tokens[i], word_tokens[i+1]) == merge_found:
                    new_word_tokens.append("".join(merge_found))  # Merge pair
                    i += 2  # Skip merged pair
                else:
                    new_word_tokens.append(word_tokens[i])
                    i += 1
            word_tokens = new_word_tokens

        # Convert subwords to token IDs
        word_ids = [token_to_id.get(token, 0) for token in word_tokens]  # 0 for unknown tokens
        encoded_ids.extend(word_ids)

    return encoded_ids

encoded_train_data = [encode(sentence, merges, token_to_id) for sentence in train_data]
encoded_test_data = [encode(sentence, merges, token_to_id) for sentence in test_data]

print("\n Training & Testing Data Encoded Successfully!")
print(" Sample Encoded Training Data:", encoded_train_data[:3]) 
print(" Sample Encoded Testing Data:", encoded_test_data[:3]) 



 Training & Testing Data Encoded Successfully!
 Sample Encoded Training Data: [[1, 0, 361, 407, 3, 4, 187, 508, 114, 555, 7, 0, 88, 9, 9, 10, 11, 0, 0, 13, 0, 0, 286, 0, 0, 0, 0, 0], [18, 0, 19, 0, 0, 0, 0, 114, 548, 0, 344, 222, 128, 0, 0, 0, 0, 26, 0, 88, 128, 0, 7, 26, 286, 0, 0, 0, 407], [28, 0, 0, 0, 128, 0, 28, 0, 0, 128, 0, 0, 0, 879, 286, 0, 0, 0, 0, 34, 0, 0, 0, 0, 0, 0, 7, 0, 190, 461, 508, 0, 190, 128, 0, 711, 0, 0, 0, 88, 39, 40, 0, 0, 0, 0, 0, 0, 0, 264, 0, 128, 0, 360, 0, 0, 0, 0, 0, 46, 0, 0]]
 Sample Encoded Testing Data: [[1, 0, 0, 0, 0, 0, 187, 407, 0, 930, 0, 0, 0, 407, 0, 0, 0, 407, 0, 128, 0, 508, 0, 0, 0, 407, 0], [18, 0, 0, 117, 0, 365, 128, 656, 0, 0, 0, 0, 372, 0, 492, 179, 0, 0, 0, 809, 0], [28, 0, 0, 534, 0, 492, 128, 0, 0, 407, 0, 0, 0, 0, 0, 148, 0, 809, 0, 0, 0, 686, 0, 0, 148, 0]]


In [67]:
def decode(encoded_ids, id_to_token):
    decoded_tokens = [id_to_token.get(token_id, "<UNK>") for token_id in encoded_ids]
    return " ".join(decoded_tokens)  # Convert tokens back to text
    
# Decode the encoded test dataset
decoded_test_data = [decode(sentence, id_to_token) for sentence in encoded_test_data]

print("\n Testing Data Decoded Successfully!")
print(" Sample Decoded Testing Data:", decoded_test_data[:3])  # Print first 3 decoded samples




 Testing Data Decoded Successfully!
 Sample Decoded Testing Data: ['1 <UNK> <UNK> <UNK> <UNK> <UNK> uth a <UNK> b <UNK> <UNK> <UNK> a <UNK> <UNK> <UNK> a <UNK> ka <UNK> na <UNK> <UNK> <UNK> a <UNK>', '2 <UNK> <UNK> ha <UNK> ke ka am <UNK> <UNK> <UNK> <UNK> sab <UNK> le ne <UNK> <UNK> <UNK> ya <UNK>', '3 <UNK> <UNK> ab <UNK> le ka <UNK> <UNK> a <UNK> <UNK> <UNK> <UNK> <UNK> ki <UNK> ya <UNK> <UNK> <UNK> ma <UNK> <UNK> ki <UNK>']


In [65]:

def get_original_vocab_size(data):
    vocab = set()
    for sentence in data:
        words = sentence.split()
        vocab.update(words)  
    return len(vocab)

original_vocab_size = get_original_vocab_size(train_data_cleaned)


bpe_vocab_size = len(token_to_id)  

# Calculate reduction percentage
vocab_reduction = ((original_vocab_size - bpe_vocab_size) / original_vocab_size) * 100

print("\n **Vocabulary Reduction Evaluation**")
print(f" Original Vocabulary Size (before BPE): {original_vocab_size}")
print(f" Reduced Vocabulary Size (after BPE): {bpe_vocab_size}")
print(f" Reduction Percentage: {vocab_reduction:.2f}%")



 **Vocabulary Reduction Evaluation**
 Original Vocabulary Size (before BPE): 2619
 Reduced Vocabulary Size (after BPE): 1000
 Reduction Percentage: 61.82%
