<a href="https://colab.research.google.com/github/adityapri/NLP---FastText-Word-Representation/blob/main/FastTEXT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Tokenization**

In [40]:
import re
from collections import defaultdict
import random
import math

In [41]:
CORPUS_PATH = "/content/wmt-news-crawl-hi.txt"

In [42]:
regx_pattern = r"[\u0900-\u0963\u0966-\u097F]+|\d+"


token_pattern = re.compile(regx_pattern,
                           flags = re.UNICODE)

In [43]:

def tokenize(sent):
  return token_pattern.findall(sent)


def load_corpus(corpus_path):

  sentences = []

  with open(corpus_path, "r",encoding='utf-8') as file:
    for raw_line in file:
      cleaned_line = raw_line.strip()
      if cleaned_line == "":
        continue
      token_list = tokenize(cleaned_line)
      sentences.append(token_list)


  return sentences


In [44]:
 # Load Corpus
sentences = load_corpus(CORPUS_PATH)

print("Total sentences read:", len(sentences))
print("Sample tokens from first sentence:", sentences[:50][:50])

Total sentences read: 6714
Sample tokens from first sentence: [['बरेली', 'जोन', 'के', 'एडीजी', 'रमित', 'शर्मा', 'खुद', 'पूरे', 'मामले', 'की', 'निगरानी', 'कर', 'रहे', 'हैं'], ['बता', 'दें', 'कि', 'सीएल', 'गुप्ता', 'एक्सपोर्ट्स', 'की', 'गिनती', 'मुरादाबाद', 'के', 'दिग्गज', 'पीतल', 'कारोबारियों', 'में', 'होती', 'है'], ['किस', 'देवी', 'की', 'पूजा', 'से', 'क्या', 'लाभ', 'मिलते', 'हैं', 'एक', 'साल', 'में', 'कितनी', 'बार', 'आती', 'हैं', 'नवरात्रियां', 'देवी', 'पूजा', 'के', 'साथ', 'व्रत', 'उपवास', 'क्यों', 'करना', 'चाहिए'], ['थाना', 'रसूलाबाद', 'पुलिस', 'ने', 'नाबालिग', 'किशोरी', 'के', 'साथ', 'रेप', 'करने', 'वाले', 'एक', 'अभियुक्त', 'को', 'गिरफ्तार', 'किया', 'है'], ['हमने', 'विभिन्न', 'मंचों', 'पर', 'यह', 'मुद्दा', 'उठाते', 'हुए', 'इस', 'पर', 'बातचीत', 'का', 'प्रयास', 'किया'], ['उन्होंने', 'कहा', 'कि', 'झारखंड', 'में', 'चुनाव', 'को', 'देखते', 'हुए', 'भाजपा', 'नई', 'नई', 'घोषणाएं', 'कर', 'रही', 'है', 'उनसे', 'पूछा', 'जाना', 'चाहिए', 'कि', 'क्या', 'जो', 'सुविधाएं', 'झारखंड', 'में', 'देने', 'की',

**Vocabulary Construction**

In [45]:
def create_vocabulary(tokenized_corpus):

  word_frequency_map = defaultdict(int)
  token_count = 0

  for sentence in tokenized_corpus:
    for token in sentence:
      word_frequency_map[token] += 1
      token_count += 1

  vocab = sorted(word_frequency_map)
  token_to_index = {tok: i for i, tok in enumerate(vocab)}
  index_to_token = vocab

  return token_to_index, index_to_token, word_frequency_map, token_count

In [46]:
token_index, index_token, freq_dict, token_total = create_vocabulary(sentences)

print("Unique vocabulary size:", len(token_index))
print("Token count in corpus:", token_total)


print("\n50 Most Frequent Words: ")
most_common = sorted(
    freq_dict.items(),
    key=lambda pair : pair[1],
    reverse = True
)[:50]

for token, occurrences in most_common:
    print(f"{token} : {occurrences}")

Unique vocabulary size: 13948
Token count in corpus: 103169

50 Most Frequent Words: 
के : 4268
में : 3148
की : 2681
है : 2597
को : 2007
से : 1915
ने : 1639
का : 1457
और : 1416
पर : 1289
कि : 1071
हैं : 927
भी : 839
कर : 626
नहीं : 622
लिए : 613
एक : 597
इस : 581
किया : 571
गया : 555
था : 489
ही : 479
बाद : 458
हो : 452
साथ : 428
करने : 421
कहा : 421
पुलिस : 404
गई : 389
रहे : 336
दिया : 331
तो : 317
रहा : 307
यह : 306
रही : 304
बताया : 299
थी : 286
हुए : 279
जा : 265
उन्होंने : 260
लेकर : 247
दी : 244
थे : 243
अपने : 233
गए : 229
इसके : 228
सिंह : 222
लेकिन : 217
होने : 201
वह : 196


**Subsampling**

In [47]:
THRESHOLD = 1e-5


def normalize_counts(freq_map, token_count):

    prob_map = {}
    inv_total = 1.0 / token_count

    for term in freq_map:
        prob_map[term] = freq_map[term] * inv_total

    return prob_map


def accept_token(token, prob_map, threshold):

    freq = prob_map[token]

    score = math.sqrt(threshold / freq) + (threshold / freq)
    if score > 1.0:
        score = 1.0

    return random.random() <= score


def apply_subsampling(data, freq_map, token_count):

    probability_table = normalize_counts(freq_map, token_count)

    filtered_data = []

    for sent in data:
        retained = []

        for tok in sent:
            if accept_token(tok, probability_table, THRESHOLD):
                retained.append(tok)

        if len(retained) > 0:
            filtered_data.append(retained)

    return filtered_data


In [48]:
random.seed(42)

reduced_data = apply_subsampling(sentences, freq_dict, token_total)

print("* Corpus Before Subsampling :")
for idx, sent in enumerate(sentences):
    if idx == 20:
        break
    print(sent)

print("\n* Corpus After Subsampling :")
for idx, sent in enumerate(reduced_data):
    if idx == 20:
        break
    print(sent)


* Corpus Before Subsampling :
['बरेली', 'जोन', 'के', 'एडीजी', 'रमित', 'शर्मा', 'खुद', 'पूरे', 'मामले', 'की', 'निगरानी', 'कर', 'रहे', 'हैं']
['बता', 'दें', 'कि', 'सीएल', 'गुप्ता', 'एक्सपोर्ट्स', 'की', 'गिनती', 'मुरादाबाद', 'के', 'दिग्गज', 'पीतल', 'कारोबारियों', 'में', 'होती', 'है']
['किस', 'देवी', 'की', 'पूजा', 'से', 'क्या', 'लाभ', 'मिलते', 'हैं', 'एक', 'साल', 'में', 'कितनी', 'बार', 'आती', 'हैं', 'नवरात्रियां', 'देवी', 'पूजा', 'के', 'साथ', 'व्रत', 'उपवास', 'क्यों', 'करना', 'चाहिए']
['थाना', 'रसूलाबाद', 'पुलिस', 'ने', 'नाबालिग', 'किशोरी', 'के', 'साथ', 'रेप', 'करने', 'वाले', 'एक', 'अभियुक्त', 'को', 'गिरफ्तार', 'किया', 'है']
['हमने', 'विभिन्न', 'मंचों', 'पर', 'यह', 'मुद्दा', 'उठाते', 'हुए', 'इस', 'पर', 'बातचीत', 'का', 'प्रयास', 'किया']
['उन्होंने', 'कहा', 'कि', 'झारखंड', 'में', 'चुनाव', 'को', 'देखते', 'हुए', 'भाजपा', 'नई', 'नई', 'घोषणाएं', 'कर', 'रही', 'है', 'उनसे', 'पूछा', 'जाना', 'चाहिए', 'कि', 'क्या', 'जो', 'सुविधाएं', 'झारखंड', 'में', 'देने', 'की', 'बात', 'भाजपा', 'वाले', 'कर', 'रहे', 

**N-Gram Generation**

In [49]:
def extract_char_ngrams(token, low=3, high=5):

    results = []


    marked = "<" + token + ">"
    size = len(marked)


    for n in range(low, high + 1):
        for start in range(size - n + 1):
            gram = marked[start:start + n]
            results.append(gram)


    results.append(marked)

    return results


In [50]:
def prepare_ngram_resources(corpus, vocab_map, low=3, high=5):


    token_to_grams = {}
    unique_grams = set()


    for token in vocab_map:
        grams = extract_char_ngrams(token, low, high)
        token_to_grams[token] = grams

        for g in grams:
            unique_grams.add(g)


    index_to_gram = list(sorted(unique_grams))
    gram_to_index = {}

    for idx, gram in enumerate(index_to_gram):
        gram_to_index[gram] = idx

    return token_to_grams, gram_to_index, index_to_gram


In [51]:
# Build n-gram structures
token_to_grams, gram_to_index, index_to_gram = prepare_ngram_resources(
    corpus=reduced_data,
    vocab_map=token_index
)

print("Vocabulary size:", len(token_to_grams))
print("Total n-grams:", len(gram_to_index))

# Let's examine a Hindi example
sample = "भारत"
print(f"\nCharacter n-grams for '{sample}':")
print(token_to_grams[sample])


# print(gram_to_index)


Vocabulary size: 13948
Total n-grams: 93506

Character n-grams for 'भारत':
['<भा', 'भार', 'ारत', 'रत>', '<भार', 'भारत', 'ारत>', '<भारत', 'भारत>', '<भारत>']


Skip-gram + Negative Sampling (FastText)

1. Setup & Hyperparameters

In [52]:
import numpy as np
import random
import math

# Training configuration
VECTOR_SIZE = 100
CTX_WINDOW = 5
NEG_K = 5
STEP_SIZE = 0.05
NUM_EPOCHS = 2


2. Embedding Initialization

In [53]:
vocab_size = len(token_index)
subword_count = len(gram_to_index)

# Subword (character n-gram) embeddings
subword_matrix = np.random.uniform(
    low=-0.5 / VECTOR_SIZE,
    high=0.5 / VECTOR_SIZE,
    size=(subword_count, VECTOR_SIZE)
)

# Context word embeddings
output_matrix = np.zeros((vocab_size, VECTOR_SIZE))


3. Negative Sampling Table Construction

In [54]:
def create_neg_sampling_pool(freq_dict, token_to_idx):
    exponent = 0.75
    pool = []

    for token, count in freq_dict.items():
        idx = token_to_idx[token]
        repetitions = int(count ** exponent)
        pool += [idx] * repetitions

    return np.asarray(pool)


In [55]:
NEG_TABLE = create_neg_sampling_pool(freq_dict, token_index)

4. Negative Sample Selector

In [56]:
def draw_negative_samples(num_samples):
    return np.random.choice(NEG_TABLE, size=num_samples)


5. Utility Functions

Sigmoid Activation

In [57]:
def logistic(x):
    return 1.0 / (1.0 + np.exp(-x))


Subword-Composed Center Vector

In [58]:
def compose_center_embedding(token):
    gram_ids = [gram_to_index[g] for g in token_to_grams[token]]
    combined_vector = np.sum(subword_matrix[gram_ids], axis=0)
    return combined_vector, gram_ids


6. Core Training Update (Single Pair)

In [59]:
def optimize_pair(focus_word, neighbor_word):
    center_embedding, used_grams = compose_center_embedding(focus_word)
    neighbor_idx = token_index[neighbor_word]

    # Positive interaction
    pos_score = np.dot(output_matrix[neighbor_idx], center_embedding)
    pos_prob = logistic(pos_score)

    grad_accumulator = (pos_prob - 1.0) * output_matrix[neighbor_idx]
    output_matrix[neighbor_idx] -= STEP_SIZE * (pos_prob - 1.0) * center_embedding

    # Negative interactions
    sampled_negatives = draw_negative_samples(NEG_K)

    for neg_idx in sampled_negatives:
        neg_score = np.dot(output_matrix[neg_idx], center_embedding)
        neg_prob = logistic(neg_score)

        grad_accumulator += neg_prob * output_matrix[neg_idx]
        output_matrix[neg_idx] -= STEP_SIZE * neg_prob * center_embedding

    # Update subword embeddings
    for gid in used_grams:
        subword_matrix[gid] -= STEP_SIZE * grad_accumulator


7. Full Training Procedure

In [60]:
def run_training():
    for ep in range(NUM_EPOCHS):
        print(f"\nEpoch {ep + 1}/{NUM_EPOCHS}")
        processed = 0

        for sentence in reduced_data:
            length = len(sentence)

            for idx, target in enumerate(sentence):
                left = max(0, idx - CTX_WINDOW)
                right = min(length, idx + CTX_WINDOW + 1)

                for ctx_pos in range(left, right):
                    if ctx_pos == idx:
                        continue
                    optimize_pair(target, sentence[ctx_pos])
                    processed += 1

        print("Total word-context updates:", processed)


Run Training

In [61]:
run_training()


Epoch 1/2
Total word-context updates: 143832

Epoch 2/2
Total word-context updates: 143832


8. Inference: Word Vector Retrieval (OOV-Safe)

In [62]:
def infer_vector(token):


    grams = extract_char_ngrams(token)

    collected = []
    for g in grams:
        if g in gram_to_index:
            collected.append(subword_matrix[gram_to_index[g]])

    if len(collected) == 0:
        return None

    return np.sum(collected, axis=0)


Validation

In [63]:
print(infer_vector("भारत"))
print(infer_vector("भारतीयता"))   # likely, OOV but still works


[-0.53649783  0.4138684  -0.4282412  -0.19546279 -0.08769107 -0.03322835
  0.52304768 -0.07680185 -0.35063553 -0.39727632 -0.14661153  0.37185727
  0.64457166 -0.19351365 -0.04703597  0.06694297 -0.43062124  0.46660899
  0.48601888 -0.12330212 -0.00945138 -0.07981257 -0.55014665 -0.18605823
 -0.2149811   0.03957449  0.31171552 -0.5424673   0.02623764  0.02167183
  0.16680383  0.3337147   0.33175652 -0.14946112 -0.41682449 -0.89805874
 -0.21262106 -0.25891588 -0.44789433  0.41436025  0.05022845 -0.39335527
 -0.02187473 -0.21032248 -0.22176676 -0.06459911  0.35186481 -0.50518031
  0.0690908  -0.5589887   0.28328477 -0.34007902 -0.28606508  0.00953444
 -0.174977    0.5763924   0.33145229  0.63332627 -0.06530339 -0.20911418
 -0.04725188  0.28932217  0.44937879 -0.00732059  0.28690095  0.13439169
 -0.09598472  0.27054813 -0.11398706 -0.48597777 -0.23275836  0.46558663
 -0.22782956 -0.28766979  0.14259917  0.44851303 -0.14000117 -0.11401588
  0.57852648  0.14240387 -0.17435748 -0.20429265 -0

9. Morphology similarity

In [64]:
v1 = infer_vector("भारत")
v2 = infer_vector("भारतीयता")

cos_sim = np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
print("Cosine similarity:", cos_sim)


Cosine similarity: 0.8090814287846109
