# Demonstrating What Text Vectorization is

In [6]:
import pickle
import tensorflow as tf
from keras.layers import TextVectorization
import string

%run constants.py
def create_textvectorisation(lst):
    text_vectorization: TextVectorization = TextVectorization(
        output_mode="int",
        split="whitespace",
        max_tokens=MAX_VOCAB_SIZE,
        output_sequence_length=MAX_SEQUENCE_LENGTH,
        standardize=custom_standardization
    )
    text_vectorization.adapt(lst)
    return text_vectorization


TRAIN_CORPORA: str = os.path.join(DATA_DIR, "train_corpora.pkl")
with open(TRAIN_CORPORA, "rb") as f:
        flat_list_train_corpora = pickle.load(f)

def custom_standardization(input_string):
    """ Remove html line-break tags and handle punctuation """
    no_uppercased = tf.strings.lower(input_string, encoding='utf-8')
    no_stars = tf.strings.regex_replace(no_uppercased, "\*", " ")
    no_repeats = tf.strings.regex_replace(no_stars, "devamını oku", "")    
    no_html = tf.strings.regex_replace(no_repeats, "<br />", "")
    no_digits = tf.strings.regex_replace(no_html, "\w*\d\w*","")
    no_punctuations = tf.strings.regex_replace(no_digits, f"([{string.punctuation}])", r" ")

    return no_punctuations
    

text_vectorization = create_textvectorisation(flat_list_train_corpora)



In [7]:

demo_string = "The Quick Brown Fox"
text_vectorization(demo_string)

<tf.Tensor: shape=(200,), dtype=int64, numpy=
array([1901, 2081, 1196,    1,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
     

# Demonstrating what embedding is

In [8]:
import tqdm
import numpy

embeddings_index = {}

f = open(GLOVE_300D_FILEPATH)
for line in tqdm.tqdm(f, ncols=100, desc="Loading Glove Embeddings."):
        values = line.split()
        word = values[0]
        coefs = numpy.asarray(values[1:], dtype="float32")
        embeddings_index[word] = coefs
f.close()

print(f"Found {len(embeddings_index)} word vectors.")
n = 10
for idx, (k, v) in enumerate(embeddings_index.items()):
  if idx == n: break
  print((k, v))

Loading Glove Embeddings.: 400000it [00:15, 26012.48it/s]

Found 400000 word vectors.
('the', array([ 4.6560e-02,  2.1318e-01, -7.4364e-03, -4.5854e-01, -3.5639e-02,
        2.3643e-01, -2.8836e-01,  2.1521e-01, -1.3486e-01, -1.6413e+00,
       -2.6091e-01,  3.2434e-02,  5.6621e-02, -4.3296e-02, -2.1672e-02,
        2.2476e-01, -7.5129e-02, -6.7018e-02, -1.4247e-01,  3.8825e-02,
       -1.8951e-01,  2.9977e-01,  3.9305e-01,  1.7887e-01, -1.7343e-01,
       -2.1178e-01,  2.3617e-01, -6.3681e-02, -4.2318e-01, -1.1661e-01,
        9.3754e-02,  1.7296e-01, -3.3073e-01,  4.9112e-01, -6.8995e-01,
       -9.2462e-02,  2.4742e-01, -1.7991e-01,  9.7908e-02,  8.3118e-02,
        1.5299e-01, -2.7276e-01, -3.8934e-02,  5.4453e-01,  5.3737e-01,
        2.9105e-01, -7.3514e-03,  4.7880e-02, -4.0760e-01, -2.6759e-02,
        1.7919e-01,  1.0977e-02, -1.0963e-01, -2.6395e-01,  7.3990e-02,
        2.6236e-01, -1.5080e-01,  3.4623e-01,  2.5758e-01,  1.1971e-01,
       -3.7135e-02, -7.1593e-02,  4.3898e-01, -4.0764e-02,  1.6425e-02,
       -4.4640e-01,  1.7197e-




# Approach to Noisy Text

In [9]:
import re
CLEANR = re.compile('<.*?>') 
original_string = "</br>The Quick Brown Fox 123 * <BR> ?"
demo_string = original_string.lower()
demo_string = re.sub(CLEANR, '', demo_string)
demo_string = re.sub(r'[^\w]', ' ', demo_string)
demo_string = re.sub(r'\w*\d\w*', ' ', demo_string)
demo_string = re.sub(r'\[^\w\s]', ' ', demo_string)

print(f"{original_string} => {demo_string}")



</br>The Quick Brown Fox 123 * <BR> ? => the quick brown fox       


In [10]:
original_string = "</br>The Quick Brown Fox 123 * <BR> ?"
print(f"{original_string} => {demo_string}")

</br>The Quick Brown Fox 123 * <BR> ? => the quick brown fox       


: 