# Text preprocessing

### Text Cleaning
Clean text by removing unnecessary characters and altering the format of words.

In [None]:
def clean_text(text):

    text = text.lower()
    
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "that is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"n'", "ng", text)
    text = re.sub(r"'bout", "about", text)
    text = re.sub(r"'til", "until", text)
    text = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", text)
    
    return text

### Tokenization
We use Keras’ Tokenizer to vectorize text descriptions,
* We remove all punctuation.
* We turn the texts into space-separated sequences of words in lowercase.
* These sequences are then split into lists of tokens.
* We set `char_level=False` , so every word will be treated as a token other than character.
* The lists of tokens will then be indexed or / and vectorized.
* We convert the corpus into sequence of tokens.

In [None]:
# Tokenization
t = Tokenizer(
    num_words=None, 
    filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', 
    lower=True, 
    split=' ', 
    char_level=False, 
    oov_token=None, 
    document_count=0
)

# corpus - list of senetences

def get_sequence_of_tokens(corpus):
    t.fit_on_texts(corpus)
    total_words = len(t.word_index) + 1
    
    input_sequences = []
    for line in corpus:
        token_list = t.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
            
    return input_sequences, total_words
input_sequences, total_words = get_sequence_of_tokens(corpus)

### Pad sequences
* Pads sequences to the same length

In [None]:
from keras.preprocessing.sequence import pad_sequences

padded_sequences = pad_sequences(
    sequences, 
    maxlen=MAX_LEN, 
    dtype='int32', 
    padding='post', 
    truncating='post')


### Word Embedding
* use Pretraind Word2Vec Model
* Create Embedding Matrix from our Vocabulary with size `(word count x embedding size)`
* Create Embedding Layer in the model architecture