In [1]:
import tensorflow as tf
import pandas as pd

In [2]:
print(tf.__version__)

2.3.0


### Keras Tokenizer

In [3]:
from tensorflow.keras.preprocessing.text import Tokenizer


In [4]:
tok=Tokenizer()
text=["How is your week going its going  good good good"]
tok.fit_on_texts(text)

In [5]:
tok.word_index

{'good': 1, 'going': 2, 'how': 3, 'is': 4, 'your': 5, 'week': 6, 'its': 7}

In [6]:
sequences = tok.texts_to_sequences(text)

In [7]:
sequences

[[3, 4, 5, 6, 2, 7, 2, 1, 1, 1]]

In [8]:
test_text="My week just started"
test_sequences=tok.texts_to_sequences([test_text])

In [9]:
test_sequences[0]

[6]

In [10]:
# store in dataframe
raw_text=["how is your week going","my week just started","hope it goes good","friday is holiday"]
df_example = pd.DataFrame({'raw_text': raw_text})
df_example.head()
# updata df_text
df_example['train_sequence'] = df_example.raw_text.apply(lambda x: tok.texts_to_sequences([x])[0])
df_example.head()

Unnamed: 0,raw_text,train_sequence
0,how is your week going,"[3, 4, 5, 6, 2]"
1,my week just started,[6]
2,hope it goes good,[1]
3,friday is holiday,[4]


In [11]:
#OOV TOKEN -out of vocabulary token shows what words we didnt have in our dictionary
tok_with_oov=Tokenizer(oov_token="UNK")
text=["How is your week going"]
tok_with_oov.fit_on_texts(text)
sequences_with_oov = tok_with_oov.texts_to_sequences(text)
print(tok_with_oov.word_index)

{'UNK': 1, 'how': 2, 'is': 3, 'your': 4, 'week': 5, 'going': 6}


In [12]:
test_sequences_with_oov=tok_with_oov.texts_to_sequences(["hope it goes good"])

In [13]:
sequences_with_oov # fitting on the train

[[2, 3, 4, 5, 6]]

In [14]:
test_sequences_with_oov

[[1, 1, 1, 1]]

In [15]:
raw_text=["how is your week going","my week just started","hope it goes good","friday is holiday"]
df_example_with_oov = pd.DataFrame({'raw_text': raw_text})
df_example_with_oov.head()
# updata df_text
df_example_with_oov['sequence'] = df_example_with_oov.raw_text.apply(lambda x: tok_with_oov.texts_to_sequences([x])[0])
df_example_with_oov.head()

Unnamed: 0,raw_text,sequence
0,how is your week going,"[2, 3, 4, 5, 6]"
1,my week just started,"[1, 5, 1, 1]"
2,hope it goes good,"[1, 1, 1, 1]"
3,friday is holiday,"[1, 3, 1]"


# Padding

In [16]:
# to make all sequences to be of same length 

In [17]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

### pre_padding

In [18]:
tok=Tokenizer()
text=["How is your week going"] # 5 WORDS
tok.fit_on_texts(text)
sequences = tok.texts_to_sequences(text)
max_length= 10 # length of the sequence # 10 WORDS
pre_padding = pad_sequences(sequences, maxlen=max_length, 
                                padding='pre',
                                truncating='pre',)  # ADD VALUES TO FRONT OF THE SEQUENCE

In [19]:
pre_padding

array([[0, 0, 0, 0, 0, 1, 2, 3, 4, 5]])

## post_padding

In [20]:
tok=Tokenizer()
text=["How is your week going"]
tok.fit_on_texts(text)
sequences = tok.texts_to_sequences(text)
max_length=10000# length of the sequence
post_padding = pad_sequences(sequences, maxlen=max_length, 
                                padding='post',
                                truncating='post',value=999)

In [21]:
post_padding

array([[  1,   2,   3, ..., 999, 999, 999]])

In [22]:
df_example_with_oov['post_padded_sequences'] =  pad_sequences(df_example_with_oov["sequence"], maxlen=max_length, 
                                padding='post',
                                truncating='post',).tolist()

In [23]:
df_example_with_oov['pre_padded_sequences'] =  pad_sequences(df_example_with_oov["sequence"], maxlen=max_length, 
                                padding='pre',
                                truncating='pre',).tolist()

In [24]:
df_example_with_oov

Unnamed: 0,raw_text,sequence,post_padded_sequences,pre_padded_sequences
0,how is your week going,"[2, 3, 4, 5, 6]","[2, 3, 4, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,my week just started,"[1, 5, 1, 1]","[1, 5, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,hope it goes good,"[1, 1, 1, 1]","[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,friday is holiday,"[1, 3, 1]","[1, 3, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


# what is stemming

stemming is the text normalization techniques.It is process of the cutting the word to stem(root)


# "Stemming is the process of reducing inflection in words to their root forms such as mapping a group of words to the same stem even if the stem itself is not a valid word in the Language."

### if the word ends in 'ed', remove the 'ed'


### if the word ends in 'ing', remove the 'ing'


### if the word ends in 'ly', remove the 'ly'

In [25]:

from nltk.stem import PorterStemmer
porter = PorterStemmer()
#PorterStemmer uses Suffix Stripping to produce stems

# playing---->play
played ---->play
player----> 

In [26]:
sentence="today is great greater greatest day.I'm liking it very much playing"
words=sentence.split()
print(words)
for w in words:
    rootWord=porter.stem(w)
    print(rootWord)

['today', 'is', 'great', 'greater', 'greatest', "day.I'm", 'liking', 'it', 'very', 'much', 'playing']
today
is
great
greater
greatest
day.i'm
like
it
veri
much
play


In [27]:
from nltk.stem import PorterStemmer
e_words= ["being", "waiting","very","tested","mostly","having","cats","stripes"]

for w in e_words:
    rootWord=porter.stem(w)
    print(rootWord)

be
wait
veri
test
mostli
have
cat
stripe


### Lemmatization 

Lemmatization is the algorithmic process of finding the lemma of a word.
It helps in returning the base or dictionary form of a word, which is known as the lemma

In [30]:
# import nltk
# nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\colourfulm\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

In [31]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [32]:
for w in e_words:
    rootWord=lemmatizer.lemmatize(w)
    print(rootWord)

being
waiting
very
tested
mostly
having
cat
stripe


In [33]:
lemmatizer.lemmatize('stripes',pos="v")

'strip'

In [36]:
# nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\colourfulm\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [34]:

import re  # regularization 
from nltk.corpus import stopwords
stops = set(stopwords.words('english'))
from nltk.tokenize import word_tokenize

from nltk.stem import PorterStemmer
ps =PorterStemmer()




from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def general_clean(text):
    text = text.lower()  # LOWER THE CASE
    text = re.sub("[^a-z0-9.\- ]"," ",text) # keeping only aplha numerical  @#$
    text = re.sub(r"   ", " ", text) # Remove any extra spaces
    return text



def clean(text):
    text=text.lower()
    print("lowered text:",text)
    cleaned_words=word_tokenize(text)
    cleaned_words=[word for word in cleaned_words if word not in stops]
    stemmer=[ps.stem(words) for words in cleaned_words]
  
   
    lemmatizer_=[lemmatizer.lemmatize(words) for words in cleaned_words]
    print("cleaned text:",cleaned_words) 
    print("stemmer     :",stemmer)
    print("lemmatizer  :",lemmatizer_)
    return " ".join(lemmatizer_)

    

In [37]:
clean("The dog is liking the car very much.")

lowered text: the dog is liking the car very much.
cleaned text: ['dog', 'liking', 'car', 'much', '.']
stemmer     : ['dog', 'like', 'car', 'much', '.']
lemmatizer  : ['dog', 'liking', 'car', 'much', '.']


'dog liking car much .'

In [38]:
tokenizer = Tokenizer(num_words=5000,oov_token="UNK")
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train) # return the indices
X_test = tokenizer.texts_to_sequences(X_test)



maxlen = 200

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen,)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

NameError: name 'X_train' is not defined