#### Text cleaning techniques
- Normalizing text: case normalization
- tokenize: taking the smallest part of the text
    - word_tokenize()
    - wordpunct_tokenize()
    - tweettokenizer
    - regexp_tokenize
- removing stop words and punctuation
- stemming and lemmetization -> takes the word to its root form


### Case Normalization

In [1]:
txt = "Good but need updates and improvements,Worst mobile i have bought ever"
text = txt.lower()
text

'good but need updates and improvements,worst mobile i have bought ever'

### Tokenization
- words or pair of words
- sentences
- paragraphs

In [7]:
txt = "Text messaging, or texting, is the act of composing and sending electronic messages aren't"
print(txt.split())

['Text', 'messaging,', 'or', 'texting,', 'is', 'the', 'act', 'of', 'composing', 'and', 'sending', 'electronic', 'messages', "aren't"]


In [5]:
from nltk.tokenize import word_tokenize

In [8]:
print(word_tokenize(txt))

['Text', 'messaging', ',', 'or', 'texting', ',', 'is', 'the', 'act', 'of', 'composing', 'and', 'sending', 'electronic', 'messages', 'are', "n't"]


In [10]:
from nltk.tokenize import wordpunct_tokenize

In [12]:
print(wordpunct_tokenize(txt))

['Text', 'messaging', ',', 'or', 'texting', ',', 'is', 'the', 'act', 'of', 'composing', 'and', 'sending', 'electronic', 'messages', 'aren', "'", 't']


In [13]:
txt = "lamo #killing it, hzv L>hfah - > hshs,sjhg kjf. huahuh; @hshak"
print('1',word_tokenize(txt))
print('2',wordpunct_tokenize(txt))

1 ['lamo', '#', 'killing', 'it', ',', 'hzv', 'L', '>', 'hfah', '-', '>', 'hshs', ',', 'sjhg', 'kjf', '.', 'huahuh', ';', '@', 'hshak']
2 ['lamo', '#', 'killing', 'it', ',', 'hzv', 'L', '>', 'hfah', '-', '>', 'hshs', ',', 'sjhg', 'kjf', '.', 'huahuh', ';', '@', 'hshak']


In [14]:
from nltk.tokenize import TweetTokenizer

In [15]:
token = TweetTokenizer()

In [16]:
print(token.tokenize(txt))

['lamo', '#killing', 'it', ',', 'hzv', 'L', '>', 'hfah', '-', '>', 'hshs', ',', 'sjhg', 'kjf', '.', 'huahuh', ';', '@hshak']


In [17]:
txt = "living life king size #chilling # lifegoals #yayyy #wanderlust"


In [18]:
from nltk.tokenize import regexp_tokenize

In [20]:
# extract all hashtags and words
print(regexp_tokenize(txt,'#[\w]+'))
# extract all words of hashtags but not hash
print(regexp_tokenize(txt,'#([\w]+)'))

['#chilling', '#yayyy', '#wanderlust']
['chilling', 'yayyy', 'wanderlust']


In [23]:
from string import punctuation

In [24]:
from nltk.corpus import stopwords

In [25]:
stop_nltk = stopwords.words("english")

In [26]:
print(stop_nltk)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [27]:
print(list(punctuation))

['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~']


In [31]:
txt = "aha akhkhn jajij. ja ahuah, gayuyg"
tok = word_tokenize(txt.lower())

In [32]:
[word for word in tok if word not in stop_nltk and word not in list(punctuation)]

['aha', 'akhkhn', 'jajij', 'ja', 'ahuah', 'gayuyg']

### Stemming
- it takes the word to its root form
- its rule based technique which just removes the suffixes
- the stemmed word might not be part of the dictionary
#### 2 types of stemming
- porter stemmer >> oldest and legacy stemmer developed in 1979
- snowball stemmer >> sophasticated stemmer, faster and supports multiple languages

In [34]:
from nltk.stem import PorterStemmer, SnowballStemmer

In [35]:
stemmer_p = PorterStemmer()

In [36]:
stemmer_p.stem("driving")

'drive'

In [39]:
txt = "Text messaging, or texting, is the act of composing and sending electronic messages, typically consisting of alphabetic and numeric characters, between two or more users of mobile devices, desktops/laptops, or other type of compatible computer."
tok = word_tokenize(txt.lower())

In [40]:
print([stemmer_p.stem(word) for word in tok])

['text', 'messag', ',', 'or', 'text', ',', 'is', 'the', 'act', 'of', 'compos', 'and', 'send', 'electron', 'messag', ',', 'typic', 'consist', 'of', 'alphabet', 'and', 'numer', 'charact', ',', 'between', 'two', 'or', 'more', 'user', 'of', 'mobil', 'devic', ',', 'desktops/laptop', ',', 'or', 'other', 'type', 'of', 'compat', 'comput', '.']


In [58]:
txt = "he is very methodical and orderly in his execution"
tok = word_tokenize(txt.lower())
print([stemmer_p.stem(word) for word in tok]) # it doesn't take in dictionary form, its just chop off

['he', 'is', 'veri', 'method', 'and', 'orderli', 'in', 'hi', 'execut']


In [42]:
stemmer_s = SnowballStemmer("english")

In [43]:
print([stemmer_s.stem(word) for word in tok])

['he', 'is', 'veri', 'method', 'and', 'order', 'in', 'his', 'execut']


In [44]:
txt = "studies studyying cries cry his execute"
tok = word_tokenize(txt.lower())
print('PorterStemmer',[stemmer_p.stem(word) for word in tok])
print('SnowballStemmer',[stemmer_s.stem(word) for word in tok])

PorterStemmer ['studi', 'studi', 'cri', 'cri', 'hi', 'execut']
SnowballStemmer ['studi', 'studyy', 'cri', 'cri', 'his', 'execut']


In [45]:
text = "studies studying cries cry his execute orderly university universal"
token = word_tokenize(text.lower())
print('PorterStemmer',[stemmer_p.stem(word) for word in token])
print('SnowballStemmer',[stemmer_s.stem(word) for word in token])

PorterStemmer ['studi', 'studi', 'cri', 'cri', 'hi', 'execut', 'orderli', 'univers', 'univers']
SnowballStemmer ['studi', 'studi', 'cri', 'cri', 'his', 'execut', 'order', 'univers', 'univers']


#### Lemmatization:
- like stemming, lemmitization takes the word to the root from called as lemma
- it involves resolving words to their dictionary from
- Alemma of a word is its dictionary form or canonical form
- lemmetizer in NLTK uses WordNet data set which comprises a list of synonyms


In [47]:
from nltk.stem import WordNetLemmatizer

In [48]:
lemm= WordNetLemmatizer()

In [51]:
txt = "he is very methodical and orderly in his execution"
tok = word_tokenize(txt.lower())
print(tok)

['he', 'is', 'very', 'methodical', 'and', 'orderly', 'in', 'his', 'execution']


In [53]:
print([lemm.lemmatize(word) for word in tok])

['he', 'is', 'very', 'methodical', 'and', 'orderly', 'in', 'his', 'execution']


- lemmetize is a very aggresive technique in taking the word to the root form
     - if the word to be stemmed is not part of the dictionary , it leaves it as it is
     - emsured that the menaing of the sentence is not altered
     - in most of the scenarios the no. distinct words after lemmetization could be same as before

In [54]:
txt2 = "he is driving and drives the down of the drived vehicle"
print([lemm.lemmatize(word) for word in word_tokenize(txt2.lower())])

['he', 'is', 'driving', 'and', 'drive', 'the', 'down', 'of', 'the', 'drived', 'vehicle']


In [55]:
print([lemm.lemmatize(word, pos='v') for word in word_tokenize(txt2.lower())]) # pos='v' only for verbs

['he', 'be', 'drive', 'and', 'drive', 'the', 'down', 'of', 'the', 'drive', 'vehicle']


In [59]:
import time
start_time = time.time()
txt = "he is very methodical and orderly in his execution"
tok = word_tokenize(txt.lower())
print([stemmer_p.stem(word) for word in tok]) # it doesn't take in dictionary form, its just chop off
print("--- %s seconds ---" % (time.time() - start_time))

['he', 'is', 'veri', 'method', 'and', 'orderli', 'in', 'hi', 'execut']
--- 0.0010440349578857422 seconds ---


In [60]:
import time
start_time = time.time()
txt = "he is very methodical and orderly in his execution"
tok = word_tokenize(txt.lower())
print([stemmer_s.stem(word) for word in tok]) # it doesn't take in dictionary form, its just chop off
print("--- %s seconds ---" % (time.time() - start_time))

['he', 'is', 'veri', 'method', 'and', 'order', 'in', 'his', 'execut']
--- 0.0009970664978027344 seconds ---


In [66]:
import time
start_time = time.time()
txt2 = "he is very methodical and orderly in his execution"
print([lemm.lemmatize(word) for word in word_tokenize(txt2.lower())])
print("--- %s seconds ---" % (time.time() - start_time))

['he', 'is', 'very', 'methodical', 'and', 'orderly', 'in', 'his', 'execution']
--- 0.0 seconds ---


#### conclusion:
- stemmer helps to reduce the number of distinct words in the corpus
- lemmitizer helps to retain the meaning of the words effectively used when the context of word is important
- stemmer is used when I have an intent of getting lesser words which reduces the total number of features

In [None]:
#combine all the steps and create udf
# takes one review as input and returns a cleaned review as a string
def clean_txt(sent):