## Natural Language Processing 

### Tokenization

In [1]:
import nltk
import string 
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize, MWETokenizer
from nltk.tokenize.util import string_span_tokenize, spans_to_relative
from nltk.tokenize import RegexpTokenizer, regexp_tokenize, LineTokenizer, SpaceTokenizer
from nltk.tokenize import TreebankWordTokenizer, BlanklineTokenizer, WhitespaceTokenizer

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Andrew\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Andrew\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
word_counter = input("")
print('The length of the text is', len(word_tokenize(word_counter)), 'words')

Natural language processing
The length of the text is 3 words


In [5]:
english_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [6]:
sentence_1 = "Good morning everyone. I hope that you are all are doing well. Have a wonderful day!"

In [7]:
sent_tokenize(sentence_1)

['Good morning everyone.',
 'I hope that you are all are doing well.',
 'Have a wonderful day!']

In [8]:
french_tokenizer = nltk.data.load('tokenizers/punkt/french.pickle')

In [9]:
French_sentence_1 = "Bonjour tout le monde. J'espère que tout le monde va bien. Bonne journée"

In [10]:
sent_tokenize(French_sentence_1)

['Bonjour tout le monde.',
 "J'espère que tout le monde va bien.",
 'Bonne journée']

In [11]:
spanish_tokenizer = nltk.data.load('tokenizers/punkt/spanish.pickle')

In [12]:
Spanish_sentence_1 = "Hola a todos. Espero que todos lo estén haciendo bien. Tenga un buen día"

In [13]:
sent_tokenize(Spanish_sentence_1)

['Hola a todos.',
 'Espero que todos lo estén haciendo bien.',
 'Tenga un buen día']

### Treebank Word Tokenizer

* treebank works by performing tokenization based on punctuation and spaces

In [14]:
tree_tokenizer = TreebankWordTokenizer()

In [15]:
tree_tokenizer.tokenize('It is going to rain today. You should bring an umbrella with you')

['It',
 'is',
 'going',
 'to',
 'rain',
 'today.',
 'You',
 'should',
 'bring',
 'an',
 'umbrella',
 'with',
 'you']

## Normilization

#### Necessary for processing in natural language text for the purpose of

* Expanding abbreviations
* Eliminating punctuations
* Converting text into uppercase or lowercase forms
* Canonicalization of text
* Converting numbers into words
* etc

In [16]:
text_to_modify = 'StoP DoinG WHat YOu aRe DOiNg'

In [17]:
print(text_to_modify.lower())
print(text_to_modify.upper())

stop doing what you are doing
STOP DOING WHAT YOU ARE DOING


In [18]:
new_text = ["Oh what a beautiful morning.", " Oh what a beautiful day.", "The sun is shining and the birds are chirping"]

In [19]:
tokenized_text = [word_tokenize(text) for text in new_text]

In [20]:
print(tokenized_text)

[['Oh', 'what', 'a', 'beautiful', 'morning', '.'], ['Oh', 'what', 'a', 'beautiful', 'day', '.'], ['The', 'sun', 'is', 'shining', 'and', 'the', 'birds', 'are', 'chirping']]


In [21]:
A = re.compile('[%s]' % re.escape(string.punctuation))
tokenized_text_no_punctuation = []

for inspection in tokenized_text:
    new_inspection = []
    for token in inspection:
        new_token = A.sub(u'', token)
        if not new_token == u'':
            new_inspection.append(new_token)
            tokenized_text_no_punctuation.append(new_inspection)
            print(tokenized_text_no_punctuation)
    

[['Oh']]
[['Oh', 'what'], ['Oh', 'what']]
[['Oh', 'what', 'a'], ['Oh', 'what', 'a'], ['Oh', 'what', 'a']]
[['Oh', 'what', 'a', 'beautiful'], ['Oh', 'what', 'a', 'beautiful'], ['Oh', 'what', 'a', 'beautiful'], ['Oh', 'what', 'a', 'beautiful']]
[['Oh', 'what', 'a', 'beautiful', 'morning'], ['Oh', 'what', 'a', 'beautiful', 'morning'], ['Oh', 'what', 'a', 'beautiful', 'morning'], ['Oh', 'what', 'a', 'beautiful', 'morning'], ['Oh', 'what', 'a', 'beautiful', 'morning']]
[['Oh', 'what', 'a', 'beautiful', 'morning'], ['Oh', 'what', 'a', 'beautiful', 'morning'], ['Oh', 'what', 'a', 'beautiful', 'morning'], ['Oh', 'what', 'a', 'beautiful', 'morning'], ['Oh', 'what', 'a', 'beautiful', 'morning'], ['Oh']]
[['Oh', 'what', 'a', 'beautiful', 'morning'], ['Oh', 'what', 'a', 'beautiful', 'morning'], ['Oh', 'what', 'a', 'beautiful', 'morning'], ['Oh', 'what', 'a', 'beautiful', 'morning'], ['Oh', 'what', 'a', 'beautiful', 'morning'], ['Oh', 'what'], ['Oh', 'what']]
[['Oh', 'what', 'a', 'beautiful', 'morn

### Stop Words

* Words that need to be filtered out because they do not contribute significant value overall during information retrieval or other natural language processing tasks

In [22]:
# List of stopword supported languages

stopwords.fileids()

['arabic',
 'azerbaijani',
 'danish',
 'dutch',
 'english',
 'finnish',
 'french',
 'german',
 'greek',
 'hungarian',
 'indonesian',
 'italian',
 'kazakh',
 'nepali',
 'norwegian',
 'portuguese',
 'romanian',
 'russian',
 'spanish',
 'swedish',
 'turkish']

In [23]:
stop_words = set(stopwords.words('english'))

In [24]:
# List of English stop words
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [25]:
spanish_stop_words = set(stopwords.words('spanish'))

In [26]:
# List of Spanish stop words

spanish_stop_words

{'a',
 'al',
 'algo',
 'algunas',
 'algunos',
 'ante',
 'antes',
 'como',
 'con',
 'contra',
 'cual',
 'cuando',
 'de',
 'del',
 'desde',
 'donde',
 'durante',
 'e',
 'el',
 'ella',
 'ellas',
 'ellos',
 'en',
 'entre',
 'era',
 'erais',
 'eran',
 'eras',
 'eres',
 'es',
 'esa',
 'esas',
 'ese',
 'eso',
 'esos',
 'esta',
 'estaba',
 'estabais',
 'estaban',
 'estabas',
 'estad',
 'estada',
 'estadas',
 'estado',
 'estados',
 'estamos',
 'estando',
 'estar',
 'estaremos',
 'estará',
 'estarán',
 'estarás',
 'estaré',
 'estaréis',
 'estaría',
 'estaríais',
 'estaríamos',
 'estarían',
 'estarías',
 'estas',
 'este',
 'estemos',
 'esto',
 'estos',
 'estoy',
 'estuve',
 'estuviera',
 'estuvierais',
 'estuvieran',
 'estuvieras',
 'estuvieron',
 'estuviese',
 'estuvieseis',
 'estuviesen',
 'estuvieses',
 'estuvimos',
 'estuviste',
 'estuvisteis',
 'estuviéramos',
 'estuviésemos',
 'estuvo',
 'está',
 'estábamos',
 'estáis',
 'están',
 'estás',
 'esté',
 'estéis',
 'estén',
 'estés',
 'fue',
 'f

In [27]:
words = ["I", "didn't", "see", "any", "milk", "in", "the", "fridge"]

In [28]:
[word for word in words if word not in stop_words]

['I', 'see', 'milk', 'fridge']