<img src='https://training.dwit.edu.np/frontend/images/computer-training-institute.png'>
<h1>Data Science and Machine learning in Python</h1>
<h3>Instructor: <a href='https://www.kaggle.com/atishadhikari'> Atish Adhikari</a></h3>
<hr>

### NLTK

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
import nltk

In [23]:
paragraph = """While natural language processing isn’t a new science, the technology is rapidly advancing thanks to an increased interest in human-to-machine communications, plus an availability of big data, powerful computing and enhanced algorithms. As a human, you may speak and write in English, Spanish or Chinese. But a computer’s native language – known as machine code or machine language – is largely incomprehensible to most people. At your device’s lowest levels, communication occurs not with words but through millions of zeros and ones that produce logical actions. 
"""

In [24]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\atish\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [25]:
tokenized_sentences = nltk.tokenize.sent_tokenize(paragraph)

In [22]:
tokenized_sentences

['Alas!',
 'While natural language processing isn’t a new science, the technology is rapidly advancing thanks to an increased interest in human-to-machine communications, plus an availability of big data, powerful computing and enhanced algorithms.',
 'As a human, you may speak and write in English, Spanish or Chinese.',
 'But a computer’s native language – known as machine code or machine language – is largely incomprehensible to most people.',
 'At your device’s lowest levels, communication occurs not with words but through millions of zeros and ones that produce logical actions.']

In [26]:
words = []

for sentence in tokenized_sentences:
    word_tokens = nltk.tokenize.word_tokenize(sentence)
    words.append(word_tokens)

In [29]:
from nltk.corpus import stopwords

In [32]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\atish\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [39]:
words

[['While',
  'natural',
  'language',
  'processing',
  'isn',
  '’',
  't',
  'a',
  'new',
  'science',
  ',',
  'the',
  'technology',
  'is',
  'rapidly',
  'advancing',
  'thanks',
  'to',
  'an',
  'increased',
  'interest',
  'in',
  'human-to-machine',
  'communications',
  ',',
  'plus',
  'an',
  'availability',
  'of',
  'big',
  'data',
  ',',
  'powerful',
  'computing',
  'and',
  'enhanced',
  'algorithms',
  '.'],
 ['As',
  'a',
  'human',
  ',',
  'you',
  'may',
  'speak',
  'and',
  'write',
  'in',
  'English',
  ',',
  'Spanish',
  'or',
  'Chinese',
  '.'],
 ['But',
  'a',
  'computer',
  '’',
  's',
  'native',
  'language',
  '–',
  'known',
  'as',
  'machine',
  'code',
  'or',
  'machine',
  'language',
  '–',
  'is',
  'largely',
  'incomprehensible',
  'to',
  'most',
  'people',
  '.'],
 ['At',
  'your',
  'device',
  '’',
  's',
  'lowest',
  'levels',
  ',',
  'communication',
  'occurs',
  'not',
  'with',
  'words',
  'but',
  'through',
  'millions',


In [35]:
english_common = list(set(stopwords.words("english")))

In [38]:
punctuations = [",", '"', "'", "?", "!", "@", "-", ".", "_"]

In [40]:
processed_words = []

for sent in words:
    for w in sent:
        if (w in english_common) or (w in punctuations) or (len(w) == 1):
            continue
        processed_words.append(w)

In [42]:
processed_words

['While',
 'natural',
 'language',
 'processing',
 'new',
 'science',
 'technology',
 'rapidly',
 'advancing',
 'thanks',
 'increased',
 'interest',
 'human-to-machine',
 'communications',
 'plus',
 'availability',
 'big',
 'data',
 'powerful',
 'computing',
 'enhanced',
 'algorithms',
 'As',
 'human',
 'may',
 'speak',
 'write',
 'English',
 'Spanish',
 'Chinese',
 'But',
 'computer',
 'native',
 'language',
 'known',
 'machine',
 'code',
 'machine',
 'language',
 'largely',
 'incomprehensible',
 'people',
 'At',
 'device',
 'lowest',
 'levels',
 'communication',
 'occurs',
 'words',
 'millions',
 'zeros',
 'ones',
 'produce',
 'logical',
 'actions']

In [43]:
from nltk.stem import PorterStemmer, LancasterStemmer, WordNetLemmatizer

In [52]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\atish\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

In [55]:
root_words = []
processed_words.extend(["knives", "trees"])
for word in processed_words:
    stemmer = WordNetLemmatizer()
    root = stemmer.lemmatize(word)
    root_words.append(root)

In [56]:
root_words

['While',
 'natural',
 'language',
 'processing',
 'new',
 'science',
 'technology',
 'rapidly',
 'advancing',
 'thanks',
 'increased',
 'interest',
 'human-to-machine',
 'communication',
 'plus',
 'availability',
 'big',
 'data',
 'powerful',
 'computing',
 'enhanced',
 'algorithm',
 'As',
 'human',
 'may',
 'speak',
 'write',
 'English',
 'Spanish',
 'Chinese',
 'But',
 'computer',
 'native',
 'language',
 'known',
 'machine',
 'code',
 'machine',
 'language',
 'largely',
 'incomprehensible',
 'people',
 'At',
 'device',
 'lowest',
 'level',
 'communication',
 'occurs',
 'word',
 'million',
 'zero',
 'one',
 'produce',
 'logical',
 'action',
 'knife',
 'tree']

In [59]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\atish\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [60]:
tagged_words = nltk.pos_tag(root_words)

In [61]:
tagged_words

[('While', 'IN'),
 ('natural', 'JJ'),
 ('language', 'NN'),
 ('processing', 'VBG'),
 ('new', 'JJ'),
 ('science', 'NN'),
 ('technology', 'NN'),
 ('rapidly', 'RB'),
 ('advancing', 'VBG'),
 ('thanks', 'NNS'),
 ('increased', 'VBD'),
 ('interest', 'NN'),
 ('human-to-machine', 'NN'),
 ('communication', 'NN'),
 ('plus', 'CC'),
 ('availability', 'NN'),
 ('big', 'JJ'),
 ('data', 'NNS'),
 ('powerful', 'JJ'),
 ('computing', 'VBG'),
 ('enhanced', 'JJ'),
 ('algorithm', 'NN'),
 ('As', 'IN'),
 ('human', 'NN'),
 ('may', 'MD'),
 ('speak', 'VB'),
 ('write', 'JJ'),
 ('English', 'JJ'),
 ('Spanish', 'JJ'),
 ('Chinese', 'NNS'),
 ('But', 'CC'),
 ('computer', 'NN'),
 ('native', 'JJ'),
 ('language', 'NN'),
 ('known', 'VBN'),
 ('machine', 'NN'),
 ('code', 'NN'),
 ('machine', 'NN'),
 ('language', 'NN'),
 ('largely', 'RB'),
 ('incomprehensible', 'JJ'),
 ('people', 'NNS'),
 ('At', 'IN'),
 ('device', 'NN'),
 ('lowest', 'JJS'),
 ('level', 'NN'),
 ('communication', 'NN'),
 ('occurs', 'VBZ'),
 ('word', 'NN'),
 ('millio

In [65]:
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\atish\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\atish\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\words.zip.


True

In [66]:
ner_tagged = nltk.chunk.ne_chunk(tagged_words)

In [68]:
print(ner_tagged)

(S
  While/IN
  natural/JJ
  language/NN
  processing/VBG
  new/JJ
  science/NN
  technology/NN
  rapidly/RB
  advancing/VBG
  thanks/NNS
  increased/VBD
  interest/NN
  human-to-machine/NN
  communication/NN
  plus/CC
  availability/NN
  big/JJ
  data/NNS
  powerful/JJ
  computing/VBG
  enhanced/JJ
  algorithm/NN
  As/IN
  human/NN
  may/MD
  speak/VB
  write/JJ
  English/JJ
  (GPE Spanish/JJ)
  Chinese/NNS
  But/CC
  computer/NN
  native/JJ
  language/NN
  known/VBN
  machine/NN
  code/NN
  machine/NN
  language/NN
  largely/RB
  incomprehensible/JJ
  people/NNS
  At/IN
  device/NN
  lowest/JJS
  level/NN
  communication/NN
  occurs/VBZ
  word/NN
  million/CD
  zero/NN
  one/CD
  produce/NN
  logical/JJ
  action/NN
  knife/VBD
  tree/NN)
