## Text Analytics
- Extract Sample document and apply following document preprocessing methods: Tokenization, Part of Speech (POS) Tagging, stop words removal, Stemming and Lemmatization.
- Create representation of document by calculating Term Frequency and Inverse Document
Frequency.

In [1]:
import nltk
from nltk import sent_tokenize
from nltk import word_tokenize

In [2]:
!pip install textblob

  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
Defaulting to user installation because normal site-packages is not writeable


In [3]:
import textblob
from textblob import TextBlob

In [4]:
text = "Hello everyone! Welcome to my blog post on Medium. We are studying Natural Language Processing."

In [5]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/ihack-pc/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/ihack-pc/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [7]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/ihack-
[nltk_data]     pc/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
TextBlob(text).words

WordList(['Hello', 'everyone', 'Welcome', 'to', 'my', 'blog', 'post', 'on', 'Medium', 'We', 'are', 'studying', 'Natural', 'Language', 'Processing'])

#### Tokenization

In [9]:
tokens_sents = nltk.sent_tokenize(text)
print(tokens_sents)

['Hello everyone!', 'Welcome to my blog post on Medium.', 'We are studying Natural Language Processing.']


In [10]:
tokens_words = nltk.word_tokenize(text)
print(tokens_words)

['Hello', 'everyone', '!', 'Welcome', 'to', 'my', 'blog', 'post', 'on', 'Medium', '.', 'We', 'are', 'studying', 'Natural', 'Language', 'Processing', '.']


#### Part of Speech (POS) Tagging

In [11]:
pos = nltk.pos_tag(tokens_words)
print(pos)

[('Hello', 'NNP'), ('everyone', 'NN'), ('!', '.'), ('Welcome', 'UH'), ('to', 'TO'), ('my', 'PRP$'), ('blog', 'NN'), ('post', 'NN'), ('on', 'IN'), ('Medium', 'NNP'), ('.', '.'), ('We', 'PRP'), ('are', 'VBP'), ('studying', 'VBG'), ('Natural', 'NNP'), ('Language', 'NNP'), ('Processing', 'NNP'), ('.', '.')]


#### Stop Words Removal

In [12]:
!pip install stop-words

  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
Defaulting to user installation because normal site-packages is not writeable


In [13]:
import nltk
from nltk.corpus import stopwords
set(stopwords.words('english'))

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

#### Stemming and Lemmatization

In [15]:
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer

In [16]:
#create an object of class PorterStemmer
porter = PorterStemmer()
lancaster=LancasterStemmer()
#proide a word to be stemmed
print("Porter Stemmer")
print(porter.stem("cats"))
print(porter.stem("trouble"))
print(porter.stem("troubling"))
print(porter.stem("troubled"))
print("Lancaster Stemmer")
print(lancaster.stem("cats"))
print(lancaster.stem("trouble"))
print(lancaster.stem("troubling"))
print(lancaster.stem("troubled"))

Porter Stemmer
cat
troubl
troubl
troubl
Lancaster Stemmer
cat
troubl
troubl
troubl


### Create representation of document by calculating Term Frequency and Inverse Document Frequency.

In [17]:
corpus = ['data science is one of the most important fields of science',
          'this is one of the best data science courses',
          'data scientists analyze data' ]

In [18]:
words_set = set()

for doc in  corpus:
    words = doc.split(' ')
    words_set = words_set.union(set(words))
    
print('Number of words in the corpus:',len(words_set))
print('The words in the corpus: \n', words_set)

Number of words in the corpus: 14
The words in the corpus: 
 {'most', 'analyze', 'the', 'data', 'of', 'one', 'is', 'important', 'best', 'courses', 'fields', 'scientists', 'this', 'science'}


In [21]:
import pandas as pd
import numpy as np
n_docs = len(corpus)         #·Number of documents in the corpus
n_words_set = len(words_set) #·Number of unique words in the 

df_tf = pd.DataFrame(np.zeros((n_docs, n_words_set)), columns=words_set)

# Compute Term Frequency (TF)
for i in range(n_docs):
    words = corpus[i].split(' ') # Words in the document
    for w in words:
        df_tf[w][i] = df_tf[w][i] + (1 / len(words))
        
df_tf

Unnamed: 0,most,analyze,the,data,of,one,is,important,best,courses,fields,scientists,this,science
0,0.090909,0.0,0.090909,0.090909,0.181818,0.090909,0.090909,0.090909,0.0,0.0,0.090909,0.0,0.0,0.181818
1,0.0,0.0,0.111111,0.111111,0.111111,0.111111,0.111111,0.0,0.111111,0.111111,0.0,0.0,0.111111,0.111111
2,0.0,0.25,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0
