In [1]:
import nltk
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

## 2.1 Noise Removal

Any piece of text which is not relevant to the context of the data and the end-output can be specified as the noise.

A general approach for noise removal is to prepare a dictionary of noisy entities, and iterate the text object by tokens(or by words), eliminating those tokens which are present in noise dictionary

In [8]:
noise_list = ["is", "a", "this", "..."]

def _remove_noise(input_text):
    words = input_text.split()
    noise_free_words = [word for word in words if word not in noise_list]
    noise_free_text = " ".join(noise_free_words)
    return noise_free_text
    
_remove_noise("this is my text")

'my text'

Another approach is to use the regular expressions while dealing with special patterns of noise

In [15]:
import re

def _remove_regex(input_text, regex_pattern):
    urls = re.finditer(regex_pattern, input_text)
    for i in urls:
        input_text = re.sub(i.group().strip(), '', input_text)
    return input_text

regex_pattern = "#[\w]*"

_remove_regex("remove this #hashtag from the sentence", regex_pattern)


'remove this  from the sentence'

## 2.2 Lexicon Normalization

1. Stemming: Stemming is a rudimentary rule-based process of stripping the suffixes (“ing”, “ly”, “es”, “s” etc) from a word.

2. Lemmatization, on the other hand, is an organized & step by step procedure of obtaining the root form of the word, it makes use of vocabulary (dictionary importance of words) and morphological analysis (word structure and grammar relations).

In [17]:
from nltk.stem.wordnet import WordNetLemmatizer
lem = WordNetLemmatizer()

from nltk.stem.porter import PorterStemmer
stem = PorterStemmer()

word = "multiplying"
lem.lemmatize(word, "v")

'multiply'

In [18]:
stem.stem(word)

'multipli'

##  2.3 Object Standardization

Text data often contains words or phrases which are not present in any standard lexical dictionaries. These pieces are not recognized by search engines and models.

In [23]:
lookup_dict = {'rt':'Retweet', 'dm':'direct message', 'awsm':'awesome',
              'luv':'love'}
def _lookup_words(input_text):
    words = input_text.split()
    new_words = []
    for word in words:
        if word.lower() in lookup_dict:
            word = lookup_dict[word.lower()]
        new_words.append(word)
        new_text = " ".join(new_words)
    return new_text

_lookup_words("RT this is a retweeted tweet by Amit Kumar")
        

'Retweet this is a retweeted tweet by Amit Kumar'

## 3.1 POS Tagging

The pos tags defines the usage and function of a word in the sentence

In [24]:
from nltk import word_tokenize, pos_tag

text = "I am learning Natural Language Processing in Data-Science"
tokens = word_tokenize(text)
print(pos_tag(tokens))

[('I', 'PRP'), ('am', 'VBP'), ('learning', 'VBG'), ('Natural', 'NNP'), ('Language', 'NNP'), ('Processing', 'NNP'), ('in', 'IN'), ('Data-Science', 'NNP')]


## 3.2 Entity Extraction (Entities as features)

### Topic modelling

In [25]:
doc1 = "Sugar is bad to consume. My sister likes to have sugar, but not my father."
doc2 = "My father spends a lot of time driving my sister around to dance practice."
doc3 = "Doctors suggest that driving may cause increased stress and blood pressure."
doc_complete = [doc1, doc2, doc3]
doc_clean = [doc.split() for doc in doc_complete]

In [32]:
import gensim
from gensim import corpora

In [36]:
# Creating the term dictionary of our corpus, where every unique term is assigned an index

dictionary = corpora.Dictionary(doc_clean)

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.

doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

In [37]:
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

In [38]:
# Running and Training LDA model on the documnt-term-matrix
ldamodel = Lda(doc_term_matrix, num_topics = 3, 
              id2word = dictionary, passes=50)

In [39]:
# Results

print(ldamodel.print_topics())

[(0, '0.060*"driving" + 0.060*"pressure." + 0.060*"that" + 0.060*"Doctors" + 0.060*"and" + 0.060*"may" + 0.060*"stress" + 0.060*"suggest" + 0.060*"increased" + 0.060*"cause"'), (1, '0.089*"to" + 0.051*"my" + 0.051*"sister" + 0.051*"My" + 0.051*"not" + 0.051*"likes" + 0.051*"have" + 0.051*"bad" + 0.051*"consume." + 0.051*"is"'), (2, '0.053*"driving" + 0.053*"sister" + 0.053*"My" + 0.053*"my" + 0.053*"father" + 0.053*"around" + 0.053*"a" + 0.053*"lot" + 0.053*"time" + 0.053*"spends"')]


### N-Grams as Features

In [40]:
def generate_ngrams(text, n):
    words = text.split()
    output = []
    for i in range(len(words)-n+1):
        output.append(words[i:i+n])
    return output

generate_ngrams('This is a simple text', 2)

[['This', 'is'], ['is', 'a'], ['a', 'simple'], ['simple', 'text']]

## 3.3 TF-IDF

In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer
obj = TfidfVectorizer()
corpus = ['This is sample document.', 'another random document.', 'third sample document text']
X = obj.fit_transform(corpus)
print(X)

  (0, 1)	0.34520501686496574
  (0, 4)	0.444514311537431
  (0, 2)	0.5844829010200651
  (0, 7)	0.5844829010200651
  (1, 3)	0.652490884512534
  (1, 0)	0.652490884512534
  (1, 1)	0.3853716274664007
  (2, 5)	0.5844829010200651
  (2, 6)	0.5844829010200651
  (2, 1)	0.34520501686496574
  (2, 4)	0.444514311537431


## 3.4 Word Embeddings

In [45]:
from gensim.models import Word2Vec
sentences = [['data', 'science'], ['vidhya', 'science', 'data', 'analytics'],
            ['machine', 'learning'], ['deep', 'learning']]

# Train the model on your corpus
model = Word2Vec(sentences, min_count=1)

In [46]:
model.similarity('data', 'science')

  """Entry point for launching an IPython kernel.


-0.10650278

In [47]:
model['learning']

  """Entry point for launching an IPython kernel.


array([ 4.7074642e-04, -4.0449761e-03,  4.4476958e-03,  4.9279034e-03,
        3.3356827e-03,  4.9486686e-03,  2.4012255e-03, -1.4502240e-03,
        3.8642315e-03, -5.9284244e-05, -9.9866789e-05,  3.2409059e-03,
        4.7122361e-03,  3.4542959e-03, -3.9117397e-03,  3.2164054e-03,
        2.6465210e-03,  2.2529254e-03, -4.7050547e-03, -4.4319904e-03,
        2.3893011e-03, -2.5542651e-03, -9.5354568e-04, -1.5621400e-03,
       -4.7711013e-03,  2.1777414e-03,  4.7961813e-03,  4.1071028e-03,
        3.7132013e-03,  7.4373034e-04,  4.1487932e-04, -1.4916781e-03,
       -6.3188130e-04, -1.1342270e-03, -2.5184678e-03, -3.2168392e-03,
        3.9951233e-04, -1.6535937e-03, -2.1824467e-03, -6.7424844e-04,
       -3.9499383e-03, -4.3253209e-03, -3.1940225e-03,  3.4526286e-03,
        4.3719881e-03, -3.1065263e-03,  2.4647971e-03, -5.3021009e-04,
       -1.8466379e-03,  4.2802030e-03,  3.3262454e-03,  3.3525510e-03,
       -1.4499974e-03, -2.1693400e-04,  4.3362179e-03, -4.3266122e-03,
      