#### Removing words using pre-defined list


In [1]:
noise_removal=['a','the','i','he','she', 'am','is', 'this']

In [2]:
type(noise_removal)

list

In [3]:
[c for c in noise_removal if c not in ['a'] ]

['the', 'i', 'he', 'she', 'am', 'is', 'this']

In [4]:
" ".join(noise_removal)

'a the i he she am is this'

In [5]:
def f_noise_removal(text1):
    text=text1.split()
    cleaned_text= [c_text for c_text in text if c_text.lower() not in noise_removal]
    cleaned_text=' '.join(cleaned_text)
    return cleaned_text
    

In [6]:
t='This is a #sample text'
t.split()

['This', 'is', 'a', '#sample', 'text']

In [7]:
f_noise_removal(t)

'#sample text'

#### Removing words based on pattern

In [8]:
import re

In [9]:
def def_remove_regex(pattern,text):
    urls=re.finditer(pattern,text)
    for i in urls: 
        input_text = re.sub(i.group().strip(), '', text)
        return input_text

In [21]:
def_remove_regex('#[\w]*',t)

'This is a  text'

#### Stemming and Lemmatization

In [42]:
from nltk.stem.wordnet import WordNetLemmatizer
lem = WordNetLemmatizer()

lem.lemmatize('playing','v')

'play'

In [2]:
from nltk.stem.porter import PorterStemmer
stem = PorterStemmer()
stem.stem('bravely')

'brave'

In [3]:
from nltk.stem.lancaster import LancasterStemmer
lstem = LancasterStemmer()
lstem.stem('bravely')

'brav'

In [10]:
from nltk.stem.snowball import SnowballStemmer
sbstem = SnowballStemmer("english")
sbstem.stem('bravely')

'brave'

#### Object standarization

In [35]:
lookup_dict = {'awsm':'awesome', 'rt':'Retweet', 'dm':'direct message','luv' :'love'}

def _lookup_words(input_text):
    words = input_text.split() 
    new_words = [] 
    for word in words:
        if word.lower() in lookup_dict:
            word = lookup_dict[word.lower()]
        new_words.append(word) 
        new_text = ' '.join(new_words) 
    return new_text
    
             

In [36]:
_lookup_words('She is awsm rt he')

'She is awesome Retweet he'

In [24]:
lookup_dict = {'awsm':'awesome', 'rt':'Retweet', 'dm':'direct message','luv' :'love'}

In [33]:
# new_words = [] 
new_words.append('there')

In [34]:
new_words

['Hi', 'there']

#### Escaping HTML characters

Data obtained from web usually contains a lot of html entities like &lt; &gt; &amp; which gets embedded in the original data. It is thus necessary to get rid of these entities. One approach is to directly remove them by the use of specific regular expressions. Another approach is to use appropriate packages and modules (for example htmlparser of Python), which can convert these entities to standard html tags. For example: &lt; is converted to “<” and &amp; is converted to “&”

In [57]:
original_tweet = "I luv my &lt;3 iphone &amp; you’re awsm apple. DisplayIsAwesome, sooo happppppy  http://www.apple.com"

from html.parser import HTMLParser
html_parser= HTMLParser()

tweet = html_parser.unescape(original_tweet)

  


In [58]:
print(tweet)

I luv my <3 iphone & you’re awsm apple. DisplayIsAwesome, sooo happppppy  http://www.apple.com


#### Decoding data

This is the process of transforming information from complex symbols to simple and easier to understand characters. Text data may be subject to different forms of decoding like “Latin”, “UTF8” etc. Therefore, for better analysis, it is necessary to keep the complete data in standard encoding format. UTF-8 encoding is widely accepted and is recommended to use.

In [71]:
tweet = original_tweet.encode('utf-8','ignore')

In [69]:
tweet

'I luv my &lt;3 iphone &amp; you’re awsm apple. DisplayIsAwesome, sooo happppppy  http://www.apple.com'

#### Part of speech tagging

Befenits of POS:
1. Word sense disambiguation
2. Improving word-based features
3. Normalization and Lemmatization
4. Efficient stopword removal

In [13]:
from nltk import word_tokenize, pos_tag
text= "I am learning Natural Language Processing on Analytics Vidhya"
tokens = word_tokenize(text)
pos_tokens = pos_tag(tokens)

In [14]:
pos_tokens

[('I', 'PRP'),
 ('am', 'VBP'),
 ('learning', 'VBG'),
 ('Natural', 'NNP'),
 ('Language', 'NNP'),
 ('Processing', 'NNP'),
 ('on', 'IN'),
 ('Analytics', 'NNP'),
 ('Vidhya', 'NNP')]

### Entity Extraction
#### 1. Named Entity Recognition (NER)

In [27]:
import nltk
doc = '''Andrew Yan-Tak Ng is a Chinese American computer scientist.He is the former chief scientist at Baidu, where 
he led the company'sArtificial Intelligence Group. He is an adjunct professor (formerly associate professor) 
at Stanford University. Ng is also the co-founderand chairman at Coursera, an online education platform. Andrew 
was bornin the UK in 1976. His parents were both from Hong Kong.'''

tokens = nltk.word_tokenize(doc)
pos_tokens = nltk.pos_tag(tokens)
ne_chunks = nltk.ne_chunk(pos_tokens)

# extract all named entities
named_entities = []
for tagged_tree in ne_chunks:    
    if hasattr(tagged_tree, 'label'):        
        entity_name = ' '.join(c[0] for c in tagged_tree.leaves()) 
        entity_type = tagged_tree.label() 
        # get NE category        
        named_entities.append((entity_name, entity_type))
print(named_entities)

[('Andrew', 'PERSON'), ('Chinese', 'GPE'), ('American', 'GPE'), ('Baidu', 'ORGANIZATION'), ('Intelligence Group', 'ORGANIZATION'), ('Stanford University', 'ORGANIZATION'), ('Coursera', 'ORGANIZATION'), ('Andrew', 'PERSON'), ('Hong Kong', 'GPE')]


In [28]:
ne_chunks

LookupError: 

===========================================================================
NLTK was unable to find the gs file!
Use software specific configuration paramaters or set the PATH environment variable.
===========================================================================

Tree('S', [Tree('PERSON', [('Andrew', 'NNP')]), ('Yan-Tak', 'NNP'), ('Ng', 'NNP'), ('is', 'VBZ'), ('a', 'DT'), Tree('GPE', [('Chinese', 'JJ')]), Tree('GPE', [('American', 'JJ')]), ('computer', 'NN'), ('scientist.He', 'NN'), ('is', 'VBZ'), ('the', 'DT'), ('former', 'JJ'), ('chief', 'JJ'), ('scientist', 'NN'), ('at', 'IN'), Tree('ORGANIZATION', [('Baidu', 'NNP')]), (',', ','), ('where', 'WRB'), ('he', 'PRP'), ('led', 'VBD'), ('the', 'DT'), ("company'sArtificial", 'JJ'), Tree('ORGANIZATION', [('Intelligence', 'NNP'), ('Group', 'NNP')]), ('.', '.'), ('He', 'PRP'), ('is', 'VBZ'), ('an', 'DT'), ('adjunct', 'NN'), ('professor', 'NN'), ('(', '('), ('formerly', 'RB'), ('associate', 'NN'), ('professor', 'NN'), (')', ')'), ('at', 'IN'), Tree('ORGANIZATION', [('Stanford', 'NNP'), ('University', 'NNP')]), ('.', '.'), ('Ng', 'NNP'), ('is', 'VBZ'), ('also', 'RB'), ('the', 'DT'), ('co-founderand', 'NN'), ('chairman', 'NN'), ('at', 'IN'), Tree('ORGANIZATION', [('Coursera', 'NNP')]), (',', ','), ('an', 

#### Topic Modelling

In [45]:
 nltk.ne_chunk(pos_tokens)

LookupError: 

===========================================================================
NLTK was unable to find the gs file!
Use software specific configuration paramaters or set the PATH environment variable.
===========================================================================

Tree('S', [Tree('PERSON', [('Andrew', 'NNP')]), ('Yan-Tak', 'NNP'), ('Ng', 'NNP'), ('is', 'VBZ'), ('a', 'DT'), Tree('GPE', [('Chinese', 'JJ')]), Tree('GPE', [('American', 'JJ')]), ('computer', 'NN'), ('scientist.He', 'NN'), ('is', 'VBZ'), ('the', 'DT'), ('former', 'JJ'), ('chief', 'JJ'), ('scientist', 'NN'), ('at', 'IN'), Tree('ORGANIZATION', [('Baidu', 'NNP')]), (',', ','), ('where', 'WRB'), ('he', 'PRP'), ('led', 'VBD'), ('the', 'DT'), ("company'sArtificial", 'JJ'), Tree('ORGANIZATION', [('Intelligence', 'NNP'), ('Group', 'NNP')]), ('.', '.'), ('He', 'PRP'), ('is', 'VBZ'), ('an', 'DT'), ('adjunct', 'NN'), ('professor', 'NN'), ('(', '('), ('formerly', 'RB'), ('associate', 'NN'), ('professor', 'NN'), (')', ')'), ('at', 'IN'), Tree('ORGANIZATION', [('Stanford', 'NNP'), ('University', 'NNP')]), ('.', '.'), ('Ng', 'NNP'), ('is', 'VBZ'), ('also', 'RB'), ('the', 'DT'), ('co-founderand', 'NN'), ('chairman', 'NN'), ('at', 'IN'), Tree('ORGANIZATION', [('Coursera', 'NNP')]), (',', ','), ('an', 