In [1]:
import nltk

# noise removal


In [2]:
"""A general approach for noise removal is to prepare a dictionary of noisy entities, and iterate the text object by tokens (or by words), eliminating those tokens which are present in the noise dictionary."""
noise_list = ["is", "a", "this", "..."] 
def _remove_noise(input_text):
    words = input_text.split() 
    noise_free_words = [word for word in words if word not in noise_list] 
    noise_free_text = " ".join(noise_free_words) 
    return noise_free_text

_remove_noise("this is a sample text")


'sample text'

In [3]:
#Another approach is to use the regular expressions while dealing with special patterns of noise. 
import re 

def _remove_regex(input_text, regex_pattern):
    urls = re.finditer(regex_pattern, input_text) 
    for i in urls: 
        input_text = re.sub(i.group().strip(), '', input_text)
    return input_text

regex_pattern = "#[\w]*"  

_remove_regex("remove this #hashtag from here", regex_pattern)

'remove this  from here'

# LEXICAL Normalization

In [4]:
"""Stemming:  Stemming is a rudimentary rule-based process of stripping the 
suffixes (“ing”, “ly”, “es”, “s” etc) from a word."""
"""Lemmatization: Lemmatization, on the other hand, is an organized 
& step by step procedure of obtaining the root form of the word, it 
makes use of vocabulary (dictionary importance of words)
and morphological analysis (word structure and grammar relations)."""

from nltk.stem.wordnet import WordNetLemmatizer 
lem = WordNetLemmatizer()

from nltk.stem.porter import PorterStemmer 
stem = PorterStemmer()

word = "multiplying" 
lem.lemmatize(word, "v")

stem.stem(word)

'multipli'

In [5]:
from nltk.stem.wordnet import WordNetLemmatizer 
lem = WordNetLemmatizer()

from nltk.stem.porter import PorterStemmer 
stem = PorterStemmer()

word = "multiplying" 


print('\n\nStemming\n\n')
print(stem.stem(word))



Stemming


multipli


#  Object Standardization

In [6]:
"""Text data often contains words or phrases which are not present in any standard lexical dictionaries. 
These pieces are not recognized by search engines and models.
"""
"""Some of the examples are – acronyms, hashtags with attached words, and colloquial slangs.
With the help of regular expressions and manually prepared data dictionaries, 
this type of noise can be fixed, the code below uses a dictionary lookup method to 
replace social media slangs from a text.
"""

lookup_dict = {'rt':'Retweet', 'dm':'direct message', "awsm" : "awesome", "luv" :"love"}
def _lookup_words(input_text):
    words = input_text.split() 
    new_words = [] 
    for word in words:
        if word.lower() in lookup_dict:
            word = lookup_dict[word.lower()]
        new_words.append(word) 
        new_text = " ".join(new_words) 
        return new_text

_lookup_words("RT this is a retweeted tweet by Shivam Bansal")

'Retweet'

# Text to Features (Feature Engineering on text data)

In [7]:
"""To analyse a preprocessed data, it needs to be converted into features. 
Depending upon the usage, text features can be constructed using assorted techniques – 
Syntactical Parsing, Entities / N-grams / word-based features, Statistical features, 
and word embeddings.
"""

#NLTK performs pos tagging annotation on input text.
from nltk import word_tokenize, pos_tag
text = "I am learning Natural Language Processing on BPEC GLOBAL"
tokens = word_tokenize(text)
print (pos_tag(tokens))

[('I', 'PRP'), ('am', 'VBP'), ('learning', 'VBG'), ('Natural', 'NNP'), ('Language', 'NNP'), ('Processing', 'NNP'), ('on', 'IN'), ('BPEC', 'NNP'), ('GLOBAL', 'NNP')]


# Entity Extraction (Entities as features)

# Named Entity Recognition (NER)
"""Noun phrase identification: This step deals with extracting all the noun phrases from a text using dependency parsing 
and part of speech tagging.
"""
"""Phrase classification: This is the classification step in which all the extracted noun phrases are 
classified into respective categories (locations, names etc)
"""
"""Entity disambiguation: Sometimes it is possible that entities are misclassified, hence creating a validation layer on 
top of the results is useful. 
"""

#Topic Modeling
doc1 = "Sugar is bad to consume. My sister likes to have sugar, but not my father." 
doc2 = "My father spends a lot of time driving my sister around to dance practice."
doc3 = "Doctors suggest that driving may cause increased stress and blood pressure."
doc_complete = [doc1, doc2, doc3]
doc_clean = [doc.split() for doc in doc_complete]

import gensim from gensim
import corpora

# Creating the term dictionary of our corpus, where every unique term is assigned an index.  
dictionary = corpora.Dictionary(doc_clean)

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above. 
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

# Running and Training LDA model on the document term matrix
ldamodel = Lda(doc_term_matrix, num_topics=3, id2word = dictionary, passes=50)

# Results 
print(ldamodel.print_topics())



In [9]:
doc1 = "Sugar is bad to consume. My sister likes to have sugar, but not my father." 
doc2 = "My father spends a lot of time driving my sister around to dance practice."
doc3 = "Doctors suggest that driving may cause increased stress and blood pressure."
doc_complete = [doc1, doc2, doc3]
doc_clean = [doc.split() for doc in doc_complete]

import gensim
import corpora
# Creating the term dictionary of our corpus, where every unique term is assigned an index.  
dictionary = corpora.Dictionary(doc_clean)

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above. 
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

# Running and Training LDA model on the document term matrix
ldamodel = Lda(doc_term_matrix, num_topics=3, id2word = dictionary, passes=50)

# Results 
print(ldamodel.print_topics())

ModuleNotFoundError: No module named 'corpora'

# N-Grams as Features

In [10]:
#A combination of N words together are called N-Grams. 
def generate_ngrams(text, n):
    words = text.split()
    output = []  
    for i in range(len(words)-n+1):
        output.append(words[i:i+n])
    return output
generate_ngrams('this is a sample text', 2)

[['this', 'is'], ['is', 'a'], ['a', 'sample'], ['sample', 'text']]

# Term Frequency – Inverse Document Frequency (TF – IDF)

In [11]:
#Term Frequency (TF) – TF for a term “t” is defined as the count of a term “t” in a document “D”
#Inverse Document Frequency (IDF) – IDF for a term is defined as logarithm of ratio of total documents available in the corpus and number of documents containing the term T.
from sklearn.feature_extraction.text import TfidfVectorizer
obj = TfidfVectorizer()
corpus = ['This is sample document.', 'another random document.', 'third sample document text']
X = obj.fit_transform(corpus)
print(X)

  (0, 7)	0.5844829010200651
  (0, 2)	0.5844829010200651
  (0, 4)	0.444514311537431
  (0, 1)	0.34520501686496574
  (1, 1)	0.3853716274664007
  (1, 0)	0.652490884512534
  (1, 3)	0.652490884512534
  (2, 4)	0.444514311537431
  (2, 1)	0.34520501686496574
  (2, 6)	0.5844829010200651
  (2, 5)	0.5844829010200651


#  Word Embedding (text vectors)

In [12]:
"""Word embedding is the modern way of representing words as vectors. The aim of word embedding is to 
redefine the high dimensional word features into low dimensional feature 
vectors by preserving the contextual similarity in the corpus. 
They are widely used in deep learning models such as Convolutional Neural Networks and Recurrent Neural Networks.
"""

from gensim.models import Word2Vec
sentences = [['data', 'science'], ['vidhya', 'science', 'data', 'analytics'],['machine', 'learning'], ['deep', 'learning']]

# train the model on your corpus  
model = Word2Vec(sentences, min_count = 1)

print(model.similarity('data', 'science'))


print(model['learning'])  


0.0060849637
[-4.36116382e-03  8.72070959e-04 -4.74658003e-03  2.03363621e-03
  2.08381331e-03  1.36481761e-03 -7.59526447e-04  1.52456958e-03
  1.41860789e-03  1.36948380e-04  3.29496688e-03  4.22716932e-03
 -3.21365288e-03 -4.87517193e-03 -1.53432611e-05 -4.15516831e-03
  2.97219609e-03 -6.24293403e-04 -2.06238218e-03  2.89041572e-03
 -3.54294991e-03 -3.43783665e-03  4.83840052e-03  2.48028850e-03
  4.11147298e-03 -2.28869799e-03  4.47443360e-03  4.48022457e-03
 -1.67458528e-03 -4.57673846e-03  1.41022343e-03  4.72342409e-03
  4.62310127e-04 -4.34963638e-03  1.58302649e-03 -3.92860780e-03
 -2.54442007e-03 -2.80732312e-03  3.67403356e-03 -3.60862864e-03
  1.36086077e-03  1.73131598e-03  6.89856941e-04 -4.01573908e-03
 -3.57138319e-03  3.96361761e-03 -4.93294152e-04  4.48956201e-03
  1.25909678e-03  2.01720581e-03  4.57006600e-03  2.34440691e-03
  8.52089433e-04  1.61166734e-03  3.96562088e-03 -9.41561128e-04
  5.96790574e-04 -1.46789337e-03  4.14076174e-04  3.09072714e-03
  3.79930343

  "C extension not loaded, training will be slow. "
  del sys.path[0]
  app.launch_new_instance()


# Text Classification

In [13]:
"""Text classification is one of the classical problem of NLP. Notorious examples include – 
Email Spam Identification, topic classification of news, sentiment classification and organization of
web pages by search engines.
"""

from textblob.classifiers import NaiveBayesClassifier as NBC
from textblob import TextBlob
training_corpus = [
                   ('I am exhausted of this work.', 'Class_B'),
                   ("I can't cooperate with this", 'Class_B'),
                   ('He is my badest enemy!', 'Class_B'),
                   ('My management is poor.', 'Class_B'),
                   ('I love this burger.', 'Class_A'),
                   ('This is an brilliant place!', 'Class_A'),
                   ('I feel very good about these dates.', 'Class_A'),
                   ('This is my best work.', 'Class_A'),
                   ("What an awesome view", 'Class_A'),
                   ('I do not like this dish', 'Class_B')]
test_corpus = [
                ("I am not feeling well today.", 'Class_B'), 
                ("I feel brilliant!", 'Class_A'), 
                ('Gary is a friend of mine.', 'Class_A'), 
                ("I can't believe I'm doing this.", 'Class_B'), 
                ('The date was good.', 'Class_A'), ('I do not enjoy my job', 'Class_B')]

model = NBC(training_corpus) 
print(model.classify("Their codes are amazing."))
 
print(model.classify("I don't like their computer."))

print(model.accuracy(test_corpus))


Class_A
Class_B
0.8333333333333334


# Scikit.Learn also provides a pipeline framework for text classification


In [16]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report
from sklearn import svm 

# preparing data for SVM model (using the same training_corpus, test_corpus from naive bayes example)
train_data = []
train_labels = []
for row in training_corpus:
    train_data.append(row[0])
    train_labels.append(row[1])

test_data = [] 
test_labels = [] 
for row in test_corpus:
    test_data.append(row[0]) 
    test_labels.append(row[1])

# Create feature vectors 
vectorizer = TfidfVectorizer(min_df=4, max_df=0.9)
# Train the feature vectors
train_vectors = vectorizer.fit_transform(train_data)
# Apply model on test data 
test_vectors = vectorizer.transform(test_data)

# Perform classification with SVM, kernel=linear 
model = svm.SVC(kernel='linear') 
model.fit(train_vectors, train_labels) 
prediction = model.predict(test_vectors)


print (classification_report(test_labels, prediction))

              precision    recall  f1-score   support

     Class_A       0.50      0.67      0.57         3
     Class_B       0.50      0.33      0.40         3

    accuracy                           0.50         6
   macro avg       0.50      0.50      0.49         6
weighted avg       0.50      0.50      0.49         6

