### 17/03/2023

In [1]:
# Import Necessary Libraries

import string

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer

import spacy

import gensim
from gensim.utils import simple_preprocess
from gensim import corpora

import textblob
from textblob import TextBlob

import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
print("nltk version: ", nltk.__version__)
print("textblob version: ", textblob.__version__)
print("sklearn version: ", sklearn.__version__)
print("gensim version: ", gensim.__version__)
print("spacy version: ", spacy.__version__)

# nltk version:  3.8.1
# textblob version:  0.17.1
# sklearn version:  1.2.1
# gensim version:  4.3.1
# spacy version:  3.5.1

nltk version:  3.8.1
textblob version:  0.17.1
sklearn version:  1.2.1
gensim version:  4.3.1
spacy version:  3.5.1


## Extracting Noun Phrases

In [3]:
text = "John is learning natural language processing"

blob = TextBlob(text=text)

In [4]:
for np in blob.noun_phrases:
    print(np)

john
natural language processing


## Finding Similarity Between Texts

In [5]:
documents = (
        "I like NLP",
        "I am exploring NLP",
        "I am a beginner in NLP",
        "I want to learn NLP",
        "I like advanced NLP")

In [6]:
# Create a TfidfVectorizer object

tf_idf_vectorizer = TfidfVectorizer()

In [7]:
tf_idf_matrix = tf_idf_vectorizer.fit_transform(raw_documents=documents)

In [8]:
# All the unique words and their unique keys

tf_idf_vectorizer.vocabulary_

{'like': 6,
 'nlp': 7,
 'am': 1,
 'exploring': 3,
 'beginner': 2,
 'in': 4,
 'want': 9,
 'to': 8,
 'learn': 5,
 'advanced': 0}

In [9]:
len(tf_idf_vectorizer.vocabulary_)

10

In [10]:
# All the unique words

tf_idf_vectorizer.get_feature_names_out()

array(['advanced', 'am', 'beginner', 'exploring', 'in', 'learn', 'like',
       'nlp', 'to', 'want'], dtype=object)

In [11]:
# sparse matrix

tf_idf_matrix

<5x10 sparse matrix of type '<class 'numpy.float64'>'
	with 16 stored elements in Compressed Sparse Row format>

In [12]:
# (5,10) means total sentence is 5 and total unique words are in sentences is 10
# 10 represent the features

tf_idf_matrix.shape

(5, 10)

In [13]:
tf_idf_matrix.toarray()[0]

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.861037  , 0.50854232, 0.        , 0.        ])

In [14]:
tf_idf_matrix[0:1].toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.861037  , 0.50854232, 0.        , 0.        ]])

In [15]:
# compute similarity for first sentence with rest of the sentences

# If we clearly observe, the first sentence and last sentence have higher
# similarity compared to the rest of the sentences.

cosine_similarity(X=tf_idf_matrix[0:1], Y=tf_idf_matrix)

array([[1.        , 0.17682765, 0.14284054, 0.13489366, 0.68374784]])

In [16]:
# same result

cosine_similarity(X=tf_idf_matrix[0:1].toarray(), Y=tf_idf_matrix.toarray())

array([[1.        , 0.17682765, 0.14284054, 0.13489366, 0.68374784]])

## By Phonetic matching

In [17]:
# I will do it later

## Tagging Part of Speech

In [18]:
Text = "I love NLP and I will learn NLP in 2 month"

In [19]:
all_stopwords = set(stopwords.words('english'))

len(all_stopwords)

179

In [20]:
# Tokenize the text into sentence

sent_tokens = sent_tokenize(text=Text, language="english")

sent_tokens

['I love NLP and I will learn NLP in 2 month']

In [21]:
for word in sent_tokens:
    words = word_tokenize(text=word, language="english")
    words = [word for word in words if word not in all_stopwords]
    # POS tagger
    tags = nltk.pos_tag(tokens=words)


print(tags)

[('I', 'PRP'), ('love', 'VBP'), ('NLP', 'NNP'), ('I', 'PRP'), ('learn', 'VBP'), ('NLP', 'RB'), ('2', 'CD'), ('month', 'NN')]


## Extract Entities from Text

In [22]:
sent = "John is studying at Stanford University in California"

In [23]:
# make word tokens

tokens = word_tokenize(text=sent)
tokens

['John', 'is', 'studying', 'at', 'Stanford', 'University', 'in', 'California']

In [24]:
# make pos tag using word tokens

tagged_tokens = nltk.pos_tag(tokens=tokens)
tagged_tokens

[('John', 'NNP'),
 ('is', 'VBZ'),
 ('studying', 'VBG'),
 ('at', 'IN'),
 ('Stanford', 'NNP'),
 ('University', 'NNP'),
 ('in', 'IN'),
 ('California', 'NNP')]

In [25]:
nltk.ne_chunk(tagged_tokens=tagged_tokens, binary=False)

ModuleNotFoundError: No module named 'svgling'

Tree('S', [Tree('PERSON', [('John', 'NNP')]), ('is', 'VBZ'), ('studying', 'VBG'), ('at', 'IN'), Tree('ORGANIZATION', [('Stanford', 'NNP'), ('University', 'NNP')]), ('in', 'IN'), Tree('GPE', [('California', 'NNP')])])

### Using SpaCy

In [26]:
nlp = spacy.load("en_core_web_sm")

In [27]:
# Read/create a sentence

doc = nlp(u'Apple is ready to launch new phone worth $10000 in New york time square ')

In [28]:
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Apple 0 5 ORG
10000 42 47 MONEY
New york 51 59 GPE


## Extracting Topics from Text

In [29]:
doc1 = "I am learning NLP, it is very interesting and exciting.it includes machine learning and deep learning"
doc2 = "My father is a data scientist and he is nlp expert"
doc3 = "My sister has good exposure into android development"

In [30]:
doc1

'I am learning NLP, it is very interesting and exciting.it includes machine learning and deep learning'

In [31]:
doc_list = [doc1, doc2, doc3]

doc_list

['I am learning NLP, it is very interesting and exciting.it includes machine learning and deep learning',
 'My father is a data scientist and he is nlp expert',
 'My sister has good exposure into android development']

In [32]:
# Cleaning and preprocessing

def text_cleaner(docs):
    preprocess_text = []
    for doc in docs:
        # Simple text preprocessing like punctuation, number, make lower case etc
        text = simple_preprocess(doc=doc)
        # Remove the stopwords
        text = [word for word in text if word not in all_stopwords]
        preprocess_text.append(text)
        
    return preprocess_text

In [33]:
result = text_cleaner(docs=doc_list)

In [34]:
result

[['learning',
  'nlp',
  'interesting',
  'exciting',
  'includes',
  'machine',
  'learning',
  'deep',
  'learning'],
 ['father', 'data', 'scientist', 'nlp', 'expert'],
 ['sister', 'good', 'exposure', 'android', 'development']]

#### Preparing document term matrix

In [35]:
# Creating the term dictionary of our corpus, where every
# unique term is assigned an index.

dictionary = corpora.Dictionary(documents=result)

In [36]:
dictionary

<gensim.corpora.dictionary.Dictionary at 0x266cd71aa90>

In [37]:
# Converting a list of documents (corpus) into Document-Term
# Matrix using dictionary prepared above.

doc_term_matrix = [dictionary.doc2bow(document=doc) for doc in result]

doc_term_matrix

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 3), (5, 1), (6, 1)],
 [(6, 1), (7, 1), (8, 1), (9, 1), (10, 1)],
 [(11, 1), (12, 1), (13, 1), (14, 1), (15, 1)]]

#### LDA model

In [38]:
# Creating the object for LDA model using gensim library

lda = gensim.models.ldamodel.LdaModel

In [39]:
# Running and Training LDA model on the document term matrix for 3 topics.

lda_model = lda(corpus=doc_term_matrix, num_topics=3, id2word=dictionary, passes=50)

In [40]:
print(lda_model.print_topics())

[(0, '0.233*"learning" + 0.093*"exciting" + 0.093*"deep" + 0.093*"machine" + 0.093*"includes" + 0.093*"interesting" + 0.093*"nlp" + 0.023*"father" + 0.023*"data" + 0.023*"expert"'), (1, '0.129*"android" + 0.129*"sister" + 0.129*"good" + 0.129*"exposure" + 0.129*"development" + 0.032*"nlp" + 0.032*"father" + 0.032*"scientist" + 0.032*"data" + 0.032*"expert"'), (2, '0.129*"nlp" + 0.129*"father" + 0.129*"data" + 0.129*"scientist" + 0.129*"expert" + 0.032*"good" + 0.032*"exposure" + 0.032*"development" + 0.032*"android" + 0.032*"sister"')]


In [41]:
print(lda_model.print_topics()[0])

(0, '0.233*"learning" + 0.093*"exciting" + 0.093*"deep" + 0.093*"machine" + 0.093*"includes" + 0.093*"interesting" + 0.093*"nlp" + 0.023*"father" + 0.023*"data" + 0.023*"expert"')


`All the weights associated with the topics from the sentence seem
almost similar. You can perform this on huge data to extract significant
topics. The whole idea to implement this on sample data is to make you
familiar with it, and you can use the same code snippet to perform on the
huge data for significant results and insights.`