<hr style="border:2px solid gray">


## **STEP: 0/4** - Install prerequisites and Import libraries

# Requirements

In [None]:
# download spacy module

%%time

!python -m spacy download en_core_web_sm 
# If working on Colab, restart runtime after this step or else Colab won't find spacy

In [4]:
# Install Prerequesties
!pip install -r requirements.txt 


# Import Libraries and Load Packages

In [25]:
%%time
"""
Import Statements
"""

# Base
import re
import pandas as pd
import numpy as np

# NLP Libraries
import spacy

import nltk
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

import string

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


CPU times: user 27 µs, sys: 6 µs, total: 33 µs
Wall time: 33.9 µs


In [20]:
# Update the DATA_PATH variable

import sys

if 'google.colab' in sys.modules:
  # If you're on Colab:
  DATA_PATH = 'https://raw.githubusercontent.com/bloominstituteoftechnology/ds_code_along_unit_4/main/data/COVID-19-Twitter-India/'
else:
  # If you're working locally:
  DATA_PATH = '..../data/'

<hr style="border:2px solid gray">

## **STEP: 1/4** - Tokenization and stop words

In [26]:
text = "Natural language processing (NLP) is a field " + \
       "of computer science, artificial intelligence " + \
       "and computational linguistics concerned with " + \
       "the interactions between computers and human " + \
       "(natural) languages, and, in particular, " + \
       "concerned with programming computers to " + \
       "fruitfully process large natural language " + \
       "corpora. Challenges in natural language " + \
       "processing frequently involve natural " + \
       "language understanding, natural language" + \
       "generation frequently from formal, machine" + \
       "-readable logical forms, connecting language " + \
       "and machine perception, managing human-" + \
       "computer dialog systems, or some combination " + \
       "thereof."

In [27]:
text

'Natural language processing (NLP) is a field of computer science, artificial intelligence and computational linguistics concerned with the interactions between computers and human (natural) languages, and, in particular, concerned with programming computers to fruitfully process large natural language corpora. Challenges in natural language processing frequently involve natural language understanding, natural languagegeneration frequently from formal, machine-readable logical forms, connecting language and machine perception, managing human-computer dialog systems, or some combination thereof.'

# Tokenization using using python (case normalization and regex)

In [28]:
sample_text = re.sub('[,().]','',text)

In [29]:
sample_text= re.sub('-',' ',sample_text)

In [30]:
sample_text

'Natural language processing NLP is a field of computer science artificial intelligence and computational linguistics concerned with the interactions between computers and human natural languages and in particular concerned with programming computers to fruitfully process large natural language corpora Challenges in natural language processing frequently involve natural language understanding natural languagegeneration frequently from formal machine readable logical forms connecting language and machine perception managing human computer dialog systems or some combination thereof'

In [31]:
sample_tokens = sample_text.lower().split()

In [32]:
sample_tokens


['natural',
 'language',
 'processing',
 'nlp',
 'is',
 'a',
 'field',
 'of',
 'computer',
 'science',
 'artificial',
 'intelligence',
 'and',
 'computational',
 'linguistics',
 'concerned',
 'with',
 'the',
 'interactions',
 'between',
 'computers',
 'and',
 'human',
 'natural',
 'languages',
 'and',
 'in',
 'particular',
 'concerned',
 'with',
 'programming',
 'computers',
 'to',
 'fruitfully',
 'process',
 'large',
 'natural',
 'language',
 'corpora',
 'challenges',
 'in',
 'natural',
 'language',
 'processing',
 'frequently',
 'involve',
 'natural',
 'language',
 'understanding',
 'natural',
 'languagegeneration',
 'frequently',
 'from',
 'formal',
 'machine',
 'readable',
 'logical',
 'forms',
 'connecting',
 'language',
 'and',
 'machine',
 'perception',
 'managing',
 'human',
 'computer',
 'dialog',
 'systems',
 'or',
 'some',
 'combination',
 'thereof']

# Tokenization using nltk
Here we will see how to tokenize the text i.e. divide whole text into smaller chunks either chunks of sentences or chunks of words. There are two ways of tokenization i.e. sentence tokenization and word tokenization.

In [None]:
#download punkt to help with tokenization

nltk.download('punkt')

In [33]:
# Here we are considering sample corpus for sentence tokenization

tokenized_sents = sent_tokenize(text)
tokenized_words = word_tokenize(text)

print(tokenized_sents,'\n',tokenized_words)

['Natural language processing (NLP) is a field of computer science, artificial intelligence and computational linguistics concerned with the interactions between computers and human (natural) languages, and, in particular, concerned with programming computers to fruitfully process large natural language corpora.', 'Challenges in natural language processing frequently involve natural language understanding, natural languagegeneration frequently from formal, machine-readable logical forms, connecting language and machine perception, managing human-computer dialog systems, or some combination thereof.'] 
 ['Natural', 'language', 'processing', '(', 'NLP', ')', 'is', 'a', 'field', 'of', 'computer', 'science', ',', 'artificial', 'intelligence', 'and', 'computational', 'linguistics', 'concerned', 'with', 'the', 'interactions', 'between', 'computers', 'and', 'human', '(', 'natural', ')', 'languages', ',', 'and', ',', 'in', 'particular', ',', 'concerned', 'with', 'programming', 'computers', 'to

# Remove stop words from a list of tokens
Stopwords are the words which doesn't make that much sense when we play with sentences. For e.g the, has, have etc. So Let's remove it from list of tokens we generated above to cut down training time of building of machine learning model for downstream tasks.

Let's import stopwords using nltk library and filter from list of word tokens.

In [None]:
nltk.download('stopwords')

In [34]:
#list of stop words
stop_words = stopwords.words('English')

In [35]:
# list of punctuations
punctuation_list = list(string.punctuation)

In [36]:
useless_words = stop_words + punctuation_list

In [37]:
filtered_tokenized_words = [word.lower() for word in tokenized_words if not word.lower() in useless_words]
print(filtered_tokenized_words) 

['natural', 'language', 'processing', 'nlp', 'field', 'computer', 'science', 'artificial', 'intelligence', 'computational', 'linguistics', 'concerned', 'interactions', 'computers', 'human', 'natural', 'languages', 'particular', 'concerned', 'programming', 'computers', 'fruitfully', 'process', 'large', 'natural', 'language', 'corpora', 'challenges', 'natural', 'language', 'processing', 'frequently', 'involve', 'natural', 'language', 'understanding', 'natural', 'languagegeneration', 'frequently', 'formal', 'machine-readable', 'logical', 'forms', 'connecting', 'language', 'machine', 'perception', 'managing', 'human-computer', 'dialog', 'systems', 'combination', 'thereof']


<hr style="border:2px solid gray">

## **STEP: 2/4** - Lemmatization and Vectorization


# Stemming and Lemmatization
Stemming or lemmatization means generation of root form of inflected words. For e.g charge is derived by stemming or lemmatization of charging,charges etc.

* There is little theoritical difference between these two i.e stemming may not result actual word but lemmatization will result actual word as root. So we can say that lemmatization is specialized form of stemming.

In [None]:
nltk.download('wordnet') # WordNet is a large word database of English Nouns, Adjectives, Adverbs and Verbs. 
nltk.download('omw-1.4') # Multilingual WordNet data

In [38]:
# Stemming using NLTK

ps = PorterStemmer()
words = ['Programs','Programming','Charging','Studying','Coding']
for w in words:
    print(ps.stem(w))

program
program
charg
studi
code


In [39]:
# Lemmatization using NLTK

wnl = WordNetLemmatizer()

words_ = ['rocks','programs','guests','games']
for w in words_:
    print(wnl.lemmatize(w))


rock
program
guest
game


# Vectorization

* Count Vectorizer 

In [40]:
count_vectorizer = CountVectorizer(stop_words='english')
vectors = count_vectorizer.fit(tokenized_sents)
dtm=vectors.transform(tokenized_sents)


print('After vectorization')
print(dtm.toarray())

After vectorization
[[1 0 0 1 1 2 2 0 1 0 1 0 0 0 1 1 1 1 0 2 0 1 1 1 0 0 0 3 1 1 0 1 1 1 0 1
  0 0 0]
 [0 1 1 0 1 0 0 1 0 1 0 1 1 2 0 1 0 0 1 3 1 0 0 0 1 2 1 3 0 0 1 0 1 0 1 0
  1 1 1]]


In [41]:
# print vocabulary
sorted(vectors.vocabulary_.keys())

['artificial',
 'challenges',
 'combination',
 'computational',
 'computer',
 'computers',
 'concerned',
 'connecting',
 'corpora',
 'dialog',
 'field',
 'formal',
 'forms',
 'frequently',
 'fruitfully',
 'human',
 'intelligence',
 'interactions',
 'involve',
 'language',
 'languagegeneration',
 'languages',
 'large',
 'linguistics',
 'logical',
 'machine',
 'managing',
 'natural',
 'nlp',
 'particular',
 'perception',
 'process',
 'processing',
 'programming',
 'readable',
 'science',
 'systems',
 'thereof',
 'understanding']

In [None]:
vectors.get_feature_names()

In [43]:
dtm_df= pd.DataFrame(dtm.todense(), columns=vectors.get_feature_names_out()) # remember to convert sparse matrix to dense

In [44]:
dtm_df


Unnamed: 0,artificial,challenges,combination,computational,computer,computers,concerned,connecting,corpora,dialog,...,particular,perception,process,processing,programming,readable,science,systems,thereof,understanding
0,1,0,0,1,1,2,2,0,1,0,...,1,0,1,1,1,0,1,0,0,0
1,0,1,1,0,1,0,0,1,0,1,...,0,1,0,1,0,1,0,1,1,1


* TF-IDF Vectorizer

In [45]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english',use_idf = True, analyzer = 'word',max_df=0.98, min_df=0.2) # try ngram_range=(1,2)
tfidf_vectors = tfidf_vectorizer.fit_transform(tokenized_sents)

In [46]:
dtm_tfid_df = pd.DataFrame(tfidf_vectors.todense(), columns=tfidf_vectorizer.get_feature_names_out())

In [47]:
dtm_tfid_df

Unnamed: 0,artificial,challenges,combination,computational,computers,concerned,connecting,corpora,dialog,field,...,nlp,particular,perception,process,programming,readable,science,systems,thereof,understanding
0,0.208514,0.0,0.0,0.208514,0.417029,0.417029,0.0,0.208514,0.0,0.208514,...,0.208514,0.208514,0.0,0.208514,0.208514,0.0,0.208514,0.0,0.0,0.0
1,0.0,0.208514,0.208514,0.0,0.0,0.0,0.208514,0.0,0.208514,0.0,...,0.0,0.0,0.208514,0.0,0.0,0.208514,0.0,0.208514,0.208514,0.208514


In [48]:
print('Features names', tfidf_vectorizer.get_feature_names_out())

Features names ['artificial' 'challenges' 'combination' 'computational' 'computers'
 'concerned' 'connecting' 'corpora' 'dialog' 'field' 'formal' 'forms'
 'frequently' 'fruitfully' 'intelligence' 'interactions' 'involve'
 'languagegeneration' 'languages' 'large' 'linguistics' 'logical'
 'machine' 'managing' 'nlp' 'particular' 'perception' 'process'
 'programming' 'readable' 'science' 'systems' 'thereof' 'understanding']


<hr style="border:2px solid gray">

## **STEP: 3/4** - Working with documents


# Working with documents

In [52]:
# Let's use spacy pretrained word embeddings models trained on english wikipedia text
nlp = spacy.load('en_core_web_sm') # here en_core_web_sm is small pretrained word embedding model, en_core_web_md -> medium size pretrained model and en_core_web_lg -> large size pretrained model

In [49]:
# Let's work with all the tweet texts from the dataset 
data = pd.read_csv(DATA_PATH + 'tweets_2020-05-29-20.csv')
docs = list(data['full_retweet_text'])

docs = [str(tweet) for tweet in docs]

In [50]:
# we have 333 tweets to work with

len(docs)

333

In [None]:
# Use Spacy to Tokenize the tweets: Remove the stop words and the punctuation

spacy_stop_words = nlp.Defaults.stop_words

# Initialize a list to hold the tokens
all_doc_tokens = []

# Loop over each tweet in the document (doc)
for tweet in nlp.pipe(docs):
    for token in tweet:
        if (not token.is_punct) & (token.text.lower() not in spacy_stop_words):
            all_doc_tokens.append(token.lemma_.lower())

    
print(all_doc_tokens)

# Represent document as vector
Let's say we have bunch of sentences in a document and we wanna do classify texts then how we can fed input to the machine learning model. As system understands numeric values, we need to convert text into numeric as feature vectors representation so that it can be fed to model.

* Count Vectorizer

In [54]:
# Initialize vectorizer
count_vectorizer = CountVectorizer(stop_words='english')
vectors = count_vectorizer.fit(docs)
dtm = vectors.transform(docs)


print('After vectorization')
print(dtm.toarray())

After vectorization
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


* TF-IDF Vectorizer

In [55]:
tfidf_vectorizer = TfidfVectorizer(use_idf = True, analyzer = 'word', ngram_range=(1,2))
tfidf_vectorizer.fit(docs)
tfidf_vectors = tfidf_vectorizer.fit_transform(docs)
print('TF-IDF Vectors', tfidf_vectors.toarray())
print('Features names', tfidf_vectorizer.get_feature_names_out())

TF-IDF Vectors [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
Features names ['000' '000 american' '000 cured' ... 'रष bjp' 'शक' 'शक र_शर']



<hr style="border:2px solid gray">

## **STEP: 4/4** - Query Similar Documents and apply Word Embeddings Model


# Query documents by similarity

In [59]:

# Let's consider first sentence in docs as query sentence
query = tfidf_vectors[0:1] # you can try selecting all tweets too
similarity_matrix = cosine_similarity(query,tfidf_vectors.toarray())

In [60]:
df = pd.DataFrame(similarity_matrix)

In [61]:
df 

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,323,324,325,326,327,328,329,330,331,332
0,1.0,0.0,0.008562,0.022453,0.009981,0.035421,0.01643,0.019815,0.003262,0.023725,...,0.0,0.01228,0.0,0.018053,0.030093,0.042061,0.0,0.020359,0.006193,0.023989


# Apply Word Embeddings Model to create Document Vectors

Let's explore another method to create document vectors i.e. pretrained word embeddings models to get contextual features from given text sequences. These document vectors can help to classify sentences and many other downstream tasks further.

In [62]:
# Here for each word in sentence particular value is being assigned as per spacy pretrained word embedding model
embed_vectors = [nlp(tweet).vector for tweet in docs]
print(embed_vectors)


[array([ 0.1781972 ,  0.03810461,  0.02440613, -0.3426875 ,  0.1372305 ,
        0.10989691, -0.47072077,  0.20923725,  0.19090615,  0.1750239 ,
        0.29537416, -0.03672197,  0.50558084,  0.204298  ,  0.1838027 ,
       -0.43580192, -0.1882704 , -0.03998079,  0.13696824,  0.4407724 ,
       -0.22326969,  0.01215597, -0.45642847, -0.03065534,  0.08043876,
        0.19559655, -0.3935933 ,  0.21441841, -0.21127547,  0.72149235,
       -0.5664713 ,  0.42360315, -0.544156  , -0.29419294,  0.36287487,
       -0.5464101 , -0.17700328, -0.03893398, -0.35411596,  0.06650092,
       -0.2559975 ,  0.16325662, -0.3731788 , -0.5642181 ,  0.18970577,
       -0.2450711 ,  0.3713584 ,  0.39287716,  0.00518529, -0.5599252 ,
        0.21150525,  0.18735416, -0.1235215 , -0.32119626, -0.0785481 ,
       -0.22418298, -0.08879691,  0.19711718,  0.41132307, -0.16733766,
       -0.0305415 , -0.2930893 , -0.0321517 ,  0.00598675,  0.39748073,
        0.02183857,  0.25851294,  0.19148788,  0.17034008,  0.0

In [64]:
df_embed = pd.DataFrame(embed_vectors)

In [65]:
df_embed

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,86,87,88,89,90,91,92,93,94,95
0,0.178197,0.038105,0.024406,-0.342687,0.137231,0.109897,-0.470721,0.209237,0.190906,0.175024,...,0.206177,-0.074054,-0.624755,0.336991,-0.202258,-0.094049,-0.237105,0.344203,0.324530,0.345032
1,2.698226,0.182291,0.196163,-1.254865,-1.533015,0.206064,-0.537408,1.261437,1.196564,-0.431594,...,-0.785855,-0.521321,-0.729125,0.086701,-0.109811,0.991540,0.884877,2.728298,0.104550,1.418898
2,0.768992,-0.068758,-0.103599,-0.010683,0.115504,0.171835,-0.651004,-0.131217,0.298662,0.163337,...,-0.319197,-0.433490,-0.599921,-0.234984,0.108900,-0.132587,0.022678,0.369493,0.096096,-0.263887
3,0.167787,-0.070213,0.061516,0.067428,-0.025000,0.063575,-0.366887,0.046630,0.264854,0.086271,...,0.113987,0.006707,-0.262098,0.158845,-0.210786,-0.346910,0.045913,-0.128698,0.146041,0.126022
4,0.184401,0.286167,0.370524,-0.259738,-0.294947,0.514672,-0.428766,-0.160979,0.845089,0.372120,...,0.034547,0.167993,-0.241829,-0.317188,0.159859,-0.433067,0.054420,0.024135,0.573659,-0.369331
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
328,-0.059690,0.303378,0.240624,0.018851,-0.261259,-0.114480,-0.374483,-0.003789,0.303777,0.228572,...,0.598605,-0.113086,-0.142512,-0.024820,-0.241463,-0.467789,-0.195620,-0.031800,0.183066,-0.035960
329,2.698226,0.182291,0.196163,-1.254865,-1.533015,0.206064,-0.537408,1.261437,1.196564,-0.431594,...,-0.785855,-0.521321,-0.729125,0.086701,-0.109811,0.991540,0.884877,2.728298,0.104550,1.418898
330,0.001256,0.029498,0.121657,0.042041,-0.123596,-0.249053,-0.221475,-0.204886,0.197770,0.226347,...,0.601223,-0.149356,-0.121632,0.187688,-0.081365,-0.429411,0.065073,-0.199160,0.093544,-0.148340
331,0.345819,-0.031832,0.125749,-0.301722,-0.319915,0.271491,0.004907,-0.033300,0.011422,-0.066301,...,-0.057659,0.159972,-0.264241,0.258170,0.117200,-0.377605,0.090091,0.028164,0.407784,-0.060219
