In [1]:
#TF:(term frequency) frequency of a word in sentence / total number of words in the sentence
#IDF(inverse document frequency): log of (total number of sentences / number of sentences containing the word)
#TF-IDF of values: product of TF and IDF for the values

In [2]:
#bag of words in python
!pip install bs4
!pip install nltk
import nltk
import numpy as np
import random
import string
import bs4 as bs
import re
import urllib.request
from nltk.corpus import stopwords
from gensim.models import Word2Vec
import spacy



In [3]:
raw_html = urllib.request.urlopen('https://en.wikipedia.org/wiki/COVID-19_pandemic_in_India')  
raw_html = raw_html.read()

In [4]:
article_html = bs.BeautifulSoup(raw_html)
article_paragraphs = article_html.find_all('p')
article_text = ''

In [5]:
for para in article_paragraphs:  
    article_text += para.text

In [6]:
nltk.download('punkt')
corpus = nltk.sent_tokenize(article_text)

[nltk_data] Downloading package punkt to /home/bavanya/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
for i in range(len(corpus )):
    corpus [i] = corpus [i].lower()
    corpus [i] = re.sub(r'\W',' ',corpus [i])
    corpus [i] = re.sub(r'\s+',' ',corpus [i])

In [8]:
print(len(corpus))

332


In [9]:
print(corpus[1])

the first case of covid 19 in india which originated from china was reported on 30 january 2020 


In [10]:
wordfreq = {}
for sentence in corpus:
    tokens = nltk.word_tokenize(sentence)
    for token in tokens:
        if token not in wordfreq.keys():
            wordfreq[token] = 1
        else:
            wordfreq[token] += 1

In [11]:
import heapq
most_freq = heapq.nlargest(200, wordfreq, key=wordfreq.get)

In [12]:
sentence_vectors = []
for sentence in corpus:
    sentence_tokens = nltk.word_tokenize(sentence)
    sent_vec = []
    for token in most_freq:
        if token in sentence_tokens:
            sent_vec.append(1)
        else:
            sent_vec.append(0)
    sentence_vectors.append(sent_vec)

In [13]:
sentence_vectors = np.asarray(sentence_vectors)

In [14]:
#Word to vector using Gensim library: Word2Vec embedding approach uses 
#neural networks and deep learning to convert words to corresponding
#vectors in such a way that the semantically similar vectors are close to each other
#in N dimensional space 

In [15]:
#1. Skip Gram model
#2. Continuous bag of words model
# in the word2vec model the dimensions of word is very less compared to bag of words 
#or TF_IDF models as the dimension represents aspect of the word in this model
#so semantic information of the word is not lost here unlike in other models

In [16]:
all_words = [nltk.word_tokenize(sent) for sent in corpus]

In [17]:
for i in range(len(all_words)):
    all_words[i] = [w for w in all_words[i] if w not in nltk.corpus.stopwords.words('english')]

In [18]:
word2vec = Word2Vec(all_words, min_count=2)#min count removes uncommon words
#with occurrence less than 2 in all_words

In [19]:
vocabulary = word2vec.wv.vocab
print(vocabulary)

{'covid': <gensim.models.keyedvectors.Vocab object at 0x7ff957030b00>, '19': <gensim.models.keyedvectors.Vocab object at 0x7ff957030b70>, 'pandemic': <gensim.models.keyedvectors.Vocab object at 0x7ff93a6c4320>, 'india': <gensim.models.keyedvectors.Vocab object at 0x7ff93a6c4be0>, 'part': <gensim.models.keyedvectors.Vocab object at 0x7ff93a6c4b38>, 'coronavirus': <gensim.models.keyedvectors.Vocab object at 0x7ff93a6c4dd8>, 'disease': <gensim.models.keyedvectors.Vocab object at 0x7ff93a6c4b70>, '2019': <gensim.models.keyedvectors.Vocab object at 0x7ff93a6c4eb8>, 'caused': <gensim.models.keyedvectors.Vocab object at 0x7ff93a6c4f60>, 'severe': <gensim.models.keyedvectors.Vocab object at 0x7ff93a6c4ac8>, 'acute': <gensim.models.keyedvectors.Vocab object at 0x7ff93a6c4cf8>, 'respiratory': <gensim.models.keyedvectors.Vocab object at 0x7ff93a72e780>, '2': <gensim.models.keyedvectors.Vocab object at 0x7ff93a72e320>, 'sars': <gensim.models.keyedvectors.Vocab object at 0x7ff93a72e748>, 'cov': <ge

In [20]:
v1 = word2vec.wv['covid']

In [21]:
print(v1)

[ 1.2976769e-03 -5.1747584e-03 -3.9950406e-04 -2.6983595e-03
  2.5110752e-03 -5.7730349e-03  3.1711319e-03  3.1592313e-03
  4.6426216e-03 -5.0211716e-03  1.1250640e-03  5.9835934e-03
 -2.3522126e-03 -7.4082543e-04  1.0398029e-03  4.0857173e-03
 -9.0996438e-04 -2.1599000e-03 -3.2720268e-03 -7.7306456e-04
  1.0918799e-03 -1.5140178e-03  2.4075103e-03  2.3662290e-04
  2.1754182e-03 -3.2091916e-03  1.1629497e-03 -3.4771576e-03
 -5.3478748e-04 -3.3749745e-03  2.0628801e-04 -3.6910886e-03
 -7.6705188e-04 -5.7392876e-04 -1.3933936e-03 -2.3743552e-03
 -1.4813132e-04  4.0441528e-03  3.0693684e-03  2.7743443e-03
  2.8050181e-03  3.3243261e-03 -3.0142777e-03 -2.1121630e-03
  3.2214369e-03  4.2142076e-03  3.8554322e-03 -1.7996834e-03
  2.5294649e-03  8.5355954e-05  4.3402524e-03 -1.4267372e-03
  1.4653469e-03 -5.7587842e-04 -1.4257132e-03  5.1422669e-03
 -3.9166049e-03  3.4902999e-03  1.6070030e-03  1.2476518e-03
  1.5714469e-04  4.8480644e-03  8.7638659e-04  1.1975091e-03
 -4.1402597e-03  2.13339

In [22]:
sim_words = word2vec.wv.most_similar('covid')

In [23]:
print(sim_words)

[('shared', 0.3834906220436096), ('per', 0.3379613161087036), ('2', 0.32632824778556824), ('nationwide', 0.3199097812175751), ('march', 0.31761130690574646), ('three', 0.3151320517063141), ('month', 0.3115532398223877), ('york', 0.30023646354675293), ('may', 0.2909044623374939), ('reported', 0.2820582389831543)]


In [24]:
###spacy library and lemmatization and stemming

In [25]:
import en_core_web_sm
nlp = en_core_web_sm.load()
sp  = spacy.load('en_core_web_sm')

In [26]:
sentence = sp(u'The cave you fear to enter holds the treasure you seek')

In [27]:
for word in sentence:
    print(word.text)

The
cave
you
fear
to
enter
holds
the
treasure
you
seek


In [28]:
for word in sentence:
    print(word.text,  word.pos_)

The DET
cave NOUN
you PRON
fear VERB
to PART
enter VERB
holds VERB
the DET
treasure NOUN
you PRON
seek VERB


In [29]:
# dependency parsing using dep_
for word in sentence:
    print(word.dep_)

det
nsubj
nsubj
relcl
aux
xcomp
ROOT
det
dobj
nsubj
relcl


In [None]:
##named entity recognition
