In [1]:
print("Word Embedding in NLP")

Word Embedding in NLP


# Bag of Words

In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.DataFrame({"Text":["Rahul is a good boy",
                           "Vinit is a bad boy",
                           "Vinit watching Pushpa Movie",
                           "War is a flop movie"],
                           "Label":[1,0,1,0]})
df

Unnamed: 0,Text,Label
0,Rahul is a good boy,1
1,Vinit is a bad boy,0
2,Vinit watching Pushpa Movie,1
3,War is a flop movie,0


In [4]:
from sklearn.feature_extraction.text import CountVectorizer
c_vector = CountVectorizer()
cv_df = c_vector.fit_transform(df['Text'])

In [5]:
cv_df

<4x11 sparse matrix of type '<class 'numpy.int64'>'
	with 16 stored elements in Compressed Sparse Row format>

In [6]:
print(c_vector.vocabulary_)

{'rahul': 7, 'is': 4, 'good': 3, 'boy': 1, 'vinit': 8, 'bad': 0, 'watching': 10, 'pushpa': 6, 'movie': 5, 'war': 9, 'flop': 2}


In [7]:
cv_df.toarray()

array([[0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0],
       [1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1],
       [0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0]], dtype=int64)

In [8]:
c_vector.transform(['bad boy Pushpa']).toarray()

array([[1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0]], dtype=int64)

# N-gram Technique

In [9]:
df = pd.DataFrame({"Text":["Rahul is a good boy",
                           "Vinit is a bad boy",
                           "Vinit watching Pushpa Movie",
                           "War is a flop movie"],
                           "Label":[1,0,1,0]})

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
ng_vector = CountVectorizer(ngram_range=(2,2))
ng_df = ng_vector.fit_transform(df['Text'])

In [11]:
print(ng_vector.vocabulary_)

{'rahul is': 7, 'is good': 5, 'good boy': 2, 'vinit is': 8, 'is bad': 3, 'bad boy': 0, 'vinit watching': 9, 'watching pushpa': 11, 'pushpa movie': 6, 'war is': 10, 'is flop': 4, 'flop movie': 1}


In [12]:
from sklearn.feature_extraction.text import CountVectorizer
tri_vector = CountVectorizer(ngram_range=(3,3))
tri_df = tri_vector.fit_transform(df['Text'])

In [13]:
tri_vector.vocabulary_

{'rahul is good': 3,
 'is good boy': 2,
 'vinit is bad': 4,
 'is bad boy': 0,
 'vinit watching pushpa': 5,
 'watching pushpa movie': 7,
 'war is flop': 6,
 'is flop movie': 1}

# TF-IDF (Term Frequency-Inverse Document Frequency)

In [14]:
df = pd.DataFrame({"Text":["Rahul is a good boy",
                           "Vinit is a bad boy",
                           "Vinit watching Pushpa Movie",
                           "War is a flop movie"],
                           "Label":[1,0,1,0]})

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
arr = tfidf.fit_transform(df['Text']).toarray()

In [16]:
arr

array([[0.        , 0.4530051 , 0.        , 0.57457953, 0.36674667,
        0.        , 0.        , 0.57457953, 0.        , 0.        ,
        0.        ],
       [0.61422608, 0.4842629 , 0.        , 0.        , 0.39205255,
        0.        , 0.        , 0.        , 0.4842629 , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.43779123, 0.55528266, 0.        , 0.43779123, 0.        ,
        0.55528266],
       [0.        , 0.        , 0.57457953, 0.        , 0.36674667,
        0.4530051 , 0.        , 0.        , 0.        , 0.57457953,
        0.        ]])

In [17]:
arr[0]

array([0.        , 0.4530051 , 0.        , 0.57457953, 0.36674667,
       0.        , 0.        , 0.57457953, 0.        , 0.        ,
       0.        ])

In [18]:
import pandas as pd
sports = pd.read_csv('Atul.csv')
sports

Unnamed: 0.1,Unnamed: 0,id,category,text
0,0,1,sports,the batsman hit a six over midwicket
1,1,2,sports,the striker scored two goals in the match
2,2,3,sports,the bowler took a wicket and fans cheered
3,3,4,food,i love eating fresh apples and bananas
4,4,5,food,the chef cooked pasta with tomato sauce
5,5,6,food,the bakery sells bread and delicious cakes


In [19]:
sports.drop(['id','Unnamed: 0'],axis = 1, inplace = True)

In [20]:
sports

Unnamed: 0,category,text
0,sports,the batsman hit a six over midwicket
1,sports,the striker scored two goals in the match
2,sports,the bowler took a wicket and fans cheered
3,food,i love eating fresh apples and bananas
4,food,the chef cooked pasta with tomato sauce
5,food,the bakery sells bread and delicious cakes


In [21]:
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\GrowTech\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [22]:
sports['token_text'] = sports['text'].apply(lambda x:word_tokenize(x.lower()))

In [23]:
sports

Unnamed: 0,category,text,token_text
0,sports,the batsman hit a six over midwicket,"[the, batsman, hit, a, six, over, midwicket]"
1,sports,the striker scored two goals in the match,"[the, striker, scored, two, goals, in, the, ma..."
2,sports,the bowler took a wicket and fans cheered,"[the, bowler, took, a, wicket, and, fans, chee..."
3,food,i love eating fresh apples and bananas,"[i, love, eating, fresh, apples, and, bananas]"
4,food,the chef cooked pasta with tomato sauce,"[the, chef, cooked, pasta, with, tomato, sauce]"
5,food,the bakery sells bread and delicious cakes,"[the, bakery, sells, bread, and, delicious, ca..."


In [24]:
sports.drop('text',axis = 1, inplace = True)

In [25]:
sports

Unnamed: 0,category,token_text
0,sports,"[the, batsman, hit, a, six, over, midwicket]"
1,sports,"[the, striker, scored, two, goals, in, the, ma..."
2,sports,"[the, bowler, took, a, wicket, and, fans, chee..."
3,food,"[i, love, eating, fresh, apples, and, bananas]"
4,food,"[the, chef, cooked, pasta, with, tomato, sauce]"
5,food,"[the, bakery, sells, bread, and, delicious, ca..."


In [26]:
import gensim

In [27]:
from scipy.linalg import triu

In [34]:
from gensim.models import Word2Vec
model = Word2Vec(sentences=sports['token_text'].to_list(),
                 vector_size = 100,
                 sg = 1,
                 min_count=1,
                 epochs=100)

In [35]:
model

<gensim.models.word2vec.Word2Vec at 0x2c21bf49990>

In [38]:
print("Batsman Similarity",model.wv.most_similar("batsman"))
print("bakery Similarity",model.wv.most_similar("bakery",topn = 3))
print("goals Similarity",model.wv.most_similar("goals"))


Batsman Similarity [('bowler', 0.18579623103141785), ('six', 0.14559486508369446), ('and', 0.09101609885692596), ('love', 0.08407751470804214), ('striker', 0.08013216406106949), ('over', 0.0645677000284195), ('scored', 0.06405312567949295), ('goals', 0.05997705087065697), ('chef', 0.058389995247125626), ('sells', 0.052152711898088455)]
bakery Similarity [('bowler', 0.18673427402973175), ('bread', 0.17629645764827728), ('match', 0.17092013359069824)]
goals Similarity [('a', 0.22319838404655457), ('bananas', 0.21723012626171112), ('six', 0.2105235755443573), ('fresh', 0.1951323300600052), ('the', 0.18303881585597992), ('two', 0.18024171888828278), ('cakes', 0.1656389832496643), ('midwicket', 0.14453235268592834), ('pasta', 0.13958428800106049), ('scored', 0.12176245450973511)]


In [None]:
print(model.wv.similarity('batsman','striker'))
print(model.wv.similarity('batsman','bowler'))
print(model.wv.similarity('bowler','wicket'))

0.08013214
0.18579623
0.09681683


In [39]:
vec = model.wv.get_normed_vectors()
vec

array([[-0.0032119 ,  0.00955157,  0.09004027, ..., -0.12114186,
         0.02412272,  0.12011483],
       [-0.13753872,  0.06439656,  0.0884329 , ..., -0.04589748,
        -0.15153791,  0.08663685],
       [ 0.00544391,  0.05763754, -0.10900236, ...,  0.00158385,
         0.14231272, -0.10410504],
       ...,
       [ 0.08372267, -0.06036152, -0.07101382, ..., -0.12738474,
         0.16614445, -0.02127205],
       [-0.02318393, -0.06870817, -0.0757602 , ..., -0.066775  ,
        -0.04577483,  0.10464286],
       [ 0.14328322, -0.11054358,  0.1055327 , ...,  0.14548272,
        -0.06118113, -0.16829276]], dtype=float32)

In [None]:
import numpy as np
import face_detection
print(face_detection.available_detectors)
detector = face_detection.build_detector(
  "DSFDDetector", confidence_threshold=.5, nms_iou_threshold=.3)
# [batch size, height, width, 3]
images_dummy = np.zeros((2, 512, 512, 3))

detections = detector.batched_detect(im)