## Spacy Embeddings & similarity check

In [9]:
import spacy

In [10]:
nlp = spacy.blank("en")

In [11]:
doc = nlp("Test Experiment sentence for tokenization")

In [12]:
for token in doc:
    print(token.text, token.pos_, token.dep_)

Test  
Experiment  
sentence  
for  
tokenization  


In [13]:
token = doc[1]
token.text

'Experiment'

In [14]:
span = doc[1:3]  ## token slice, but not string slice
print(span.text)

Experiment sentence


In [15]:
## Boolean checking of characteristics of tokens

print("Index:   ", [token.i for token in doc])
print("Text:    ", [token.text for token in doc])
print("is_alpha:", [token.is_alpha for token in doc])
print("is_punct:", [token.is_punct for token in doc])
print("like_num:", [token.like_num for token in doc])

Index:    [0, 1, 2, 3, 4]
Text:     ['Test', 'Experiment', 'sentence', 'for', 'tokenization']
is_alpha: [True, True, True, True, True]
is_punct: [False, False, False, False, False]
like_num: [False, False, False, False, False]


In [16]:
nlp = spacy.load('en_core_web_md')

In [17]:
corpus = nlp(u"Apple and Samsung are mobile brands")

In [18]:
Apple = corpus[0]
Samsung = corpus[2]
mobile = corpus[4]
brands = corpus[5]

In [22]:
emb = Apple.vector

print(emb)
print("Embedding dimension: ", len(emb))  ## embedding dimension

[-3.6391e-01  4.3771e-01 -2.0447e-01 -2.2889e-01 -1.4227e-01  2.7396e-01
 -1.1435e-02 -1.8578e-01  3.7361e-01  7.5339e-01 -3.0591e-01  2.3741e-02
 -7.7876e-01 -1.3802e-01  6.6992e-02 -6.4303e-02 -4.0024e-01  1.5309e+00
 -1.3897e-02 -1.5657e-01  2.5366e-01  2.1610e-01 -3.2720e-01  3.4974e-01
 -6.4845e-02 -2.9501e-01 -6.3923e-01 -6.2017e-02  2.4559e-01 -6.9334e-02
 -3.9967e-01  3.0925e-02  4.9033e-01  6.7524e-01  1.9481e-01  5.1488e-01
 -3.1149e-01 -7.9939e-02 -6.2096e-01 -5.3277e-03 -1.1264e-01  8.3528e-02
 -7.6947e-03 -1.0788e-01  1.6628e-01  4.2273e-01 -1.9009e-01 -2.9035e-01
  4.5630e-02  1.0120e-01 -4.0855e-01 -3.5000e-01 -3.6175e-01 -4.1396e-01
  5.9485e-01 -1.1524e+00  3.2424e-02  3.4364e-01 -1.9209e-01  4.3255e-02
  4.9227e-02 -5.4258e-01  9.1275e-01  2.9576e-01  2.3658e-02 -6.8737e-01
 -1.9503e-01 -1.1059e-01 -2.2567e-01  2.4180e-01 -3.1230e-01  4.2700e-01
  8.3952e-02  2.2703e-01  3.0581e-01 -1.7276e-01  3.2536e-01  5.4696e-03
 -3.2745e-01  1.9439e-01  2.2616e-01  7.4742e-02  2

#### POS Tagging

In [25]:
for token in corpus:
    print(token.text, "---> ", token.pos_)

Apple --->  PROPN
and --->  CCONJ
Samsung --->  PROPN
are --->  AUX
mobile --->  ADJ
brands --->  NOUN


#### Linguistic tagging (NP -> Noun Phrase, VB -> verb phrase .. etc)

In [29]:
## Syntactic tagging also represents relation between tokens

for token in corpus:
    print(token.text, "---> ", token.pos_, "\tTags --> ", token.tag_, "\tSyntactic dependency: " , token.dep_)

Apple --->  PROPN 	Tags -->  NNP 	Syntactic dependency:  nsubj
and --->  CCONJ 	Tags -->  CC 	Syntactic dependency:  cc
Samsung --->  PROPN 	Tags -->  NNP 	Syntactic dependency:  conj
are --->  AUX 	Tags -->  VBP 	Syntactic dependency:  ROOT
mobile --->  ADJ 	Tags -->  JJ 	Syntactic dependency:  amod
brands --->  NOUN 	Tags -->  NNS 	Syntactic dependency:  attr


In [30]:
## Visualizing the dependency parsing

from spacy import displacy

displacy.serve(corpus, style="dep")

  "__main__", mod_spec)



Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


#### NER / Label Tagging

In [42]:
for ent in corpus.ents:
    print(ent.text,  ent.label_, "\tstart_pos" , ent.start_char, "end_pos", ent.end_char,)

Apple ORG 	start_pos 0 end_pos 5
Samsung ORG 	start_pos 10 end_pos 17


In [45]:
displacy.serve(corpus, style="ent")

  "__main__", mod_spec)



Using the 'ent' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


##### Checking words if they are out of vocabulary

In [46]:
for token in corpus:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov)

Apple True 7.1346846 False
and True 4.6577983 False
Samsung True 7.77001 False
are True 5.41568 False
mobile True 6.9603257 False
brands True 6.6324863 False


#### Similary between two text segments

In [49]:
text1 = nlp("Dell Laptop for sale at Rs 49999, i3, 1TB Memory")
text2 = nlp("i3 processor laptop for sale at Rs 48000 with 1 Tera Byte Harddrive")

In [51]:
print(text1.similarity(text2))  ## similarity score range(0-1)

0.8624262310396199


In [74]:
t2 = nlp("Sofa furniture size 24x12 pale blue color")

In [75]:
print(text1.similarity(t2))

0.4332451634463822


In [82]:
tokens = []

for ent in text1.ents:
    print(ent.text,  ent.label_, "\tstart_pos" , ent.start_char, "end_pos", ent.end_char)
    #tokens.append(text1[ent.start_char:ent.end_char])
    
displacy.serve(text1, style="ent")

Dell Laptop PERSON 	start_pos 0 end_pos 11
Rs 49999 PRODUCT 	start_pos 24 end_pos 32
i3 PRODUCT 	start_pos 34 end_pos 36
1 CARDINAL 	start_pos 38 end_pos 39


  "__main__", mod_spec)



Using the 'ent' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


#### token to token comparision

### Reading e-commerce data

In [85]:
import json

data_ = []
for line in open('12147_final_df.json', 'r'):
    data_.append(json.loads(line))

In [88]:
len(data_)

385052

In [89]:
data_[0]

{'index': 9.0,
 'match_available_price': 40.95,
 'match_source': 'Amazon-US',
 'match_stock': 'In Stock',
 'match_crawl_time': 20210228175,
 'match_raw_subcategory': 'Sandals',
 'match_seed_urlh': '17abcf03e89b3beea1ad99928603caa04648fc62',
 'match_mrp': 40.95,
 'match_title': 'Salt Water Sandals by Hoy Shoe The Original Sandal Shiny Yellow 1 Little Kid  ',
 'match_raw_brand': 'Salt Water Sandals',
 'match_urlh': '17b4e6f1e1dc6bcc50d90a19698dfc6eed3359b0',
 'match_sku': 'B005BFPO3W',
 'match_raw_category': 'Clothing, Shoes & Jewelry',
 'match_color': 'Shiny Yellow',
 'match_crawl_type': 'product',
 'match_crawl_date': 20210228,
 'match_status': 0.0,
 'match_url': 'https://www.amazon.com/Salt-Water-Sandals-Original-Toddler/dp/B005BFPO3W',
 'match_datasource_s': '',
 'match_country': 'usa',
 'match_size': '1 Little Kid',
 'match_upc': '',
 'match_seed_url': 'https://www.amazon.com/Salt-Water-Sandals-Original-Sandal/dp/B0067GKH3Q',
 'match_thumbnail': 'https://m.media-amazon.com/images/I/

## FastText

## Word2Vec

In [1]:
from gensim.models.word2vec import Word2Vec
import gensim.downloader as api

In [3]:
corpus = api.load('text8')



In [4]:
model = Word2Vec(corpus)  ## training on corpus

In [5]:
print(model.wv.most_similar('tree'))

[('trees', 0.677410900592804), ('bark', 0.6747909784317017), ('leaf', 0.6383562684059143), ('flower', 0.6171472072601318), ('bird', 0.6062579154968262), ('fruit', 0.6017009019851685), ('avl', 0.5758088231086731), ('cave', 0.571187436580658), ('cactus', 0.5711280107498169), ('leaves', 0.5696697235107422)]


In [None]:
## Available pre-trained models


In [3]:
# print(api.load('glove-wiki-gigaword-50', return_path=True))
model = api.load("glove-wiki-gigaword-100")



IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)






In [5]:
model.wv.most_similar("glass") ## no difference in using .wv method for models

  """Entry point for launching an IPython kernel.


[('ceramic', 0.6910325288772583),
 ('stained', 0.6856706142425537),
 ('walls', 0.6817072629928589),
 ('plastic', 0.6806964874267578),
 ('window', 0.6744905710220337),
 ('metal', 0.6706480979919434),
 ('stone', 0.6591024398803711),
 ('piece', 0.6523524522781372),
 ('pieces', 0.6489548683166504),
 ('shards', 0.6478654742240906)]

In [6]:
model.most_similar("glass")

[('ceramic', 0.6910325288772583),
 ('stained', 0.6856706142425537),
 ('walls', 0.6817072629928589),
 ('plastic', 0.6806964874267578),
 ('window', 0.6744905710220337),
 ('metal', 0.6706480979919434),
 ('stone', 0.6591024398803711),
 ('piece', 0.6523524522781372),
 ('pieces', 0.6489548683166504),
 ('shards', 0.6478654742240906)]

#### Bi-gram model

In [None]:
from gensim.test.utils import common_texts

In [None]:
## training a bigram detector
bigram_transformer = Phrases(common_texts

In [None]:
# Apply the trained MWE detector to a corpus, using the result to train a Word2vec model.
model = Word2Vec(bigram_transformer[common_texts], min_count=1)

## Glove 

In [1]:
import os
from gensim.scripts.glove2word2vec import glove2word2vec

In [None]:
root_folder='.'
data_folder_name='data'
glove_filename='glove.6B.100d.txt'
word2vec_output_file = glove_filename+'.word2vec'

# glove_path = os.path.abspath(os.path.join(DATA_PATH, glove_filename))
# glove2word2vec(glove_path, word2vec_output_file)

#### Available glove models
 glove.6B_tokens - (50-300 dimension) <br>
 glove.42B_tokens <br>
 glove.840B_tokens <br>
 glove.27B_tokens_twitter

In [4]:
import numpy as np

embeddings_index = {}
f = open('./glove.6B.100d.txt', encoding="utf8")
for line in f:
    values = line.split(' ')
    word = values[0] ## The first entry is the word
    coefs = np.asarray(values[1:], dtype='float32') ## These are the vecotrs representing the embedding for the word
    embeddings_index[word] = coefs
f.close()

print('GloVe data loaded')

GloVe data loaded


In [8]:
embeddings_index['laptop']

array([-0.12612 ,  0.15564 ,  0.73378 , -0.20459 ,  0.44796 ,  0.55318 ,
        0.74986 ,  0.044664,  0.70504 ,  0.5721  ,  0.68294 ,  0.404   ,
       -0.30313 ,  0.35006 , -0.37475 ,  0.035355, -0.060059,  0.45026 ,
        0.45652 , -0.10797 , -0.5212  , -0.5381  ,  0.52531 ,  0.26234 ,
        0.38583 , -0.27634 , -0.2725  , -0.3148  , -0.29824 ,  0.34577 ,
        0.47973 ,  0.61487 , -0.21743 ,  0.56996 ,  0.84115 ,  0.59942 ,
       -0.21932 ,  0.11974 ,  1.196   , -0.27857 ,  0.47654 , -0.057031,
        0.56029 , -0.45929 , -1.1889  ,  0.098238,  0.3118  , -0.16599 ,
        0.59446 ,  0.19975 ,  0.3876  ,  0.19775 , -0.30316 , -0.08063 ,
        0.27396 , -0.40205 , -0.55974 , -0.21332 ,  1.3266  , -0.39926 ,
       -0.031734,  0.58554 , -0.38878 ,  0.51607 , -0.57022 ,  0.047489,
        0.54649 , -0.30561 ,  0.45699 ,  0.70653 ,  0.77859 ,  0.22167 ,
        0.88696 , -0.6587  , -0.72107 ,  0.93184 ,  0.34877 ,  0.35567 ,
       -0.11896 ,  0.47235 ,  0.81035 , -0.083377, 

## TF-IFD Embeddings

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
bow_rep_tfidf = tfidf.fit_transform(processed_docs)



In [None]:
print(tfidf.tf_)   ## ID for all words in vocabulary
print(tfidf.idf_)   ## IDF for all words in the vocabulary


In [None]:
temp = tfidf.transform(['sentence of your choice'])
print(temp.to_array())