### Word2Vec

##### Pre-trained 

In [2]:
from gensim.models.word2vec import Word2Vec
import gensim.downloader as api
from numpy import dot
from numpy.linalg import norm

In [6]:
#1
model = api.load("glove-wiki-gigaword-100") ## 128 MB

In [628]:
#2
model = api.load("glove-wiki-gigaword-300") ## 376 MB



In [None]:
#3
model = api.load("glove-twitter-25") ## 104 MB

In [None]:
#4
model = api.load("glove-twitter-100") ## 387 MB

In [None]:
#5
model = api.load("word2vec-google-news-300")  ## 1662 MB

#### sentence similarity

In [814]:
DIM = 300

In [1039]:
s1 = "Jo Malone London™ Dark Amber & Ginger Lily Scented Home Candle NO COLOR  7 oz"
s2 = "Jo Malone London Dark Amber & Ginger Lily Home Candle"

In [1045]:
e1 = sen_emb(s1) 
e2 = sen_emb(s2)

In [1046]:
e1, e2 = pad_emb(e1, e2, DIM)

In [1047]:
cos_sim = dot(e1, e2)/(norm(e1)*norm(e2))
print(cos_sim)

0.5476768247725139


In [1048]:
e1_ = avg_emb(s1, DIM)
e2_ = avg_emb(s2, DIM)

In [1049]:
cos_sim = dot(e1_, e2_)/(norm(e1_)*norm(e2_))
print(cos_sim)

1.0


###### word similarity

In [1054]:
model.similarity("jacket", "coat")

0.8291835

##### alpha-numeric words

In [1059]:
a1 = "3 1/2 Pillow" 
a2 = "3.5 Pillow"

e1 = sen_emb(a1) 
e2 = sen_emb(a2)

e1, e2 = pad_emb(e1, e2, DIM)

cos_sim = dot(e1, e2)/(norm(e1)*norm(e2))
print("By concatenation: ", cos_sim)

e1_ = avg_emb(s1, DIM)
e2_ = avg_emb(s2, DIM)
print("By average: ", dot(e1_, e2_)/(norm(e1_)*norm(e2_)))

By concatenation:  1.0
By average:  1.0


##### spell mistakes

In [None]:
model['lapttp']

##### Effect of prefix/suffix

In [1064]:
model.similarity("shirt", "shirtless")

0.44347695

##### Singular / Plural 

In [1070]:
model.similarity("child", "children")

0.8452649

#### Context / Ambiguity check

In [1080]:
s1 = "I bought a Apple PC on sale"
s2 = "I bought a Apple fruit on sale"

e1 = sen_emb(s1) 
e2 = sen_emb(s2)

e1, e2 = pad_emb(e1, e2, DIM)

cos_sim = dot(e1, e2)/(norm(e1)*norm(e2))
print("By concatenation: ", cos_sim)

e1_ = avg_emb(s1, DIM)
e2_ = avg_emb(s2, DIM)
print("By average: ", dot(e1_, e2_)/(norm(e1_)*norm(e2_)))

By concatenation:  0.9072405258660791
By average:  1.0


#### Phrases detection

In [None]:
model.similarity("wireless charger", "new")

## Glove

##### pre-trained

In [1083]:
from gensim.scripts.glove2word2vec import glove2word2vec

In [1240]:
embeddings_index = {}
f = open('./glove.42B.300d.txt', encoding="utf8")
for line in f:
    values = line.split(' ')
    word = values[0]           ## The first entry is the word
    coefs = np.asarray(values[1:], dtype='float32')   ## These are the vecotrs representing the embedding for the word
    embeddings_index[word] = coefs
f.close()

print('GloVe data loaded')

GloVe data loaded


In [1241]:
DIM = 300

In [1243]:
len(embeddings_index)

1917495

##### sentence similarity

In [1330]:
s1 = "Jo Malone London™ Dark Amber & Ginger Lily Scented Home Candle NO COLOR  7 oz"
s2 = "Jo Malone London Dark Amber & Ginger Lily Home Candle"

In [1336]:
e1 = sen_emb_g(s1) 
e2 = sen_emb_g(s2)

In [1337]:
e1, e2 = pad_emb(e1, e2, DIM)

In [1338]:
cos_sim = dot(e1, e2)/(norm(e1)*norm(e2))
print(cos_sim)

0.4925424632614757


In [1339]:
e1_ = avg_emb_g(s1, DIM)
e2_ = avg_emb_g(s2, DIM)

In [1340]:
cos_sim = dot(e1_, e2_)/(norm(e1_)*norm(e2_))
print(cos_sim)

0.99999994


##### word similarity

In [1345]:
e1 = embeddings_index['see']
e2 = embeddings_index['sea']

cos_sim = dot(e1, e2)/(norm(e1)*norm(e2))
print(cos_sim)

0.42877197


##### Alpha numeric

In [1353]:
a1 = "2M size shirt" 
a2 = "shirt"

e1 = sen_emb_g(a1) 
e2 = sen_emb_g(a2)

e1, e2 = pad_emb(e1, e2, DIM)

cos_sim = dot(e1, e2)/(norm(e1)*norm(e2))
print("By concatenation: ", cos_sim)

e1_ = avg_emb_g(s1, DIM)
e2_ = avg_emb_g(s2, DIM)
print("By average: ", dot(e1_, e2_)/(norm(e1_)*norm(e2_)))

By concatenation:  0.0788578400714062
By average:  0.99999994


##### spell-mistakes

In [None]:
embeddings_index['jecket']

##### prefix/suffix

In [1364]:
e1 = embeddings_index['shirt']
e2 = embeddings_index['shirtless']

cos_sim = dot(e1, e2)/(norm(e1)*norm(e2))
print(cos_sim)

0.2853798


##### singular / plural

In [1370]:
e1 = embeddings_index['child']
e2 = embeddings_index['children']

cos_sim = dot(e1, e2)/(norm(e1)*norm(e2))
print(cos_sim)

0.8334375


##### Context check

In [1378]:
a1 = "I bought a Apple PC on sale" 
a2 = "I bought a Apple fruit on sale"

e1 = sen_emb_g(a1) 
e2 = sen_emb_g(a2)

e1, e2 = pad_emb(e1, e2, DIM)

cos_sim = dot(e1, e2)/(norm(e1)*norm(e2))
print("By concatenation: ", cos_sim)

e1_ = avg_emb_g(s1, DIM)
e2_ = avg_emb_g(s2, DIM)
print("By average: ", dot(e1_, e2_)/(norm(e1_)*norm(e2_)))

By concatenation:  0.8943847240157649
By average:  0.99999994


##### Phrases detection  

In [None]:
embeddings_index['water resistant']

### Glove - custom train on e-comm data

In [None]:
from glove import Corpus, Glove

In [1389]:
corpus = pd.read_csv('processed_Dtrain_4f.csv')

corpus['title'] = [ '' if x is np.NaN else x for x in corpus['title'] ]
corpus['match_title'] = [ '' if x is np.NaN else x for x in corpus['match_title'] ]
corpus['description'] = [ '' if x is np.NaN else x for x in corpus['description'] ]
corpus['match_description'] = [ '' if x is np.NaN else x for x in corpus['match_description'] ]
corpus['color'] = [ '' if x is np.NaN else x for x in corpus['color'] ]

In [1390]:
import nltk
from nltk.tokenize import word_tokenize

corpus['title'] = corpus.apply(lambda row: nltk.word_tokenize(row['title']), axis=1)
corpus['match_title'] = corpus.apply(lambda row: nltk.word_tokenize(row['match_title']), axis=1)
corpus['description'] = corpus.apply(lambda row: nltk.word_tokenize(row['description']), axis=1)
corpus['match_description'] = corpus.apply(lambda row: nltk.word_tokenize(row['match_description']), axis=1)
corpus['color'] = corpus.apply(lambda row: nltk.word_tokenize(row['color']), axis=1)

In [1391]:
sentences = []
for row in corpus['title']:
    sentences.append(list(row))
    
for row in corpus['match_title']:
    sentences.append(list(row))

for row in corpus['description']:
    sentences.append(list(row))
    
for row in corpus['match_description']:
    sentences.append(list(row))
    
for row in corpus['color']:
    sentences.append(list(row))

In [1392]:
len(sentences)

1540205

In [1394]:
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.test.utils import get_tmpfile
from gensim.models import KeyedVectors

In [1395]:
glove_file = './glove.6B.100d.txt'

In [1397]:
tmp_file = get_tmpfile("test_word2vec.txt")

_ = glove2word2vec(glove_file, tmp_file)

glove_vectors = KeyedVectors.load_word2vec_format(tmp_file)

In [1398]:
# build a word2vec model on your dataset 

base_model = Word2Vec(size=200, min_count=1)
base_model.build_vocab(sentences)  ## our custom dataset
total_examples = base_model.corpus_count

In [1400]:
# add GloVe's vocabulary & weights
base_model.build_vocab([list(glove_vectors.vocab.keys())], update=True)

# train on your data
base_model.train(sentences, total_examples=total_examples, epochs=base_model.epochs)
base_model_wv = base_model.wv

In [1401]:
len(base_model_wv.vocab)

451771

In [1415]:
DIM = 200

In [1512]:
s1 = "Salt Water Sandals by Hoy Original Sandal (Baby, Walker, Toddler, Little Kid & Big Kid) SHINY YELLOW Little Kid 2 M"
s2 = "Salt Water Sandals by Hoy Shoe The Original Sandal Shiny Yellow 1 Little Kid"

In [1523]:
e1 = sen_emb_g(s1) 
e2 = sen_emb_g(s2)

In [1524]:
e1, e2 = pad_emb(e1, e2, DIM)

In [1525]:
cos_sim = dot(e1, e2)/(norm(e1)*norm(e2))
print(cos_sim)

0.2848395999882002


In [1531]:
e1_ = avg_emb_g(s1, DIM)
e2_ = avg_emb_g(s2, DIM)

In [1532]:
cos_sim = dot(e1_, e2_)/(norm(e1_)*norm(e2_))
print(cos_sim)

1.0000001


In [1546]:
base_model_wv.similarity('jacket', 'coat')

0.49098107

In [1561]:
a1 = "2m size shirt" 
a2 = "shirt"

e1 = sen_emb_g(a1) 
e2 = sen_emb_g(a2)

e1, e2 = pad_emb(e1, e2, DIM)

cos_sim = dot(e1, e2)/(norm(e1)*norm(e2))
print("By concatenation: ", cos_sim)

e1_ = avg_emb_g(s1, DIM)
e2_ = avg_emb_g(s2, DIM)
print("By average: ", dot(e1_, e2_)/(norm(e1_)*norm(e2_)))

By concatenation:  -2.8504344761133525e-06
By average:  1.0000001


In [1574]:
base_model_wv.similarity('shoe', 'shoes')

0.61495805

In [1578]:
a1 = "river bank holds surprises for people" 
a2 = "Federal bank holds surprise for people"

e1 = sen_emb_g(a1) 
e2 = sen_emb_g(a2)

e1, e2 = pad_emb(e1, e2, DIM)

cos_sim = dot(e1, e2)/(norm(e1)*norm(e2))
print("By concatenation: ", cos_sim)

e1_ = avg_emb_g(s1, DIM)
e2_ = avg_emb_g(s2, DIM)
print("By average: ", dot(e1_, e2_)/(norm(e1_)*norm(e2_)))

By concatenation:  0.9780601484327588
By average:  1.0000001


In [None]:
base_model_wv['wireless charger']

### Wor2Vec custom trained 

In [1589]:
model = Word2Vec(sentences, min_count=1, size=200, workers=10, window=9, iter=30)
model.train(sentences, total_examples=len(sentences), epochs=30)

(830790695, 982184550)

In [1590]:
model.save("word2vec_200d_w9_m1.model")
len(model.wv.vocab)

89813

In [1733]:
model = Word2Vec.load("word2vec_100d_w9_m1.model")

In [1734]:
model['laptop'].shape

  """Entry point for launching an IPython kernel.


(100,)

In [1735]:
DIM = 100

##### sentence similarity

In [1809]:
s1 = "Jo Malone London™ Dark Amber & Ginger Lily Scented Home Candle NO COLOR  7 oz"
s2 = "Jo Malone London Dark Amber & Ginger Lily Home Candle"

In [1810]:
e1 = sen_emb(s1) 
e2 = sen_emb(s2)



In [1811]:
e1, e2 = pad_emb(e1, e2, DIM)

In [1812]:
cos_sim = dot(e1, e2)/(norm(e1)*norm(e2))
print(cos_sim)

0.5253544213039529


In [1813]:
e1_ = avg_emb(s1, DIM)
e2_ = avg_emb(s2, DIM)

  del sys.path[0]


In [1814]:
cos_sim = dot(e1_, e2_)/(norm(e1_)*norm(e2_))
print(cos_sim)

1.0000001


##### word similarity

In [1819]:
model.similarity("see", "sea")

  """Entry point for launching an IPython kernel.


0.0066966154

##### alpha-numeric words

In [1823]:
a1 = "2M size shirt" 
a2 = "shirt"

e1 = sen_emb(a1) 
e2 = sen_emb(a2)

e1, e2 = pad_emb(e1, e2, DIM)

cos_sim = dot(e1, e2)/(norm(e1)*norm(e2))
print("By concatenation: ", cos_sim)

e1_ = avg_emb(s1, DIM)
e2_ = avg_emb(s2, DIM)
print("By average: ", dot(e1_, e2_)/(norm(e1_)*norm(e2_)))

By concatenation:  -0.006502402036809093
By average:  1.0


  del sys.path[0]


##### spell mistakes

In [None]:
model['jecket']

##### prefix / suffix 

In [None]:
model.similarity("shirt", "shirtless")

##### singular / plural

In [1832]:
model.similarity("child", "children")

  """Entry point for launching an IPython kernel.


0.224303

##### Context check

In [1834]:
s1 = "I bought a Apple PC on sale"
s2 = "I bought a Apple fruit on sale"

e1 = sen_emb(s1) 
e2 = sen_emb(s2)

e1, e2 = pad_emb(e1, e2, DIM)

cos_sim = dot(e1, e2)/(norm(e1)*norm(e2))
print("By concatenation: ", cos_sim)

e1_ = avg_emb(s1, DIM)
e2_ = avg_emb(s2, DIM)
print("By average: ", dot(e1_, e2_)/(norm(e1_)*norm(e2_)))

By concatenation:  0.8781490852586474
By average:  1.0


  del sys.path[0]


##### phrases detection

In [None]:
model["new york"]

#### Bi-gram / tri-gram

In [1837]:
from gensim.test.utils import common_texts
from gensim.models.phrases import Phraser, Phrases

In [1838]:
common_terms = ["of", "with", "without", "and", "or", "the", "a"]

In [1839]:
phrases = Phrases(sentences, common_terms=common_terms)

In [1840]:
bigram = Phraser(phrases)
# Applying the Phraser to transform the sentences

all_sentences = list(bigram[sentences])

In [1924]:
len(all_sentences)

1540205

In [1841]:
print(all_sentences[0:4])

[['theory', 'silk', 'shell', 'top', 'ivory', 'small'], ['chantelle_lingerie', 'c_magnifique', 'seamless_unlined', 'minimizer_bra', 'ivory', 'd'], ['rag_and_bone', 'rag_and_bone', 'shiloh_combat', 'boot', 'women', 'black', 'us_eu'], ['bixbee', 'kitty', 'water_resistant', 'lunchbox', 'kids', 'pink', 'little', 'girl', 'one', 'size']]


In [1842]:
model = Word2Vec(all_sentences, min_count=1, size=100, workers=10, window=9, iter=30)

In [1843]:
len(model.wv.vocab)

155890

In [1844]:
model['laptop'].shape

  """Entry point for launching an IPython kernel.


(100,)

In [1845]:
DIM = 100

In [1885]:
s1 = "Salt Water Sandals by Hoy Original Sandal (Baby, Walker, Toddler, Little Kid & Big Kid) SHINY YELLOW Little Kid 2 M"
s2 = "Salt Water Sandals by Hoy Shoe The Original Sandal Shiny Yellow 1 Little Kid"

In [1886]:
e1 = sen_emb(s1) 
e2 = sen_emb(s2)



In [1887]:
e1, e2 = pad_emb(e1, e2, DIM)

In [1888]:
cos_sim = dot(e1, e2)/(norm(e1)*norm(e2))
print(cos_sim)

0.30279343493163263


In [1889]:
e1_ = avg_emb(s1, DIM)
e2_ = avg_emb(s2, DIM)

  del sys.path[0]


In [1890]:
cos_sim = dot(e1_, e2_)/(norm(e1_)*norm(e2_))
print(cos_sim)

0.9999999


In [1895]:
model.similarity("sneaker", "shoe")

  """Entry point for launching an IPython kernel.


0.6956452

In [1897]:
a1 = "3 ½ Pillow" 
a2 = "3.5 Pillow"

e1 = sen_emb(a1) 
e2 = sen_emb(a2)

e1, e2 = pad_emb(e1, e2, DIM)

cos_sim = dot(e1, e2)/(norm(e1)*norm(e2))
print("By concatenation: ", cos_sim)

e1_ = avg_emb(s1, DIM)
e2_ = avg_emb(s2, DIM)
print("By average: ", dot(e1_, e2_)/(norm(e1_)*norm(e2_)))

By concatenation:  0.01699504935929986
By average:  0.9999999


  del sys.path[0]


In [None]:
model['jecket']

In [1904]:
model.similarity("clean", "cleaner")

  """Entry point for launching an IPython kernel.


0.04212213

In [1908]:
model.similarity("shoe", "shoes")

  """Entry point for launching an IPython kernel.


0.6647383

In [1910]:
s1 = "river bank holds surprises for people"
s2 = "Federal bank holds surprise for people"

e1 = sen_emb(s1) 
e2 = sen_emb(s2)

e1, e2 = pad_emb(e1, e2, DIM)

cos_sim = dot(e1, e2)/(norm(e1)*norm(e2))
print("By concatenation: ", cos_sim)

e1_ = avg_emb(s1, DIM)
e2_ = avg_emb(s2, DIM)
print("By average: ", dot(e1_, e2_)/(norm(e1_)*norm(e2_)))

By concatenation:  0.9065869478833567
By average:  0.9934628


  del sys.path[0]


In [1921]:
model.similarity("wireless_charger", "charger")

  """Entry point for launching an IPython kernel.


0.5979278

In [1409]:
def sen_emb_g(sen):
    # pre-process the sentence
    sen = sen.replace('-',' ')
    sen = sen.replace('/',' ')
    sen = remove_special_chars(sen)
    sen = sen.replace('&','and')
    sen = sen.lower()
    
    ## split the sen to list of words
    words = sen.split()
    
    emb = []
    emb = np.array(emb)
        

    for word in words:
        try:
            vec = base_model_wv[word]
        
        except KeyError:
            continue
    
        emb = np.concatenate([emb, vec])
        
    return emb    

In [1410]:
def avg_emb_g(sen, dim):
    sen = sen.replace('-',' ')
    sen = sen.replace('/',' ')
    sen = remove_special_chars(sen)
    sen = sen.replace('&','and')
    sen = sen.lower()
    words = sen.split()
    
    vec = np.zeros(dim)
    
    for word in words:
        try:
            v = base_model_wv[word]
        except KeyError:
            continue
            
        v.setflags(write=1) 
        vec.setflags(write=1)
        
        if vec[0] == 0:
            vec = v
            
        for i in range(dim):
            vec[i] = (v[i] + vec[i])/2
            
    return vec   

In [180]:
PUNCT_TO_REMOVE = string.punctuation
PUNCT_TO_REMOVE = PUNCT_TO_REMOVE.replace("-","")
PUNCT_TO_REMOVE = PUNCT_TO_REMOVE.replace(".","")
PUNCT_TO_REMOVE = PUNCT_TO_REMOVE.replace("&","")

def remove_special_chars(text):
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))


def sen_emb(sen):
    # pre-process the sentence
    sen = sen.replace('-',' ')
    sen = sen.replace('/',' ')
    sen = remove_special_chars(sen)
    sen = sen.replace('&','and')
    sen = sen.lower()
    
    ## split the sen to list of words
    words = sen.split()
    
    emb = []
    emb = np.array(emb)
        

    for word in words:
        try:
            vec = model[word]
        
        except KeyError:
            continue
    
        emb = np.concatenate([emb, vec])
        
    return emb    
    

In [802]:
def pad_emb(e1, e2, dim):
    s_ = e1.shape
    size_1 = s_[0]
    
    s_ = e2.shape
    size_2 = s_[0]
    
    if size_1 == size_2:
        return [e1, e2]
    
    elif size_1 < size_2:
        for _ in range(size_2 - size_1):
            e1 = np.append(e1, 0.0)
        
    elif size_1 > size_2:
        for _ in range(size_1 - size_2):
            e2 = np.append(e2, 0.0)
            
    return [e1, e2]

In [269]:
def avg_emb(sen, dim):
    sen = sen.replace('-',' ')
    sen = sen.replace('/',' ')
    sen = remove_special_chars(sen)
    sen = sen.replace('&','and')
    sen = sen.lower()
    words = sen.split()
    
    vec = np.zeros(dim)
    
    for word in words:
        try:
            v = model[word]
        except KeyError:
            continue
            
        v.setflags(write=1) 
        vec.setflags(write=1)
        
        if vec[0] == 0:
            vec = v
            
        for i in range(dim):
            vec[i] = (v[i] + vec[i])/2
            
    return vec   