In [1]:
from sentence_transformers import SentenceTransformer
from sentence_transformers import util
import numpy as np
import pandas as pd

  LARGE_SPARSE_SUPPORTED = LooseVersion(scipy_version) >= '0.14.0'


In [None]:
## installation: https://github.com/UKPLab/sentence-transformers
# https://www.sbert.net/

In [2]:
model = SentenceTransformer('all-mpnet-base-v2')

## Other available models: https://www.sbert.net/docs/pretrained_models.html
# all-roberta-large-v1
# all-distilroberta-v1 
# all-MiniLM-L6-v2
# distiluse-base-multilingual-cased-v1
# paraphrase-TinyBERT-L6-v2

In [49]:
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 384, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)

##### sentence similarity  

In [46]:
emb1 = model.encode("Jo Malone London™ Dark Amber & Ginger Lily Scented Home Candle NO COLOR  7 oz—")
emb2 = model.encode("Jo Malone London Dark Amber & Ginger Lily Home Candle")

In [47]:
cos = np.dot(emb1, emb2)
cos = cos / (np.linalg.norm(emb1)* np.linalg.norm(emb2))
print("Cosine similarity: ", cos)

Cosine similarity:  0.92585653


In [48]:
cos_sim = util.cos_sim(emb1, emb2)
print("Cos similarity: ", cos_sim)

Cos similarity:  tensor([[0.9259]])


##### word similarity

In [71]:
w1 = model.encode("jacket")
w2 = model.encode("raincoat")

cos_sim = util.cos_sim(w1, w2)
print("Cos similarity: ", cos_sim)

Cos similarity:  tensor([[0.7672]])


##### alpha-numeric

In [77]:
w1 = model.encode("3 ½ Pillow")
w2 = model.encode("3.5 Pillow")

cos_sim = util.cos_sim(w1, w2)
print("Cos similarity: ", cos_sim)

Cos similarity:  tensor([[0.9193]])


###### spell-mistakes

In [84]:
w1 = model.encode("weterproof")
w2 = model.encode("water proof")

cos_sim = util.cos_sim(w1, w2)
print("Cos similarity: ", cos_sim)

Cos similarity:  tensor([[0.7792]])


###### prefix / suffix

In [88]:
w1 = model.encode("clean")
w2 = model.encode("cleaner")

cos_sim = util.cos_sim(w1, w2)
print("Cos similarity: ", cos_sim)

Cos similarity:  tensor([[0.6858]])


###### singular / plural

In [92]:
w1 = model.encode("shoe")
w2 = model.encode("shoes")

cos_sim = util.cos_sim(w1, w2)
print("Cos similarity: ", cos_sim)

Cos similarity:  tensor([[0.8166]])


##### Context check

In [96]:
w1 = model.encode("River bank holds surprises for people")
w2 = model.encode("‘Federal bank holds surprise for people")

cos_sim = util.cos_sim(w1, w2)
print("Cos similarity: ", cos_sim)

Cos similarity:  tensor([[0.5274]])


##### Phrase detection

In [121]:
w1 = model.encode("water proof")
w2 = model.encode("resistant")

cos_sim = util.cos_sim(w1, w2)
print("Cos similarity: ", cos_sim)

Cos similarity:  tensor([[0.4101]])


##### plotting histograms

In [122]:
corpus = pd.read_csv('processed_Dtrain_4f.csv')

In [123]:
titles = []
for row in corpus['title']:
    titles.append((row))
    
match_titles = []
for row in corpus['match_title']:
    match_titles.append((row))

In [124]:
len(titles)

308041

In [126]:
scores = []

for i in range(len(titles) - 208041):
    s1 = titles[i]
    s2 = match_titles[i]
    
    if s1 is np.NaN or s2 is np.NaN:
        continue
    if len(s1) == 0 or len(s2) == 0:
        continue
    
    w1 = model.encode(s1)
    w2 = model.encode(s2)
    cos_sim = util.cos_sim(w1, w2)
    if cos_sim > 0:
        scores.append(cos_sim)
    if (i % 10000 == 0):
        print(i/10000)

0.0
1.0
2.0
3.0
4.0
5.0
6.0
7.0
8.0
9.0


In [127]:
scores = np.array(scores)
dataset = pd.DataFrame(scores)
dataset.to_csv('SBERT_allmpnet_v2.csv')

### Plotting other models histograms

In [144]:
from gensim.models.word2vec import Word2Vec
import gensim.downloader as api
from gensim.scripts.glove2word2vec import glove2word2vec

In [148]:
embeddings_index = {}
f = open('./glove.42B.300d.txt', encoding="utf8")
for line in f:
    values = line.split(' ')
    word = values[0]           ## The first entry is the word
    coefs = np.asarray(values[1:], dtype='float32')   ## These are the vecotrs representing the embedding for the word
    embeddings_index[word] = coefs
f.close()

print('GloVe data loaded')

GloVe data loaded


In [149]:
DIM = 300

In [150]:
scores = []

for i in range(len(titles) - 208041):
    s1 = titles[i]
    s2 = match_titles[i]
    
    if s1 is np.NaN or s2 is np.NaN:
        continue
    if len(s1) == 0 or len(s2) == 0:
        continue
    
    e1 = sen_emb(s1) 
    e2 = sen_emb(s2)
    e1, e2 = pad_emb(e1, e2, DIM)
    cos_sim = dot(e1, e2)/(norm(e1)*norm(e2))
    
    if cos_sim > 0:
        scores.append(cos_sim)
    if (i % 1000 == 0):
        print(i/1000)

0.0


  from ipykernel import kernelapp as app


1.0
2.0
3.0
4.0
5.0
6.0
7.0
8.0
9.0
10.0
11.0
12.0
13.0
14.0
15.0
16.0
17.0
18.0
19.0
20.0
21.0
22.0
23.0
24.0
25.0
26.0
27.0
28.0
29.0
30.0
31.0
32.0
33.0
34.0
35.0
36.0
37.0
38.0
39.0
40.0
41.0
42.0
43.0
44.0
45.0
46.0
47.0
48.0
49.0
50.0
51.0
52.0
53.0
54.0
55.0
56.0
57.0
58.0
59.0
60.0
61.0
62.0
63.0
64.0
65.0
66.0
67.0
68.0
69.0
70.0
71.0
72.0
73.0
74.0
75.0
76.0
77.0
78.0
79.0
80.0
81.0
82.0
83.0
84.0
85.0
86.0
87.0
88.0
89.0
90.0
91.0
92.0
93.0
94.0
95.0
96.0
97.0
98.0
99.0


In [153]:
scores = np.array(scores)
dataset = pd.DataFrame(scores)
dataset.to_csv('scores_glove_42B_300d.csv')

In [146]:
PUNCT_TO_REMOVE = string.punctuation
PUNCT_TO_REMOVE = PUNCT_TO_REMOVE.replace("-","")
PUNCT_TO_REMOVE = PUNCT_TO_REMOVE.replace(".","")
PUNCT_TO_REMOVE = PUNCT_TO_REMOVE.replace("&","")

def remove_special_chars(text):
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))


def sen_emb(sen):
    # pre-process the sentence
    sen = sen.replace('-',' ')
    sen = sen.replace('/',' ')
    sen = remove_special_chars(sen)
    sen = sen.replace('&','and')
    sen = sen.lower()
    
    ## split the sen to list of words
    words = sen.split()
    
    emb = []
    emb = np.array(emb)
        

    for word in words:
        try:
            vec = embeddings_index[word]
        
        except KeyError:
            continue
    
        emb = np.concatenate([emb, vec])
        
    return emb    

In [147]:
def pad_emb(e1, e2, dim):
    s_ = e1.shape
    size_1 = s_[0]
    
    s_ = e2.shape
    size_2 = s_[0]
    
    if size_1 == size_2:
        return [e1, e2]
    
    elif size_1 < size_2:
        for _ in range(size_2 - size_1):
            e1 = np.append(e1, 0.0)
        
    elif size_1 > size_2:
        for _ in range(size_1 - size_2):
            e2 = np.append(e2, 0.0)
            
    return [e1, e2]

#### custom build and fine-tuning

In [3]:
from sentence_transformers import SentenceTransformer, InputExample, losses, util
from torch.utils.data import DataLoader
import numpy as np
import pandas as pd

In [20]:
model = SentenceTransformer('all-mpnet-base-v2')

In [5]:
corpus = pd.read_csv('processed_Dtrain_4f.csv')

titles = []
for row in corpus['title']:
    titles.append((row))
    
match_titles = []
for row in corpus['match_title']:
    match_titles.append((row))

In [6]:
train_samples = []

for i in range(len(titles)):
    
    inp_example = InputExample(texts=[titles[i], match_titles[i]], label=1.0)
    train_samples.append(inp_example)

In [17]:
num_epochs = 1
train_batch_size = 16
model_save_path = './sbert_finetuned'

In [21]:

train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)

In [22]:
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up

In [23]:
model.fit(train_objectives=[(train_dataloader, train_loss)],
          epochs=num_epochs,
          warmup_steps=warmup_steps,
          output_path=model_save_path,
          show_progress_bar = True)

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/19253 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [36]:
emb1 = model.encode("Jo Malone London™ Dark Amber & Ginger Lily Scented Home Candle NO COLOR  7 oz—")
emb2 = model.encode("")

In [37]:
cos_sim = util.cos_sim(emb1, emb2)
print("Cos similarity: ", cos_sim)

Cos similarity:  tensor([[0.9204]])
