## **Connecting with Drive for Data Extraction**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/sts2016-english-with-gs-v1.0

/content/drive/MyDrive/sts2016-english-with-gs-v1.0


## **Data Extraction**

In [None]:
""" 
We need to change the file name and file location for the file we need to do sentence embedding. 
"""

f = open("/content/drive/MyDrive/sts2016-english-with-gs-v1.0/STS2016.input.question-question.txt", "r")

In [None]:
#Reading lines from file
lines = f.readlines()

In [None]:
# Seperating data on the basis of tab and storing in list format
sen1=[]
sen2=[]
for line in lines:
    sen1.append(line.split("\t")[0])
    sen2.append(line.split("\t")[1])
print(len(sen1))
print(sen1[:10], sep="\n")
print()
print(len(sen2))
print(sen2[:10], sep="\n")

1555
['Should I drink water during my workout?', 'How can I put something in book format without "publishing" it?', 'How do I stop my dog from jumping on me?', "What's the best way to store asparagus?", 'How do I make a height adjustable desk?', 'What is the best time and temperature for taste when cooking a salmon fillet in the oven?', 'How can I find out why my washing machine trips the outlet?', 'What should I look for in a jump rope?', 'Should I use IRA money to pay down my student loans?', 'Do I need a UK transit visa for flying to the Canary Islands?']

1555
['How can I get my toddler to drink more water?', 'How can I "time-stamp" my data without publishing it?', 'How do I make my dog forget a command?', "What's the correct way to store fats?", 'How can I build a wall mounted adjustable height desk?', 'What is the best oil to use when cooking in a wok?', 'How can I figure out why my washing machine is tripping the GFCI receptacle?', 'What should I look for in a running shoe?', 'S

## **Doc2Vec**

In [None]:
#Importing necessary libraries
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

In [None]:
# Tokenization of first document
tokenized_doc1= []
tokenized_doc2= []
for d in sen1:
    tokenized_doc1.append(word_tokenize(d.lower()))
tokenized_doc1
print(*tokenized_doc1[:5], sep="\n")

In [None]:
# Convert tokenized document into gensim formated tagged data
doc2vectagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(tokenized_doc1)]
print(*doc2vectagged_data[:5], sep="\n")

In [None]:
## Training doc2vec model
doc2vecmodel = Doc2Vec(doc2vectagged_data, vector_size=20, window=2, min_count=1, workers=4, epochs = 100)
# Saving trained doc2vec model
doc2vecmodel.save("test_doc2vec.model")
## Loading saved doc2vec model
doc2vecmodel= Doc2Vec.load("test_doc2vec.model")
## Printing doc2vec model vocabulary
doc2vecmodel.wv.vocab

In [None]:
# Tokenization of second document
for d in sen2:
     tokenized_doc2.append(word_tokenize(d.lower()))
tokenized_doc2
print(*tokenized_doc2[:5], sep="\n")

In [None]:
#Calculating Cosine Similarity
from sklearn.metrics.pairwise import cosine_similarity
from scipy import spatial
doc2vecCosine=[]
doc2vecCosineList=[]
for i in range (len(tokenized_doc1)):
  v1 = doc2vecmodel.infer_vector(tokenized_doc1[i])
  v2 = doc2vecmodel.infer_vector(tokenized_doc2[i])
  doc2vecCosine=1-(spatial.distance.cosine(v1, v2))
  doc2vecCosineList.append(doc2vecCosine)
  print((doc2vecCosine), sen1[i], sen2[i])  #Note that spatial.distance.cosine computes the distance, and not the similarity. So, you must subtract the value from 1 to get the similarity.

In [None]:
#Storing the cosine similarities into a List
doc2vecresult = []
for item in doc2vecCosineList:
    doc2vecresult.append(item * 5)
print(doc2vecresult)

In [None]:
# Storing and writing the cosine similarities into a file
with open(r'Doc2vec_question.txt', 'w') as fp:
    for item in doc2vecresult:
        # write each item on a new line
        fp.write("%s\n" % item)
    print('Done')

## **SBert**

In [None]:
#Installing Sentence Bert 
!pip install -U sentence-transformers

In [None]:
# Used Pre-trained model 'all-MiniLM-L6-v2'
from sentence_transformers import SentenceTransformer, util
sbertmodel = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
#Computing embedding for both lists
sbertembed1 = sbertmodel.encode(sen1, convert_to_tensor=True)
sbertembed2 = sbertmodel.encode(sen2, convert_to_tensor=True)

In [None]:
#Compute cosine-similarities using embeddings
sbertCosine = util.cos_sim(sbertembed1, sbertembed2)
SbertCosineList=[]
for i in range(len(sen1)):
  SbertCosineList.append("{:.4f}".format(sbertCosine[i][i]))
  print("Score: {:.4f} \t\t {} \t\t {}".format(sbertCosine[i][i], sen1[i], sen2[i]))

In [None]:
#Storing the cosine similarities into a List
Sbertresult = []
for item in SbertCosineList:
  Sbertresult.append(float(item) * 5)
print(Sbertresult)

In [None]:
# Storing and writing the cosine similarities into a file
with open(r'Sbert_question.txt', 'w') as fp:
    for item in Sbertresult:
        fp.write("%s\n" % item)
    print('Done')

## **Universal Sentence Encoder**

In [None]:
# Installing the TF and TF-Hub.
!pip3 install --upgrade tensorflow-gpu
!pip3 install tensorflow-hub

In [None]:
#Importing necessary libraries
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np

In [None]:
#Loading the model
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4" #@param ["https://tfhub.dev/google/universal-sentence-encoder/4", "https://tfhub.dev/google/universal-sentence-encoder-large/5"]
UniversalSentenceEncodermodel = hub.load(module_url)
print ("module %s loaded" % module_url)
def embed(input):
  return UniversalSentenceEncodermodel(input)

module https://tfhub.dev/google/universal-sentence-encoder/4 loaded


In [None]:
#Compute embedding for both the lists
useembeddings1 = UniversalSentenceEncodermodel(sen1) 
useembeddings2 = UniversalSentenceEncodermodel(sen2) 

In [None]:
#Compute cosine-similarities using embeddings
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import cosine
UseCosineList=[]
for i in range (len(sen1)):
  use_vector1 = UniversalSentenceEncodermodel([sen1[i]])[0]
  use_vector2 = UniversalSentenceEncodermodel([sen2[i]])[0]
  usecosines = (1-cosine(use_vector1, use_vector2))
  UseCosineList.append("{:.4f}".format(usecosines))
  print(sen1[i], sen2[i], usecosines)

In [None]:
#Storing the cosine similarities into a List
Useresult = []
for item in UseCosineList:
  Useresult.append(float(item) * 5)
print(Useresult)

[2.321, 3.1159999999999997, 3.5715000000000003, 2.6215, 4.0915, 2.232, 4.225, 3.154, 4.46, 3.8495, 3.7755, 3.4010000000000002, 3.383, 3.371, 2.42, 2.1765, 2.5475, 3.8615, 2.36, 2.601, 2.419, 2.4690000000000003, 2.056, 4.3205, 4.1735, 3.0935, 2.7634999999999996, 3.7800000000000002, 2.225, 3.0940000000000003, 3.1145, 2.9465000000000003, 4.3715, 3.189, 2.9395, 3.4535, 2.6895000000000002, 1.226, 3.991, 2.8325, 3.382, 2.4655, 2.2235, 2.537, 1.7415, 1.5484999999999998, 1.9184999999999999, 3.4175, 3.8715, 3.246, 3.365, 3.1435000000000004, 1.9220000000000002, 3.2779999999999996, 4.1375, 4.2615, 3.7524999999999995, 3.859, 2.681, 2.62, 1.7005000000000001, 3.7264999999999997, 1.8895, 4.202500000000001, 2.579, 2.4485, 2.7150000000000003, 2.9320000000000004, 2.476, 2.8105, 3.0435, 1.6070000000000002, 1.543, 2.9145, 1.874, 2.626, 2.799, 3.8495, 2.2864999999999998, 3.165, 3.8975, 2.0615, 2.662, 4.2405, 3.769, 3.6630000000000003, 3.479, 4.164, 3.7615, 3.209, 2.0780000000000003, 2.5415, 3.2984999999999

In [None]:
# Storing and writing the cosine similarities into a file
with open(r'UniSenEn_question.txt', 'w') as fp:
    for item in Useresult:
        fp.write("%s\n" % item)
    print('Done')

Done


## **SIMCSE**

In [None]:
!pip install simcse

In [None]:
import torch
from scipy.spatial.distance import cosine
from transformers import AutoModel, AutoTokenizer

# Importing the models and the package will download the models automatically.
tokenizer = AutoTokenizer.from_pretrained("princeton-nlp/sup-simcse-bert-base-uncased")
simcsemodel = AutoModel.from_pretrained("princeton-nlp/sup-simcse-bert-base-uncased")

Downloading:   0%|          | 0.00/252 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/689 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [None]:
#Compute cosine-similarities using embeddings

SimcseCosineList=[]
for i in range(len(sen1)):
 # Tokenize the sentences
  texts = [sen1[i],sen2[i]]
  inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
  # Computing Embeddings
  with torch.no_grad():
      simcseembeddings = simcsemodel(**inputs, output_hidden_states=True, return_dict=True).pooler_output
  # Calculate cosine similarities
  simcse_cosine = 1 - cosine(simcseembeddings[0],simcseembeddings[1])
  SimcseCosineList.append("{:.4f}".format(simcse_cosine))
  print("Cosine similarity: %.3f between \"%s\" and \"%s\" " % (simcse_cosine, texts[0], texts[1]))

In [None]:
#Storing the cosine similarities into a List
SimCseresult = []
for item in SimcseCosineList:
  SimCseresult.append(float(item) * 5)
print(SimCseresult)

[2.8865000000000003, 3.3355, 3.877, 2.9915000000000003, 4.305, 3.0185, 4.268, 2.9745, 4.797000000000001, 3.7645, 3.5385, 3.798, 3.62, 3.205, 2.9865000000000004, 1.7625, 2.6265, 3.9795000000000003, 2.9735, 2.861, 2.9535, 2.768, 2.4995000000000003, 4.733, 4.649, 3.2975, 2.6185, 3.3965, 2.2815, 2.8315, 3.7314999999999996, 3.8395, 4.4535, 3.1995, 2.698, 3.35, 3.0980000000000003, 2.2975000000000003, 3.7119999999999997, 3.4905000000000004, 3.9895000000000005, 2.59, 2.1695, 3.274, 2.0125, 1.6105, 1.9295, 3.409, 4.062, 3.361, 3.6015, 2.968, 2.152, 3.587, 4.1785, 4.5225, 3.548, 4.077, 2.5424999999999995, 2.293, 2.537, 4.093999999999999, 1.911, 4.24, 2.945, 1.7954999999999999, 2.6055, 2.7479999999999998, 2.7760000000000002, 3.2035, 3.1605, 2.0250000000000004, 2.0535, 2.9320000000000004, 2.372, 3.5335, 2.644, 3.6725000000000003, 1.749, 3.1535, 4.241, 3.2315, 1.6345, 3.874, 3.8195, 3.411, 3.715, 3.2634999999999996, 4.1635, 3.754, 1.9520000000000002, 3.154, 3.8240000000000003, 3.4844999999999997, 1

In [None]:
# Storing and writing the cosine similarities into a file
with open(r'SimCse_question.txt', 'w') as fp:
    for item in SimCseresult:
        fp.write("%s\n" % item)
    print('Done')

Done


## **Infercent**

In [None]:
#Downloading Infercent model with Version 1 i.e Glove.
! mkdir encoder
! curl -Lo encoder/infersent1.pkl https://dl.fbaipublicfiles.com/infersent/infersent1.pkl
! curl -Lo encoder/infersent2.pkl https://dl.fbaipublicfiles.com/infersent/infersent2.pkl
  
! mkdir GloVe
! curl -Lo GloVe/glove.840B.300d.zip http://nlp.stanford.edu/data/glove.840B.300d.zip
! unzip GloVe/glove.840B.300d.zip -d GloVe/

In [None]:
# Loading the model
import torch
import torchvision
from models import InferSent
model_version = 1
MODEL_PATH = "encoder/infersent%s.pkl" % model_version
params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version}
infersentmodel = InferSent(params_model) ####################
infersentmodel.load_state_dict(torch.load(MODEL_PATH))

In [None]:
import nltk
nltk.download('punkt')

In [None]:
# Loading the model, If infersent1 uses GloVe embeddings. If infersent2 -> use InferSent embeddings.
W2V_PATH = 'GloVe/glove.840B.300d.txt' if model_version == 1 else 'fastText/crawl-300d-2M.vec'
infersentmodel.set_w2v_path(W2V_PATH)

In [None]:
sen1=[]
sen2=[]
totallist=[]
for line in lines:
  sen1.append(line.split("\t")[0])
  sen2.append(line.split("\t")[1])
  totallist.append(line.split("\t")[0])
  totallist.append(line.split("\t")[1])
type(totallist)
print(len(totallist))

2542


In [None]:
# Loading the embeddings of all sentences most and building the vocab
infersentmodel.build_vocab(totallist, tokenize=True)

Found 947(/963) words with w2v vectors
Vocab size : 947


In [None]:
#Compute cosine-similarities using embeddings
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import cosine
infersentList=[]
for i in range(len(sen1)):
  v1=infersentmodel.encode([sen1[i]])[0]
  v2=infersentmodel.encode([sen2[i]])[0]
  infersent_cosine=1-cosine(v1,v2)
  infersentList.append("{:.4f}".format(infersent_cosine))
  print("Cosine similarity: %.3f between \"%s\" and \"%s\" " % (infersent_cosine, sen1[i], sen2[i]))

In [None]:
#Storing the cosine similarities into a List
Infersentresult = []
for item in infersentList:
  Infersentresult.append(float(item) * 5)
print(Infersentresult)

In [None]:
# Storing and writing the cosine similarities into a file
with open(r'Infercent_question.txt', 'w') as fp:
    for item in Infersentresult:
        fp.write("%s\n" % item)
    print('Done')