# Defining SBERT

In [1]:
import torch

In [2]:
torch.cuda.is_available()

True

In [3]:
from torch import nn
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BertModel, BertConfig, BertTokenizer
from scipy import spatial

# QUESTION: 
## SHOULD WE TRAIN FROM SCRATCH OR CAN WE START FROM bert-base-uncased or any other starting point?
##### DONE: BertConfig() starts from random weights

## TODO: Add validation set and test on testing set for SBERT classification
##### keep the model running for a while and then test .....

## TODO: After finishing training for classification, save the model please to stop training everytime <3
##### sure.. but we can just run the model 

## TODO: After training on the classification data, check how to "fine-tune" on the STS regression data (maybe only the head without the bert itself?)
## Check my question please <3 https://docs.google.com/document/d/1YeohuAr55fKF2nI1RiCgpq_Wa3Yn-CLAwfPoBmViNIM/edit

class SBERT(nn.Module):
    def __init__(self):
        super(SBERT, self).__init__()
        
        # self.model = BertModel.from_pretrained("bert-base-uncased")
        configuration = BertConfig()
        self.model = BertModel(configuration)
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        
        self.pooling = nn.AvgPool1d(kernel_size=3,stride=1)
        # 768 / 3 -> 256
        self.linear = nn.Linear(in_features=2298, out_features=3) # 2298=(768-2)*3; 153 is the embedding dimension after pooling and stuff..
        self.softmax = nn.Softmax(dim=1)

    def forward(self, sent1, sent2=None, objective=None):
        
        if objective=="embedding":
            encoded_input1 = self.tokenizer(sent1, padding=True, truncation=True, return_tensors='pt')
            output1 = self.model(**encoded_input1)
            output1 = self.pooling(output1["pooler_output"])
            return output1

        encoded_input2 = self.tokenizer(sent2, padding=True, truncation=True, return_tensors='pt')
        output2 = self.model(**encoded_input2)
        output2 = self.pooling(output2["pooler_output"])
                        
        if objective == "regression":
            return torch.cosine_similarity(output1, output2)

        if objective == "classification":
            diff = abs(torch.subtract(output1,output2))
            concat = torch.cat([output1,output2,diff],axis=1)            
            result = self.linear(concat)
            out = self.softmax(result)
            return out

sbert = SBERT()

In [4]:
sbert

SBERT(
  (model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
     

# Regression Objective Evaluation

In [None]:
# False in ex1
# True in ex2
comparing_models = False

In [None]:
if comparing_models: 
    PATH = "models/classification_regression.pt"
    sbert = SBERT()
    sbert.load_state_dict(torch.load(PATH))
    sbert.eval()

In [23]:
import pandas as pd

df_test = pd.read_csv("Stsbenchmark/sts-test.csv",header=0,names=["main-caption","genre","filename","year","score","sentence1","sentence2"])#,usecols=['score','sentence1','sentence2'])

df_test = df_test[['score','sentence1','sentence2']]
df_test.head()

Unnamed: 0,score,sentence1,sentence2
0,3.6,A group of men play soccer on the beach.,A group of boys are playing soccer on the beach.
1,5.0,One woman is measuring another woman's ankle.,A woman measures another woman's ankle.
2,4.2,A man is cutting up a cucumber.,A man is slicing a cucumber.
3,1.5,A man is playing a harp.,A man is playing a keyboard.
4,1.8,A woman is cutting onions.,A woman is cutting tofu.


In [24]:
ls = list(df_test.score)
minn = min(ls)
maxx = max(ls)
(minn,maxx)

(0.0, 5.0)

In [25]:
def map_score(value, leftMin=0, leftMax=5, rightMin=-1, rightMax=1):
    # Figure out how 'wide' each range is
    leftSpan = leftMax - leftMin
    rightSpan = rightMax - rightMin

    # Convert the left range into a 0-1 range (float)
    valueScaled = float(value - leftMin) / float(leftSpan)

    # Convert the 0-1 range into a value in the right range.
    return rightMin + (valueScaled * rightSpan)

df_test['score'] = df_test['score'].apply(map_score)

In [26]:
ls = list(df_test.score)
minn = min(ls)
maxx = max(ls)
(minn,maxx)

(-1.0, 1.0)

In [27]:
df_test.isna().sum()

score        0
sentence1    0
sentence2    5
dtype: int64

In [28]:
df_test.dropna(inplace=True)

In [29]:
df_test.isna().sum()

score        0
sentence1    0
sentence2    0
dtype: int64

In [43]:
from torch.utils.data import Dataset,DataLoader

class STS_Dataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        sent1 = row['sentence1']
        sent2 = row['sentence2']
        label = row['score']
        return (sent1, sent2), label

In [None]:
training_data_regression = STS_Dataset(df_test)
train_dataloader_regression = DataLoader(training_data_regression, batch_size=1, shuffle=False)

losses = []
criterion = nn.MSELoss()
sbert.eval()
for i,((sent1,sent2),label) in enumerate(train_dataloader_regression):
    
    output = sbert(sentence1,sentence2,objective="regression")

    loss = criterion(output,label)
    
    print(f"iteration {i} mse_loss={loss}")
    
    losses.append(loss)

iteration 0 mse_loss=0.3016123210005845
iteration 1 mse_loss=0.00011680365134125736
iteration 2 mse_loss=0.09559995642233725
iteration 3 mse_loss=1.9298555970244489
iteration 4 mse_loss=1.6108494147353256
iteration 5 mse_loss=0.347147715096959
iteration 6 mse_loss=1.2303078383498274
iteration 7 mse_loss=1.2303078383498274
iteration 8 mse_loss=1.6993532136582077
iteration 9 mse_loss=1.6993532136582077
iteration 10 mse_loss=0.00011680365134125736
iteration 11 mse_loss=3.05967414389182
iteration 12 mse_loss=0.052529168229588225
iteration 13 mse_loss=1.4141786265425764
iteration 14 mse_loss=1.6108494147353256
iteration 15 mse_loss=0.052529168229588225
iteration 16 mse_loss=0.3016123210005845
iteration 17 mse_loss=0.3016123210005845
iteration 18 mse_loss=2.277661779313573
iteration 19 mse_loss=1.0592370501570787
iteration 20 mse_loss=3.6450157202773177
iteration 21 mse_loss=0.09559995642233725
iteration 22 mse_loss=0.052529168229588225
iteration 23 mse_loss=1.1863401413016406
iteration 24 m

iteration 198 mse_loss=0.6228246855788315
iteration 199 mse_loss=3.3459449320845684
iteration 200 mse_loss=3.3459449320845684
iteration 201 mse_loss=3.56904802322913
iteration 202 mse_loss=3.56904802322913
iteration 203 mse_loss=3.3459449320845684
iteration 204 mse_loss=3.3459449320845684
iteration 205 mse_loss=0.5029538973860824
iteration 206 mse_loss=3.956886508470067
iteration 207 mse_loss=1.2303078383498274
iteration 208 mse_loss=2.7862033556990706
iteration 209 mse_loss=2.7862033556990706
iteration 210 mse_loss=3.493880326180943
iteration 211 mse_loss=3.956886508470067
iteration 212 mse_loss=2.8533710527472578
iteration 213 mse_loss=1.1151200042016494
iteration 214 mse_loss=2.1199139451653943
iteration 215 mse_loss=2.277661779313573
iteration 216 mse_loss=0.09559995642233725
iteration 217 mse_loss=0.22014153280783538
iteration 218 mse_loss=0.6228246855788315
iteration 219 mse_loss=3.2012095379881944
iteration 220 mse_loss=3.956886508470067
iteration 221 mse_loss=3.4447078861291396

iteration 394 mse_loss=3.3459449320845684
iteration 395 mse_loss=3.05967414389182
iteration 396 mse_loss=0.004787591844090291
iteration 397 mse_loss=0.7554954737715807
iteration 398 mse_loss=1.4141786265425764
iteration 399 mse_loss=0.9009662619643295
iteration 400 mse_loss=3.956886508470067
iteration 401 mse_loss=1.0592370501570787
iteration 402 mse_loss=0.1514707446150863
iteration 403 mse_loss=0.1514707446150863
iteration 404 mse_loss=0.1514707446150863
iteration 405 mse_loss=3.956886508470067
iteration 406 mse_loss=0.5029538973860824
iteration 407 mse_loss=2.0425909911208233
iteration 408 mse_loss=0.3016123210005845
iteration 409 mse_loss=0.052529168229588225
iteration 410 mse_loss=0.00011680365134125736
iteration 411 mse_loss=1.6108494147353256
iteration 412 mse_loss=0.004787591844090291
iteration 413 mse_loss=3.05967414389182
iteration 414 mse_loss=2.0425909911208233
iteration 415 mse_loss=3.6450157202773177
iteration 416 mse_loss=0.09559995642233725
iteration 417 mse_loss=0.0222

iteration 589 mse_loss=1.6108494147353256
iteration 590 mse_loss=0.9009662619643295
iteration 591 mse_loss=0.22014153280783538
iteration 592 mse_loss=2.277661779313573
iteration 593 mse_loss=3.6450157202773177
iteration 594 mse_loss=0.22014153280783538
iteration 595 mse_loss=1.0592370501570787
iteration 596 mse_loss=3.3459449320845684
iteration 597 mse_loss=0.3958831091933336
iteration 598 mse_loss=0.09559995642233725
iteration 599 mse_loss=1.0592370501570787
iteration 600 mse_loss=2.7862033556990706
iteration 601 mse_loss=0.052529168229588225
iteration 602 mse_loss=0.6228246855788315
iteration 603 mse_loss=1.4141786265425764
iteration 604 mse_loss=0.6228246855788315
iteration 605 mse_loss=1.2303078383498274
iteration 606 mse_loss=1.6108494147353256
iteration 607 mse_loss=0.7554954737715807
iteration 608 mse_loss=0.7554954737715807
iteration 609 mse_loss=0.00011680365134125736
iteration 610 mse_loss=1.2303078383498274
iteration 611 mse_loss=2.277661779313573
iteration 612 mse_loss=1.82

In [3]:
# from sklearn.metrics import classification_report
# import json

# report = classification_report(trues, preds,output_dict=True)
# print(json.dumps(report, sort_keys=True, indent=4))

# Classification Objective Evaluation

In [7]:
import json
json_list = list(open("snli_1.0/snli_1.0_train.jsonl","r"))

label2int = {"contradiction": 0, "entailment": 1, "neutral": 2}

In [10]:
import pandas as pd

data = {'sentence1': [], 'sentence2': [], 'gold_label': []}
for json_str in json_list:
    try:
        result = json.loads(json_str)
        result['gold_label']=label2int[result['gold_label']]
        for key in data:
            data[key].append(result[key])
    except:
        pass
df_train = pd.DataFrame.from_dict(data).head()

In [11]:
df_train.head()

Unnamed: 0,sentence1,sentence2,gold_label
0,A person on a horse jumps over a broken down a...,A person is training his horse for a competition.,2
1,A person on a horse jumps over a broken down a...,"A person is at a diner, ordering an omelette.",0
2,A person on a horse jumps over a broken down a...,"A person is outdoors, on a horse.",1
3,Children smiling and waving at camera,They are smiling at their parents,2
4,Children smiling and waving at camera,There are children present,1


In [12]:
from torch.utils.data import Dataset,DataLoader

class SNLI_Dataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        sent1 = row['sentence1']
        sent2 = row['sentence2']
        label = row['gold_label']
        return (sent1, sent2), label

In [13]:
training_data = SNLI_Dataset(df_train)
train_dataloader = DataLoader(training_data, batch_size=8, shuffle=False)

In [14]:
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(sbert.parameters(), lr=0.001, momentum=0.9)

- outputs of shape (N,C) where N is the batch size C is the number of classes
- target of shape (N)

In [15]:
training_data = SNLI_Dataset(df_train)
train_dataloader = DataLoader(training_data, batch_size=16, shuffle=False)

n_epochs = 100
sbert.train()
losses = []
for epoch in range(n_epochs):

    for i,((sent1,sent2),label) in enumerate(train_dataloader):
        
        output = sbert(sent1,sent2,objective="classification")
        optimizer.zero_grad()
        
        loss = criterion(output, label)
        
        losses.append(loss)
        
        print(f"epoch={epoch+1} iteration={i+1}/{len(train_dataloader)} loss={loss.detach().numpy()}")
        
        loss.backward()
        optimizer.step()
        
        # break
print('Finished Training')

epoch=1 iteration=1/1 loss=1.1248085498809814
epoch=2 iteration=1/1 loss=1.1213394403457642
epoch=3 iteration=1/1 loss=1.116319179534912
epoch=4 iteration=1/1 loss=1.1178715229034424
epoch=5 iteration=1/1 loss=1.1034400463104248
epoch=6 iteration=1/1 loss=1.0994272232055664
epoch=7 iteration=1/1 loss=1.1001434326171875
epoch=8 iteration=1/1 loss=1.0792795419692993
epoch=9 iteration=1/1 loss=1.086229920387268
epoch=10 iteration=1/1 loss=1.0764697790145874
epoch=11 iteration=1/1 loss=1.0764305591583252
epoch=12 iteration=1/1 loss=1.080661416053772
epoch=13 iteration=1/1 loss=1.0829098224639893
epoch=14 iteration=1/1 loss=1.0621845722198486
epoch=15 iteration=1/1 loss=1.0699632167816162
epoch=16 iteration=1/1 loss=1.0708560943603516
epoch=17 iteration=1/1 loss=1.0716842412948608
epoch=18 iteration=1/1 loss=1.0721025466918945
epoch=19 iteration=1/1 loss=1.067765712738037
epoch=20 iteration=1/1 loss=1.060341477394104
epoch=21 iteration=1/1 loss=1.0495812892913818
epoch=22 iteration=1/1 loss

In [16]:
for idx in range(5):
    pred = torch.argmax(sbert(df_train.iloc[idx].sentence1,df_train.iloc[idx].sentence2,"classification")).numpy().item()
    true = df_train.iloc[idx]['gold_label']
    print(pred==true)

True
False
True
True
True


In [18]:
PATH = "models/classification.pt"
torch.save(sbert.state_dict(), PATH)

# fine-tuning using the regression objective

In [19]:
PATH = "models/classification.pt"

sbert = SBERT()
sbert.load_state_dict(torch.load(PATH))
sbert.train()

SBERT(
  (model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
     

In [20]:
import pandas as pd

df_train = pd.read_csv("Stsbenchmark/sts-train.csv",header=0,names=["main-caption","genre","filename","year","score","sentence1","sentence2"])#,usecols=['score','sentence1','sentence2'])
df_test = pd.read_csv("Stsbenchmark/sts-test.csv",header=0,names=["main-caption","genre","filename","year","score","sentence1","sentence2"])#,usecols=['score','sentence1','sentence2'])

df_train = df_train[['score','sentence1','sentence2']]
df_test = df_test[['score','sentence1','sentence2']]

df_train.head()
# df_test.head()

Unnamed: 0,score,sentence1,sentence2
0,3.8,A man is playing a large flute.,A man is playing a flute.
1,3.8,A man is spreading shreded cheese on a pizza.,A man is spreading shredded cheese on an uncoo...
2,2.6,Three men are playing chess.,Two men are playing chess.
3,4.25,A man is playing the cello.,A man seated is playing the cello.
4,4.25,Some men are fighting.,Two men are fighting.


In [21]:
ls = list(df_train.score)
minn = min(ls)
maxx = max(ls)
(minn,maxx)

(0.0, 5.0)

In [22]:
def map_score(value, leftMin=0, leftMax=5, rightMin=-1, rightMax=1):
    # Figure out how 'wide' each range is
    leftSpan = leftMax - leftMin
    rightSpan = rightMax - rightMin

    # Convert the left range into a 0-1 range (float)
    valueScaled = float(value - leftMin) / float(leftSpan)

    # Convert the 0-1 range into a value in the right range.
    return rightMin + (valueScaled * rightSpan)

df_train['score'] = df_train['score'].apply(map_score)

In [23]:
ls = list(df_train.score)
minn = min(ls)
maxx = max(ls)
(minn,maxx)

(-1.0, 1.0)

In [24]:
from torch.utils.data import Dataset,DataLoader

class STS_Dataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        sent1 = row['sentence1']
        sent2 = row['sentence2']
        label = row['score']
        return (sent1, sent2), label

In [25]:
sbert("I love pizza","I love burger",objective="regression")

tensor([0.6167], grad_fn=<DivBackward0>)

In [26]:
import torch.optim as optim

criterion = nn.MSELoss()
optimizer = optim.SGD(sbert.parameters(), lr=0.001, momentum=0.9)

In [28]:
training_data_regression = STS_Dataset(df_train)
train_dataloader_regression = DataLoader(training_data_regression, batch_size=64, shuffle=False)

In [None]:
n_epochs = 1

losses = []
for epoch in range(n_epochs):

    for i,((sent1,sent2),label) in enumerate(train_dataloader_regression):
        
        output = sbert(sent1,sent2,objective="regression")
        optimizer.zero_grad()
        
        label = label.float()
        loss = criterion(output, label)
        
        losses.append(loss)
        
        print(f"epoch={epoch+1} iteration={i+1}/{len(train_dataloader_regression)} loss={loss.detach().numpy()}")
        
        loss.backward()
        optimizer.step()
        
        # break
print('Finished Training')

epoch=1 iteration=1/62 loss=0.3414393365383148
epoch=1 iteration=2/62 loss=0.5225234627723694
epoch=1 iteration=3/62 loss=0.5447043776512146
epoch=1 iteration=4/62 loss=0.9775974750518799
epoch=1 iteration=5/62 loss=0.8285077214241028
epoch=1 iteration=6/62 loss=0.9737005829811096
epoch=1 iteration=7/62 loss=0.7615697383880615
epoch=1 iteration=8/62 loss=0.7637732625007629
epoch=1 iteration=9/62 loss=0.27583539485931396
epoch=1 iteration=10/62 loss=0.4577804207801819
epoch=1 iteration=11/62 loss=0.42774084210395813
epoch=1 iteration=12/62 loss=0.6824129223823547
epoch=1 iteration=13/62 loss=0.5674241185188293
epoch=1 iteration=14/62 loss=0.6364807486534119
epoch=1 iteration=15/62 loss=0.7143394947052002
epoch=1 iteration=16/62 loss=0.5169401168823242
epoch=1 iteration=17/62 loss=0.39554280042648315
epoch=1 iteration=18/62 loss=0.48140937089920044
epoch=1 iteration=19/62 loss=0.4177089333534241
epoch=1 iteration=20/62 loss=0.4531996548175812
epoch=1 iteration=21/62 loss=0.32224404811859

In [8]:
PATH = "models/classification_regression.pt"
torch.save(sbert.state_dict(), PATH)

# Use your best fine-tuned model and create a small semantic search system

In [5]:
PATH = "models/classification_regression.pt"
sbert = SBERT()
sbert.load_state_dict(torch.load(PATH))
sbert.eval()

SBERT(
  (model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
     

In [6]:
import json
list_ = []
path = "datasets/News_Category_Dataset_v2.json"
with open(path) as files:
    for file in files:
        list_.append(json.loads(file))

In [7]:
import pandas as pd

df_news = pd.DataFrame(list_)
df_news.head()

Unnamed: 0,category,headline,authors,link,short_description,date
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...,Melissa Jeltsen,https://www.huffingtonpost.com/entry/texas-ama...,She left her husband. He killed their children...,2018-05-26
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.,2018-05-26
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,Ron Dicker,https://www.huffingtonpost.com/entry/hugh-gran...,The actor and his longtime girlfriend Anna Ebe...,2018-05-26
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,Ron Dicker,https://www.huffingtonpost.com/entry/jim-carre...,The actor gives Dems an ass-kicking for not fi...,2018-05-26
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...,Ron Dicker,https://www.huffingtonpost.com/entry/julianna-...,"The ""Dietland"" actress said using the bags is ...",2018-05-26


In [8]:
sent1 = list(df_news.headline)
print("Length before splitting:",len(sent1))
sent2 = [x for sentence in sent1 for x in sentence.split(".")]
sent2 = [x for x in sent2 if x]
print("Length after splitting:",len(sent2))
print(f"Data increased by {len(sent2)-len(sent1)} sentences")

Length before splitting: 200853
Length after splitting: 215043
Data increased by 14190 sentences


In [9]:
sent2[:5]

['There Were 2 Mass Shootings In Texas Last Week, But Only 1 On TV',
 "Will Smith Joins Diplo And Nicky Jam For The 2018 World Cup's Official Song",
 'Hugh Grant Marries For The First Time At Age 57',
 "Jim Carrey Blasts 'Castrato' Adam Schiff And Democrats In New Artwork",
 'Julianna Margulies Uses Donald Trump Poop Bags To Pick Up After Her Dog']

In [10]:
input_sentence = "Sweden is a very safe place."

In [11]:
def create_embeddings(sentences):
    embeddings = []
    for i,sentence in enumerate(sentences):
        print(f"Finished iteration {i}/{len(sentences)}",end="\r")
        
        embeddings.append(sbert(sentence,objective="embedding"))

    embeddings = torch.FloatTensor(embeddings)
    torch.save(x, 'datasets/embeddings.pt')

In [None]:
create_embeddings(sent2)

Finished iteration 1061/215043

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/home/ali/anaconda3/envs/py38/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3444, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipykernel_89163/908439655.py", line 1, in <module>
    create_embeddings(sent2)
  File "/tmp/ipykernel_89163/686597705.py", line 6, in create_embeddings
    embeddings.append(sbert(sentence,objective="embedding"))
  File "/home/ali/anaconda3/envs/py38/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
    return forward_call(*input, **kwargs)
  File "/tmp/ipykernel_89163/255337989.py", line 37, in forward
    output1 = self.model(**encoded_input1)
  File "/home/ali/anaconda3/envs/py38/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/ali/anaconda3/envs/py38/lib/python3.8/site-packages/transformers/models/bert/modeling_bert.py", line 991, in forwar

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/home/ali/anaconda3/envs/py38/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3444, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipykernel_89163/908439655.py", line 1, in <module>
    create_embeddings(sent2)
  File "/tmp/ipykernel_89163/686597705.py", line 6, in create_embeddings
    embeddings.append(sbert(sentence,objective="embedding"))
  File "/home/ali/anaconda3/envs/py38/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
    return forward_call(*input, **kwargs)
  File "/tmp/ipykernel_89163/255337989.py", line 37, in forward
    output1 = self.model(**encoded_input1)
  File "/home/ali/anaconda3/envs/py38/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/ali/anaconda3/envs/py38/lib/python3.8/site-packages/transformers/models/bert/modeling_bert.py", line 991, in forwar

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/home/ali/anaconda3/envs/py38/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3444, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipykernel_89163/908439655.py", line 1, in <module>
    create_embeddings(sent2)
  File "/tmp/ipykernel_89163/686597705.py", line 6, in create_embeddings
    embeddings.append(sbert(sentence,objective="embedding"))
  File "/home/ali/anaconda3/envs/py38/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
    return forward_call(*input, **kwargs)
  File "/tmp/ipykernel_89163/255337989.py", line 37, in forward
    output1 = self.model(**encoded_input1)
  File "/home/ali/anaconda3/envs/py38/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/ali/anaconda3/envs/py38/lib/python3.8/site-packages/transformers/models/bert/modeling_bert.py", line 991, in forwar

In [None]:
def search_engine():
    