# Defining SBERT

In [1]:
import torch

In [2]:
torch.cuda.is_available()

True

In [3]:
from torch import nn
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BertModel, BertConfig, BertTokenizer
from scipy import spatial

# QUESTION: 
## SHOULD WE TRAIN FROM SCRATCH OR CAN WE START FROM bert-base-uncased or any other starting point?
##### DONE: BertConfig() starts from random weights

## TODO: Add validation set and test on testing set for SBERT classification
##### keep the model running for a while and then test .....

## TODO: After finishing training for classification, save the model please to stop training everytime <3
##### sure.. but we can just run the model 

## TODO: After training on the classification data, check how to "fine-tune" on the STS regression data (maybe only the head without the bert itself?)
## Check my question please <3 https://docs.google.com/document/d/1YeohuAr55fKF2nI1RiCgpq_Wa3Yn-CLAwfPoBmViNIM/edit

class SBERT(nn.Module):
    def __init__(self):
        super(SBERT, self).__init__()
        
        # self.model = BertModel.from_pretrained("bert-base-uncased")
        configuration = BertConfig()
        self.model = BertModel(configuration)
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        
        self.pooling = nn.AvgPool1d(kernel_size=3,stride=1)
        # 768 / 3 -> 256
        self.linear = nn.Linear(in_features=2298, out_features=3) # 2298=(768-2)*3; 153 is the embedding dimension after pooling and stuff..
        self.softmax = nn.Softmax(dim=1)

    def forward(self, sent1, sent2=None, objective="embedding"):
        encoded_input1 = self.tokenizer(sent1, padding=True, truncation=True, return_tensors='pt')
        output1 = self.model(**encoded_input1)
        output1 = self.pooling(output1["pooler_output"])
        
        if objective=="embedding":
            return output1

        encoded_input2 = self.tokenizer(sent2, padding=True, truncation=True, return_tensors='pt')
        output2 = self.model(**encoded_input2)
        output2 = self.pooling(output2["pooler_output"])
                        
        if objective == "regression":
            return torch.cosine_similarity(output1, output2)

        if objective == "classification":
            diff = abs(torch.subtract(output1,output2))
            concat = torch.cat([output1,output2,diff],axis=1)            
            result = self.linear(concat)
            out = self.softmax(result)
            return out

sbert = SBERT()

In [4]:
# sbert

# Regression Objective Evaluation

In [5]:
# False in ex1
# True in ex2
comparing_models = False

In [6]:
if comparing_models: 
    PATH = "models/classification_regression.pt"
    sbert = SBERT()
    sbert.load_state_dict(torch.load(PATH))
    sbert.eval()

In [7]:
import pandas as pd

df_test = pd.read_csv("datasets/Stsbenchmark/sts-test.csv",header=0,names=["main-caption","genre","filename","year","score","sentence1","sentence2"])#,usecols=['score','sentence1','sentence2'])

df_test = df_test[['score','sentence1','sentence2']]
df_test.head()

Unnamed: 0,score,sentence1,sentence2
0,3.6,A group of men play soccer on the beach.,A group of boys are playing soccer on the beach.
1,5.0,One woman is measuring another woman's ankle.,A woman measures another woman's ankle.
2,4.2,A man is cutting up a cucumber.,A man is slicing a cucumber.
3,1.5,A man is playing a harp.,A man is playing a keyboard.
4,1.8,A woman is cutting onions.,A woman is cutting tofu.


In [8]:
ls = list(df_test.score)
minn = min(ls)
maxx = max(ls)
(minn,maxx)

(0.0, 5.0)

In [9]:
def map_score(value, leftMin=0, leftMax=5, rightMin=-1, rightMax=1):
    # Figure out how 'wide' each range is
    leftSpan = leftMax - leftMin
    rightSpan = rightMax - rightMin

    # Convert the left range into a 0-1 range (float)
    valueScaled = float(value - leftMin) / float(leftSpan)

    # Convert the 0-1 range into a value in the right range.
    return rightMin + (valueScaled * rightSpan)

df_test['score'] = df_test['score'].apply(map_score)

In [10]:
ls = list(df_test.score)
minn = min(ls)
maxx = max(ls)
(minn,maxx)

(-1.0, 1.0)

In [11]:
df_test.isna().sum()

score        0
sentence1    0
sentence2    5
dtype: int64

In [12]:
df_test.dropna(inplace=True)
df_test.reset_index(inplace=True)

In [13]:
df_test.isna().sum()

index        0
score        0
sentence1    0
sentence2    0
dtype: int64

In [48]:
cosine_scores = []
for i,row in df_test.iterrows():
    print(f"Finished {i}/{len(df_test)}",end="\r")
    score = sbert(row.sentence1,row.sentence2,"regression").detach().numpy()[0]
    cosine_scores.append(score)

Finished 978/975

In [52]:
labels = df_test.score.values.tolist()

In [53]:
from scipy.stats import spearmanr
spearmanr(labels,cosine_scores)

SpearmanrResult(correlation=0.057250966842595334, pvalue=0.07396443024495716)

# Classification Objective Evaluation

In [5]:
import json
json_list_train = list(open("datasets/snli_1.0/snli_1.0_train.jsonl","r"))
json_list_val = list(open("datasets/snli_1.0/snli_1.0_dev.jsonl","r"))

label2int = {"contradiction": 0, "entailment": 1, "neutral": 2}

In [6]:
import pandas as pd

data_train = {'sentence1': [], 'sentence2': [], 'gold_label': []}
for json_str in json_list_train:
    try:
        result = json.loads(json_str)
        result['gold_label']=label2int[result['gold_label']]
        for key in data_train:
            data_train[key].append(result[key])
    except:
        pass
df_train = pd.DataFrame.from_dict(data_train)#.head()

data_val = {'sentence1': [], 'sentence2': [], 'gold_label': []}
for json_str in json_list_val:
    try:
        result = json.loads(json_str)
        result['gold_label']=label2int[result['gold_label']]
        for key in data_val:
            data_val[key].append(result[key])
    except:
        pass
df_val = pd.DataFrame.from_dict(data_val)#.head()

print("Training data:",len(df_train))
print("Testing data:",len(df_val))

Training data: 549367
Testing data: 9842


In [7]:
df_train.head()

Unnamed: 0,sentence1,sentence2,gold_label
0,A person on a horse jumps over a broken down a...,A person is training his horse for a competition.,2
1,A person on a horse jumps over a broken down a...,"A person is at a diner, ordering an omelette.",0
2,A person on a horse jumps over a broken down a...,"A person is outdoors, on a horse.",1
3,Children smiling and waving at camera,They are smiling at their parents,2
4,Children smiling and waving at camera,There are children present,1


In [8]:
from torch.utils.data import Dataset,DataLoader

class SNLI_Dataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        sent1 = row['sentence1']
        sent2 = row['sentence2']
        label = row['gold_label']
        return (sent1, sent2), label

- outputs of shape (N,C) where N is the batch size C is the number of classes
- target of shape (N)

In [None]:
import torch.optim as optim
import time

# from torchsample.callbacks import EarlyStopping
# callbacks = [EarlyStopping(monitor='val_loss', patience=5)]
# model.set_callbacks(callbacks)


n_epochs = 10
n_batch = 16
lr=0.001


training_data = SNLI_Dataset(df_train)
train_dataloader = DataLoader(training_data, batch_size=n_batch, shuffle=False)

validation_data = SNLI_Dataset(df_val)
val_dataloader = DataLoader(validation_data, batch_size=n_batch, shuffle=True)

criterion = nn.CrossEntropyLoss()
# optimizer = optim.SGD(sbert.parameters(), lr=lr, momentum=0.9)
optimizer = optim.Adam(sbert.parameters(), lr=lr)

sbert.train()
train_losses = []
val_losses = []
for epoch in range(n_epochs):

    for i,((sent1,sent2),label) in enumerate(train_dataloader):
        
        start = time.time()
        
        optimizer.zero_grad()
        
        output = sbert(sent1,sent2,objective="classification")
        loss = criterion(output, label)
        train_losses.append(loss)
        
#         (sent1_val,sent2_val),label_val = next(iter(val_dataloader))
#         output_val = sbert(sent1_val,sent2_val,"classification")
#         val_loss = criterion(output_val,label_val)
#         val_losses.append(val_loss)
        
        message = "epoch={}/{} iteration={}/{} train_loss={:.4f} took {:.4f} secs" \
            .format(epoch+1,n_epochs,i+1,len(train_dataloader),loss.detach().numpy(),time.time()-start)
        
        print(message)
        
        loss.backward()
        optimizer.step()
        
        
print('Finished Training')

epoch=1/10 iteration=1/34336 train_loss=1.1059 took 0.7070 secs
epoch=1/10 iteration=2/34336 train_loss=1.1709 took 0.4261 secs
epoch=1/10 iteration=3/34336 train_loss=1.1881 took 0.3999 secs
epoch=1/10 iteration=4/34336 train_loss=1.1084 took 0.4236 secs
epoch=1/10 iteration=5/34336 train_loss=1.1597 took 0.6614 secs
epoch=1/10 iteration=6/34336 train_loss=1.1511 took 0.7325 secs
epoch=1/10 iteration=7/34336 train_loss=1.2218 took 0.5861 secs
epoch=1/10 iteration=8/34336 train_loss=1.1032 took 0.3301 secs
epoch=1/10 iteration=9/34336 train_loss=1.1626 took 0.4545 secs
epoch=1/10 iteration=10/34336 train_loss=1.1426 took 0.3818 secs
epoch=1/10 iteration=11/34336 train_loss=1.1036 took 0.3551 secs
epoch=1/10 iteration=12/34336 train_loss=1.1771 took 0.3191 secs
epoch=1/10 iteration=13/34336 train_loss=1.1839 took 0.3191 secs
epoch=1/10 iteration=14/34336 train_loss=1.1054 took 0.4677 secs
epoch=1/10 iteration=15/34336 train_loss=1.1008 took 0.4597 secs
epoch=1/10 iteration=16/34336 trai

epoch=1/10 iteration=127/34336 train_loss=1.0977 took 0.3690 secs
epoch=1/10 iteration=128/34336 train_loss=1.1001 took 0.3332 secs
epoch=1/10 iteration=129/34336 train_loss=1.1000 took 0.3713 secs
epoch=1/10 iteration=130/34336 train_loss=1.1171 took 0.4331 secs
epoch=1/10 iteration=131/34336 train_loss=1.1060 took 0.3523 secs
epoch=1/10 iteration=132/34336 train_loss=1.0820 took 0.3680 secs
epoch=1/10 iteration=133/34336 train_loss=1.1085 took 0.3665 secs
epoch=1/10 iteration=134/34336 train_loss=1.0986 took 0.4781 secs
epoch=1/10 iteration=135/34336 train_loss=1.1000 took 0.4283 secs
epoch=1/10 iteration=136/34336 train_loss=1.1067 took 0.3226 secs
epoch=1/10 iteration=137/34336 train_loss=1.1103 took 0.3258 secs
epoch=1/10 iteration=138/34336 train_loss=1.0962 took 0.4284 secs
epoch=1/10 iteration=139/34336 train_loss=1.1038 took 0.3075 secs
epoch=1/10 iteration=140/34336 train_loss=1.0924 took 0.5742 secs
epoch=1/10 iteration=141/34336 train_loss=1.0968 took 0.3720 secs
epoch=1/10

epoch=1/10 iteration=252/34336 train_loss=1.0936 took 0.3439 secs
epoch=1/10 iteration=253/34336 train_loss=1.1012 took 0.3577 secs
epoch=1/10 iteration=254/34336 train_loss=1.0926 took 0.3493 secs
epoch=1/10 iteration=255/34336 train_loss=1.1052 took 0.4273 secs
epoch=1/10 iteration=256/34336 train_loss=1.1002 took 0.4254 secs
epoch=1/10 iteration=257/34336 train_loss=1.1050 took 0.5002 secs
epoch=1/10 iteration=258/34336 train_loss=1.0996 took 0.5125 secs
epoch=1/10 iteration=259/34336 train_loss=1.0950 took 0.4493 secs
epoch=1/10 iteration=260/34336 train_loss=1.1405 took 0.4415 secs
epoch=1/10 iteration=261/34336 train_loss=1.0980 took 0.4050 secs
epoch=1/10 iteration=262/34336 train_loss=1.0996 took 0.4618 secs
epoch=1/10 iteration=263/34336 train_loss=1.1056 took 0.4704 secs
epoch=1/10 iteration=264/34336 train_loss=1.0934 took 0.3020 secs
epoch=1/10 iteration=265/34336 train_loss=1.0994 took 0.4040 secs
epoch=1/10 iteration=266/34336 train_loss=1.0997 took 0.3608 secs
epoch=1/10

epoch=1/10 iteration=377/34336 train_loss=1.1098 took 0.4717 secs
epoch=1/10 iteration=378/34336 train_loss=1.0992 took 0.3339 secs
epoch=1/10 iteration=379/34336 train_loss=1.1069 took 0.4697 secs
epoch=1/10 iteration=380/34336 train_loss=1.0999 took 0.3949 secs
epoch=1/10 iteration=381/34336 train_loss=1.0997 took 0.3452 secs
epoch=1/10 iteration=382/34336 train_loss=1.0939 took 0.3781 secs
epoch=1/10 iteration=383/34336 train_loss=1.1031 took 0.3377 secs
epoch=1/10 iteration=384/34336 train_loss=1.1020 took 0.3799 secs
epoch=1/10 iteration=385/34336 train_loss=1.1027 took 0.2858 secs
epoch=1/10 iteration=386/34336 train_loss=1.1038 took 0.3774 secs
epoch=1/10 iteration=387/34336 train_loss=1.1078 took 0.3635 secs
epoch=1/10 iteration=388/34336 train_loss=1.0969 took 0.3824 secs
epoch=1/10 iteration=389/34336 train_loss=1.1022 took 0.4305 secs
epoch=1/10 iteration=390/34336 train_loss=1.1000 took 0.5288 secs
epoch=1/10 iteration=391/34336 train_loss=1.1014 took 0.4507 secs
epoch=1/10

epoch=1/10 iteration=502/34336 train_loss=1.1015 took 0.3570 secs
epoch=1/10 iteration=503/34336 train_loss=1.1027 took 0.3654 secs
epoch=1/10 iteration=504/34336 train_loss=1.0992 took 0.3958 secs
epoch=1/10 iteration=505/34336 train_loss=1.1005 took 0.5075 secs
epoch=1/10 iteration=506/34336 train_loss=1.0999 took 0.4457 secs
epoch=1/10 iteration=507/34336 train_loss=1.1068 took 0.4602 secs
epoch=1/10 iteration=508/34336 train_loss=1.1041 took 0.3417 secs
epoch=1/10 iteration=509/34336 train_loss=1.0911 took 0.5211 secs
epoch=1/10 iteration=510/34336 train_loss=1.1035 took 0.3986 secs
epoch=1/10 iteration=511/34336 train_loss=1.1097 took 0.3953 secs
epoch=1/10 iteration=512/34336 train_loss=1.0980 took 0.3953 secs
epoch=1/10 iteration=513/34336 train_loss=1.0989 took 0.3885 secs
epoch=1/10 iteration=514/34336 train_loss=1.1009 took 0.4470 secs
epoch=1/10 iteration=515/34336 train_loss=1.1113 took 0.4314 secs
epoch=1/10 iteration=516/34336 train_loss=1.0976 took 0.4414 secs
epoch=1/10

epoch=1/10 iteration=627/34336 train_loss=1.1021 took 0.3714 secs
epoch=1/10 iteration=628/34336 train_loss=1.0923 took 0.3818 secs
epoch=1/10 iteration=629/34336 train_loss=1.1048 took 0.3221 secs
epoch=1/10 iteration=630/34336 train_loss=1.1056 took 0.3449 secs
epoch=1/10 iteration=631/34336 train_loss=1.1040 took 0.3434 secs
epoch=1/10 iteration=632/34336 train_loss=1.1053 took 0.3193 secs
epoch=1/10 iteration=633/34336 train_loss=1.0978 took 0.4865 secs
epoch=1/10 iteration=634/34336 train_loss=1.0998 took 0.5117 secs
epoch=1/10 iteration=635/34336 train_loss=1.0987 took 0.6334 secs
epoch=1/10 iteration=636/34336 train_loss=1.0984 took 0.4702 secs
epoch=1/10 iteration=637/34336 train_loss=1.1033 took 0.4426 secs
epoch=1/10 iteration=638/34336 train_loss=1.0990 took 0.4426 secs
epoch=1/10 iteration=639/34336 train_loss=1.0990 took 0.4420 secs
epoch=1/10 iteration=640/34336 train_loss=1.0969 took 0.3982 secs
epoch=1/10 iteration=641/34336 train_loss=1.1014 took 0.4450 secs
epoch=1/10

epoch=1/10 iteration=752/34336 train_loss=1.1046 took 0.4105 secs
epoch=1/10 iteration=753/34336 train_loss=1.1029 took 0.3185 secs
epoch=1/10 iteration=754/34336 train_loss=1.0971 took 0.3870 secs
epoch=1/10 iteration=755/34336 train_loss=1.0991 took 0.4435 secs
epoch=1/10 iteration=756/34336 train_loss=1.0963 took 0.4850 secs
epoch=1/10 iteration=757/34336 train_loss=1.0956 took 0.4180 secs
epoch=1/10 iteration=758/34336 train_loss=1.1058 took 0.6471 secs
epoch=1/10 iteration=759/34336 train_loss=1.1098 took 0.4610 secs
epoch=1/10 iteration=760/34336 train_loss=1.0951 took 0.4518 secs
epoch=1/10 iteration=761/34336 train_loss=1.1016 took 0.3948 secs
epoch=1/10 iteration=762/34336 train_loss=1.1044 took 0.4513 secs
epoch=1/10 iteration=763/34336 train_loss=1.1009 took 0.3583 secs
epoch=1/10 iteration=764/34336 train_loss=1.1001 took 0.3250 secs
epoch=1/10 iteration=765/34336 train_loss=1.1039 took 0.4266 secs
epoch=1/10 iteration=766/34336 train_loss=1.1027 took 0.3304 secs
epoch=1/10

In [2]:
train_losses

NameError: name 'train_losses' is not defined

In [16]:
for idx in range(5):
    pred = torch.argmax(sbert(df_train.iloc[idx].sentence1,df_train.iloc[idx].sentence2,"classification")).numpy().item()
    true = df_train.iloc[idx]['gold_label']
    print(pred==true)

True
False
True
True
True


In [18]:
PATH = "models/classification.pt"
torch.save(sbert.state_dict(), PATH)

# fine-tuning using the regression objective

In [19]:
PATH = "models/classification.pt"

sbert = SBERT()
sbert.load_state_dict(torch.load(PATH))
sbert.train()

SBERT(
  (model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
     

In [20]:
import pandas as pd

df_train = pd.read_csv("Stsbenchmark/sts-train.csv",header=0,names=["main-caption","genre","filename","year","score","sentence1","sentence2"])#,usecols=['score','sentence1','sentence2'])
df_test = pd.read_csv("Stsbenchmark/sts-test.csv",header=0,names=["main-caption","genre","filename","year","score","sentence1","sentence2"])#,usecols=['score','sentence1','sentence2'])

df_train = df_train[['score','sentence1','sentence2']]
df_test = df_test[['score','sentence1','sentence2']]

df_train.head()
# df_test.head()

Unnamed: 0,score,sentence1,sentence2
0,3.8,A man is playing a large flute.,A man is playing a flute.
1,3.8,A man is spreading shreded cheese on a pizza.,A man is spreading shredded cheese on an uncoo...
2,2.6,Three men are playing chess.,Two men are playing chess.
3,4.25,A man is playing the cello.,A man seated is playing the cello.
4,4.25,Some men are fighting.,Two men are fighting.


In [21]:
ls = list(df_train.score)
minn = min(ls)
maxx = max(ls)
(minn,maxx)

(0.0, 5.0)

In [22]:
def map_score(value, leftMin=0, leftMax=5, rightMin=-1, rightMax=1):
    # Figure out how 'wide' each range is
    leftSpan = leftMax - leftMin
    rightSpan = rightMax - rightMin

    # Convert the left range into a 0-1 range (float)
    valueScaled = float(value - leftMin) / float(leftSpan)

    # Convert the 0-1 range into a value in the right range.
    return rightMin + (valueScaled * rightSpan)

df_train['score'] = df_train['score'].apply(map_score)

In [23]:
ls = list(df_train.score)
minn = min(ls)
maxx = max(ls)
(minn,maxx)

(-1.0, 1.0)

In [24]:
from torch.utils.data import Dataset,DataLoader

class STS_Dataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        sent1 = row['sentence1']
        sent2 = row['sentence2']
        label = row['score']
        return (sent1, sent2), label

In [25]:
sbert("I love pizza","I love burger",objective="regression")

tensor([0.6167], grad_fn=<DivBackward0>)

In [26]:
import torch.optim as optim

criterion = nn.MSELoss()
optimizer = optim.SGD(sbert.parameters(), lr=0.001, momentum=0.9)

In [28]:
training_data_regression = STS_Dataset(df_train)
train_dataloader_regression = DataLoader(training_data_regression, batch_size=64, shuffle=False)

In [None]:
n_epochs = 1

losses = []
for epoch in range(n_epochs):

    for i,((sent1,sent2),label) in enumerate(train_dataloader_regression):
        
        output = sbert(sent1,sent2,objective="regression")
        optimizer.zero_grad()
        
        label = label.float()
        loss = criterion(output, label)
        
        losses.append(loss)
        
        print(f"epoch={epoch+1} iteration={i+1}/{len(train_dataloader_regression)} loss={loss.detach().numpy()}")
        
        loss.backward()
        optimizer.step()
        
        # break
print('Finished Training')

epoch=1 iteration=1/62 loss=0.3414393365383148
epoch=1 iteration=2/62 loss=0.5225234627723694
epoch=1 iteration=3/62 loss=0.5447043776512146
epoch=1 iteration=4/62 loss=0.9775974750518799
epoch=1 iteration=5/62 loss=0.8285077214241028
epoch=1 iteration=6/62 loss=0.9737005829811096
epoch=1 iteration=7/62 loss=0.7615697383880615
epoch=1 iteration=8/62 loss=0.7637732625007629
epoch=1 iteration=9/62 loss=0.27583539485931396
epoch=1 iteration=10/62 loss=0.4577804207801819
epoch=1 iteration=11/62 loss=0.42774084210395813
epoch=1 iteration=12/62 loss=0.6824129223823547
epoch=1 iteration=13/62 loss=0.5674241185188293
epoch=1 iteration=14/62 loss=0.6364807486534119
epoch=1 iteration=15/62 loss=0.7143394947052002
epoch=1 iteration=16/62 loss=0.5169401168823242
epoch=1 iteration=17/62 loss=0.39554280042648315
epoch=1 iteration=18/62 loss=0.48140937089920044
epoch=1 iteration=19/62 loss=0.4177089333534241
epoch=1 iteration=20/62 loss=0.4531996548175812
epoch=1 iteration=21/62 loss=0.32224404811859

In [8]:
PATH = "models/classification_regression.pt"
torch.save(sbert.state_dict(), PATH)

# Use your best fine-tuned model and create a small semantic search system

In [5]:
PATH = "models/classification_regression.pt"
sbert = SBERT()
sbert.load_state_dict(torch.load(PATH))
sbert.eval()

SBERT(
  (model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
     

In [6]:
import json
list_ = []
path = "datasets/News_Category_Dataset_v2.json"
with open(path) as files:
    for file in files:
        list_.append(json.loads(file))

In [7]:
import pandas as pd

df_news = pd.DataFrame(list_)
df_news.head()

Unnamed: 0,category,headline,authors,link,short_description,date
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...,Melissa Jeltsen,https://www.huffingtonpost.com/entry/texas-ama...,She left her husband. He killed their children...,2018-05-26
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.,2018-05-26
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,Ron Dicker,https://www.huffingtonpost.com/entry/hugh-gran...,The actor and his longtime girlfriend Anna Ebe...,2018-05-26
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,Ron Dicker,https://www.huffingtonpost.com/entry/jim-carre...,The actor gives Dems an ass-kicking for not fi...,2018-05-26
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...,Ron Dicker,https://www.huffingtonpost.com/entry/julianna-...,"The ""Dietland"" actress said using the bags is ...",2018-05-26


In [8]:
sent1 = list(df_news.headline)
print("Length before splitting:",len(sent1))
sent2 = [x for sentence in sent1 for x in sentence.split(".")]
sent2 = [x for x in sent2 if x]
print("Length after splitting:",len(sent2))
print(f"Data increased by {len(sent2)-len(sent1)} sentences")

Length before splitting: 200853
Length after splitting: 215043
Data increased by 14190 sentences


In [9]:
sent2[:5]

['There Were 2 Mass Shootings In Texas Last Week, But Only 1 On TV',
 "Will Smith Joins Diplo And Nicky Jam For The 2018 World Cup's Official Song",
 'Hugh Grant Marries For The First Time At Age 57',
 "Jim Carrey Blasts 'Castrato' Adam Schiff And Democrats In New Artwork",
 'Julianna Margulies Uses Donald Trump Poop Bags To Pick Up After Her Dog']

In [10]:
input_sentence = "Sweden is a very safe place."

In [11]:
def create_embeddings(sentences):
    embeddings = []
    for i,sentence in enumerate(sentences):
        print(f"Finished iteration {i}/{len(sentences)}",end="\r")
        
        embeddings.append(sbert(sentence,objective="embedding"))

    embeddings = torch.FloatTensor(embeddings)
    torch.save(x, 'datasets/embeddings.pt')

In [None]:
create_embeddings(sent2)

Finished iteration 1061/215043

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/home/ali/anaconda3/envs/py38/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3444, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipykernel_89163/908439655.py", line 1, in <module>
    create_embeddings(sent2)
  File "/tmp/ipykernel_89163/686597705.py", line 6, in create_embeddings
    embeddings.append(sbert(sentence,objective="embedding"))
  File "/home/ali/anaconda3/envs/py38/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
    return forward_call(*input, **kwargs)
  File "/tmp/ipykernel_89163/255337989.py", line 37, in forward
    output1 = self.model(**encoded_input1)
  File "/home/ali/anaconda3/envs/py38/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/ali/anaconda3/envs/py38/lib/python3.8/site-packages/transformers/models/bert/modeling_bert.py", line 991, in forwar

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/home/ali/anaconda3/envs/py38/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3444, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipykernel_89163/908439655.py", line 1, in <module>
    create_embeddings(sent2)
  File "/tmp/ipykernel_89163/686597705.py", line 6, in create_embeddings
    embeddings.append(sbert(sentence,objective="embedding"))
  File "/home/ali/anaconda3/envs/py38/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
    return forward_call(*input, **kwargs)
  File "/tmp/ipykernel_89163/255337989.py", line 37, in forward
    output1 = self.model(**encoded_input1)
  File "/home/ali/anaconda3/envs/py38/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/ali/anaconda3/envs/py38/lib/python3.8/site-packages/transformers/models/bert/modeling_bert.py", line 991, in forwar

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/home/ali/anaconda3/envs/py38/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3444, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipykernel_89163/908439655.py", line 1, in <module>
    create_embeddings(sent2)
  File "/tmp/ipykernel_89163/686597705.py", line 6, in create_embeddings
    embeddings.append(sbert(sentence,objective="embedding"))
  File "/home/ali/anaconda3/envs/py38/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
    return forward_call(*input, **kwargs)
  File "/tmp/ipykernel_89163/255337989.py", line 37, in forward
    output1 = self.model(**encoded_input1)
  File "/home/ali/anaconda3/envs/py38/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/ali/anaconda3/envs/py38/lib/python3.8/site-packages/transformers/models/bert/modeling_bert.py", line 991, in forwar

In [None]:
def search_engine():
    