In [1]:
import datasets

stsb = datasets.load_dataset('glue', 'stsb', split='train')
stsb_dev = datasets.load_dataset('glue', 'stsb', split='validation')
stsb, stsb_dev

(Dataset({
     features: ['sentence1', 'sentence2', 'label', 'idx'],
     num_rows: 5749
 }),
 Dataset({
     features: ['sentence1', 'sentence2', 'label', 'idx'],
     num_rows: 1500
 }))

In [4]:
stsb = stsb.map(lambda x: {'label': x['label']/5.0})

Map:   0%|          | 0/5749 [00:00<?, ? examples/s]

In [5]:
stsb_dev = stsb_dev.map(lambda x: {'label': x['label']/5.0})

In [7]:
from sentence_transformers import InputExample
from torch.utils.data import DataLoader

samples = [InputExample(texts=[row['sentence1'], row['sentence2']]) for row in stsb]
batch_size = 16
loader = DataLoader(samples, batch_size=batch_size, shuffle=True)

In [8]:
from sentence_transformers.cross_encoder import CrossEncoder

model = CrossEncoder('bert-base-uncased', num_labels=1)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
num_epochs = 1
warmup_steps = int(len(loader)*0.4*num_epochs)
model.fit(
    train_dataloader=loader,
    epochs=num_epochs,
    warmup_steps=warmup_steps,
    output_path='bert-stsb-cross-encoder'
)

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/360 [00:00<?, ?it/s]

In [25]:
model.save('bert-stsb-cross-encoder')

In [38]:
import pandas as pd

gold = datasets.load_dataset('glue', 'stsb', split='train')
gold_df = pd.DataFrame({
    'sentence1': gold['sentence1'],
    'sentence2': gold['sentence2'],
    'label': gold['label']
})
gold_df['label'] = gold_df['label'] / 5.0
gold_df.head()

Unnamed: 0,sentence1,sentence2,label
0,A plane is taking off.,An air plane is taking off.,1.0
1,A man is playing a large flute.,A man is playing a flute.,0.76
2,A man is spreading shreded cheese on a pizza.,A man is spreading shredded cheese on an uncoo...,0.76
3,Three men are playing chess.,Two men are playing chess.,0.52
4,A man is playing the cello.,A man seated is playing the cello.,0.85


In [29]:
from tqdm import tqdm

silver = []
for sentence1 in list(set(gold_df['sentence1'])):
    sampled = gold_df[gold_df['sentence1'] != sentence1].sample(5)
    sampled = sampled['sentence2'].tolist()
    for sentence2 in sampled:
        silver.append({'sentence1': sentence1, 'sentence2': sentence2})
silver_df = pd.DataFrame(silver)
print(silver_df.shape)
silver_df.head()

(27180, 2)


Unnamed: 0,sentence1,sentence2
0,Former Zambian president arrested,"Senate, House on collision course on border money"
1,Former Zambian president arrested,Two girls are walking by a tree in front of a ...
2,Former Zambian president arrested,A man follows his black and white dog as it ta...
3,Former Zambian president arrested,Did the transaction take place?
4,Former Zambian president arrested,Madoff's brother to plead guilty to fraud


In [31]:
# pairs_df = pd.concat([gold_df, silver_df])
pairs_df = silver_df
print(pairs_df.shape)
pairs_df = pairs_df.drop_duplicates()
print(pairs_df.shape)

(27180, 2)
(27177, 2)


In [None]:
from sentence_transformers.cross_encoder import CrossEncoder
cross_encoder = CrossEncoder('bert-stsb-cross-encoder')

In [33]:
silver = list(zip(pairs_df['sentence1'], pairs_df['sentence2']))
silver[:2]

[('Former Zambian president arrested',
  'Senate, House on collision course on border money'),
 ('Former Zambian president arrested',
  'Two girls are walking by a tree in front of a brick building.')]

In [34]:
scores = cross_encoder.predict(silver)

In [35]:
pairs_df['label'] = scores.tolist()
pairs_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pairs_df['label'] = scores.tolist()


Unnamed: 0,sentence1,sentence2,label
0,Former Zambian president arrested,"Senate, House on collision course on border money",0.00047
1,Former Zambian president arrested,Two girls are walking by a tree in front of a ...,0.000471
2,Former Zambian president arrested,A man follows his black and white dog as it ta...,0.000472
3,Former Zambian president arrested,Did the transaction take place?,0.000481
4,Former Zambian president arrested,Madoff's brother to plead guilty to fraud,0.000475


In [39]:
all_data_df = pd.concat([gold_df, pairs_df])
print(all_data_df.shape)

(32926, 3)


In [40]:
samples = [InputExample(texts=[row['sentence1'], row['sentence2']], label=float(row['label']))\
           for _, row in all_data_df.iterrows()]
batch_size = 16
loader = DataLoader(samples, batch_size=batch_size, shuffle=True)

In [41]:
len(loader)

2058

In [42]:
from sentence_transformers import models, SentenceTransformer

bert = models.Transformer('bert-base-uncased')
pooler = models.Pooling(
    bert.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True
)
model = SentenceTransformer(modules=[bert, pooler])

In [44]:
from sentence_transformers import losses
loss = losses.CosineSimilarityLoss(model=model)

In [45]:
epochs = 1
warmup_steps = int(len(loader) * epochs * 0.15)

model.fit(
    train_objectives=[(loader, loss)],
    epochs=epochs,
    warmup_steps=warmup_steps,
    output_path='bert-stsb-aug'
)

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2058 [00:00<?, ?it/s]