In [1]:
import datasets

squad = datasets.load_dataset('squad_v2', split='train')
squad

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 130319
})

In [2]:
squad[0]

{'id': '56be85543aeaaa14008c9063',
 'title': 'Beyoncé',
 'context': 'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".',
 'question': 'When did Beyonce start becoming popular?',
 'answers': {'text': ['in the late 1990s'], 'answer_start': [269]}}

In [3]:
from sentence_transformers import InputExample
from tqdm import tqdm

samples = []
for row in tqdm(squad):
    samples.append(InputExample(texts=[row['question'], row['context']]))
len(samples)

100%|██████████| 130319/130319 [00:08<00:00, 16132.63it/s]


130319

In [4]:
from sentence_transformers.datasets import NoDuplicatesDataLoader

batch_size = 16
loader = NoDuplicatesDataLoader(samples[:100], batch_size=batch_size)
len(loader)

6

In [5]:
from sentence_transformers import models, SentenceTransformer

bert = models.Transformer('microsoft/mpnet-base')
pooling = models.Pooling(bert.get_word_embedding_dimension(), pooling_mode_mean_tokens=True)
model = SentenceTransformer(modules=[bert, pooling])
model

Some weights of MPNetModel were not initialized from the model checkpoint at microsoft/mpnet-base and are newly initialized: ['mpnet.pooler.dense.bias', 'mpnet.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

In [6]:
from sentence_transformers import losses

loss = losses.MultipleNegativesRankingLoss(model)

In [9]:
epochs = 1
warmup_steps = int(len(loader)*epochs*0.1)
model.fit(
    train_objectives=[(loader, loss)],
    epochs = epochs,
    warmup_steps=warmup_steps,
    # output_path='mpnet-mnr-squad2',
    show_progress_bar=True
)

In [8]:
# model.save('mpnet-mnr-squad2')

In [10]:
squad_dev = datasets.load_dataset('squad_v2', split='validation')
squad_dev

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 11873
})

In [17]:
import pandas as pd
from tqdm import tqdm

squad_df = []
for row in tqdm(squad_dev):
    squad_df.append({
        'question': row['question'],
        'context': row['context'],
        'id': row['id']
    })
squad_df = pd.DataFrame(squad_df)
squad_df.head()

100%|██████████| 11873/11873 [00:00<00:00, 16320.33it/s]


Unnamed: 0,question,context,id
0,In what country is Normandy located?,The Normans (Norman: Nourmands; French: Norman...,56ddde6b9a695914005b9628
1,When were the Normans in Normandy?,The Normans (Norman: Nourmands; French: Norman...,56ddde6b9a695914005b9629
2,From which countries did the Norse originate?,The Normans (Norman: Nourmands; French: Norman...,56ddde6b9a695914005b962a
3,Who was the Norse leader?,The Normans (Norman: Nourmands; French: Norman...,56ddde6b9a695914005b962b
4,What century did the Normans first gain their ...,The Normans (Norman: Nourmands; French: Norman...,56ddde6b9a695914005b962c


In [18]:
squad_df.shape

(11873, 3)

In [23]:
no_dup_df = squad_df.drop_duplicates(subset='context', keep='first')
no_dup_df = no_dup_df.drop(columns=['question'])
no_dup_df['id'] = no_dup_df['id'] + 'con'
print(no_dup_df.shape)
no_dup_df.head()

(1204, 2)


Unnamed: 0,context,id
0,The Normans (Norman: Nourmands; French: Norman...,56ddde6b9a695914005b9628con
9,"The Norman dynasty had a major political, cult...",56dddf4066d3e219004dad5fcon
17,"The English name ""Normans"" comes from the Fren...",56dde0379a695914005b9636con
21,"In the course of the 10th century, the initial...",56dde0ba66d3e219004dad75con
28,"Before Rollo's arrival, its populations did no...",56dde1d966d3e219004dad8dcon


In [24]:
squad_df2 = squad_df.merge(no_dup_df, how='inner', on='context')
print(squad_df2.shape)
squad_df2.head()

(11873, 4)


Unnamed: 0,question,context,id_x,id_y
0,In what country is Normandy located?,The Normans (Norman: Nourmands; French: Norman...,56ddde6b9a695914005b9628,56ddde6b9a695914005b9628con
1,When were the Normans in Normandy?,The Normans (Norman: Nourmands; French: Norman...,56ddde6b9a695914005b9629,56ddde6b9a695914005b9628con
2,From which countries did the Norse originate?,The Normans (Norman: Nourmands; French: Norman...,56ddde6b9a695914005b962a,56ddde6b9a695914005b9628con
3,Who was the Norse leader?,The Normans (Norman: Nourmands; French: Norman...,56ddde6b9a695914005b962b,56ddde6b9a695914005b9628con
4,What century did the Normans first gain their ...,The Normans (Norman: Nourmands; French: Norman...,56ddde6b9a695914005b962c,56ddde6b9a695914005b9628con


In [27]:
ir_queries = { row['id_x']: row['question'] for _,row in squad_df2.iterrows()}
ir_corpus = { row['id_y']: row['context'] for _,row in squad_df2.iterrows()}

In [42]:
ir_relevant_docs = {key:[] for key in squad_df2.id_x.unique()}
[ir_relevant_docs[row['id_x']].append(row['id_y']) for _, row in squad_df2.iterrows()];
ir_relevant_docs = {key:set(val) for key, val in ir_relevant_docs.items()}


In [44]:
from sentence_transformers.evaluation import InformationRetrievalEvaluator

eval = InformationRetrievalEvaluator(ir_queries, ir_corpus, ir_relevant_docs)

In [45]:
model = SentenceTransformer('mpnet-mnr-squad2')
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

In [46]:
eval(model)

0.2383023453093362

In [None]:
qa = SentenceTransformer('multi-qa-mpnet-base-cos-v1')

eval(qa)