# Step 1: Get our retriever

Use positive data to finetune a pre-trained sentence transformer model. Sentence transformer model will automatically use in-batch negative sampling.

1. model: paraphrase-multilingual-mpnet-base-v2

2. data features: only title

In [2]:
!pip -qqq install sentence-transformers
!pip -qqq install datasets
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from sentence_transformers import SentenceTransformer, models, InputExample, losses
from datasets import Dataset
from torch.utils.data import DataLoader
from sklearn.model_selection import KFold

In [3]:
topics = pd.read_csv("topics.csv")
content = pd.read_csv("content.csv")
correlations = pd.read_csv("correlations.csv")

In [4]:
def cv_split(train, n_folds, seed):
    kfold = KFold(n_splits=n_folds, shuffle=True, random_state=seed)
    for num, (train_index, val_index) in enumerate(kfold.split(train)):
        train.loc[val_index, 'fold'] = int(num)
    train['fold'] = train['fold'].astype(int)
    return train

In [5]:
kfolds = cv_split(correlations, 5, 1006)
correlations = kfolds[kfolds.fold!=0]

In [6]:
topics.rename(columns=lambda x: "topic_" + x, inplace=True)
content.rename(columns=lambda x: "content_" + x, inplace=True)

In [7]:
correlations["content_id"] = correlations["content_ids"].str.split(" ")
corr = correlations.explode("content_id").drop(columns=["content_ids"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  correlations["content_id"] = correlations["content_ids"].str.split(" ")


In [8]:
corr = corr.merge(topics, how="left", on="topic_id")
corr = corr.merge(content, how="left", on="content_id")

In [9]:
corr["set"] = corr[["topic_title", "content_title"]].values.tolist()  # use only title here
train_df = pd.DataFrame(corr["set"])

In [10]:
dataset = Dataset.from_pandas(train_df)

In [11]:
train_examples = []
train_data = dataset["set"]
n_examples = dataset.num_rows

for i in range(n_examples):
    example = train_data[i]
    if example[0] == None: #remove None
        continue        
    train_examples.append(InputExample(texts=[str(example[0]), str(example[1])]))

In [12]:
model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-mpnet-base-v2")

Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.77k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/723 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/402 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [13]:
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=64)
train_loss = losses.MultipleNegativesRankingLoss(model=model)
num_epochs = 10
warmup_steps = int(len(train_dataloader) * num_epochs * 0.1)

In [14]:
model.fit(train_objectives=[(train_dataloader, train_loss)],
          epochs=num_epochs,
          save_best_model = True,
          output_path='autodl-tmp/paraphrase-multilingual-mpnet-base-v2-exp19_fold0_epochs10',
          warmup_steps=warmup_steps)

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3523 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3523 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3523 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3523 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3523 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3523 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3523 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3523 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3523 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3523 [00:00<?, ?it/s]

Save kfold for the next step, since we use fold=0 for validation.

In [20]:
kfolds.to_csv('kfold_correlations_exp19.csv', index=0)