This is a simple baseline code using SBERT.

It has no training phase. It only calculates the cosine similarity between topic and content. 

I choose the most similar content as the predicted output.

# Code

**Version 3**:
* Use `text` or `description` or `title` field of the `topic` and `content`
* Choose only those contents with the same `language` field as the topic
* Select TOP-5 `content` since the average number of correlations per topic is 4.6
* Use GPU P100

In [1]:
import numpy as np
import pandas as pd
import torch
import cupy
from transformers import AutoTokenizer, AutoModel
from tqdm.auto import tqdm
from cuml.metrics import pairwise_distances

In [2]:
# config
device = "cuda" if torch.cuda.is_available() else "cpu"
DATA_PATH = "/kaggle/input/learning-equality-curriculum-recommendations/"
MODEL_PATH = "/kaggle/input/sbert-models/paraphrase-multilingual-mpnet-base-v2"
VEC_PATH = '/kaggle/input/lecr-baseline-vectors/'
MAX_LEN = 512
TOP_N = 5
DEBUG = False

In [3]:
# read data
content = pd.read_csv(DATA_PATH + 'content.csv')
topics = pd.read_csv(DATA_PATH + 'topics.csv')
correlations = pd.read_csv(DATA_PATH + 'correlations.csv')
submission = pd.read_csv(DATA_PATH + 'sample_submission.csv')

In [4]:
# model
model = AutoModel.from_pretrained(MODEL_PATH)
model.eval()
model.to(device)
# tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

In [5]:
# embedding content
if DEBUG == False:
    emb_content = np.load(VEC_PATH + 'vecs_content.npy')
    emb_content = torch.tensor(emb_content)
else:
    vecs_content = []

    for _, row in tqdm(content.iterrows(), total=len(content)):
        # input sentence
        sentence = row['text']
        if type(sentence) is float:
            sentence = row['description']
        if type(sentence) is float:
            sentence = row['title']

        # tokenize
        tok = tokenizer(sentence)
        for k, v in tok.items():
            tok[k] = torch.tensor(v[:MAX_LEN]).to(device).unsqueeze(0)

        # embedded vector
        with torch.no_grad():
            output = model(**tok)
        vec = output.last_hidden_state.squeeze(0).mean(0).cpu()
        vecs_content.append(vec)

    # embedded content
    emb_content = torch.stack(vecs_content)

In [6]:
# topic dataframes we need to predict
submission_topic_ids = submission['topic_id'].tolist()
submission_topics = topics.query(f'id in {submission_topic_ids}').reset_index(drop=True)

In [7]:
# embedding topics
if DEBUG == False:
    emb_topics = np.load(VEC_PATH + 'vecs_topics.npy')
    emb_topics = torch.tensor(emb_topics)
else:
    vecs_topics = []

    for _, row in tqdm(submission_topics.iterrows(), total=len(submission_topics)):
        # input sentence
        sentence = row['description']
        if type(sentence) is float:
            sentence = row['title']

        # tokenize
        tok = tokenizer(sentence)
        for k, v in tok.items():
            tok[k] = torch.tensor(v[:MAX_LEN]).to(device).unsqueeze(0)

        # embedded vector
        with torch.no_grad():
            output = model(**tok)
        vec = output.last_hidden_state.squeeze(0).mean(0).cpu()
        vecs_topics.append(vec)

    # embedded topics
    emb_topics = torch.stack(vecs_topics)

In [8]:
# save embeddings as dataset
# import cupy
# cupy.save('vecs_topics', vecs_topics)
# cupy.save('vecs_content', vecs_content)

In [9]:
# predict
vecs_content = cupy.asarray(emb_content)
vecs_topics = cupy.asarray(emb_topics)

predicts = []
for index, vec in enumerate(vecs_topics):
    # calculate cosine similarity
    cosine_sims = pairwise_distances(vec.reshape(1, len(vec)), vecs_content, metric='cosine')
    
    # choose only those with the same language
    language = submission_topics.loc[index, 'language']
    same_language_index = content.query(f'language=="{language}"').index.tolist()
    
    # select
    res = []
    for sim_index in cosine_sims.argsort(1)[0].get():
        if sim_index in same_language_index:
            res.append(sim_index)
        # Only select TOP-N
        if len(res) >= TOP_N:
            break
            
    # combine all the selected results with space
    pred = " ".join([content.loc[s, 'id'] for s in res])
    predicts.append(pred)

In [10]:
# submission file
submission['content_ids'] = predicts
print(submission)

submission.to_csv('submission.csv', index=None)

         topic_id                                        content_ids
0  t_00004da3a1b2  c_ea45fb870cbe c_8768f474805d c_7d756190bda7 c...
1  t_00068291e9a4  c_9a106c2655ac c_24d97713000d c_0809a3357679 c...
2  t_00069b63a70a  c_a6db0765d460 c_ac592c567023 c_f37ecf29f9c9 c...
3  t_0006d41a73a8  c_2e165cfa1a05 c_33699a2b161f c_6c44cf34950b c...
4  t_4054df11a74e  c_f6416e74f9bb c_11a1dc0bfb99 c_e3fc621f753a c...


# What to do next?


* Balance the semantics of `title`, `description` and `text`