# LECR Simple Unsupervised Baseline

This code predict similar contents without training. It calculate the cosine distance between topic and content using a vector of multilingual models.

The single most similar content is used as the predicted result, but you can also select multiple content results by using a similarity threshold.


### Update:

#### [Version 5](https://www.kaggle.com/code/takamichitoda/lecr-simple-unsupervised-baseline?scriptVersionId=113958919): Public Score=0.145
- baseline

#### [Version 8](https://www.kaggle.com/code/takamichitoda/lecr-simple-unsupervised-baseline?scriptVersionId=114180880): Public Score=0.196
- Filter by language
- Fixed `SELECT_TOP_N` (1 -> 5)
- Pre-calculated the content and topic vectors for debugging and added them as a [dataset](https://www.kaggle.com/datasets/takamichitoda/lecr-mdistilbert-sentence-vector).

#### [Version 17](https://www.kaggle.com/code/takamichitoda/lecr-simple-unsupervised-baseline?scriptVersionId=114273565): Public Score=0.143
- Join text with `[SEP]` tokens
- Use [pre-calcurated vectors](https://www.kaggle.com/datasets/takamichitoda/lecr-mdistilbert-sentence-vector) to reduce computation time
- Improved efficiency of language filter

#### [Version 19](https://www.kaggle.com/code/takamichitoda/lecr-simple-unsupervised-baseline?scriptVersionId=114362087): Public Score=0.174
- Content with title or description doesn't use text columns. (If both don't exist, use text column.)

#### [Version 20](https://www.kaggle.com/code/takamichitoda/lecr-simple-unsupervised-baseline?scriptVersionId=114375844): Public Score=0.182
- Consider topics and content with the same title. (ref: [my disucussion](https://www.kaggle.com/competitions/learning-equality-curriculum-recommendations/discussion/373350))

#### Version 32: latest
- texts vectorization only use title columns
- `paraphrase-multilingual-mpnet-base-v2`
- Change language filter logic

In [1]:
import os
import json
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import torch
from torch import nn

from transformers import AutoTokenizer, AutoModel

import cupy as cp
from cuml.metrics import pairwise_distances

device = "cuda" if torch.cuda.is_available() else "cpu"
%env TOKENIZERS_PARALLELISM=true

env: TOKENIZERS_PARALLELISM=true


In [2]:
class CFG:
    INPUT = '/kaggle/input/learning-equality-curriculum-recommendations'
    MODEL = '/kaggle/input/sbert-models/paraphrase-multilingual-mpnet-base-v2'
    PRE_CALC_VECTORS = "/kaggle/input/lecr-mdistilbert-sentence-vector/"
    MAX_LEN = 512
    SELECT_TOP_N = 5

In [3]:
content_df = pd.read_csv(f'{CFG.INPUT}/content.csv')
correlations_df = pd.read_csv(f'{CFG.INPUT}/correlations.csv')
topics_df = pd.read_csv(f'{CFG.INPUT}/topics.csv')
sub_df = pd.read_csv(f'{CFG.INPUT}/sample_submission.csv')

In [4]:
content_df['title_duplicated'] = content_df['title'].duplicated(keep=False)
topics_df['title_duplicated'] = topics_df['title'].duplicated(keep=False)

In [5]:
model = AutoModel.from_pretrained(CFG.MODEL)
model.eval()
model.to(device)

tokenizer = AutoTokenizer.from_pretrained(CFG.MODEL)

In [6]:
content_vecs = np.load(f"{CFG.PRE_CALC_VECTORS}/content_mpnet_avg.npy")
topic_vecs = np.load(f"{CFG.PRE_CALC_VECTORS}/topic_mpnet_avg.npy")
with open(f"{CFG.PRE_CALC_VECTORS}/content_id2idx.json", 'r') as f:
    content_id2idx = json.load(f)
with open(f"{CFG.PRE_CALC_VECTORS}/topic_id2idx.json", 'r') as f:
    topic_id2idx = json.load(f)

In [7]:
vecs = []
for _, row in tqdm(content_df.fillna("").iterrows(), total=len(content_df)):
    try:
        vec = content_vecs[content_id2idx[row['id']], :]
        vec = torch.tensor(vec)
    except KeyError:
        texts = row['title']
        if texts == "":
            texts = "no title"
        tok = tokenizer(texts)
        for k, v in tok.items():
            tok[k] = torch.tensor(v[:CFG.MAX_LEN]).to(device).unsqueeze(0)
        with torch.no_grad():
            output = model(**tok)
        vec = output.last_hidden_state.squeeze(0).mean(0).cpu()

    vecs.append(vec)

vecs1 = torch.stack(vecs)

  0%|          | 0/154047 [00:00<?, ?it/s]

In [8]:
sub_topic_ids = sub_df['topic_id'].tolist()
_topics_df = topics_df.query(f'id in {sub_topic_ids}').reset_index(drop=True)

In [9]:
vecs = []
for _, row in tqdm(_topics_df.fillna('').iterrows(), total=len(_topics_df)):
    try:
        vec = topic_vecs[topic_id2idx[row['id']], :]
        vec = torch.tensor(vec)
    except KeyError:
        texts = row['title']
        if texts == '':
            texts = "no title"
            
        tok = tokenizer(texts)
        for k, v in tok.items():
            tok[k] = torch.tensor(v[:CFG.MAX_LEN]).to(device).unsqueeze(0)
        with torch.no_grad():
            output = model(**tok)
        vec = output.last_hidden_state.squeeze(0).mean(0).cpu()
    
    vecs.append(vec)
    
vecs2 = torch.stack(vecs)

  0%|          | 0/5 [00:00<?, ?it/s]

In [10]:
vecs1 = cp.asarray(vecs1)
vecs2 = cp.asarray(vecs2)

In [11]:
topics_df['language'].value_counts()

en     36161
es     13910
pt      4177
ar      3701
fr      3701
bg      2867
sw      2860
gu      2320
bn      2176
hi      1786
it       866
zh       862
mr       300
fil      247
as       167
my       135
km       121
kn       119
te        93
or        70
ur        66
ta        60
pnb       51
pl        43
tr        40
swa       35
ru        34
mul        4
Name: language, dtype: int64

In [12]:
vecs2.shape

(5, 768)

In [13]:
predicts = []
for i2, v2 in enumerate(vecs2):
    lang = _topics_df.loc[i2, "language"]
    if lang == "en" or lang == "es": 
        predicts.append("")
        continue
    _content_df = content_df.query(f'language=="{lang}"')
    lang_i = _content_df.index.tolist()

    sim = pairwise_distances(v2.reshape(1, len(v2)), vecs1[lang_i, :], metric='cosine')
    res = np.array(lang_i)[sim.argsort(1)[0].get()][:CFG.SELECT_TOP_N]
    p1 = [content_df.loc[s, 'id'] for s in res]
    
    topic_id = _topics_df.loc[i2, "id"]
    title = _topics_df.query(f"id=='{topic_id}'")['title'].iloc[0]
    
    p2 = _content_df[_content_df['title'] == title]['id'].tolist()
    
    for _p in p1:
        if len(p2) >= CFG.SELECT_TOP_N:
            break
        if _p in p2:
            continue
        p2.append(_p)
    p = " ".join(p2)
    
    predicts.append(p)

In [14]:
sub_df['content_ids'] = predicts
sub_df.head()

Unnamed: 0,topic_id,content_ids
0,t_00004da3a1b2,c_e1e8557d7c61 c_6f1dcaa3ca40 c_80e7e76ff4db c...
1,t_00068291e9a4,c_e88be716634d c_89ce9367be10 c_ac1672cdcd2c c...
2,t_00069b63a70a,
3,t_0006d41a73a8,c_e24eca6ed18a c_62191f77b582 c_5e4c167bfeba c...
4,t_4054df11a74e,


In [15]:
sub_df.to_csv('submission.csv', index=None)

In [16]:
!ls

__notebook__.ipynb  submission.csv
