# Examples of validation methods correlated with Public Score

The content of train could be the answer to test, but there is content in test that is not connected to train at all.
Therefore, I have split the content in such a way that the content that is only connected to test is not used in the training.
(All contents are used during inference).

> The full test set includes an additional 10,000 topics (none present in the training set) and a large number of additional content items. The additional content items are only correlated to test set topics.

More information is available in [Discussion](https://www.kaggle.com/competitions/learning-equality-curriculum-recommendations/discussion/372875)

In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
from cuml import NearestNeighbors

In [None]:
INPUT_DIR = Path('/kaggle/input/learning-equality-curriculum-recommendations')
N_SPLITS = 6
N_NEIGHBORS_LIST = [1, 2, 3, 8, 12]
N_COMPONENTS = 128

In [None]:
topic_df = pd.read_csv(INPUT_DIR / 'topics.csv')
content_df = pd.read_csv(INPUT_DIR / 'content.csv')
corr_df = pd.read_csv(INPUT_DIR / 'correlations.csv')
topic_df = topic_df.merge(
    corr_df, left_on='id', right_on='topic_id', how='left'
)
topic_df = topic_df.drop(columns=['topic_id'])

# Split

In [None]:
topic_df['stratify_category'] = (
    topic_df['category'] != 'source' + '_' +
    topic_df['description'].notnull().astype(str)
#     topic_df['has_content'].astype(str)
)

In [None]:
kf = StratifiedGroupKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)
folds = list(kf.split(topic_df, topic_df['stratify_category'], groups=topic_df["channel"]))
topic_df['fold'] = np.nan
content_df['fold'] = np.nan

for fold, (train_idx, val_idx) in enumerate(folds):
    topic_df.loc[val_idx, "fold"] = fold
    train_topic_df = topic_df.query(f"fold != {fold}").reset_index(drop=True)
    val_topic_df = topic_df.query(f"fold == {fold}").reset_index(drop=True)

    train_content_ids = set(train_topic_df["content_ids"].str.split().explode().to_list())
    val_content_ids = set(val_topic_df["content_ids"].str.split().explode().to_list())
    only_val_content_ids = val_content_ids - train_content_ids
    content_df.loc[content_df['id'].isin(only_val_content_ids), 'fold'] = fold

In [None]:
display(topic_df['fold'].value_counts())

# Cross Validation

In [None]:
# ref: https://www.kaggle.com/code/columbia2131/lecr-example-of-f2-score
def fbeta_score(y_true_ids: pd.Series, y_pred_ids: pd.Series, beta=2, eps=1e-15):
    true_ids = y_true_ids.str.split()
    pred_ids = y_pred_ids.str.split()
    score_list = []
    for true, pred in zip(true_ids.tolist(), pred_ids.tolist()):
        TP = (set(true) & set(pred))
        precision = len(TP) / len(pred)
        recall = len(TP) / len(true)
        f2 = (1+beta**2) * (precision*recall) / ((beta**2)*precision+recall+eps)
        score_list.append(f2)
    score = sum(score_list) / len(score_list)
    return score

In [None]:
for fold in range(N_SPLITS):
    val_topic_df = topic_df.query(f"fold == {fold}").reset_index(drop=True)
    train_content_df = content_df.query(f"fold != {fold}").reset_index(drop=True)

    # embedding
    tfidf_svd = Pipeline([
        ("tfidf", TfidfVectorizer()),
        ("svd", TruncatedSVD(n_components=N_COMPONENTS))
    ])
    tfidf_svd.fit(train_content_df["title"].fillna(''))
    val_content_vec = tfidf_svd.transform(content_df["title"].fillna(''))
    val_topic_vec = tfidf_svd.transform(val_topic_df["title"].fillna(''))

    # candidate
    nn = NearestNeighbors(n_neighbors=max(N_NEIGHBORS_LIST))
    nn.fit(val_content_vec)
    _, indices = nn.kneighbors(val_topic_vec)
    
    for n_neighbors in N_NEIGHBORS_LIST:
        # prediction
        preds = pd.Series(content_df["id"].values[indices[:, :n_neighbors]].tolist())
        preds = preds.apply(lambda x: " ".join(x))
        topic_df.loc[topic_df['fold']==fold, f'pred_content_ids_{n_neighbors}'] = preds.values

        # evaluation
        targets = val_topic_df['content_ids']
        has_content = val_topic_df['has_content']
        valid_category = (
        (val_topic_df['category'] == 'aligned').astype(bool) |
        (val_topic_df['category'] == 'supplemental').astype(bool)
        )
        valid_mask = has_content & valid_category
        score = fbeta_score(targets[valid_mask], preds[valid_mask])
        print(f'fold {fold}: {score:.4f}, n_neighbors={n_neighbors}')
    print('----------------------------------------')

# whole score
for n_neighbors in N_NEIGHBORS_LIST:
    preds = topic_df[f'pred_content_ids_{n_neighbors}']
    targets = topic_df['content_ids']
    has_content = topic_df['has_content']
    valid_category = (
        (topic_df['category'] == 'aligned').astype(bool) |
        (topic_df['category'] == 'supplemental').astype(bool)
    )
    valid_mask = has_content & valid_category
    score = fbeta_score(targets[valid_mask], preds[valid_mask])
    print(f'whole score : {score:.4f}, n_neighbor={n_neighbors}')