# Examples of validation methods correlated with Public Score

The content of train could be the answer to test, but there is content in test that is not connected to train at all.
Therefore, I have split the content in such a way that the content that is only connected to test is not used in the training.
(All contents are used during inference).

> The full test set includes an additional 10,000 topics (none present in the training set) and a large number of additional content items. The additional content items are only correlated to test set topics.

More information is available in [Discussion](https://www.kaggle.com/competitions/learning-equality-curriculum-recommendations/discussion/372875)

In [2]:
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
# from cuml import NearestNeighbors

In [3]:
INPUT_DIR = Path('../../input/')
N_SPLITS = 6
N_NEIGHBORS_LIST = [1, 2, 3, 8, 12]
N_COMPONENTS = 128

In [4]:
topic_df = pd.read_csv(INPUT_DIR / 'topics.csv')
content_df = pd.read_csv(INPUT_DIR / 'content.csv')
corr_df = pd.read_csv(INPUT_DIR / 'correlations.csv')
topic_df = topic_df.merge(
    corr_df, left_on='id', right_on='topic_id', how='left'
)
topic_df = topic_df.drop(columns=['topic_id'])

# Split

In [5]:
topic_df['stratify_category'] = (
    topic_df['category'] != 'source' + '_' +
    topic_df['description'].notnull().astype(str)
#     topic_df['has_content'].astype(str)
)

In [6]:
kf = StratifiedGroupKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)
folds = list(kf.split(topic_df, topic_df['stratify_category'], groups=topic_df["channel"]))
topic_df['fold'] = np.nan
content_df['fold'] = np.nan

for fold, (train_idx, val_idx) in enumerate(folds):
    topic_df.loc[val_idx, "fold"] = fold
    train_topic_df = topic_df.query(f"fold != {fold}").reset_index(drop=True)
    val_topic_df = topic_df.query(f"fold == {fold}").reset_index(drop=True)

    train_content_ids = set(train_topic_df["content_ids"].str.split().explode().to_list())
    val_content_ids = set(val_topic_df["content_ids"].str.split().explode().to_list())
    only_val_content_ids = val_content_ids - train_content_ids
    content_df.loc[content_df['id'].isin(only_val_content_ids), 'fold'] = fold

In [7]:
display(topic_df['fold'].value_counts())

3.0    15133
1.0    14599
0.0    14348
4.0    13072
2.0    11126
5.0     8694
Name: fold, dtype: int64

In [14]:
content_df.fold.value_counts(dropna=False)

4.0    29161
3.0    27354
NaN    25292
1.0    20403
0.0    19446
5.0    17532
2.0    14859
Name: fold, dtype: int64

In [15]:
topic_df.fold.value_counts(dropna=False)

3.0    15133
1.0    14599
0.0    14348
4.0    13072
2.0    11126
5.0     8694
Name: fold, dtype: int64

In [18]:
topic_df.fold

0        0.0
1        4.0
2        4.0
3        5.0
4        0.0
        ... 
76967    5.0
76968    2.0
76969    1.0
76970    4.0
76971    1.0
Name: fold, Length: 76972, dtype: float64

In [19]:
content_df.fold

0         NaN
1         5.0
2         3.0
3         1.0
4         2.0
         ... 
154042    3.0
154043    5.0
154044    4.0
154045    4.0
154046    0.0
Name: fold, Length: 154047, dtype: float64

# Combining with train.csv

In [59]:
train = pd.read_csv("../../input/train_context.csv")

In [60]:
train = train.merge(topic_df[["id", "channel", "category", "language", "fold"]], how="left", left_on="topics_ids", right_on="id").drop(columns=["id"])
train = train.rename(columns={"fold": "topic_fold"})

In [63]:
train.shape

(615170, 16)

In [64]:
train = train.merge(content_df[["id", "kind", "language", "fold"]], how="left", left_on="content_ids", right_on="id").drop(columns=["id"])
train = train.rename(columns={"fold": "content_fold"})

In [68]:
train = train.rename(columns={"category": "topic_category", 
                              "language_x": "topic_language", 
                              "kind": "content_kind",
                              "language_y": "content_language"})

In [74]:
train.columns

Index(['topics_ids', 'content_ids', 'topic_title', 'topic_description',
       'content_title', 'content_description', 'content_text',
       'topic_parent_title', 'topic_parent_description', 'topic_child_title',
       'topic_child_description', 'target', 'channel', 'topic_category',
       'topic_language', 'topic_fold', 'content_kind', 'content_language',
       'content_fold'],
      dtype='object')

In [77]:
new_cols = [
    'topics_ids', 'content_ids', 'channel', 'topic_title', 'topic_description', 
    'topic_parent_title', 'topic_parent_description', 'topic_child_title', 'topic_child_description',
    'topic_category', 'topic_language',
    'content_title', 'content_description', 'content_text',
    'content_kind', 'content_language', 
    'target',
    'topic_fold', 'content_fold'
]
assert len(set(new_cols)) == len(set(train.columns))

train = train.loc[:, new_cols]

In [79]:
train.to_csv("../../input/train_context_5fold.csv", index=False)