Ref: https://www.kaggle.com/code/theoviel/modeling-oriented-eda-building-a-good-cv-split

In [40]:
import os
import json
import numpy as np
import pandas as pd
import networkx as nx
from tqdm.notebook import tqdm

from sklearn.model_selection import StratifiedGroupKFold


In [88]:
from utils import read_data

In [2]:
DATA_PATH = "../../input/"

topics = pd.read_csv(DATA_PATH + "topics.csv")
content = pd.read_csv(DATA_PATH + "content.csv")
correlations = pd.read_csv(DATA_PATH + "correlations.csv")

In [19]:
train = pd.read_csv(DATA_PATH + "train.csv")

In [5]:
topics_val = topics[topics['category'] != "source"][["channel", "id"]]
# topics_val = topics[["channel", "id"]]

topics_val = topics_val.merge(correlations, left_on='id', right_on='topic_id')

channel_val = topics_val.groupby("channel").agg(list).reset_index()
channel_val['content_ids'] = channel_val['content_ids'].apply(lambda x: list(np.unique(np.concatenate([x_.split(' ') for x_ in x]))))

In [22]:
topic_channels_category = train.merge(topics, how="left", left_on="topics_ids", right_on="id")[["channel", "category"]]
train.insert(4, "channel", topic_channels_category["channel"])
train.insert(5, "category", topic_channels_category["category"])

In [32]:
train.insert(7, "fold", np.full(len(train), -1))

In [33]:
train

Unnamed: 0,topics_ids,content_ids,title1,title2,channel,category,target,fold
0,t_3d9ad9931021,c_efb73ad83f4b,,,ebc86c,supplemental,0,-1
1,t_3d9ad9931021,c_77574ef20c1f,,,ebc86c,supplemental,0,-1
2,t_3d9ad9931021,c_200ae87baf4d,,,ebc86c,supplemental,0,-1
3,t_3d9ad9931021,c_87e171afe50b,,,ebc86c,supplemental,0,-1
4,t_3d9ad9931021,c_3c070b63a944,,,ebc86c,supplemental,0,-1
...,...,...,...,...,...,...,...,...
615165,t_70da08637930,c_eb6448437b5f,8.1.5 Use dot (.) and cross (x) diagrams to il...,Level 2: Describe the formation of ionic bond ...,c7ca13,aligned,0,-1
615166,t_70da08637930,c_07c1da15995b,8.1.5 Use dot (.) and cross (x) diagrams to il...,Level 3: Describe the formation of ionic bond ...,c7ca13,aligned,0,-1
615167,t_70da08637930,c_17ff16d31106,8.1.5 Use dot (.) and cross (x) diagrams to il...,Level 1: Describe the formation of ionic bond ...,c7ca13,aligned,0,-1
615168,t_70da08637930,c_7cb9a57f2219,8.1.5 Use dot (.) and cross (x) diagrams to il...,"Ligações iônicas, covalentes e metálicas",c7ca13,aligned,0,-1


In [42]:
n_folds = 5
seed = 42

train_wo_source = train[train.category!="source"].reset_index()

kfold = StratifiedGroupKFold(n_splits = n_folds, shuffle = True, random_state = seed)
for num, (_, val_index) in enumerate(kfold.split(train_wo_source, train_wo_source['target'], train_wo_source['channel'])):
    train_wo_source.loc[val_index, 'fold'] = int(num)
train_wo_source['fold'] = train_wo_source['fold'].astype(int)

In [73]:
train

Unnamed: 0,topics_ids,content_ids,title1,title2,channel,category,target,fold
0,t_3d9ad9931021,c_efb73ad83f4b,,,ebc86c,supplemental,0,4.0
1,t_3d9ad9931021,c_77574ef20c1f,,,ebc86c,supplemental,0,4.0
2,t_3d9ad9931021,c_200ae87baf4d,,,ebc86c,supplemental,0,4.0
3,t_3d9ad9931021,c_87e171afe50b,,,ebc86c,supplemental,0,4.0
4,t_3d9ad9931021,c_3c070b63a944,,,ebc86c,supplemental,0,4.0
...,...,...,...,...,...,...,...,...
615165,t_70da08637930,c_eb6448437b5f,8.1.5 Use dot (.) and cross (x) diagrams to il...,Level 2: Describe the formation of ionic bond ...,c7ca13,aligned,0,4.0
615166,t_70da08637930,c_07c1da15995b,8.1.5 Use dot (.) and cross (x) diagrams to il...,Level 3: Describe the formation of ionic bond ...,c7ca13,aligned,0,4.0
615167,t_70da08637930,c_17ff16d31106,8.1.5 Use dot (.) and cross (x) diagrams to il...,Level 1: Describe the formation of ionic bond ...,c7ca13,aligned,0,4.0
615168,t_70da08637930,c_7cb9a57f2219,8.1.5 Use dot (.) and cross (x) diagrams to il...,"Ligações iônicas, covalentes e metálicas",c7ca13,aligned,0,4.0


In [58]:
channel_fold_map = {}

for c in train_wo_source.channel.unique():
    channel_fold_map[c] = train_wo_source[train_wo_source.channel==c].iloc[0].fold

In [67]:
train["fold"] = train["channel"].map(channel_fold_map)

In [78]:
train.loc[train["category"] == "source", "fold"] =-1

In [80]:
train.fold.unique()

array([ 4., -1.,  2.,  3.,  0.,  1.])

In [81]:
train.fold.value_counts()

-1.0    365140
 2.0     85180
 3.0     46450
 4.0     44500
 1.0     44150
 0.0     29750
Name: fold, dtype: int64

In [82]:
train_5fold = pd.read_csv("../../input/train_5fold.csv")

In [83]:
train_5fold.fold.value_counts()

4    123160
1    123130
2    123060
0    122950
3    122870
Name: fold, dtype: int64

In [84]:
train

Unnamed: 0,topics_ids,content_ids,title1,title2,channel,category,target,fold
0,t_3d9ad9931021,c_efb73ad83f4b,,,ebc86c,supplemental,0,4.0
1,t_3d9ad9931021,c_77574ef20c1f,,,ebc86c,supplemental,0,4.0
2,t_3d9ad9931021,c_200ae87baf4d,,,ebc86c,supplemental,0,4.0
3,t_3d9ad9931021,c_87e171afe50b,,,ebc86c,supplemental,0,4.0
4,t_3d9ad9931021,c_3c070b63a944,,,ebc86c,supplemental,0,4.0
...,...,...,...,...,...,...,...,...
615165,t_70da08637930,c_eb6448437b5f,8.1.5 Use dot (.) and cross (x) diagrams to il...,Level 2: Describe the formation of ionic bond ...,c7ca13,aligned,0,4.0
615166,t_70da08637930,c_07c1da15995b,8.1.5 Use dot (.) and cross (x) diagrams to il...,Level 3: Describe the formation of ionic bond ...,c7ca13,aligned,0,4.0
615167,t_70da08637930,c_17ff16d31106,8.1.5 Use dot (.) and cross (x) diagrams to il...,Level 1: Describe the formation of ionic bond ...,c7ca13,aligned,0,4.0
615168,t_70da08637930,c_7cb9a57f2219,8.1.5 Use dot (.) and cross (x) diagrams to il...,"Ligações iônicas, covalentes e metálicas",c7ca13,aligned,0,4.0


In [89]:
train_better_cv_5fold = read_data(train)

In [90]:
train_better_cv_5fold

Unnamed: 0,topics_ids,content_ids,title1,title2,channel,category,target,fold,text
0,t_3d9ad9931021,c_efb73ad83f4b,Title does not exist,Title does not exist,ebc86c,supplemental,0,4.0,Title does not exist [SEP] Title does not exist
1,t_3d9ad9931021,c_77574ef20c1f,Title does not exist,Title does not exist,ebc86c,supplemental,0,4.0,Title does not exist [SEP] Title does not exist
2,t_3d9ad9931021,c_200ae87baf4d,Title does not exist,Title does not exist,ebc86c,supplemental,0,4.0,Title does not exist [SEP] Title does not exist
3,t_3d9ad9931021,c_87e171afe50b,Title does not exist,Title does not exist,ebc86c,supplemental,0,4.0,Title does not exist [SEP] Title does not exist
4,t_3d9ad9931021,c_3c070b63a944,Title does not exist,Title does not exist,ebc86c,supplemental,0,4.0,Title does not exist [SEP] Title does not exist
...,...,...,...,...,...,...,...,...,...
615165,t_70da08637930,c_eb6448437b5f,8.1.5 Use dot (.) and cross (x) diagrams to il...,Level 2: Describe the formation of ionic bond ...,c7ca13,aligned,0,4.0,8.1.5 Use dot (.) and cross (x) diagrams to il...
615166,t_70da08637930,c_07c1da15995b,8.1.5 Use dot (.) and cross (x) diagrams to il...,Level 3: Describe the formation of ionic bond ...,c7ca13,aligned,0,4.0,8.1.5 Use dot (.) and cross (x) diagrams to il...
615167,t_70da08637930,c_17ff16d31106,8.1.5 Use dot (.) and cross (x) diagrams to il...,Level 1: Describe the formation of ionic bond ...,c7ca13,aligned,0,4.0,8.1.5 Use dot (.) and cross (x) diagrams to il...
615168,t_70da08637930,c_7cb9a57f2219,8.1.5 Use dot (.) and cross (x) diagrams to il...,"Ligações iônicas, covalentes e metálicas",c7ca13,aligned,0,4.0,8.1.5 Use dot (.) and cross (x) diagrams to il...


In [91]:
train_better_cv_5fold.to_csv("../../input/train_better_cv_5fold.csv", index=False)