# Lib

In [1]:
import pandas as pd
from ast import literal_eval
from sklearn.model_selection import GroupKFold

# Config

In [2]:
CFG = dict(n_splits=5)
CFG

{'n_splits': 5}

# Data loading

In [3]:
base_path = "../input/nbme-score-clinical-patient-notes"

patient_notes = pd.read_csv(f"{base_path}/patient_notes.csv")
features = pd.read_csv(f"{base_path}/features.csv")
train_data = pd.read_csv(f"{base_path}/train.csv")

# Preprocessing

In [4]:
train_merge = pd.merge(train_data.merge(patient_notes, on=['case_num', 'pn_num'], how='left'),
                                        features,
                                        on=['feature_num', 'case_num'], how='left'
                                        )
assert train_merge.shape[0] == train_data.shape[0]

print(f"Shape train_merge = {train_merge.shape}")
train_merge.sample(3)

Shape train_merge = (14300, 8)


Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location,pn_history,feature_text
12799,90766_912,9,90766,912,"['mom: migraines', 'FH migraines']","['587 601', '560 562;592 601']",20YO female \r\nCC: Headache\r\nHPI: pt report...,Family-history-of-migraines
14244,95228_912,9,95228,912,[],[],"20 F no PMH, lives w/ roommate in apartment ha...",Family-history-of-migraines
4201,22081_203,2,22081,203,['Sexually monogamous'],['693 712'],CC: Irregular peroids\r\n\r\nHPI: 44 year olf ...,Sexually-active


# Split data

In [5]:
train_merge["fold"] = -1

skf = GroupKFold(CFG['n_splits'])
groups = train_merge['pn_num'].values
for fold, (_, valid_idx) in enumerate(skf.split(train_merge["id"], train_merge["location"], groups)):
    train_merge.loc[valid_idx, "fold"] = fold

In [6]:
assert train_merge.groupby('fold', as_index=False).size()['size'].sum() == train_merge.shape[0]
train_merge.groupby('fold', as_index=False).size()

Unnamed: 0,fold,size
0,0,2860
1,1,2860
2,2,2860
3,3,2860
4,4,2860


# Save

In [7]:
train_merge.to_csv(f'{CFG["n_splits"]}_folds_data.csv', index=False)