#Setup

In [None]:
!pip install numpy
!pip install pandas
!pip install sklearn

In [None]:
import numpy as np
import pandas as pd
import math
import itertools
import random
import os
import gzip
import json
from sklearn.model_selection import StratifiedKFold
import shutil

In [None]:
# Open up gdrive to get files

from google.colab import drive
drive.mount('gdrive')

# Constants

In [None]:
### GLOBAL VARIABLES ###

BASE_PATH = 'gdrive/MyDrive/colabNotebooks/commonLitReadabilityPrize/firstPlace_CodeFiles'

In [None]:
TRAIN_PATH = os.path.join(BASE_PATH, 'data/training/original/train.csv')
CV_OUT_PATH = os.path.join(BASE_PATH, 'data/training/cv')
BS_OUT_PATH = os.path.join(BASE_PATH, 'data/training/bs')

# Adding this below - not sure if SEED is supposed to =28, but that's what they assigned it 
#   to in some other files, and it seems like they forgot to assign it in this notebook
SEED = 28

# Functions

In [None]:
def prepare_bootstrap(df, n_bags, save_path):
  for i in range(n_bags):
    bag = df.sample(n=len(df), replace=True)
    bag_val = df[~df.id.isin(bag.id)]
    out_train = os.path.join(save_path, 'train_fold_' + str(i) + '.csv')
    out_val = os.path.join(save_path, 'val_fold_' + str(i) + '.csv')
    bag.to_csv(out_train)
    bag_val.to_csv(out_val)

In [None]:
def make_cv_data(df, out_path, kfolds=6):
  get_bin_stratified(df, n_splits=kfolds)
  for fold in range(kfolds):
    print('Fold:', fold)
    train_df = df.loc[df.fold!=fold].reset_index(drop=True)
    val_df = df.loc[df.fold==fold].reset_index(drop=True)
    train_df.to_csv(out_path + '/train_fold_' + str(fold) + '.csv')
    val_df.to_csv(out_path + '/val_fold_' + str(fold) + '.csv')

In [None]:
def get_bin_stratified(df, n_bins=20, n_splits=5):
    df['bin'] = pd.cut(df.target, n_bins, labels=[i for i in range(n_bins)])
    
    df['fold'] = np.nan

    skf = StratifiedKFold(n_splits=n_splits, random_state=SEED, shuffle=True)
    gen_skf = skf.split(df.id, y=df.bin)

    for fold, (idx_train, idx_val) in enumerate(gen_skf):
        df.loc[idx_val, 'fold'] = fold
    
    df['fold'] = df['fold'].astype('int8')

# Prepare train splits

In [None]:
train_df = pd.read_csv(TRAIN_PATH)
make_cv_data(df=train_df, out_path=CV_OUT_PATH)


In [None]:
train_df = pd.read_csv(TRAIN_PATH)
prepare_bootstrap(df=train_df, n_bags=6, save_path=BS_OUT_PATH)