In [None]:
import sys
if 'google.colab' in sys.modules:  # colab特有の処理_2回目以降
  # Google Driveのマウント
  from google.colab import drive
  drive.mount('/content/drive')

  # ライブラリのパス指定
  sys.path.append('/content/drive/MyDrive/Colab_Files/kaggle/commonlit/XX_modules')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# データセットをDriveから取得
INPUT_SRC = '/content/drive/MyDrive/Colab_Files/kaggle/commonlit/00_input/commonlitreadabilityprize/'

!mkdir -p 'input'
!mkdir -p 'clrp-pre-trained'

!cp -r {INPUT_SRC} '/content/input'

In [None]:
from pathlib import Path

# input
if 'kaggle_web_client' in sys.modules:  # kaggle環境
    DATA_DIR = Path('../input/commonlitreadabilityprize/')

elif 'google.colab' in sys.modules: # Colab環境
    DATA_DIR = Path('/content/input/commonlitreadabilityprize')

else:
    DATA_DIR = Path('../00_input/commonlitreadabilityprize/')

In [None]:
import os
import math
import random
import time

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from transformers import AdamW # optimizer
from transformers import AutoTokenizer
from transformers import AutoModel
from transformers import AutoConfig
from transformers import get_cosine_schedule_with_warmup # scheduler

from sklearn.model_selection import KFold, StratifiedKFold

import gc
gc.enable()

In [None]:
def set_random_seed(random_seed):
    random.seed(random_seed)
    np.random.seed(random_seed)
    os.environ["PYTHONHASHSEED"] = str(random_seed)

    torch.manual_seed(random_seed)
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)

    torch.backends.cudnn.deterministic = True# cudnnによる最適化で結果が変わらないためのおまじない 

In [None]:
# train, testを読む
train_df = pd.read_csv(DATA_DIR/"train.csv")

# Remove incomplete entries if any.
train_df.drop(train_df[(train_df.target == 0) & (train_df.standard_error == 0)].index,
              inplace=True)
train_df.reset_index(drop=True, inplace=True)

test_df = pd.read_csv(DATA_DIR/"test.csv")
submission_df = pd.read_csv(DATA_DIR/"sample_submission.csv")

In [None]:
# https://www.kaggle.com/abhishek/step-1-create-folds
def create_folds(data, num_splits, SEED, return_df=False):
    # we create a new column called kfold and fill it with -1
    data["kfold"] = -1
    
    # the next step is to randomize the rows of the data
    data = data.sample(frac=1).reset_index(drop=True)

    # calculate number of bins by Sturge's rule
    # I take the floor of the value, you can also
    # just round it
    num_bins = int(np.floor(1 + np.log2(len(data))))
    
    # bin targets
    data.loc[:, "bins_tg"] = pd.cut(
        data["target"], bins=num_bins, labels=False
    ).map(lambda x: str(x))

    # bin standard_error
    data.loc[:, "bins_std"] = pd.cut(
        data["standard_error"], bins=num_bins, labels=False
    )

    # bins
    data.loc[:, "bins"] = data['bins_tg'].map(lambda x: str(x)) + data['bins_std'].map(lambda x: str(x))

    # initiate the kfold class from model_selection module
    kf = StratifiedKFold(n_splits=5, random_state=SEED, shuffle=True)

    # note that, instead of targets, we use bins!
    for f, (t_, v_) in enumerate(kf.split(X=data, y=data.bins.values)):
      data.loc[v_, 'kfold'] = f
    return data

In [None]:
SEED = 1000
set_random_seed(SEED)
kfold_df = create_folds(train_df, 5, SEED=SEED) # binsで切る場合



In [None]:
kfold_df.to_csv(f'{INPUT_SRC}/train_kfold.csv', index=False)

In [None]:
kfold_df_load = pd.read_csv(f'{INPUT_SRC}/train_kfold.csv')

In [None]:
kfold_df_load[kfold_df_load.fillna(0) != kfold_df.fillna(0)]

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error,kfold,bins_tg,bins_std,bins
0,,,,,,,,7,,72
1,,,,,,0.499267,,8,,83
2,,,,,,,,5,,52
3,,,,,-1.199957,,,5,,52
4,,,,,-0.956118,,,6,,64
...,...,...,...,...,...,...,...,...,...,...
2828,,,,,,0.462924,,7,,71
2829,,,,,-1.188881,,,5,,54
2830,,,,,,,,1,,17
2831,,,,,,,,3,,33


In [28]:
kfold_df.iloc[1]

id                                                        fa84dbf46
url_legal                                                       NaN
license                                                         NaN
excerpt           With trembling hands the lad took the shavings...
target                                                    0.0881482
standard_error                                             0.499267
kfold                                                             1
bins_tg                                                           8
bins_std                                                          3
bins                                                             83
Name: 1, dtype: object

In [29]:
kfold_df_load.iloc[1]

id                                                        fa84dbf46
url_legal                                                       NaN
license                                                         NaN
excerpt           With trembling hands the lad took the shavings...
target                                                    0.0881482
standard_error                                             0.499267
kfold                                                             1
bins_tg                                                           8
bins_std                                                          3
bins                                                             83
Name: 1, dtype: object