# Commonlit Fasthugs

Copied from https://www.kaggle.com/aruthart/transformers-fastai-starter-submission
and modified to run on Colab before being ported back

Competition dataset: commonlitreadabilityprize

Additional input datasets:
* fasthugs
* distilroberta-base

## Package Installation and Environment Setup

In [1]:
!nvidia-smi

In [2]:
import os
def check_environment():
    global ENV
    if 'google.colab' in str(get_ipython()):
        os.environ['IPYTHON_ENV'] = 'COLAB'
        ENV = 'COLAB'
    else:
        os.environ['IPYTHON_ENV'] = 'KAGGLE'
        ENV = 'KAGGLE'
        os.environ['TOKENIZERS_PARALLELISM'] = 'false'
check_environment()

In [3]:
# colab
if (ENV == 'COLAB'):
    from google.colab import drive
    drive.mount('/content/drive')

In [4]:
# colab
![ $IPYTHON_ENV = 'COLAB' ] && pip install -qqq fastai --upgrade
![ $IPYTHON_ENV = 'COLAB' ] && pip install -qqq transformers 
![ $IPYTHON_ENV = 'COLAB' ] && pip install -qqq fasthugs
![ $IPYTHON_ENV = 'COLAB' ] && pip install -qqq scikit-learn

In [5]:
# kaggle
![ $IPYTHON_ENV = 'KAGGLE' ] && pip install -q --no-deps ../input/fasthugs

In [6]:
# colab
![ $IPYTHON_ENV = 'COLAB' ] && mkdir -p /root/.fastai/data 
![ $IPYTHON_ENV = 'COLAB' ] && ln -s /root/.fastai/data data
![ $IPYTHON_ENV = 'COLAB' ] && mkdir -p data/commonlit
![ $IPYTHON_ENV = 'COLAB' ] && cp /content/drive/MyDrive/kaggle/commonlitreadabilityprize.zip data/commonlit/.
![ $IPYTHON_ENV = 'COLAB' ] && unzip -d data/commonlit -o data/commonlit/commonlitreadabilityprize.zip

## Imports and function declarations

In [7]:
from fastai.text.all import *
from fasthugs.data import TransformersTextBlock, TextGetter
from fasthugs.learner import TransLearner

from transformers import AutoModelForSequenceClassification
# from sklearn.model_selection import StratifiedKFold
# import gc

In [8]:
import os
def check_environment():
    global ENV
    if 'google.colab' in str(get_ipython()):
        os.environ['IPYTHON_ENV'] = 'COLAB'
        ENV = 'COLAB'
    else:
        os.environ['IPYTHON_ENV'] = 'KAGGLE'
        ENV = 'KAGGLE'
        os.environ['TOKENIZERS_PARALLELISM'] = 'false'
check_environment()

In [9]:
def build_datablock(env, model_name, splitter):
    dblock = DataBlock(blocks = [TransformersTextBlock(pretrained_model_name=model_name), RegressionBlock()],
                    get_x=TextGetter('excerpt'),
                    get_y=ItemGetter('target'),
                    splitter=splitter)
                    #    splitter=RandomSplitter(seed=42))
    return dblock

In [10]:
def build_dataloaders(dblock, train_df, bs=16, val_bs=None, num_workers=2):
    val_bs = bs * 2 if val_bs is None else val_bs
    dls = dblock.dataloaders(train_df, bs=bs, val_bs=val_bs, num_workers=num_workers)
    return dls

In [11]:
def build_learner(dls, model_name, env, path=None, metrics=[rmse], opt_func=Adam, model_args={}, learner_args={}):
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1,**model_args)
    learn = TransLearner(dls, model, metrics=metrics, path=path, opt_func=opt_func, **learner_args)
    return learn

In [12]:
def get_commonlit_path(env):
    if env == 'COLAB': path = Path('data/commonlit') # colab
    if env == 'KAGGLE': path = Path('../input/commonlitreadabilityprize') # kaggle
    return path


In [13]:
def config_learner(env, model_name, model_args, bs=16):
    path = get_commonlit_path(env)
    model_name = '../input/transformers/' + model_name if env == 'KAGGLE' else model_name 
    output_path = Path('.')
    train_df = pd.read_csv(path/'train.csv')
    dblock = build_datablock(
                env,
                model_name, 
                splitter=RandomSubsetSplitter(
                            train_sz=0.01, valid_sz=0.01,seed=42))
    dls = build_dataloaders(dblock, train_df, bs=bs)
    learn = build_learner(dls, model_name, env, output_path, model_args=model_args)
    return learn

In [14]:
def run_training(learn, lr, wd, epochs=8):
    cbs=[SaveModelCallback(monitor='_rmse', fname='model', comp=np.less, reset_on_fit=False)]
    learn.fit_one_cycle(epochs, lr, wd=wd, cbs=cbs)

In [15]:
def make_submission(env,learn):
    path = get_commonlit_path(env)
    test_df = pd.read_csv(path/'test.csv')
    test_dl = learn.dls.test_dl(test_df)
    preds, _ = learn.get_preds(dl=test_dl)
    submit_df = pd.read_csv(path/'sample_submission.csv', index_col='id')
    submit_preds = preds.squeeze()
    submit_df['target'] = submit_preds 
    submit_df.to_csv('submission.csv')

## Training 

Define path, output_path

In [16]:
model_name = 'distilroberta-base'

In [17]:
model_args = {'hidden_dropout_prob': 0.1}


In [18]:
bs = 16

In [19]:
learn = config_learner(ENV, model_name, model_args, bs=bs)

In [20]:
learn.lr_find()

In [21]:
lr = 7.6e-5
wd = [0.05, 0.05, 0.05]

In [22]:
%time
run_training(learn, lr, wd)

The best performing model is stored and loaded at the end of the training by `SaveModelCallback`:

In [23]:
learn.recorder.plot_loss()

In [24]:
# (output_path/'models').ls()

In [25]:
learn.validate()

## Submission

In [26]:
make_submission(ENV,learn)