In [1]:
cd ../src

/workspace/Script/NLP/CommonLit - Evaluate Student Summaries/src


In [2]:
import os
import gc
import json
import time
import torch
import joblib
import random
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from pathlib import Path
import plotly.express as px
import matplotlib.pyplot as plt

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.float_format', lambda x: '%.4f' % x)

import warnings
warnings.filterwarnings("ignore")

In [3]:
from datetime import date

TODAY = date.today()
TODAY = TODAY.strftime('%Y-%m-%d')
TODAY

'2023-08-12'

In [4]:
def get_version(start=0):
    if not hasattr(get_version, 'counter'):
        get_version.counter = start
    value = get_version.counter
    get_version.counter += 1
    return value

In [5]:
from train_utils import kfold

In [6]:
!nvidia-smi

Sat Aug 12 18:03:07 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.199.02   Driver Version: 470.199.02   CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA RTX A6000    Off  | 00000000:2D:00.0  On |                  Off |
| 30%   46C    P8    33W / 300W |    556MiB / 48677MiB |      9%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA RTX A6000    Off  | 00000000:99:00.0 Off |                  Off |
| 30%   39C    P8    15W / 300W |     10MiB / 48685MiB |      0%      Default |
|       

In [7]:
DATA_PATH = Path(r"/database/kaggle/Commontlit/data")
CHECKPOINT_PATH = Path(r"/database/kaggle/Commontlit/checkpoint")

os.listdir(DATA_PATH)

['summaries_train.csv',
 'prompts_train.csv',
 'CLEAR_corpus_final.xlsx',
 'summaries_test.csv',
 'prompts_test.csv',
 'sample_submission.csv']

# Data

In [8]:
df = pd.read_csv(DATA_PATH/'summaries_train.csv')
df.shape

(7165, 5)

In [9]:
df.head()

Unnamed: 0,student_id,prompt_id,text,content,wording
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.2057,0.3805
1,0020ae56ffbf,ebad26,They would rub it up with soda to make the sme...,-0.5483,0.5068
2,004e978e639e,3b9047,"In Egypt, there were many occupations and soci...",3.1289,4.2312
3,005ab0199905,3b9047,The highest class was Pharaohs these people we...,-0.2106,-0.4714
4,0070c9e7af47,814d6b,The Third Wave developed rapidly because the ...,3.2729,3.2198


In [10]:
df.prompt_id.unique()

array(['814d6b', 'ebad26', '3b9047', '39c16e'], dtype=object)

# CV

In [11]:
# from sklearn.model_selection import GroupKFold,StratifiedGroupKFold,KFold,StratifiedKFold
# from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [12]:
# train_df = df[df.data=="train"].reset_index(drop=True)
# valid_df = df[df.data=='valid'].reset_index(drop=True)

In [13]:
# seeds = [42]
# folds_names = []
# for K in [5,10]:  
#     for seed in seeds:
#         mskf = StratifiedKFold(n_splits=K,shuffle=True,random_state=seed)
#         name = f"fold_sk_{K}_seed_{seed}"
#         train_df[name] = -1
#         for fold, (trn_, val_) in enumerate(mskf.split(train_df,train_df['year'])):
#             train_df.loc[val_, name] = fold+1
#         valid_df[name] = 0

In [14]:
df['fold'] = df['prompt_id'].map({"814d6b":0,"ebad26":1,"3b9047":2,"39c16e":3})

In [15]:
df["fold"].value_counts()

fold
3    2057
2    2009
1    1996
0    1103
Name: count, dtype: int64

In [16]:
FOLD_NAME = "fold"

In [19]:
class args:
    seed = 2022
    
    # Model
    model_name = 'microsoft/deberta-v3-large' #smp.UnetPlusPlus smp.Unet
    project_name = "CommonLit"
    # CV
    kfold_name = FOLD_NAME
    selected_folds = [0]
    not_include_folds = []
    # Paths
    name = model_name.split('.')[1] if '.' in model_name else model_name
    exp_name = f"{TODAY}--v{get_version(start=0)}-test"  
    checkpoints_path = str(CHECKPOINT_PATH/Path(fr'{kfold_name}/{name}/{exp_name}'))  
    
    dataset = "CommonLitDataset"
    data = {
            'mask_pct':0
           
           }
    model = {
            "model_params":{"model_name":model_name,
                            'num_labels':2,
                            "use_gradient_checkpointing":False,
                            "config_path":None ,
                            "pooling_params":{"pooling_name":"CLS"}
                           },
        
            'loss':"FeedbackLoss",
            "loss_params":{"weights":[0.5,0.5], 
                          },

            "pretrained_weights":None, 
             
            }
    
    optimizer = {
            "name":"optim.AdamW",
            'params':{"encoder_lr":2e-5,
                      "decoder_lr":5e-4,
                     "weight_decay": 0.0,
                     },            
            }

    scheduler = {
            "name":"cosine",
            'params':{
                      "lr_end":1e-7,# defaulkt 1e-7
                      "power":3
                     },
            "warmup":0.,            
            }
    
    train_loader = {
            "batch_size":32,
            'drop_last':True,
            "num_workers":16,
            "pin_memory":False, 
            "shuffle":True,
            }
    
    
    val_loader = {
            "batch_size":128,
            'drop_last':False,
            "num_workers":16,
            "pin_memory":False,
            "shuffle":False
            }
    
    trainer = {"use_amp":False,
                'epochs':10,
                "sample":False,
                "train_all_data":False,
                "use_awp":False,
                "start_epoch_awp":1,
                "adv_lr":0.0003,
                "adv_eps":0.001,
                "grad_clip":False,
                "max_norm":10
              }
    

    callbacks = {'save':True,"es":False,"patience":10,
                 'verbose_eval':1,"epoch_pct_eval":1/1,"epoch_eval_dist":"uniforme",#uniforme
                 "metric_track":"valid_dice","mode":"max",'top_k':1,"start_eval_epoch":0,
                 "save_last_k":0
                }
    
    device = 1
    
Path(args.checkpoints_path).mkdir(parents=True,exist_ok=True)
print(args.checkpoints_path)

/database/kaggle/Commontlit/checkpoint/fold/microsoft/deberta-v3-large/2023-08-12--v1-test


In [20]:
kfold(args,df)

----------- fold ---------

-------------   Fold 1 / 4  -------------



VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01666950341671812, max=1.0)…

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


  0%|          | 0/6062 [00:00<?, ?it/s]

  0%|          | 0/1103 [00:00<?, ?it/s]

Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.dense.bias', 'mask_predictions.dense.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Pooling: CLS
    -> 434014210 trainable parameters



TypeError: string indices must be integers