In [1]:
cd ../src

/workspace/Script/NLP/Shovel Ready/src


In [2]:
import os
import gc
import json
import time
import torch
import joblib
import random
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from pathlib import Path
import plotly.express as px
import matplotlib.pyplot as plt

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.float_format', lambda x: '%.4f' % x)

import warnings
warnings.filterwarnings("ignore")

In [3]:
from transformers import DataCollatorWithPadding
from transformers import AutoTokenizer, AutoModel, AutoConfig

%env TOKENIZERS_PARALLELISM = true

env: TOKENIZERS_PARALLELISM=true


In [4]:
from datetime import date

TODAY = date.today()
TODAY = TODAY.strftime('%Y-%m-%d')
TODAY

'2023-08-22'

In [5]:
def get_version(start=0):
    if not hasattr(get_version, 'counter'):
        get_version.counter = start
    value = get_version.counter
    get_version.counter += 1
    return value

In [6]:
from train_utils import kfold

In [7]:
!nvidia-smi

Tue Aug 22 11:22:50 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.199.02   Driver Version: 470.199.02   CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA RTX A6000    Off  | 00000000:2D:00.0  On |                  Off |
| 67%   84C    P2   240W / 300W |  12735MiB / 48651MiB |     63%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA RTX A6000    Off  | 00000000:99:00.0 Off |                  Off |
| 30%   43C    P8    21W / 300W |     13MiB / 48685MiB |      0%      Default |
|       

In [8]:
DATA_PATH = Path(r"/database/kaggle/Shovel Ready/data")
CHECKPOINT_PATH = Path(r"/database/kaggle/Commontlit/checkpoint")

os.listdir(DATA_PATH)

['persuade_corpus.csv']

# Data

In [9]:
df = pd.read_csv(DATA_PATH/'persuade_corpus.csv')
df.shape

(285391, 30)

In [10]:
df.head()

Unnamed: 0,essay_id,essay_id_comp,competition_set,full_text,holistic_essay_score,discourse_id,discourse_start,discourse_end,discourse_text,discourse_type,discourse_type_num,discourse_effectiveness,hierarchical_id,hierarchical_text,hierarchical_label,provider,task,source_text,prompt_name,assignment,gender,grade_level,ell_status,race_ethnicity,economically_disadvantaged,student_disability_status,essay_word_count,in_feedback2.0,test_split_feedback_1,test_split_feedback_2
0,5408891152126,423A1CA112E2,train,Phones\n\nModern humans today are always on th...,3,1622627660525.0,0,7,Phones\n\n,Unannotated,Unannotated 1,,,,,Georgia Virtual,Independent,,Phones and driving,Today the majority of humans own and operate c...,M,,,Black/African American,,,378,1,,
1,5408891152126,423A1CA112E2,train,Phones\n\nModern humans today are always on th...,3,1622627660524.0,8,229,Modern humans today are always on their phone....,Lead,Lead 1,Adequate,,,,Georgia Virtual,Independent,,Phones and driving,Today the majority of humans own and operate c...,M,,,Black/African American,,,378,1,,
2,5408891152126,423A1CA112E2,train,Phones\n\nModern humans today are always on th...,3,1622627653021.0,230,312,They are some really bad consequences when stu...,Position,Position 1,Adequate,,,,Georgia Virtual,Independent,,Phones and driving,Today the majority of humans own and operate c...,M,,,Black/African American,,,378,1,,
3,5408891152126,423A1CA112E2,train,Phones\n\nModern humans today are always on th...,3,1622627671020.0,313,400,Some certain areas in the United States ban ph...,Evidence,Evidence 1,Adequate,1622627653021.0,They are some really bad consequences when stu...,Position,Georgia Virtual,Independent,,Phones and driving,Today the majority of humans own and operate c...,M,,,Black/African American,,,378,1,,
4,5408891152126,423A1CA112E2,train,Phones\n\nModern humans today are always on th...,3,1622627696365.0,401,756,"When people have phones, they know about certa...",Evidence,Evidence 2,Adequate,1622627653021.0,They are some really bad consequences when stu...,Position,Georgia Virtual,Independent,,Phones and driving,Today the majority of humans own and operate c...,M,,,Black/African American,,,378,1,,


In [11]:
LABEL2EFFEC = ('Adequate', 'Effective', 'Ineffective')
EFFEC2LABEL = {t: l for l, t in enumerate(LABEL2EFFEC)}

In [12]:
df = df[df.competition_set=='train']
df = df[df.discourse_effectiveness.isin(LABEL2EFFEC)]
df.shape

(144289, 30)

# CV

In [13]:
# from sklearn.model_selection import GroupKFold,StratifiedGroupKFold,KFold,StratifiedKFold
# from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [14]:
# train_df = df[df.data=="train"].reset_index(drop=True)
# valid_df = df[df.data=='valid'].reset_index(drop=True)

In [15]:
# seeds = [42]
# folds_names = []
# for K in [5,10]:  
#     for seed in seeds:
#         mskf = StratifiedKFold(n_splits=K,shuffle=True,random_state=seed)
#         name = f"fold_sk_{K}_seed_{seed}"
#         train_df[name] = -1
#         for fold, (trn_, val_) in enumerate(mskf.split(train_df,train_df['year'])):
#             train_df.loc[val_, name] = fold+1
#         valid_df[name] = 0

In [16]:
df['fold'] = 0

In [17]:
df["fold"].value_counts()

fold
0    144289
Name: count, dtype: int64

In [18]:
FOLD_NAME = "fold"

In [19]:
class args:
    seed = 2022
    
    # Model
    model_name = 'microsoft/deberta-v3-large' # microsoft/deberta-xlarge 'microsoft/deberta-v3-base'
    project_name = "Shovel"
    # CV
    kfold_name = FOLD_NAME
    selected_folds = [0]
    not_include_folds = []
    # Paths
    name = model_name.split('.')[1] if '.' in model_name else model_name
    exp_name = f"{TODAY}--v{get_version(start=0)}-test2"  
    checkpoints_path = str(CHECKPOINT_PATH/Path(fr'{kfold_name}/{name}/{exp_name}'))  
    
    dataset = "FeedbackDataset"
    data = {"params_train":{'mask_prob':0.0,"mask_ratio":0.0},
            "params_valid":{'mask_prob':0.0,"mask_ratio":0.0},
           
           }
    model = {
            "model_params":{"model_name":model_name,
                            "num_label_discourse_type":7,
                            "num_label_effectiveness":3,
                            "use_dropout":True,
                            "use_gradient_checkpointing":True,
                            "config_path":None ,
                           },
        
            "pretrained_weights":None, 
             
            }
    
    optimizer = {
            "name":"optim.AdamW",
            'params':{"lr":2e-5,
                     "weight_decay": 0.01,
                     },            
            }

    scheduler = {
            "name":"poly",
            'params':{
                      "lr_end":1e-7,# defaulkt 1e-7
                      "power":3
                     },
            "warmup":0.04,            
            }
    
    train_loader = {
            "batch_size":1,
            'drop_last':True,
            "num_workers":2,
            "pin_memory":False, 
            "shuffle":True,
            }
    
    
    val_loader = {
            "batch_size":1,
            'drop_last':False,
            "num_workers":2,
            "pin_memory":False,
            "shuffle":False
            }
    
    trainer = {"use_amp":True,
                'epochs':5,
                "sample":False,
                "train_all_data":True,
                "use_awp":False,
                "start_epoch_awp":1,
                "adv_lr":0.0003,
                "adv_eps":0.001,
                "grad_clip":False,
                "max_norm":10
              }
    

    callbacks = {'save':True,"es":False,"patience":10,
                 'verbose_eval':1,"epoch_pct_eval":1/1,"epoch_eval_dist":"uniforme",#uniforme
                 "metric_track":"valid_rmse","mode":"min",'top_k':1,"start_eval_epoch":0,
                 "save_last_k":0
                }
    
    device = 0
    
Path(args.checkpoints_path).mkdir(parents=True,exist_ok=True)
print(args.checkpoints_path)

/database/kaggle/Commontlit/checkpoint/fold/microsoft/deberta-v3-large/2023-08-22--v0-test2


In [None]:
kfold(args,df)

----------- fold ---------

-------------   Fold 1 / 1  -------------



Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mamedprof[0m. Use [1m`wandb login --relogin`[0m to force relogin


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded 15594 samples.
Loaded 15594 samples.


Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['mask_predictions.dense.weight', 'mask_predictions.dense.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


    -> 434025485 trainable parameters



Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Using Amp


  0%|          | 0/15594 [00:00<?, ?it/s]