In [1]:
!nvidia-smi

Wed Oct  4 20:00:37 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.116.04   Driver Version: 525.116.04   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA RTX A4000    Off  | 00000000:00:05.0 Off |                  Off |
| 41%   40C    P8    24W / 140W |      1MiB / 16376MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## Install Dependencies

In [2]:
!pip install transformers==4.21.3 torch==1.12.1+cu116 -f https://download.pytorch.org/whl/torch_stable.html -q

[0m

## Clone Repos containing Data

In [3]:
!git clone https://github.com/masakhane-io/masakhane-pos.git
!git clone https://github.com/masakhane-io/lacuna_pos_ner.git


fatal: destination path 'masakhane-pos' already exists and is not an empty directory.
fatal: destination path 'lacuna_pos_ner' already exists and is not an empty directory.


## Imports

In [4]:
from src.utils import *
from src.model import *
from src.dataset import *
from src.train import *
from src.infer import *
from src.postprocess import *

## Prepare Data

In [5]:
!mkdir -p models data

In [6]:
## PS: place Test.csv from Zindi in data folder

In [7]:
# prepare train & test
import pandas as pd

test = pd.read_csv('data/Test.csv')
test = preprocess_test(test)
train = preprocess_data()

In [8]:
train.head(2)

Unnamed: 0,text,target,lang
0,Hukumar wasan motsa jiki ta duniya WA ta cire...,"[NOUN, NOUN, VERB, NOUN, SCONJ, NOUN, PROPN, A...",hau
1,Hukumar ta duniya ta dauki wannan matakin ne ...,"[NOUN, SCONJ, NOUN, AUX, VERB, PRON, NOUN, PAR...",hau


In [9]:
test.head(2)

Unnamed: 0,text
0,Ne otim penj e kind Februar tarik 9 gi Februar...
1,Sifuna ne ojiwo jonyuol kod joma moko marito n...


In [10]:
# prepare monolingual data for pseudotagging
luo = create_luo_data()
tsn = create_tsn_data()

In [11]:
tsn.head(2)

Unnamed: 0,text
0,"Le fa ba lebagane le dikgwetlho di le dintsi, ..."
1,Ramolotja ke mongwe wa ba ba fetang dimilione ...


## Create Folds

In [12]:
selected_languages = ["zul", "xho", "sna", "lug", "kin", "swa", "nya", "sna","bbj","wol", "ibo", "pcm"]#, "eng-ron-wol"]
train = train[train.lang.isin(selected_languages)].reset_index(drop=True)

In [13]:
train.lang.nunique()

11

In [14]:
folds_mapper = {
    "ibo": 0, "nya": 0,
    "kin": 1, "wol": 1,
    "lug": 2, "swa": 2, "zul": 2,
    "pcm": 3, "bbj": 3,
    "xho": 4, "sna": 4
}

In [15]:
train["fold"] = train.lang.map(folds_mapper)

In [16]:
labels = ['ADP', 'AUX', 'VERB', 'PART', 'PRON', 'NOUN', 'PROPN', 'ADJ', 'PUNCT', 'X', 'NUM', 'ADV', 'DET', 'CCONJ', 'INTJ', 'SCONJ', 'SYM']
labels_to_ids = {v:k for k,v in enumerate(labels)}
ids_to_labels = {k:v for k,v in enumerate(labels)}

In [17]:
train

Unnamed: 0,text,target,lang,fold
0,Ndị uweojii na steeti Edo ejidela ma kpọchie ...,"[NOUN, NOUN, ADP, NOUN, PROPN, VERB, CCONJ, VE...",ibo,0
1,Ihe ọdachi a bụ nke dapụtàrà n' obodo Ama dị ...,"[NOUN, NOUN, DET, VERB, NOUN, VERB, ADP, NOUN,...",ibo,0
2,Dịka ozi a kụpụrụ site n' aka onye na- ahụ ma...,"[ADP, NOUN, PRON, VERB, ADP, ADP, NOUN, NOUN, ...",ibo,0
3,Ọ gara n' ihu kọwaa na ndị nta ahụ sòrò ndị b...,"[PRON, VERB, ADP, NOUN, VERB, SCONJ, NOUN, NOU...",ibo,0
4,"O kwuru na oge e ruzịrị n' obodo ahụ , na ndị...","[PRON, VERB, SCONJ, NOUN, PRON, VERB, ADP, NOU...",ibo,0
...,...,...,...,...
16478,Dembare iri kutarisirawo zvekare kuti Ngezi P...,"[PROPN, AUX, VERB, ADJ, DET, PROPN, PROPN, PRO...",sna,4
16479,Asi chikwata cha Tonderayi Ndiraya ichi chino...,"[SCONJ, NOUN, ADP, PROPN, PROPN, DET, VERB, AD...",sna,4
16480,Asi mutambo mukuru svondo rino uchange uri ku...,"[SCONJ, NOUN, ADJ, NOUN, DET, AUX, AUX, ADP, P...",sna,4
16481,Pa mwedzi miviri yapfuura Bosso yakunda mu mu...,"[ADP, NOUN, NUM, ADJ, PROPN, VERB, ADP, NOUN, ...",sna,4


In [18]:
train.groupby('fold').lang.unique()

fold
0         [ibo, nya]
1         [kin, wol]
2    [lug, swa, zul]
3         [pcm, bbj]
4         [xho, sna]
Name: lang, dtype: object

## CFG

In [19]:
LR = 1e-5
NUM_EPOCHS = 4
NUM_CORES = 4
BATCH_SIZE = 8
USE_FP16 = True
GRAD_ACCUM_STEPS = 2
MAX_SEQ_LENGTH = 200
PRETRAINED_MODEL = "Davlan/afro-xlmr-large-61L"
USE_AMP = True
LRs = [1e-5, 7e-6, 3e-6, 1e-6, 2.5e-7]

## Round 1: Train Baseline Model

In [18]:
!mkdir -p models/exp5
train_folds(
    train, "exp5",labels_to_ids, PRETRAINED_MODEL, LRs=LRs, BATCH_SIZE=BATCH_SIZE, LR=LR, NUM_EPOCHS=NUM_EPOCHS, GRAD_ACCUM_STEPS=GRAD_ACCUM_STEPS, label_smooth=0.1,
    grad_norm=0
)

-------------------------------------------------------------
Fold 0
-------------------------------------------------------------
FULL Dataset: (16483, 4)
TRAIN Dataset: (13423, 4)
TEST Dataset: (3060, 4)


Some weights of the model checkpoint at Davlan/afro-xlmr-large-61L were not used when initializing XLMRobertaModel: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaModel were not initialized from the model checkpoint at Davlan/afro-xlmr-large-61L and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inf

### Training epoch: 1
### LR = 1e-05



100%|██████████| 1678/1678 [05:45<00:00,  4.85it/s, train_loss=1.2] 

Training loss epoch: 1.2027424631087515
### Evaluating epoch: 1



100%|██████████| 765/765 [00:29<00:00, 26.21it/s]


Validation ACC:  0.7845581312751827
### Training epoch: 2
### LR = 7e-06



100%|██████████| 1678/1678 [05:46<00:00,  4.84it/s, train_loss=0.922]

Training loss epoch: 0.9215961636107357
### Evaluating epoch: 2



100%|██████████| 765/765 [00:29<00:00, 26.06it/s]


Validation ACC:  0.7896637467388844
### Training epoch: 3
### LR = 3e-06



100%|██████████| 1678/1678 [05:46<00:00,  4.85it/s, train_loss=0.878]

Training loss epoch: 0.877718549590287
### Evaluating epoch: 3



100%|██████████| 765/765 [00:29<00:00, 26.13it/s]


Validation ACC:  0.7906613268628924
### Training epoch: 4
### LR = 1e-06



100%|██████████| 1678/1678 [05:46<00:00,  4.85it/s, train_loss=0.861]

Training loss epoch: 0.8608500977025697
### Evaluating epoch: 4



100%|██████████| 765/765 [00:29<00:00, 26.22it/s]


Validation ACC:  0.7914010734420178
-------------------------------------------------------------
Fold 1
-------------------------------------------------------------
FULL Dataset: (16483, 4)
TRAIN Dataset: (13408, 4)
TEST Dataset: (3075, 4)


Some weights of the model checkpoint at Davlan/afro-xlmr-large-61L were not used when initializing XLMRobertaModel: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaModel were not initialized from the model checkpoint at Davlan/afro-xlmr-large-61L and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inf

### Training epoch: 1
### LR = 1e-05



100%|██████████| 1676/1676 [05:45<00:00,  4.85it/s, train_loss=1.22]

Training loss epoch: 1.2150700338771634
### Evaluating epoch: 1



100%|██████████| 769/769 [00:29<00:00, 26.06it/s]


Validation ACC:  0.7759401056210022
### Training epoch: 2
### LR = 7e-06



100%|██████████| 1676/1676 [05:46<00:00,  4.84it/s, train_loss=0.958]

Training loss epoch: 0.9575566037031233
### Evaluating epoch: 2



100%|██████████| 769/769 [00:29<00:00, 26.07it/s]


Validation ACC:  0.7624295094299401
### Training epoch: 3
### LR = 3e-06



100%|██████████| 1676/1676 [05:46<00:00,  4.84it/s, train_loss=0.913]

Training loss epoch: 0.9127573213031013
### Evaluating epoch: 3



100%|██████████| 769/769 [00:29<00:00, 26.15it/s]


Validation ACC:  0.7673558249804945
### Training epoch: 4
### LR = 1e-06



100%|██████████| 1676/1676 [05:46<00:00,  4.84it/s, train_loss=0.898]

Training loss epoch: 0.897671022427964
### Evaluating epoch: 4



100%|██████████| 769/769 [00:29<00:00, 26.16it/s]


Validation ACC:  0.7663374355736212
-------------------------------------------------------------
Fold 2
-------------------------------------------------------------
FULL Dataset: (16483, 4)
TRAIN Dataset: (12130, 4)
TEST Dataset: (4353, 4)


Some weights of the model checkpoint at Davlan/afro-xlmr-large-61L were not used when initializing XLMRobertaModel: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaModel were not initialized from the model checkpoint at Davlan/afro-xlmr-large-61L and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inf

### Training epoch: 1
### LR = 1e-05



100%|██████████| 1517/1517 [05:12<00:00,  4.85it/s, train_loss=1.2] 

Training loss epoch: 1.202941944273821
### Evaluating epoch: 1



100%|██████████| 1089/1089 [00:41<00:00, 26.08it/s]


Validation ACC:  0.7593399394422695
### Training epoch: 2
### LR = 7e-06



100%|██████████| 1517/1517 [05:13<00:00,  4.84it/s, train_loss=0.938]

Training loss epoch: 0.9379383739312385
### Evaluating epoch: 2



100%|██████████| 1089/1089 [00:41<00:00, 26.21it/s]


Validation ACC:  0.7562860690945726
### Training epoch: 3
### LR = 3e-06



100%|██████████| 1517/1517 [05:13<00:00,  4.84it/s, train_loss=0.893]

Training loss epoch: 0.8928392900002969
### Evaluating epoch: 3



100%|██████████| 1089/1089 [00:41<00:00, 26.02it/s]


Validation ACC:  0.7516767326610561
### Training epoch: 4
### LR = 1e-06



100%|██████████| 1517/1517 [05:13<00:00,  4.84it/s, train_loss=0.879]

Training loss epoch: 0.8790543639274828
### Evaluating epoch: 4



100%|██████████| 1089/1089 [00:41<00:00, 26.04it/s]


Validation ACC:  0.7542504858105699
-------------------------------------------------------------
Fold 3
-------------------------------------------------------------
FULL Dataset: (16483, 4)
TRAIN Dataset: (13483, 4)
TEST Dataset: (3000, 4)


Some weights of the model checkpoint at Davlan/afro-xlmr-large-61L were not used when initializing XLMRobertaModel: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaModel were not initialized from the model checkpoint at Davlan/afro-xlmr-large-61L and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inf

### Training epoch: 1
### LR = 1e-05



100%|██████████| 1686/1686 [05:48<00:00,  4.84it/s, train_loss=1.13]

Training loss epoch: 1.1340323978966524
### Evaluating epoch: 1



100%|██████████| 750/750 [00:28<00:00, 25.99it/s]


Validation ACC:  0.6186567749933481
### Training epoch: 2
### LR = 7e-06



100%|██████████| 1686/1686 [05:48<00:00,  4.84it/s, train_loss=0.894]

Training loss epoch: 0.8935935080829182
### Evaluating epoch: 2



100%|██████████| 750/750 [00:28<00:00, 26.04it/s]


Validation ACC:  0.62037869740936
### Training epoch: 3
### LR = 3e-06



100%|██████████| 1686/1686 [05:48<00:00,  4.84it/s, train_loss=0.856]

Training loss epoch: 0.8564739056541402
### Evaluating epoch: 3



100%|██████████| 750/750 [00:28<00:00, 26.07it/s]


Validation ACC:  0.6225008331012589
### Training epoch: 4
### LR = 1e-06



100%|██████████| 1686/1686 [05:48<00:00,  4.84it/s, train_loss=0.845]

Training loss epoch: 0.8453518718940649
### Evaluating epoch: 4



100%|██████████| 750/750 [00:29<00:00, 25.85it/s]


Validation ACC:  0.6259063653563263
-------------------------------------------------------------
Fold 4
-------------------------------------------------------------
FULL Dataset: (16483, 4)
TRAIN Dataset: (13488, 4)
TEST Dataset: (2995, 4)


Some weights of the model checkpoint at Davlan/afro-xlmr-large-61L were not used when initializing XLMRobertaModel: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaModel were not initialized from the model checkpoint at Davlan/afro-xlmr-large-61L and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inf

### Training epoch: 1
### LR = 1e-05



100%|██████████| 1686/1686 [05:48<00:00,  4.84it/s, train_loss=1.17]

Training loss epoch: 1.171622664690866
### Evaluating epoch: 1



100%|██████████| 749/749 [00:28<00:00, 26.05it/s]


Validation ACC:  0.7866081877683705
### Training epoch: 2
### LR = 7e-06



100%|██████████| 1686/1686 [05:48<00:00,  4.84it/s, train_loss=0.915]

Training loss epoch: 0.9150980274377486
### Evaluating epoch: 2



100%|██████████| 749/749 [00:29<00:00, 25.79it/s]


Validation ACC:  0.7899412427406663
### Training epoch: 3
### LR = 3e-06



100%|██████████| 1686/1686 [05:48<00:00,  4.84it/s, train_loss=0.872]

Training loss epoch: 0.8723491904715768
### Evaluating epoch: 3



100%|██████████| 749/749 [00:28<00:00, 26.13it/s]


Validation ACC:  0.7903840736718573
### Training epoch: 4
### LR = 1e-06



100%|██████████| 1686/1686 [05:48<00:00,  4.84it/s, train_loss=0.856]

Training loss epoch: 0.8563931371888515
### Evaluating epoch: 4



100%|██████████| 749/749 [00:28<00:00, 26.08it/s]


Validation ACC:  0.790448740059496


### Create pseudo-labels

In [20]:
import torch
import numpy as np

def create_pseudos(lang, df, exp):
    tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL)
    test_params = {'batch_size': 4,
                    'shuffle': False,
                    'num_workers': 4,
                    'pin_memory':True,
                   'collate_fn':DataCollatorWithPadding(tokenizer=tokenizer)
                    } 
    
    testing_set = CustomDataset(df, tokenizer,labels_to_ids, 200, True)
    test_texts_loader = DataLoader(testing_set, **test_params)
    for FOLD in range(5):
        preds = inference(test_texts_loader,FOLD, exp, ids_to_labels, PRETRAINED_MODEL, lang)
    
    test_params = {'batch_size': 1,
                    'shuffle': False,
                    'num_workers': 4,
                    'pin_memory':True,
                   'collate_fn':DataCollatorWithPadding(tokenizer=tokenizer)
                    } 
    
    testing_set = CustomDataset(df, tokenizer,labels_to_ids, 200, True)
    test_texts_loader = DataLoader(testing_set, **test_params)
    
    fold0 = np.load(f"{lang}/TEST_fold0.npy")
    fold1 = np.load(f"{lang}/TEST_fold1.npy")
    fold2 = np.load(f"{lang}/TEST_fold2.npy")
    fold3 = np.load(f"{lang}/TEST_fold3.npy")
    fold4 = np.load(f"{lang}/TEST_fold4.npy")
    ensemble = (fold0 + fold1 + fold2 + fold3 + fold4 )/5
    final_preds = folds_inference(test_texts_loader, ensemble, ids_to_labels)
    from itertools import chain
    l = list(chain.from_iterable(final_preds))
    df["target"] = l
    return df

In [17]:
!mkdir -p luo tsn
luo = create_pseudos('luo', luo, 'exp5')
luo.to_csv('luo/luo.csv', index=False)
tsn = create_pseudos('tsn', tsn, 'exp5')
tsn.to_csv('tsn/tsn.csv', index=False)

Downloading config.json:   0%|          | 0.00/728 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/2.09G [00:00<?, ?B/s]

Some weights of the model checkpoint at Davlan/afro-xlmr-large-61L were not used when initializing XLMRobertaModel: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaModel were not initialized from the model checkpoint at Davlan/afro-xlmr-large-61L and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inf

## Round 1:

In [21]:
#luo = pd.read_csv('luo/luo.csv').dropna().reset_index(drop=True)
#luo["target"] = luo["target"].apply(convert_to_list)
#
#tsn = pd.read_csv('tsn/tsn.csv').dropna().reset_index(drop=True)
#tsn["target"] = tsn["target"].apply(convert_to_list)
#train = pd.read_csv('data/train.csv')
#train["target"] = train["target"].apply(convert_to_list)

In [None]:
!mkdir -p models/exp6
luo['fold'] = -1
tsn['fold'] = -1
train_folds(
    train, "exp6",labels_to_ids, PRETRAINED_MODEL, LRs=LRs, BATCH_SIZE=BATCH_SIZE, LR=LR, NUM_EPOCHS=NUM_EPOCHS, GRAD_ACCUM_STEPS=GRAD_ACCUM_STEPS,
    label_smooth=0.09,grad_norm=0, luo=luo, tsn=tsn, luo_n=3000, tsn_n=3000
)
luo = create_pseudos('luo', luo, 'exp6')
luo.to_csv('luo/luo.csv', index=False)
tsn = create_pseudos('tsn', tsn, 'exp6')
tsn.to_csv('tsn/tsn.csv', index=False)

## Round2

In [19]:
from itertools import chain
def infer_test(lang, df, exp):
    tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL)
    test_params = {'batch_size': 4,
                    'shuffle': False,
                    'num_workers': 4,
                    'pin_memory':True,
                   'collate_fn':DataCollatorWithPadding(tokenizer=tokenizer)
                    } 
    
    testing_set = CustomDataset(df, tokenizer,labels_to_ids, 200, True)
    test_texts_loader = DataLoader(testing_set, **test_params)
    for FOLD in range(5):
        preds = inference(test_texts_loader,FOLD, exp, ids_to_labels, PRETRAINED_MODEL, lang)
    
    test_params = {'batch_size': 1,
                    'shuffle': False,
                    'num_workers': 4,
                    'pin_memory':True,
                   'collate_fn':DataCollatorWithPadding(tokenizer=tokenizer)
                    } 
    
    testing_set = CustomDataset(df, tokenizer,labels_to_ids, 200, True)
    test_texts_loader = DataLoader(testing_set, **test_params)
    
    fold0 = np.load(f"{lang}/TEST_fold0.npy")
    fold1 = np.load(f"{lang}/TEST_fold1.npy")
    fold2 = np.load(f"{lang}/TEST_fold2.npy")
    fold3 = np.load(f"{lang}/TEST_fold3.npy")
    fold4 = np.load(f"{lang}/TEST_fold4.npy")
    ensemble = (fold0 + fold1 + fold2 + fold3 + fold4 )/5
    final_preds = folds_inference(test_texts_loader, ensemble, ids_to_labels)
    l = list(chain.from_iterable(final_preds))
    df["target"] = l
    return df

In [None]:
!mkdir -p models/exp7
luo['fold'] = -1
tsn['fold'] = -1
train_folds(
    train, "exp7",labels_to_ids, PRETRAINED_MODEL, LRs=LRs, BATCH_SIZE=BATCH_SIZE, LR=LR, NUM_EPOCHS=NUM_EPOCHS, GRAD_ACCUM_STEPS=GRAD_ACCUM_STEPS,
    label_smooth=0.08,grad_norm=0, luo=luo, tsn=tsn, luo_n=3600, tsn_n=3100
)

In [None]:
luo = create_pseudos('luo', luo.dropna().reset_index(drop=True), 'exp7').dropna().reset_index(drop=True)
luo.to_csv('luo/luo.csv', index=False)
tsn = create_pseudos('tsn', tsn.dropna().reset_index(drop=True), 'exp7').dropna().reset_index(drop=True)
tsn.to_csv('tsn/tsn.csv', index=False)

## Round3

In [None]:
!mkdir -p models/exp8
luo['fold'] = -1
tsn['fold'] = -1
train_folds(
    train, "exp8",labels_to_ids, PRETRAINED_MODEL, LRs=LRs, BATCH_SIZE=BATCH_SIZE, LR=LR, NUM_EPOCHS=NUM_EPOCHS, GRAD_ACCUM_STEPS=GRAD_ACCUM_STEPS,
    label_smooth=0.08,grad_norm=0, luo=luo, tsn=tsn, luo_n=4000, tsn_n=3500
)
luo = create_pseudos('luo', luo, 'exp8')
luo.to_csv('luo/luo.csv', index=False)
tsn = create_pseudos('tsn', tsn, 'exp8')
tsn.to_csv('tsn/tsn.csv', index=False)

## Round4

In [None]:
!mkdir -p models/exp9
luo['fold'] = -1
tsn['fold'] = -1

train_folds(
    train, "exp9",labels_to_ids, PRETRAINED_MODEL, LRs=LRs, BATCH_SIZE=BATCH_SIZE, LR=LR, NUM_EPOCHS=NUM_EPOCHS, GRAD_ACCUM_STEPS=GRAD_ACCUM_STEPS,
    label_smooth=0.07,grad_norm=0, luo=luo, tsn=tsn, luo_n=4200, tsn_n=3700
)
luo = create_pseudos('luo', luo, 'exp9')
luo.to_csv('luo/luo.csv', index=False)
tsn = create_pseudos('tsn', tsn, 'exp9')
tsn.to_csv('tsn/tsn.csv', index=False)

In [None]:
train = train.to_csv('train.csv', index=False)

## Postprocess

### Nelder-Mead: This will take a considerable amount of time so you can skip it

In [None]:
#fold0_w = find_best_thresh(0, train, PRETRAINED_MODEL, labels_to_ids, 'exp6')
#fold1_w = find_best_thresh(1, train, PRETRAINED_MODEL, labels_to_ids, 'exp6')
#fold2_w = find_best_thresh(2, train, PRETRAINED_MODEL, labels_to_ids, 'exp6')
#fold3_w = find_best_thresh(3, train, PRETRAINED_MODEL, labels_to_ids, 'exp6')
#fold4_w = find_best_thresh(4, train, PRETRAINED_MODEL, labels_to_ids, 'exp6')

In [20]:
fold0_w = np.array([0.9767511 , 0.3981664 , 1.61331843, 0.2595545 , 1.81338756,
       1.28729398, 0.76055386, 0.09408014, 1.06771219, 1.47217587,
       1.25157168, 1.03431401, 0.33491653, 0.52220695, 1.25210021,
       1.6623188 , 1.14310743])
fold1_w = np.array([0.9767511 , 0.3981664 , 1.61331843, 0.2595545 , 1.81338756,
       1.28729398, 0.76055386, 0.09408014, 1.06771219, 1.47217587,
       1.25157168, 1.03431401, 0.33491653, 0.52220695, 1.25210021,
       1.6623188 , 1.14310743])
fold2_w = np.array([0.87911696, 1.52908478, 0.81095612, 0.9107473 , 0.00344712,
       1.16542127, 0.89979852, 0.96534758, 1.0812359 , 1.22360948,
       1.01659991, 1.27223844, 1.12947197, 0.81287226, 1.21721596,
       1.00092072, 1.16421219])

fold3_w = np.array([0.9767511 , 0.3981664 , 1.61331843, 0.2595545 , 1.81338756,
       1.28729398, 0.76055386, 0.09408014, 1.06771219, 1.47217587,
       1.25157168, 1.03431401, 0.33491653, 0.52220695, 1.25210021,
       1.6623188 , 1.14310743])

fold4_w = np.array([1.18264067, 1.49253454, 0.6136005 , 1.0873054 , 0.64117687,
       1.07551867, 1.32218085, 0.21402064, 1.14864329, 1.01481558,
       1.08367489, 0.24427615, 1.31034121, 1.07435968, 0.76660614,
       1.56849931, 1.01845596])

In [21]:
sub = infer_test('models/exp9', test, 'exp9')
l = list(chain.from_iterable(sub.target.values))
len(l)
TEST = pd.read_csv("data/Test.csv")
TEST["Pos"] = l
TEST[["Id", "Pos"]].to_csv('lacuna_pseudo.csv', index=False)

Some weights of the model checkpoint at Davlan/afro-xlmr-large-61L were not used when initializing XLMRobertaModel: ['lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaModel were not initialized from the model checkpoint at Davlan/afro-xlmr-large-61L and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inf

In [23]:
fold0 = np.load(f"models/exp9/TEST_fold0.npy")*fold0_w
fold1 = np.load(f"models/exp9/TEST_fold1.npy")*fold1_w
fold2 = np.load(f"models/exp9/TEST_fold2.npy")*fold2_w
fold3 = np.load(f"models/exp9/TEST_fold3.npy")*fold3_w
fold4 = np.load(f"models/exp9/TEST_fold4.npy")*fold4_w 
ensemble = (fold0 + fold1 + fold2 + fold3 +fold4)/5

tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL)
test_params = {'batch_size': 1,
                'shuffle': False,
                'num_workers': 4,
                'pin_memory':True,
               'collate_fn':DataCollatorWithPadding(tokenizer=tokenizer)
                } 

testing_set = CustomDataset(test, tokenizer, labels_to_ids, 200, True)
test_texts_loader = DataLoader(testing_set, **test_params)

p = folds_inference(test_texts_loader, ensemble, ids_to_labels)
from itertools import chain
l = list(chain.from_iterable(p))
l = list(chain.from_iterable(l))
TEST = pd.read_csv("data/Test.csv")
TEST["Pos"] = l
TEST[["Id", "Pos"]].to_csv('lacuna_opt.csv', index=False)

100%|██████████| 1208/1208 [00:02<00:00, 586.81it/s]


In [None]:
sub = postprocess(pd.read_csv('data/Test.csv'), pd.read_csv('lacuna_opt.csv'))

In [25]:
sub.to_csv('lacuna_opt_post.csv', index=False)