In [11]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [12]:
!pip install -q transformers
!pip install -q sentencepiece

In [13]:
import os
import gc
import re
import math
import glob
import random
import warnings
import collections
import numpy as np
import pandas as pd


import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader


from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModel,
    get_cosine_schedule_with_warmup,
)

from tqdm.notebook import tqdm
warnings.filterwarnings("ignore")

In [14]:
class cfg:
    seed = 2023
    batch_size = 64
    device = torch.device('cuda' if torch.cuda.is_available else 'cpu')

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(seed=cfg.seed)

In [15]:
test_df = pd.read_csv("/content/Test.csv")
sub_df = pd.read_csv("/content/SampleSubmission.csv")
display(test_df.head())
display(sub_df.head())

Unnamed: 0,Id,Word,Language,Pos
0,Id00qog2f11n_0,Ne,luo,
1,Id00qog2f11n_1,otim,luo,
2,Id00qog2f11n_2,penj,luo,
3,Id00qog2f11n_3,e,luo,
4,Id00qog2f11n_4,kind,luo,


Unnamed: 0,Id,Pos
0,Id00qog2f11n_0,
1,Id00qog2f11n_1,
2,Id00qog2f11n_2,
3,Id00qog2f11n_3,
4,Id00qog2f11n_4,


In [16]:
tags = ['ADJ',
 'ADP',
 'ADV',
 'AUX',
 'CCONJ',
 'DET',
 'INTJ',
 'NOUN',
 'NUM',
 'PART',
 'PRON',
 'PROPN',
 'PUNCT',
 'SCONJ',
 'SYM',
 'VERB',
 'X']


label_vocab = {'PAD': -100, 'ADJ': 0, 'ADP': 1, 'ADV': 2, 'AUX': 3, 'CCONJ': 4, 'DET': 5,\
               'INTJ': 6, 'NOUN': 7, 'NUM': 8, 'PART': 9,\
               'PRON': 10, 'PROPN': 11, 'PUNCT': 12, 'SCONJ': 13, 'SYM': 14, 'VERB': 15, 'X': 16}

print(f"Total tags: {len(tags)}")
test_df.head()

Total tags: 17


Unnamed: 0,Id,Word,Language,Pos
0,Id00qog2f11n_0,Ne,luo,
1,Id00qog2f11n_1,otim,luo,
2,Id00qog2f11n_2,penj,luo,
3,Id00qog2f11n_3,e,luo,
4,Id00qog2f11n_4,kind,luo,


### Preprocessing

In [17]:
def get_samples(df):
    sentences = []
    current_sentence = []

    # Process each row in the CSV data
    for index, row in tqdm(df.iterrows(), total=len(df)):
        word = row['Word']

        # removing soft hyphens
        word = word.replace('\x8d', '')

        current_sentence.append(word)

        # Check if the word ends with a full stop
        if word.strip() in ['.','?','!']:
            sentences.append(current_sentence)
            current_sentence = []

    return sentences

t_sentences = get_samples(test_df)

  0%|          | 0/32045 [00:00<?, ?it/s]

In [18]:
def align_tokenizations(sentences, tokenizer):
    tokenized_sentences = []
    label_mask = [] # to pick up only the prediction of first token of the word
    for sentence in tqdm(sentences, total=len(sentences)):
        tok_sent = []
        lm_sent = []
        for word in sentence:
            word_tokens = tokenizer.tokenize(word)
            token_mask = [1] + [0]*(len(word_tokens)-1)
            tok_sent.extend(word_tokens)
            lm_sent.extend(token_mask)

        tokenized_sentences.append(tok_sent)
        label_mask.append(lm_sent)

    return tokenized_sentences, label_mask

In [19]:
def convert_to_ids(sentences, lb_mask, tokenizer):
    sentences_ids = []
    label_mask = []
    for i, (sentence, mask) in tqdm(enumerate(zip(sentences, lb_mask)), total=len(sentences)):
        sentence_tensor = torch.tensor(tokenizer.convert_tokens_to_ids(['<s>'] + sentence + ['</s>'])).long()
        mask_tensor = torch.tensor([0] + mask + [0]).long()
        sentences_ids.insert(i, sentence_tensor)
        label_mask.insert(i, mask_tensor)

    return sentences_ids, label_mask

In [20]:
class PosTaggingDataset(Dataset):
    def __init__(self, sentences, label_mask):
        self.sentences = sentences
        self.label_mask = label_mask

    def __getitem__(self, i):
        return self.sentences[i], self.label_mask[i]

    def __len__(self):
        return len(self.sentences)

def collate_fn(items):
    max_len = max(len(item[0]) for item in items)
    sentences = torch.ones((len(items), max_len)).long()
    attention_mask = torch.zeros((len(items), max_len)).long()
    label_mask = torch.zeros((len(items), max_len)).long()

    for i, (sentence, lb_mask)  in enumerate(items):
        att_mask = torch.tensor([1 for s in sentence]).long()
        lb_mask = torch.tensor([0 if s in [0,1,2] else m for s, m in zip(sentence, lb_mask)]) # 0: start, 1: pad, 2: end

        sentences[i][:len(sentence)] = sentence
        attention_mask[i][:len(att_mask)] = att_mask
        label_mask[i][:len(lb_mask)] = lb_mask

    return {
        "input_ids": sentences ,
        "attention_mask": attention_mask,
        "label_mask": label_mask
    }


### Model

In [21]:
class POSModel(nn.Module):
    def __init__(self, model_name, model_config, num_labels):
        super().__init__()

        self.base_model = AutoModel.from_pretrained(model_name, config=model_config)
        self.linear = nn.Linear(self.base_model.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        word_emb, sentence_emb = self.base_model(input_ids, attention_mask, return_dict=False)
        logits = self.linear(word_emb)
        return logits


In [22]:
def infer(model_name="", model_path=""):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model_config = AutoConfig.from_pretrained(model_name)

    model = POSModel(model_name, model_config, len(label_vocab)-1)
    # model = nn.DataParallel(model, device_ids=[0, 1])

    model.to(cfg.device)
    model.load_state_dict(torch.load(model_path, map_location=cfg.device))
    model.eval()

    tokenized_sentences, label_mask = align_tokenizations(t_sentences, tokenizer)
    sentences_ids, label_mask = convert_to_ids(tokenized_sentences, label_mask, tokenizer)
    loader = DataLoader(PosTaggingDataset(sentences_ids, label_mask), collate_fn=collate_fn, batch_size=cfg.batch_size, shuffle=False)

    preds = []
    for i, batch in tqdm(enumerate(loader), total=len(loader)):
        with torch.no_grad():
            logits = model(batch['input_ids'].to(cfg.device), batch['attention_mask'].to(cfg.device))

        logits = torch.max(logits.cpu(), 2)[1]
        label_mask = batch['label_mask']


        label_mask = label_mask==1 # converting to bool
        logits = torch.masked_select(logits, label_mask)
        logits = logits.view(-1)
        preds.insert(i, logits.numpy())

    return preds

In [23]:
!unzip '/content/drive/Shareddrives/ZINDI Data Science/Lacuna/Competition #1/Scripts/Winning Solutions/#4 Overfit Gambit Accpeted/epoch_1st_run_pseudoLabeling.zip'


Archive:  /content/drive/Shareddrives/ZINDI Data Science/Lacuna/Competition #1/Scripts/Winning Solutions/#4 Overfit Gambit Accpeted/epoch_1st_run_pseudoLabeling.zip
  inflating: content/best_epoch_1.pth  


In [24]:
all_preds = []
# for path in sorted(glob.glob("/kaggle/input/lm-pos-train-0-9-folds/*.pth")):
path = "/content/content/best_epoch_1.pth"




In [25]:
model_preds = infer(model_name="Davlan/afro-xlmr-large-75L", model_path=str(path))
model_preds = np.concatenate(model_preds)
all_preds.append(model_preds)

Downloading (…)okenizer_config.json:   0%|          | 0.00/418 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/714 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/3.27G [00:00<?, ?B/s]

Some weights of XLMRobertaModel were not initialized from the model checkpoint at Davlan/afro-xlmr-large-75L and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/1395 [00:00<?, ?it/s]

  0%|          | 0/1395 [00:00<?, ?it/s]

  0%|          | 0/22 [00:00<?, ?it/s]

In [26]:
def pick_tags(x):
    counts = np.bincount(x)
    max_index = np.argmax(counts)
    return max_index

In [27]:
predictions = np.apply_along_axis(pick_tags, axis=0, arr=all_preds)

In [28]:
label_to_pos = {value: key for key, value in label_vocab.items()}
sub_df['Pos'] = [label_to_pos[p] for p in predictions]
sub_df.to_csv("submission_best_epoch_1.csv", index=False)
sub_df.head(10)

Unnamed: 0,Id,Pos
0,Id00qog2f11n_0,AUX
1,Id00qog2f11n_1,VERB
2,Id00qog2f11n_2,NOUN
3,Id00qog2f11n_3,ADP
4,Id00qog2f11n_4,ADP
5,Id00qog2f11n_5,PROPN
6,Id00qog2f11n_6,NOUN
7,Id00qog2f11n_7,NUM
8,Id00qog2f11n_8,CCONJ
9,Id00qog2f11n_9,PROPN
