In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
!pip install -q transformers
!pip install -q sentencepiece


In [7]:
import os
import gc
import re
import math
import random
import warnings
import numpy as np
import pandas as pd


import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModel,
    AutoModelForTokenClassification,
    get_cosine_schedule_with_warmup,
)

from tqdm.notebook import tqdm
warnings.filterwarnings("ignore")

In [8]:
class cfg:
    seed = 2023
    epochs = 2
    batch_size = 4
    max_grad_norm = 1
    learning_rate = 1e-5
    weight_decay = 1e-4
    adam_epsilon = 1e-8
    ls = 0.01
    num_cycles = 0.5
    max_length = 180
    max_sentences = 20000 # max possible is around 1500, above that means consider all sentences
    device = torch.device('cuda' if torch.cuda.is_available else 'cpu')
    val = False

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(seed=cfg.seed)

In [9]:
test_data_labels = pd.read_csv('/content/submission.csv') # update pseudo labels every round
test_data_labels.head()

Unnamed: 0,Id,Pos
0,Id00qog2f11n_0,AUX
1,Id00qog2f11n_1,VERB
2,Id00qog2f11n_2,NOUN
3,Id00qog2f11n_3,ADP
4,Id00qog2f11n_4,ADP


In [10]:
test_df = pd.read_csv("/content/Test.csv")
test_df.head()

Unnamed: 0,Id,Word,Language,Pos
0,Id00qog2f11n_0,Ne,luo,
1,Id00qog2f11n_1,otim,luo,
2,Id00qog2f11n_2,penj,luo,
3,Id00qog2f11n_3,e,luo,
4,Id00qog2f11n_4,kind,luo,


In [11]:
test_df.shape
test_df_to_concat = test_df[['Id','Word','Language']].merge(test_data_labels,how='left',on='Id')
print(test_df_to_concat.shape)
test_df_to_concat = test_df_to_concat.rename(columns = {'Language':'lang','Word':'word','Pos':'tag'})
test_df_to_concat.head()

(32045, 4)


Unnamed: 0,Id,word,lang,tag
0,Id00qog2f11n_0,Ne,luo,AUX
1,Id00qog2f11n_1,otim,luo,VERB
2,Id00qog2f11n_2,penj,luo,NOUN
3,Id00qog2f11n_3,e,luo,ADP
4,Id00qog2f11n_4,kind,luo,ADP


In [12]:
# train.csv has train data of all languages under one file
train_df = pd.read_csv("/content/train.csv")
train_df = pd.read_csv("/content/train.csv")
print(train_df.shape)
train_df.head()

(654842, 3)


Unnamed: 0,word,tag,lang
0,Do,VERB,pcm
1,senator,NOUN,pcm
2,tok,VERB,pcm
3,dis,DET,pcm
4,one,NUM,pcm


In [13]:
train_df = pd.concat([train_df,test_df_to_concat])
print(train_df.shape)
test_df = pd.read_csv("/content/Test.csv")
train_df = train_df[train_df.word.notnull()]
train_df.head()

(686887, 4)


Unnamed: 0,word,tag,lang,Id
0,Do,VERB,pcm,
1,senator,NOUN,pcm,
2,tok,VERB,pcm,
3,dis,DET,pcm,
4,one,NUM,pcm,


In [14]:
train_df.Id.isna().sum()


654631

In [15]:
train_df.lang.unique()

array(['pcm', 'hau', 'bbj', 'ewe', 'bam', 'zul', 'lug', 'nya', 'yor',
       'wol', 'sna', 'ibo', 'xho', 'fon', 'swa', 'twi', 'kin', 'mos',
       'luo', 'tsn'], dtype=object)

In [16]:
langs = ['pcm','hau', 'bbj', 'ewe', 'bam', 'zul', 'lug', 'nya', 'yor',
       'wol', 'sna', 'ibo', 'xho', 'fon', 'swa', 'twi', 'kin', 'mos',
        'luo','tsn']

label_vocab = {'PAD': -100, 'ADJ': 0, 'ADP': 1, 'ADV': 2, 'AUX': 3, 'CCONJ': 4, 'DET': 5,\
               'INTJ': 6, 'NOUN': 7, 'NUM': 8, 'PART': 9,\
               'PRON': 10, 'PROPN': 11, 'PUNCT': 12, 'SCONJ': 13, 'SYM': 14, 'VERB': 15, 'X': 16}

print(f"Total languages: {len(langs)}   Total tags: {len(label_vocab)-1}")
train_df.head()

Total languages: 20   Total tags: 17


Unnamed: 0,word,tag,lang,Id
0,Do,VERB,pcm,
1,senator,NOUN,pcm,
2,tok,VERB,pcm,
3,dis,DET,pcm,
4,one,NUM,pcm,


In [17]:
if cfg.val:
    train_data = train_df[~train_df['lang'].isin(['swa'])]
    val_data = train_df[train_df['lang'].isin(['swa'])]

else:
    train_data = train_df.copy()

### Preprocessing

In [18]:
def get_samples(df, full_val_samples=False):
    sentences = []
    taggings = []

    # Temporary variables to store sentence and tagging for current sentence
    current_sentence = []
    current_tagging = []

    for lang in tqdm(df.lang.unique(), total=len(df.lang.unique())):
        sentence_count = 0
        # Process each row in the CSV data
        for index, row in df[df.lang==lang].iterrows():
            word = row['word']
            tag = row['tag']

            # removing soft hyphens
            word = word.replace('\x8d', '')


            current_sentence.append(word)
            current_tagging.append(tag)

            if word.strip() in ['.', '?', '!']:
                sentence_count+=1
                assert len(current_sentence)==len(current_tagging)
                sentences.append(current_sentence)
                taggings.append(current_tagging)
                current_sentence = []
                current_tagging = []



    return sentences, taggings

t_sentences, t_taggings = get_samples(train_data)
if cfg.val:
    v_sentences, v_taggings = get_samples(val_data, full_val_samples=True)

  0%|          | 0/20 [00:00<?, ?it/s]

In [19]:
def rem_duplicates(t_sentences, t_taggings):
    sentence_dict = {}

    unique_sentences = []
    unique_taggings = []

    for index, s in enumerate(t_sentences):
        joined = " ".join(s)

        if joined not in sentence_dict:
            sentence_dict[joined] = index

            unique_sentences.append(s)
            unique_taggings.append(t_taggings[index])

    return unique_sentences, unique_taggings


print(f"Total Sentences: {len(t_sentences)}")

t_sentences, t_taggings = rem_duplicates(t_sentences, t_taggings)

print(f"Unique Sentences: {len(t_sentences)}")

Total Sentences: 25773
Unique Sentences: 25407


In [20]:
def align_tokenizations(sentences, taggings, tokenizer):
    tokenized_sentences = []
    aligned_taggings = []
    for sentence, tagging in tqdm(zip(sentences, taggings), total=len(sentences)):
        tok_sent = []
        tags = []

        for word, tag in zip(sentence, tagging):
            word_tokens = tokenizer.tokenize(word)
            tok_sent.extend(word_tokens)
            tag = [tag] + ['PAD'] * (len(word_tokens) - 1)
            tags.extend(tag)

        tokenized_sentences.append(tok_sent)
        aligned_taggings.append(tags)
        assert len(tok_sent) == len(tags)

    return tokenized_sentences, aligned_taggings

In [21]:
def convert_to_ids(sentences, taggings, tokenizer):
    sentences_ids = []
    taggings_ids = []
    for i, (sentence, tagging) in tqdm(enumerate(zip(sentences, taggings)), total=len(sentences)):
        sentence_tensor = torch.tensor(tokenizer.convert_tokens_to_ids(['<s>'] + sentence[:cfg.max_length-2] + ['</s>'])).long()
        tagging_tensor = torch.tensor([label_vocab['PAD']] + [label_vocab[tag] for tag in tagging[:cfg.max_length-2]] + [label_vocab['PAD']]).long()

        sentences_ids.insert(i, sentence_tensor)
        taggings_ids.insert(i, tagging_tensor)



        assert len(sentence_tensor) == len(tagging_tensor)
    return sentences_ids, taggings_ids

In [22]:
class PosTaggingDataset(Dataset):
    def __init__(self, sentences, taggings):
        assert len(sentences) == len(taggings)
        self.sentences = sentences
        self.taggings = taggings

    def __getitem__(self, i):
        return self.sentences[i], self.taggings[i]

    def __len__(self):
        return len(self.sentences)

def collate_fn(items):
    max_len = max(len(item[0]) for item in items)
    sentences = torch.ones((len(items), max_len)).long()
    taggings = torch.ones((len(items), max_len)).long()
    attention_mask = torch.zeros((len(items), max_len)).long()
    for i, (sentence, tagging) in enumerate(items):
        tagging = torch.tensor([tag for tag in tagging]+[label_vocab['PAD']]*(max_len-len(tagging)))
        mask = torch.tensor([1 for s in sentence]).long()

        sentences[i][:len(sentence)] = sentence
        taggings[i][:len(tagging)] = tagging
        attention_mask[i][:len(mask)] = mask

    return {
        "input_ids": sentences ,
        "labels": taggings,
        "attention_mask": attention_mask
    }



In [23]:
class POSModel(nn.Module):
    def __init__(self, model_name, model_config, num_labels):
        super().__init__()
        self.model_config = model_config.update(
            {
                "hidden_dropout_prob": 0.,
                "hidden_dropout" : 0.,
                "attention_dropout" : 0.,
                "attention_probs_dropout_prob" : 0.,
                "add_pooling_layer": False,
            }
        )
        self.base_model = AutoModel.from_pretrained(model_name, config=self.model_config)
        self.linear = nn.Linear(self.base_model.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        word_emb, sent_emb = self.base_model(input_ids, attention_mask, return_dict=False)
        logits = self.linear(word_emb)
        return logits


In [24]:
def train(model, loader, optimizer, scheduler, grad_clip=True):
    model.train()

    scaler = torch.cuda.amp.GradScaler(enabled=True)
    criterion = nn.CrossEntropyLoss(label_smoothing=cfg.ls)
    total_loss = correct = num_loss = num_perf = 0
    for batch in tqdm(loader, total=len(loader)):
        with torch.cuda.amp.autocast(enabled=True):
            logits = model(batch['input_ids'].to(cfg.device), batch['attention_mask'].to(cfg.device))

        y = batch['labels'].to(cfg.device)
        loss = criterion(logits.view(-1, len(label_vocab)-1), y.view(-1))
        total_loss += loss.item()
        num_loss += 1
        scaler.scale(loss).backward()

        if grad_clip:
            nn.utils.clip_grad_norm_(model.parameters(), cfg.max_grad_norm)

        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()
        scheduler.step()

        y_pred = torch.max(logits, 2)[1] # compute highest-scoring tag
        mask = (y != label_vocab['PAD']) # ignore <pad> tags
        correct += torch.sum((y_pred == y) * mask) # compute number of correct predictions
        num_perf += torch.sum(mask).item()

    return total_loss / num_loss, correct.item() / num_perf

def validate(model, loader):
    model.eval()
    criterion = nn.CrossEntropyLoss(label_smoothing=cfg.ls)

    total_loss = correct = num_loss = num_perf = 0
    for batch in loader:
        with torch.no_grad():
            logits = model(batch['input_ids'].to(cfg.device), batch['attention_mask'].to(cfg.device))

        y = batch['labels'].to(cfg.device)
        loss = criterion(logits.view(-1, len(label_vocab)-1), y.view(-1))
        total_loss += loss.item()
        num_loss += 1

        y_pred = torch.max(logits, 2)[1] # compute highest-scoring tag
        mask = (y != label_vocab['PAD']) # ignore <pad> tags
        correct += torch.sum((y_pred == y) * mask) # compute number of correct predictions
        num_perf += torch.sum(mask).item()

    return total_loss / num_loss, correct.item() / num_perf

In [25]:
def opt_and_sched(model):
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": cfg.weight_decay,
        },
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]
    optimizer = torch.optim.AdamW(optimizer_grouped_parameters,lr=cfg.learning_rate, eps=cfg.adam_epsilon)
    scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=cfg.num_warmup_steps,\
                                num_training_steps=cfg.num_train_steps, num_cycles=cfg.num_cycles)

    return optimizer, scheduler

In [26]:
def fit(model, model_name, train_loader=None, val_loader=None):
    cfg.epoch_steps = len(train_loader)
    cfg.num_train_steps = cfg.epochs * cfg.epoch_steps
    cfg.num_warmup_steps = 0.1 * cfg.epoch_steps

    optimizer, scheduler = opt_and_sched(model)
    output_model_name = model_name.split("/")[-1]
    best_score = 0
    for epoch in range(cfg.epochs):
        print("#"*30 + f" Epoch {epoch+1} Running " + "#"*30 + "\n")
        train_loss, train_acc = train(model, train_loader, optimizer, scheduler, grad_clip=True)

        if cfg.val:
            val_loss, val_acc = validate(model, val_loader)

        print(("#"*20 + f"Train Loss: {train_loss} | Train Acc: {train_acc} " + "#"*20))
        if cfg.val:
            print(("#"*20 + f"Val Loss: {val_loss} | Val Acc: {val_acc} " + "#"*20))

            if val_acc > best_score:
                best_score = val_acc
                torch.save(model.state_dict(),f"./best_epoch.pth")

        else:
            if train_acc > best_score:
                best_score = train_acc
                torch.save(model.state_dict(),f"./best_epoch.pth")



In [29]:
!unzip '/content/drive/Shareddrives/ZINDI Data Science/Lacuna/Competition #1/Scripts/Winning Solutions/#4 Overfit Gambit Accpeted/mlm_model.zip'


Archive:  /content/drive/Shareddrives/ZINDI Data Science/Lacuna/Competition #1/Scripts/Winning Solutions/#4 Overfit Gambit Accpeted/mlm_model.zip
   creating: mlm_model/
  inflating: mlm_model/training_args.bin  
   creating: mlm_model/.ipynb_checkpoints/
  inflating: mlm_model/tokenizer_config.json  
  inflating: mlm_model/tokenizer.json  
  inflating: mlm_model/pytorch_model.bin  
   creating: mlm_model/checkpoint-600/
  inflating: mlm_model/checkpoint-600/training_args.bin  
  inflating: mlm_model/checkpoint-600/rng_state.pth  
  inflating: mlm_model/checkpoint-600/scheduler.pt  
  inflating: mlm_model/checkpoint-600/optimizer.pt  
  inflating: mlm_model/checkpoint-600/trainer_state.json  
  inflating: mlm_model/checkpoint-600/pytorch_model.bin  
  inflating: mlm_model/checkpoint-600/config.json  
  inflating: mlm_model/config.json   
  inflating: mlm_model/sentencepiece.bpe.model  
   creating: mlm_model/runs/
   creating: mlm_model/runs/Oct11_16-38-52_43c4fc6d5c2e/
  inflating: ml

In [30]:
model_name="/content/mlm_model"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model_config = AutoConfig.from_pretrained(model_name)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [31]:
train_tokenized_sentences, train_aligned_taggings = align_tokenizations(t_sentences, t_taggings, tokenizer)
train_sentences_ids, train_taggings_ids = convert_to_ids(train_tokenized_sentences, train_aligned_taggings, tokenizer)
train_loader = DataLoader(PosTaggingDataset(train_sentences_ids, train_taggings_ids), collate_fn=collate_fn, batch_size=cfg.batch_size, shuffle=True)

if cfg.val:
    val_tokenized_sentences, val_aligned_taggings = align_tokenizations(v_sentences, v_taggings, tokenizer)
    val_sentences_ids, val_taggings_ids = convert_to_ids(val_tokenized_sentences, val_aligned_taggings, tokenizer)
    val_loader = DataLoader(PosTaggingDataset(val_sentences_ids, val_taggings_ids), collate_fn=collate_fn, batch_size=cfg.batch_size, shuffle=False)

  0%|          | 0/25407 [00:00<?, ?it/s]

  0%|          | 0/25407 [00:00<?, ?it/s]

In [32]:
model = POSModel(model_name, model_config, len(label_vocab)-1)
# model = nn.DataParallel(model, device_ids=[0, 1])
model = model.to(cfg.device)

if cfg.val:
    fit(model, model_name, train_loader, val_loader)

    del val_tokenized_sentences, val_aligned_taggings, val_sentences_ids, val_taggings_ids

else:
    fit(model, model_name, train_loader)

del model, tokenizer, model_config
del train_tokenized_sentences, train_aligned_taggings, train_sentences_ids, train_taggings_ids

for _ in range(5):
    gc.collect()

Some weights of XLMRobertaModel were not initialized from the model checkpoint at /content/mlm_model and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


############################## Epoch 1 Running ##############################



  0%|          | 0/6352 [00:00<?, ?it/s]

####################Train Loss: 0.6082512209337364 | Train Acc: 0.8359210024169953 ####################
############################## Epoch 2 Running ##############################



  0%|          | 0/6352 [00:00<?, ?it/s]

####################Train Loss: 0.3568650156785019 | Train Acc: 0.9117240173005979 ####################


In [None]:
!zip -r epoch_1st_run_pseudoLabeling.zip '/content/best_epoch_1.pth'


  adding: content/best_epoch_1.pth (deflated 7%)


In [None]:
!cp epoch_1st_run_pseudoLabeling.zip '/content/drive/Shareddrives/ZINDI Data Science/Lacuna/Competition #1/Scripts/Winning Solutions/#4 Overfit Gambit Accpeted/'
!ls -lt '/content/drive/Shareddrives/ZINDI Data Science/Lacuna/Competition #1/Scripts/Winning Solutions/#4 Overfit Gambit Accpeted/'

total 12194209
-rw------- 1 root root 2075852886 Oct 11 19:07 epoch_1st_run_pseudoLabeling.zip
-rw------- 1 root root     658711 Oct 11 18:18 submission.csv
-rw------- 1 root root 2075850755 Oct 11 18:10 epoch.zip
-rw------- 1 root root 8334495271 Oct 11 17:04 mlm_model.zip
drwx------ 2 root root       4096 Sep 20 07:51 data_used
drwx------ 2 root root       4096 Sep 20 07:51 Notebooks
-rw------- 1 root root       2109 Sep 20 07:51 summary
