In [1]:
!pip install -q transformers
!pip install --upgrade accelerate
!pip install -q sentencepiece

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m30.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m27.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m72.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m72.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m28.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.23.0-py3-none-any.whl (258 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m258.1/258.1 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.23.0
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m12.1 MB/s[0m eta [36m0:00

In [2]:
import os
import torch
import random
import warnings
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForMaskedLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling

warnings.simplefilter("ignore")
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ['WANDB_DISABLED'] = 'true'

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(seed=69)

In [3]:
train_df = pd.read_csv("/content/train.csv")
test_df = pd.read_csv("/content/Test.csv")
train_df = train_df[train_df.word.notnull()]
train_df.head()

Unnamed: 0,word,tag,lang
0,Do,VERB,pcm
1,senator,NOUN,pcm
2,tok,VERB,pcm
3,dis,DET,pcm
4,one,NUM,pcm


In [4]:
df = train_df[train_df['lang'].isin(['wol','pcm'])]

def get_samples(df, full_val_samples=True):
    sentences = []
    taggings = []

    # Temporary variables to store sentence and tagging for current sentence
    current_sentence = []
    current_tagging = []

    for lang in tqdm(df.lang.unique(), total=len(df.lang.unique())):
        sentence_count = 0
        # Process each row in the CSV data
        for index, row in df[df.lang==lang].iterrows():
            if not full_val_samples:
                if sentence_count==200:
                    break
            word = row['word']
            tag = row['tag']

            # removing soft hyphens
            word = word.replace('\x8d', '')


            current_sentence.append(word)
            current_tagging.append(tag)

            if word.strip() in ['.', '?', '!']:
                sentence_count+=1
                assert len(current_sentence)==len(current_tagging)
                sentences.append(current_sentence)
                taggings.append(current_tagging)
                current_sentence = []
                current_tagging = []



    return sentences, taggings

train_sent, _ = get_samples(df)

  0%|          | 0/1 [00:00<?, ?it/s]

In [5]:
class PretrainingDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=128):
        super().__init__()

        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def tokenize(self, text, text_pair=None):
        return self.tokenizer(
            text=text,
            max_length=self.max_length,
            truncation=True,
            padding=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_special_tokens_mask=True,
            return_token_type_ids=False,
            return_offsets_mapping=False,
            return_tensors=None,
        )

    def __getitem__(self, index):
        text = self.texts[index]
        tokenized = self.tokenize(text)

        return tokenized

In [6]:
def main(output_dir, model_name_or_path):
    model = AutoModelForMaskedLM.from_pretrained(model_name_or_path).to(device)

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=True,
        mlm_probability=0.25,
    )

    training_args = TrainingArguments(
                output_dir=output_dir,
                num_train_epochs=5,
                per_device_train_batch_size=8,
                save_strategy="steps",
                save_steps=150,
                save_total_limit=1,


            )

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=dataset,
    )

    trainer.train()
    trainer.save_model(training_args.output_dir)
    tokenizer.save_pretrained(training_args.output_dir)


In [7]:
import glob
# files = glob.glob("/content/lacuna-luo-tsn-txt-files/*")
files = glob.glob("/content/*.txt")


t_sentences = []
for path in files:
    with open(path, 'r') as f:
        text = f.read()


    text = text.split("\n")
    t_sentences.extend(text)

In [8]:
train_sentences = []

for s in train_sent:
    train_sentences.append(" ".join(s))

In [9]:
t_sentences = train_sentences + t_sentences

In [10]:
model_name_or_path = "Davlan/afro-xlmr-large-75L"
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
output_dir = "/content/mlm_model"
dataset = PretrainingDataset(t_sentences, tokenizer)

Downloading (…)okenizer_config.json:   0%|          | 0.00/418 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

In [11]:
main(output_dir, model_name_or_path)

Downloading (…)lve/main/config.json:   0%|          | 0.00/714 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/3.27G [00:00<?, ?B/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,2.3922


In [12]:
!zip -r mlm_model.zip mlm_model/


  adding: mlm_model/ (stored 0%)
  adding: mlm_model/training_args.bin (deflated 49%)
  adding: mlm_model/.ipynb_checkpoints/ (stored 0%)
  adding: mlm_model/tokenizer_config.json (deflated 76%)
  adding: mlm_model/tokenizer.json (deflated 76%)
  adding: mlm_model/pytorch_model.bin (deflated 7%)
  adding: mlm_model/checkpoint-600/ (stored 0%)
  adding: mlm_model/checkpoint-600/training_args.bin (deflated 49%)
  adding: mlm_model/checkpoint-600/rng_state.pth (deflated 28%)
  adding: mlm_model/checkpoint-600/scheduler.pt (deflated 49%)
  adding: mlm_model/checkpoint-600/optimizer.pt (deflated 7%)
  adding: mlm_model/checkpoint-600/trainer_state.json (deflated 50%)
  adding: mlm_model/checkpoint-600/pytorch_model.bin (deflated 7%)
  adding: mlm_model/checkpoint-600/config.json (deflated 47%)
  adding: mlm_model/config.json (deflated 47%)
  adding: mlm_model/sentencepiece.bpe.model (deflated 49%)
  adding: mlm_model/runs/ (stored 0%)
  adding: mlm_model/runs/Oct11_16-38-52_43c4fc6d5c2e/ (s

In [16]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [17]:
!cp mlm_model.zip '/content/drive/MyDrive/Colab Notebooks/Data/'
!ls -lt '/content/drive/MyDrive/Colab Notebooks/Data/'

total 8144786
-rw------- 1 root root 8334495271 Oct 11 17:05 mlm_model.zip
-rw------- 1 root root     848575 Nov  2  2022 new_ss.csv
-rw------- 1 root root     848575 Dec  2  2021 SampleSubmission.csv
-rw------- 1 root root    1201313 Dec  2  2021 Test.csv
-rw------- 1 root root    2864060 Dec  2  2021 Train.csv
-rw------- 1 root root       1156 Dec  2  2021 VariableDefinitions.csv
