# CLM (Causal Language Modelling)

# Imports

In [1]:
import logging
import math
import os
import sys
from pathlib import Path
from typing import Callable, Dict, List, Optional

import datasets
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import wandb
from datasets import DatasetDict, concatenate_datasets, load_dataset
from tqdm import tqdm
from tokenizers import ByteLevelBPETokenizer
from transformers import (
    CONFIG_MAPPING, 
    AutoConfig, 
    AutoModelForCausalLM,
    GPT2TokenizerFast, 
    HfArgumentParser, 
    TrainingArguments
)

%load_ext watermark
%watermark -v -p torch,datasets,transformers,wandb

Python implementation: CPython
Python version       : 3.10.8
IPython version      : 8.9.0

torch       : 2.0.0
datasets    : 2.9.0
transformers: 4.26.0
wandb       : 0.14.0



# Config

In [2]:
class CFG:
    ta_tokenizer_ckpt = '../ckpts/tamil/tokenizer'
    ta_clm_ckpt = '../ckpts/tamil/clm'
    en_tokenizer_ckpt = '../ckpts/english/tokenizer'
    en_clm_ckpt = '../ckpts/english/clm'
    context_length = 128

# Dataset

## Process Validation Data

In [3]:
from glob import glob

import pandas as pd

txt_files = glob('../data/tamil/*.txt')
output_txt = '../data/ta_valid.txt'
output_csv = '../data/ta_valid.csv'

for file in txt_files:
    with open(file, 'r') as f:
        txt = f.read()
        with open(output_txt, 'a') as out:
            out.write(txt + "\n")

text = []
# id = []
for file in txt_files:
    with open(file, 'r') as f:
        txt = f.readlines()
        text.extend(txt)
        # id.extend(list(range(len(id), len(txt))))
df = pd.DataFrame({
    'id': range(len(text)),
    'text': text
})

print(df.head())
df.to_csv(output_csv, index=False)

   id                                               text
0   0  சிங்கமும் கழுதைப்புலியும் பசுவைப் பிடித்து வைத...
1   1  குட்டி கழுதைப்புலி சொல்லியது: “நான் சின்னப்பயல...
2   2  அதைக்கேட்டு கோபமான கழுதைப்புலி அந்தக் குடலோடு ...
3   3  பசுவில் பாதி கேட்க வந்த கழுதைப்புலி தற்போது தன...
4   4  குடலை சிங்கத்திடம் கொடுத்து விட்டு திரும்பிய க...


# Train Tokenizer

In [4]:
# Extracted from https://github.com/AbinayaM02/GPT2-Tamil/blob/main/src/train_tokenizer.py
dataset = load_dataset('oscar', 'unshuffled_deduplicated_ta', split="train")
indic_tamil = load_dataset(
    "text", data_files="../data/data/ta/ta.txt")['train']
dataset = concatenate_datasets([indic_tamil, dataset])


def batch_iterator(batch_size: int = 512):
    for i in range(0, len(dataset), batch_size):
        yield dataset[i: i+batch_size]['text']


def train_tokenizer():
    tokenizer = ByteLevelBPETokenizer()
    tokenizer.train_from_iterator(
        batch_iterator(),
        vocab_size=52_000,
        min_frequency=2,
        special_tokens=[
            "<s>",
            "<pad>",
            "</s>",
            "<unk>",
            "<mask>"
        ],
    )
    tokenizer.save(f"{CFG.ta_tokenizer_ckpt}/")

dataset

Found cached dataset oscar (/Users/aneeshaparajit/.cache/huggingface/datasets/oscar/unshuffled_deduplicated_ta/1.0.0/84838bd49d2295f62008383b05620571535451d84545037bb94d6f3501651df2)
Using custom data configuration default-4395bba7710a9245
Found cached dataset text (/Users/aneeshaparajit/.cache/huggingface/datasets/text/default-4395bba7710a9245/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2)


  0%|          | 0/1 [00:00<?, ?it/s]

Dataset({
    features: ['text', 'id'],
    num_rows: 32376070
})

In [5]:
dataset[:2]

{'text': ['2019ல் 5.9 பில்லியன் அமெரிக்க டொலர் கடனை இலங்கை மீள் செலுத்த வேண்டியுள்ளதாக மத்திய வங்கியின் ஆளுநரான பேராசிரியர், இந்திரஜித் குமாரசுவாமி தெரிவித்துள்ளார்.',
  'இப்படி தினமும் செய்தாலும், உதடுகளில் உள்ள கருமை அகலும்.'],
 'id': [None, None]}

## Loading datasets

In [6]:
def get_datasets():
    ds_train = load_dataset('oscar', 'unshuffled_deduplicated_ta', split="train")
    ds_valid = load_dataset("csv", data_files="../data/ta_valid.csv")['train']
    raw_ds = DatasetDict({
        'train': ds_train, 
        'valid': ds_valid
    })
    return raw_ds

raw_dataset = get_datasets()
raw_dataset

Found cached dataset oscar (/Users/aneeshaparajit/.cache/huggingface/datasets/oscar/unshuffled_deduplicated_ta/1.0.0/84838bd49d2295f62008383b05620571535451d84545037bb94d6f3501651df2)
Using custom data configuration default-fc64922f48343321


Downloading and preparing dataset csv/default to /Users/aneeshaparajit/.cache/huggingface/datasets/csv/default-fc64922f48343321/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /Users/aneeshaparajit/.cache/huggingface/datasets/csv/default-fc64922f48343321/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  return pd.read_csv(xopen(filepath_or_buffer, "rb", use_auth_token=use_auth_token), **kwargs)


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'text'],
        num_rows: 833101
    })
    valid: Dataset({
        features: ['id', 'text'],
        num_rows: 1740
    })
})

In [23]:
tokenizer = GPT2TokenizerFast(tokenizer_file='../ckpts/tamil/tokenizer/tokenizer.json')
tokenizer.add_special_tokens({
    'bos_token': '<s>',
    'eos_token': '</s>',
    'unk_token': '<unk>',
    'pad_token': '<pad>'
})
tokenizer

GPT2TokenizerFast(name_or_path='', vocab_size=52000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>'})

In [24]:
text = raw_dataset['train'][:5]['text']
len(text)

5

In [25]:
outputs = tokenizer(
    text=text, 
    truncation=True, 
    max_length=CFG.context_length,
    return_overflowing_tokens=True,
    padding=True,
    return_length=True, 
    return_tensors="pt"
)

In [26]:
outputs['input_ids'].shape

torch.Size([260, 128])

In [11]:
outputs.keys()

dict_keys(['input_ids', 'attention_mask', 'length', 'overflow_to_sample_mapping'])

In [12]:
__import__('pprint').pprint(outputs)

{'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]]),
 'input_ids': tensor([[285, 318, 320,  ..., 203, 269, 265],
        [277, 267, 299,  ..., 307, 265, 277],
        [286, 271, 262,  ..., 262, 269, 267],
        ...,
        [302, 306, 262,  ..., 267, 556, 367],
        [309, 267, 301,  ..., 294, 275, 292],
        [275, 279, 421,  ...,   1,   1,   1]]),
 'length': tensor([128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
        128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
        128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
        128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
        128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
        128, 128, 128, 128, 128, 128, 128, 128, 128, 128

In [13]:
def tokenize(item):
    outputs = tokenizer(
        text=item['text'], 
        truncation=True, 
        max_length=CFG.context_length,
        return_overflowing_tokens=True,
        padding=True,
        return_length=True, 
        return_tensors="pt"
    )
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length == CFG.context_length:
            input_batch.append(input_ids)
    return {"input_ids": input_batch}

tokenized_datasets = raw_dataset.map(
    tokenize, batched=True, remove_columns=raw_dataset["train"].column_names
)
tokenized_datasets

  0%|          | 0/834 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 11943163
    })
    valid: Dataset({
        features: ['input_ids'],
        num_rows: 2657
    })
})

In [19]:
tokenizer.bos_token

'<|endoftext|>'

In [29]:
tokenizer.tokenize('<s>')

['<s>']

In [30]:
tokenizer.bos_token_id

0

In [31]:
config = AutoConfig.from_pretrained(
    "abinayam/gpt-2-tamil",
    vocab_size=len(tokenizer),
    n_ctx=CFG.context_length,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)

Downloading (…)lve/main/config.json:   0%|          | 0.00/876 [00:00<?, ?B/s]

In [34]:
model = AutoModelForCausalLM.from_config(config)
model_size = sum(t.numel() for t in model.parameters())
print(f"GPT-2 size: {model_size/1000**2:.1f}M parameters")

GPT-2 size: 125.8M parameters


In [35]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [36]:
out = data_collator([tokenized_datasets["train"][i] for i in range(5)])
for key in out:
    print(f"{key} shape: {out[key].shape}")

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


input_ids shape: torch.Size([5, 128])
attention_mask shape: torch.Size([5, 128])
labels shape: torch.Size([5, 128])


In [37]:
from transformers import Trainer, TrainingArguments

args = TrainingArguments(
    output_dir="codeparrot-ds",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    evaluation_strategy="steps",
    eval_steps=5_000,
    logging_steps=5_000,
    gradient_accumulation_steps=8,
    num_train_epochs=1,
    weight_decay=0.1,
    warmup_steps=1_000,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=5_000,
    fp16=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["valid"],
)