# NLP

### 1. Pretraining

##### 1. Casual Language Modeling

##### 2. Masked Language Modeling

##### 3. Denoising / Span Interuption

##### 4. Next Sentence Prediction / Sentence Order Prediciton




### 2. Post Training / Fine Tuning

##### 1. Full Fine Tuning

##### 2. Parameter Efficient Fine Tuning

###### 1. LoRA (Low Rank Adaptation)

###### 2. Prefix Tuning / Prompt Tuning

###### 3. Adapters


##### 3. Instruction Fine Tuning




### 3. Model Characterization

##### 1. Architecture

##### 2. Number of Parameters

##### 3. Number of Layers

##### 4. Number of Attention Heads

##### 5. Context Length



# 1. Pretraining

## Casual Language Modeling


### Casual Pretraining Pipeline

In [5]:
# imports
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments

# data
dataset = load_dataset("wikitext", "wikitext-103-raw-v1")

# tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")
def tokenize_fn(example):
    return tokenizer(example["text"], truncation=True, max_length=1024)

tokenized = dataset.map(tokenize_fn,batched=True, remove_columns=["text"])

# split datasets
train_dataset = tokenized["train"]
eval_dataset = tokenized["validation"]

# model
model = AutoModelForCausalLM.from_pretrained("gpt2")

training_args = TrainingArguments(
    output_dir="./clm_pretrain",
    outwrite_ouput_dir = True,
    evaluation_strategy = "steps",
    eval_steps = 500,
    per_device_train_batch_size = 2,
    gradient_accumulation_steps = 8,
    learning_rate = 2e-5,
    num_train_epochs = 3,
    save_steps=1000,
    logging_steps = 100,
    fp16 = True,
    report_to = "none")

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = eval_dataset
)

trainer.train()


'''

✅ This will train a GPT-style causal LM on raw text.


'''

Map:   0%|          | 0/4358 [00:00<?, ? examples/s]

Map:   0%|          | 0/1801350 [00:00<?, ? examples/s]

Map:   0%|          | 0/3760 [00:00<?, ? examples/s]

TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'outwrite_ouput_dir'

# Pretraining

###

# Quick Route (Hugging Face)

In [None]:
# imports
from datasets import load_dataset
from transformers import AutoTokenizers, DataCollatorForLanguageModeling, AutoModelForMaskedLM, TrainingArguments, Trainer


MODEL_NAME = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)


raw = load_dataset("wikitext", "wikitext-2-raw-v1", split = "train")

# group into blocks for mlm
def tokenize_fn(example):
    return tokenizer(examples["text"], truncation=True, max_length = 512)


tok = raw.map(tokenize_fn, batched=True, remove_columns=["text"])

# mlm automatically
data_collator = DataCollatorForLanguageModeling(tokenizer = tokenizer, mlm=True,mlm_probability=0.15)


model = AutoModelForMaskedLM.from_pretrained(MODEL_NAME)

training_args = TrainingArguments(
    output_dir="./mlm-bert",
    per_device_train_batch_size = 8,
    gradient_accumulation_steps = 4,
    evaluation_strategy = "no",
    num_train_epochs = 3,
    save_steps = 5000,
    logging_steps = 200,
    learning_rate = 5e-5,
    fp16 = True,
    push_to_hub = False)

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tok,
    data_collator = data_collator)

trainer.train()
trainer.save_model("./mlm-bert-finetuned")


# PyTorch Implementation (From Scratch)

In [None]:
# imports
import math, random, torch
from torch import nn
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
from datasets import load_dataset


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Masking Function (BERT Style)

MASK_PROB = 0.15

def mask_tokens(inputs, tokenizer)
raw = load_dataset("wikitext", "wikitext-2-raw-v1", split = "train")