# mGPT - Multilingual GPT model

[Huggingface model card](https://huggingface.co/ai-forever/mGPT)

Supported languages:
Arabic, Hebrew, Vietnamese, Indonesian, Javanese, Malay, Tagalog, Latvian, Lithuanian, Basque, Malayalam, Tamil, Telugu, Armenian, Bengali, Marathi, Hindi, Urdu, Afrikaans, Danish, English, German, Swedish, French, Italian, Portuguese, Romanian, Spanish, Greek, Ossetian, Tajik, Persian, Japanese, Georgian, Korean, Thai, Buryat, Kalmyk, Mongolian, Swahili, Yoruba, Belarusian, Bulgarian, Russian, Ukrainian, Polish, Burmese, Uzbek, Bashkir, Kazakh, Kyrgyz, Tatar, Azerbaijani, Chuvash, Turkish, Turkmen, Tuvan, Yakut, Estonian, Finnish, Hungarian

In [1]:
#@title Install transformers
!pip install transformers[torch]
!pip install accelerate -U

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers[torch]
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m54.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers[torch])
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m32.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers[torch])
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m82.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers[torch])
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64

In [5]:
#@title Load libraries
import torch
import numpy as np
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from transformers import TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments



In [None]:
#@title Get Model

model = "ai-forever/mGPT"

tokenizer = GPT2Tokenizer.from_pretrained(model)
model = GPT2LMHeadModel.from_pretrained(model, pad_token_id=tokenizer.eos_token_id)


## Generate

In [None]:
#@title Generate
#@markdown Text to start with
prompt = "" #@param {type: "string"}
#@markdown Length of generated text in tokens (100 tokens is about 75 words)
max_length = 250 #@param {type: "integer"}
#@markdown Temperature. Best results in range 0.8-2
temperature = 0.8  #@param {type:"slider", min:0, max:2, step:0.1}
inputs = tokenizer(prompt, add_special_tokens=False, return_tensors="pt")["input_ids"]
prompt_length = len(tokenizer.decode(inputs[0]))
outputs = model.generate(inputs, max_length=max_length, do_sample=True, temperature=temperature)
generated = prompt + tokenizer.decode(outputs[0])[prompt_length + 1 :]

print(generated)

## Finetuning

**Doesn't work on Colab free plan :(**

You can fintune the model on your own texts.
Collect the dataset of texts and save them to the same `.txt` file. Texts should be separated with `<|endoftext|>`
Check [this example](https://raw.githubusercontent.com/ai-forever/mgpt/main/data/sah.txt) in Sakha language.
Upload the file to Colab notebook and paste the path to the `dataset` param in the following cell


In [None]:
#@title Mount Google Drive
import os
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#@title Load dataset
dataset_path = "/content/sah.txt" #@param {"type": "string"}

train_dataset = TextDataset(tokenizer=tokenizer,file_path=dataset_path,block_size=64)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [5]:
#@title Set Params
#@markdown The output directory where model will be saved (you can store it on the drive to reuse it later)
model_dir = "/content/drive/MyDrive/AI/mGPT" #@param {"type": "string"}
#@markdown Overwrite the content of the output directory
overwrite_output_dir=True #@param {"type": "boolean"}
#@markdown Number of training epochs
num_train_epochs=2 #@param {"type": "integer"}
#@markdown Batch size for training
per_device_train_batch_size=4 #@param {"type": "integer"}
#@markdown Batch size for evaluation
per_device_eval_batch_size=4 #@param {"type": "integer"}
#@markdown Number of warmup steps for learning rate scheduler
warmup_steps=10 #@param {"type": "integer"}
#@markdown To make "virtual" batch size larger
gradient_accumulation_steps=16 #@param {"type": "integer"}
#@markdown Learning rate (set smaller learning rate for smaller datasets)
lr = 0.00001 #@param {type:"slider", min:1e-5, max:1e-4, step:4.5e-5}

training_args = TrainingArguments(
    output_dir="./output/", #The output directory
    overwrite_output_dir=overwrite_output_dir, #overwrite the content of the output directory
    num_train_epochs=num_train_epochs, # number of training epochs
    per_device_train_batch_size=per_device_train_batch_size, # batch size for training
    per_device_eval_batch_size=per_device_eval_batch_size,  # batch size for evaluation
    warmup_steps=warmup_steps,# number of warmup steps for learning rate scheduler
    gradient_accumulation_steps=gradient_accumulation_steps, # to make "virtual" batch size larger
    )


trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    optimizers = (torch.optim.AdamW(model.parameters(),lr=lr),None) # Optimizer and lr scheduler
)

In [None]:
#@title Run Finetuning
#@markdown This will run the finetuning and save the model after that
trainer.train()
trainer.save_model(model_dir)

### Generate with finetuned model

In [None]:
#@title Load finetuned model
#@markdown The directory where finetuned model is stored
model_dir = "/content/drive/MyDrive/AI/mGPT" #@param {"type": "string"}


model_name_or_path = "ai-forever/mGPT"
tokenizer = GPT2Tokenizer.from_pretrained(model_name_or_path)
model = GPT2LMHeadModel.from_pretrained(model_dir).to(DEVICE)

In [None]:
#@title Generate
#@markdown Text to start with
prompt = "" #@param {type: "string"}
#@markdown Length of generated text in tokens (100 tokens is about 75 words)
max_length = 250 #@param {type: "integer"}
#@markdown Temperature. Best results in range 0.8-2
temperature = 0.8  #@param {type:"slider", min:0, max:2, step:0.1}
inputs = tokenizer(prompt, add_special_tokens=False, return_tensors="pt")["input_ids"]
prompt_length = len(tokenizer.decode(inputs[0]))
outputs = model.generate(inputs, max_length=max_length, do_sample=True, temperature=temperature)
generated = prompt + tokenizer.decode(outputs[0])[prompt_length + 1 :]

print(generated)