## Install lib

In [None]:
%%writefile setup.sh

git clone https://github.com/NVIDIA/apex
cd apex
pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./

Overwriting setup.sh


In [None]:
!sh setup.sh

In [None]:
!apt-get install llvm-9-dev

In [None]:
!pip uninstall triton

In [None]:
!pip install triton==0.2.3

In [None]:
# !pip uninstall -y typing

In [None]:
!pip install cpufeature

In [None]:
!DS_BUILD_CPU_ADAM=1 DS_BUILD_SPARSE_ATTN=1 pip install deepspeed==0.3.7

In [None]:
!ds_report

--------------------------------------------------
DeepSpeed C++/CUDA extension op report
--------------------------------------------------
NOTE: Ops not installed will be just-in-time (JIT) compiled at
      runtime if needed. Op compatibility means that your system
      meet the required dependencies to JIT install the op.
--------------------------------------------------
JIT compiled ops requires ninja
ninja .................. [92m[OKAY][0m
--------------------------------------------------
op name ................ installed .. compatible
--------------------------------------------------
cpu_adam ............... [92m[YES][0m ...... [92m[OKAY][0m
fused_adam ............. [93m[NO][0m ....... [92m[OKAY][0m
fused_lamb ............. [93m[NO][0m ....... [92m[OKAY][0m
sparse_attn ............ [92m[YES][0m ...... [92m[OKAY][0m
transformer ............ [93m[NO][0m ....... [92m[OKAY][0m
stochastic_transformer . [93m[NO][0m ....... [92m[OKAY][0m
utils ............

In [None]:
import deepspeed.ops.sparse_attention.sparse_attn_op

In [None]:
!git clone  https://github.com/sberbank-ai/ru-gpts

In [None]:
!pip install transformers==3.5.1

In [None]:
!cp ru-gpts/src_utils/trainer_pt_utils.py /usr/local/lib/python3.7/dist-packages/transformers/trainer_pt_utils.py

# Test model

### Load model

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import sys
sys.path.append("ru-gpts/")

In [None]:
import os
os.environ["USE_DEEPSPEED"] = "1"

In [None]:
from src.xl_wrapper import RuGPT3XL

Note! seq_len is max sequence length for generation used in generation process. Max avialable seq_len is 2048 (in tokens).
Also inference takes around 10 Gb GPU memory.

In [None]:
# gpt = RuGPT3XL.from_pretrained("sberbank-ai/rugpt3xl", seq_len=512)
gpt = RuGPT3XL.from_pretrained("sberbank-ai/rugpt3xl", seq_len=512)
# model parallel group is not initialized - если не подключена gpu 

> initializing model parallel with size 1
> initializing model parallel cuda seeds on global rank 0, model parallel rank 0, and data parallel rank 0 with model parallel seed: 3952 and data parallel seed: 1234


Downloading:   0%|          | 0.00/1.57M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.63G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/653 [00:00<?, ?B/s]

Use alternating sparse & dense attention layers


### Get logits

In [None]:
logits = gpt("Кто был президентом США в 2020? ").logits

In [None]:
type(logits), logits.shape

### Get loss

In [None]:
input_ids = [gpt.tokenizer("Кто был президентом США в 2020? ")['input_ids']]
labels = input_ids

In [None]:
import torch


with torch.no_grad():
    loss = gpt(input_ids=input_ids, labels=labels).loss

In [None]:
loss

### Simple generation

In [None]:
def filter_resuls(nr):
    return [x[:x.find("<|endoftext|>")] for x in nr]

Greedy decoding

In [None]:
filter_resuls(gpt.generate(
    "Кто был президентом США в 2020? ",
    max_length=50,
    no_repeat_ngram_size=3,
    repetition_penalty=2.,
))

sample

In [None]:
filter_resuls(gpt.generate(
    "Кто был президентом США в 2020? ", do_sample=True, num_return_sequences=5,
    max_length=50,
    no_repeat_ngram_size=3,
    repetition_penalty=2.,
))

### Top_k top_p filtering

In [None]:
filter_resuls(gpt.generate(
    "Александр Сергеевич Пушкин родился в ",
    top_k=5,
    top_p=0.95,
    temperature=1.2,
    num_return_sequences=5,
    do_sample=True,
    max_length=50,
    no_repeat_ngram_size=3,
    repetition_penalty=2.,
))

### Beamsearch

In [None]:
filter_resuls(gpt.generate(
    text="Александр Сергеевич Пушкин родился в ",
    max_length=50,
    num_beams=10,
    no_repeat_ngram_size=3,
    repetition_penalty=2.,
    num_return_sequences=5,
))