In [None]:
!pip install transformers transliterate googletrans --quiet

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from transliterate import *
from googletrans import Translator
import transformers
from transformers import *

In [None]:
import re
from transformers import AlbertForMaskedLM, AlbertTokenizer
from transliterate import translit
from googletrans import Translator

class TextChanger:
    def replace_until_next_word(self, text):
        result = []
        i = 0
        n = len(text)
        tmp = ''
        masks = []
        while i < n:
            # Если находим символ '-'
            if text[i] == '-' or text[i] == '.':
                # Заменяем все символы до следующего пробела или конца строки на '_'
                while i < n and text[i] != ' ':
                    tmp += text[i]
                    result.append('_')
                    i += 1
            else:
                # Если это не '-', просто добавляем символ в результат
                result.append(text[i])
                if tmp != '':
                    masks.append(tmp)
                tmp = ''
                i += 1

        # Собираем результат в строку
        return "".join(result)

    def replace_underscores_with_mask(self, text):
        return re.sub(r"_{2,}", " [MASK]", text)

    def generate_masks_input(self, text, tokenizer):
        return tokenizer(self.replace_underscores_with_mask(self.replace_until_next_word(text)), return_tensors="pt")

    def generate_gloss_text(self, tokenizer, transcription, translation, lang="Ijor", metalang="English", is_segmented=False):
        return tokenizer(f"""Provide the glosses for the following transcription in {lang}.

        Transcription in {lang}: {transcription}
        Transcription segmented: {is_segmented}
        Translation in {metalang}: {translation}\n
        Glosses:
        """, return_tensors="pt")

class ModelLoader:
    def load_gloss_model(self, path_to_model="/content/drive/My Drive/fine_tuned_t5"):
        my_model = transformers.T5ForConditionalGeneration.from_pretrained(path_to_model)
        return my_model

    def load_gloss_tokenizer(self, path_to_tokenizer="google/byt5-base"):
        tokenizer = transformers.ByT5Tokenizer.from_pretrained(path_to_tokenizer, use_fast=False)
        return tokenizer

    def load_mask_model(self, path_to_model="/content/drive/My Drive/albert"):
        model = AlbertForMaskedLM.from_pretrained(path_to_model)
        return model

    def load_mask_tokenizer(self, path_to_tokenizer="/content/drive/My Drive/albert"):
        tokenizer = AlbertTokenizer.from_pretrained(path_to_tokenizer)
        return tokenizer

textChanger = TextChanger()
modelLoader = ModelLoader()

GlossModel = modelLoader.load_gloss_model()
MaskModel = modelLoader.load_mask_model()
tokenizerGlossModel = modelLoader.load_gloss_tokenizer()
tokenizerMaskModel = modelLoader.load_mask_tokenizer()

translator = Translator()

i = input()
transcription = translit(i, language_code='ru', reversed=True)
result = await translator.translate(i)
translation = result.text
textToGlossModel = textChanger.generate_gloss_text(tokenizerGlossModel, i, transcription, translation)
GlossModelAnswer = tokenizerGlossModel.batch_decode(
    GlossModel.generate(**textToGlossModel, max_length=1024), skip_special_tokens=True
)
print("GLOSS MODEL ANSWER ->", GlossModelAnswer[0])

MaskTextFromGlossModel = textChanger.replace_underscores_with_mask(textChanger.replace_until_next_word(GlossModelAnswer[0]))
textToMaskModel = textChanger.generate_masks_input(GlossModelAnswer[0], tokenizerMaskModel)

# Получаем logits и inputs
maskModelOutputs = MaskModel(**textToMaskModel)
logits = maskModelOutputs.logits
inputs = textToMaskModel

# Находим все позиции масок
mask_token_indices = (inputs['input_ids'] == tokenizerMaskModel.mask_token_id).nonzero(as_tuple=True)[1]

# Предсказание токенов для каждой маски
predicted_tokens = []
for mask_index in mask_token_indices:
    predicted_token_id = logits[0, mask_index].argmax(axis=-1)
    predicted_token = tokenizerMaskModel.decode(predicted_token_id)
    predicted_tokens.append(predicted_token)

# Замена масок на предсказанные токены
predicted_text = MaskTextFromGlossModel
for token in predicted_tokens:
    predicted_text = predicted_text.replace("[MASK]", token, 1)  # Заменяем только одну маску за раз

print("TEXT TO MASK MODEL ->")
print("FINAL_ANSWER ->", predicted_text)

loading configuration file /content/drive/My Drive/fine_tuned_t5/config.json
Model config T5Config {
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 3968,
  "d_kv": 64,
  "d_model": 1536,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "gradient_checkpointing": false,
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "num_decoder_layers": 6,
  "num_heads": 12,
  "num_layers": 18,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "tie_word_embeddings": false,
  "tokenizer_class": "ByT5Tokenizer",
  "torch_dtype": "float32",
  "transformers_version": "4.50.3",
  "use_cache": true,
  "vocab_size": 384
}

loading weights file /content/drive/My Drive/fine_tuned_t5/model.safetensors
Ge

tokenizer_config.json:   0%|          | 0.00/2.59k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.50k [00:00<?, ?B/s]

loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--google--byt5-base/snapshots/92d8c008d55cf7c254915bac165171dfe6c20c44/special_tokens_map.json
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--google--byt5-base/snapshots/92d8c008d55cf7c254915bac165171dfe6c20c44/tokenizer_config.json
loading file tokenizer.json from cache at None
loading file chat_template.jinja from cache at None


config.json:   0%|          | 0.00/721 [00:00<?, ?B/s]

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--google--byt5-base/snapshots/92d8c008d55cf7c254915bac165171dfe6c20c44/config.json
Model config T5Config {
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 3968,
  "d_kv": 64,
  "d_model": 1536,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "gradient_checkpointing": false,
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "num_decoder_layers": 6,
  "num_heads": 12,
  "num_layers": 18,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "tie_word_embeddings": false,
  "tokenizer_class": "ByT5Tokenizer",
  "transformers_version": "4.50.3",
  "use_cache": true,
  "vocab_size": 384
}

loading file 