In [None]:
!git clone https://github.com/YRL-AIDA/T5-GlyF.git
%cd T5-GlyF
!pip install -r requirements.txt

In [2]:
import torch
from glyf.corrector.glyph_corrector import GlyphCorrector
from utils.utils import load_pkl, load_json

In [3]:
path_to_data = 'data/processed/small_dataset_example.pkl'
path_to_model = '/content/drive/MyDrive/homoglyphs/model'
path_to_glyphs = 'data/external/homoglyphs.pkl'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
prefix = 'fix homoglyphs: '

corrector = GlyphCorrector(path_to_model, path_to_glyphs, prefix, device)
print(f'Device: {device}')
print(corrector.__doc__)

Device: cuda

    Inference class for trained model.

    :param model_path: path to model; 
    :param glyphs_path: path to dictionary of homoglyphs;
    :param prefix: additional prompt for model (for example, 'fix homoglyphs: '), some models need it;
    :param device: device on which the model will be located (cpu/gpu);
    


In [4]:
path_to_data = 'data/processed/small_dataset_example.pkl'
examples = load_pkl(path_to_data)
print(len(examples))
examples[:3]

1000


[['Tending to calɱ  мᴏԁȩɽɑƫę ơɼ țʀàṅʠᴜі1ɪᴢẹ',
  'Tending to calm  moderate or tranquilize'],
 ['A VISCID FLUID SECRETED BY MUCOUS MEMBRANES  WHICH IT SERVES TO MOISTEN AND PROTECT. IT COVERS THE LINING MEMBRANES OF ALL THE CAVITIES WHICH OPEN EXTERNALLY SUCH AS THOSE OF THE MOUTH NOSE LUNGS INTESTINAL CANAL URINARY PASSAGEȘ ȨṬС.',
  'A VISCID FLUID SECRETED BY MUCOUS MEMBRANES  WHICH IT SERVES TO MOISTEN AND PROTECT. IT COVERS THE LINING MEMBRANES OF ALL THE CAVITIES WHICH OPEN EXTERNALLY SUCH AS THOSE OF THE MOUTH NOSE LUNGS INTESTINAL CANAL URINARY PASSAGES ETC.'],
 ['ANY ONE OF NUMEROUS SPECIES OF EUROPEAN MOTHS OF THE FAMILY LȨŪÇАṈĪḌĄĘ.',
  'ANY ONE OF NUMEROUS SPECIES OF EUROPEAN MOTHS OF THE FAMILY LEUCANIDAE.']]

In [11]:
print(corrector.correct.__doc__)


        Corrects a single input sentence.

        :param x: input sentence;
        :return: corrected sentence.
        


In [5]:
corrected_example = corrector.correct(examples[0][0])
print()
print(f'Original sentence:  {examples[0][1]}')
print(f'With homoglyphs:    {examples[0][0]}')
print(f'Corrected sentence: {corrected_example}')

100%|██████████| 1/1 [00:03<00:00,  3.93s/it]


Original sentence:  Tending to calm  moderate or tranquilize
With homoglyphs:    Tending to calɱ  мᴏԁȩɽɑƫę ơɼ țʀàṅʠᴜі1ɪᴢẹ
Corrected sentence: Tending to calm  moderate or tranquilize





In [10]:
print(corrector.batch_correct.__doc__)


        Corrects input list of sentences.

        :param sentences: input list of sentences;
        :param batch_size: size of subsample of input sentences;
        :return: corrected sentences.
        


In [6]:
corrected_examples = corrector.batch_correct([x[0] for x in examples[:5]], batch_size=5)[0]
corrected_examples

100%|██████████| 1/1 [00:03<00:00,  3.14s/it]


['Tending to calm  moderate or tranquilize',
 'A VISCID FLUID SECRETED BY MUCOUS MEMBRANES  WHICH IT SERVES TO MOISTEN AND PROTECT. IT COVERS THE LINING MEMBRANES OF ALL THE CAVITIES WHICH OPEN EXTERNALLY SUCH AS THOSE OF THE MOUTH NOSE LUNGS INTESTINAL CANAL URINARY PASSAGES ETC.',
 'ANY ONE OF NUMEROUS SPECIES OF EUROPEAN MOTHS OF THE FAMILY LEUCANIDAE.',
 'The positive  or non-acid component of a salt; a substance which combined with an acid neutralizes the latter and forms a salt; -- applied also to the hydroxides of the positive elements or radicals and to certain organic bodies resembling them in their property of forming salts with acids.',
 'A picture or hieroglyph representing and expressing an idea.']

In [7]:
for i in range(len(examples[:5])):
  print(f'Original sentence:  {examples[i][1]}')
  print(f'With homoglyphs:    {examples[i][0]}')
  print(f'Corrected sentence: {corrected_examples[i]}')
  print('-'*42)

Original sentence:  Tending to calm  moderate or tranquilize
With homoglyphs:    Tending to calɱ  мᴏԁȩɽɑƫę ơɼ țʀàṅʠᴜі1ɪᴢẹ
Corrected sentence: Tending to calm  moderate or tranquilize
------------------------------------------
Original sentence:  A VISCID FLUID SECRETED BY MUCOUS MEMBRANES  WHICH IT SERVES TO MOISTEN AND PROTECT. IT COVERS THE LINING MEMBRANES OF ALL THE CAVITIES WHICH OPEN EXTERNALLY SUCH AS THOSE OF THE MOUTH NOSE LUNGS INTESTINAL CANAL URINARY PASSAGES ETC.
With homoglyphs:    A VISCID FLUID SECRETED BY MUCOUS MEMBRANES  WHICH IT SERVES TO MOISTEN AND PROTECT. IT COVERS THE LINING MEMBRANES OF ALL THE CAVITIES WHICH OPEN EXTERNALLY SUCH AS THOSE OF THE MOUTH NOSE LUNGS INTESTINAL CANAL URINARY PASSAGEȘ ȨṬС.
Corrected sentence: A VISCID FLUID SECRETED BY MUCOUS MEMBRANES  WHICH IT SERVES TO MOISTEN AND PROTECT. IT COVERS THE LINING MEMBRANES OF ALL THE CAVITIES WHICH OPEN EXTERNALLY SUCH AS THOSE OF THE MOUTH NOSE LUNGS INTESTINAL CANAL URINARY PASSAGES ETC.
---------

In [8]:
print(corrector.evaluate.__doc__)


        Evaluate the model on dataset.

        :param dataset: data ([[X, y], ...], X - attacked sentence, y - corrected sentence);
        :param batch_size: size of subsample of input sentences (default = 32);
        :param logs_path: path to file for logging (for example, 'logs.log');
        :return: calculated metrics on the dataset (accuracy and levenshtein ratio)
        


In [9]:
acc, l_ratio = corrector.evaluate(examples, batch_size=128, logs_path='logs/logs.log')
print()
print(f'Accuracy: {acc}')
print(f'Levenshtein ratio: {l_ratio}')

100%|██████████| 8/8 [00:46<00:00,  5.81s/it]


Accuracy: 0.993
Levenshtein ratio: 0.9999399820820181



