In [1]:
from access.preprocessors import get_preprocessors
from access.resources.prepare import prepare_models
from access.simplifiers import get_fairseq_simplifier, get_preprocessed_simplifier
from access.text import word_tokenize
from access.utils.helpers import yield_lines, write_lines, get_temp_filepath, mute

In [1]:
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
#

from functools import lru_cache

import Levenshtein
import numpy as np

from access.resources.paths import FASTTEXT_EMBEDDINGS_PATH
from access.resources.prepare import prepare_fasttext_embeddings
from access.text import (to_words, remove_punctuation_tokens, remove_stopwords, spacy_process)
from access.utils.helpers import yield_lines


def get_word2rank(vocab_size=np.inf):
    prepare_fasttext_embeddings()
    # TODO: Decrease vocab size or load from smaller file
    word2rank = {}
    line_generator = yield_lines(FASTTEXT_EMBEDDINGS_PATH)
    next(line_generator)  # Skip the first line (header)
    for i, line in enumerate(line_generator):
        if (i + 1) > vocab_size:
            break
        word = line.split(' ')[0]
        word2rank[word] = i
    return word2rank


def get_rank(word):
    return get_word2rank().get(word, len(get_word2rank()))


def get_log_rank(word):
    return np.log(1 + get_rank(word))


def get_lexical_complexity_score(sentence):
    words = to_words(remove_stopwords(remove_punctuation_tokens(sentence)))
    words = [word for word in words if word in get_word2rank()]
    if len(words) == 0:
        return np.log(1 + len(get_word2rank()))  # TODO: This is completely arbitrary
    return np.quantile([get_log_rank(word) for word in words], 0.75)


def get_levenshtein_similarity(complex_sentence, simple_sentence):
    return Levenshtein.ratio(complex_sentence, simple_sentence)


def get_dependency_tree_depth(sentence):
    def get_subtree_depth(node):
        if len(list(node.children)) == 0:
            return 0
        return 1 + max([get_subtree_depth(child) for child in node.children])

    tree_depths = [get_subtree_depth(spacy_sentence.root) for spacy_sentence in spacy_process(sentence).sents]
    if len(tree_depths) == 0:
        return 0
    return max(tree_depths)


In [3]:
sentence = "As he crossed toward the pharmacy at the corner he involuntarily turned his head because of a burst of light that had ricocheted from his temple, and saw, with that quick smile with which we greet a rainbow or a rose, a blindingly white parallelogram of sky being unloaded from the van—a dresser with mirrors across which, as across a cinema screen, passed a flawlessly clear reflection of boughs sliding and swaying not arboreally, but with a human vacillation, produced by the nature of those who were carrying this sky, these boughs, this gliding façade."
get_lexical_complexity_score(sentence)

Downloading...
... 100% - 1264 MB - 11.92 MB/s - 106s
Extracting...


KeyboardInterrupt: 

In [2]:
from pathlib import Path
import shutil
import sys
import tempfile


In [None]:
source_filepath = get_temp_filepath()
write_lines([word_tokenize(line) for line in fileinput.input()], source_filepath)

In [None]:
f = open("source.complex",'r') 


In [3]:
source_filepath = get_temp_filepath()
f = open("source.complex",'r') 
l = [word_tokenize(line) for line in f]

print(source_filepath)

filepath = Path(source_filepath)
filepath.parent.mkdir(parents=True, exist_ok=True)
with filepath.open('w') as f:
    for line in l:
        print(line)
        f.write(line + '\n')

/var/folders/bc/fl55zrvs7l102488vgch97qm0000gn/T/tmpgy9otb4m
Some trails are designated as nature trails , and are used by people learning about the natural world .


In [4]:
best_model_dir = prepare_models()
# Load best model
#recommended_preprocessors_kwargs = {'LengthRatioPreprocessor': {'target_ratio': 0.25},
 #                                   'LevenshteinPreprocessor': {'target_ratio': 0.25},
  #                                  'WordRankRatioPreprocessor': {'target_ratio': 0.25},
   #                                 'SentencePiecePreprocessor': {'vocab_size': 10000},
    #                               }

preprocessors = get_preprocessors(recommended_preprocessors_kwargs)
simplifier = get_fairseq_simplifier(best_model_dir)
simplifier = get_preprocessed_simplifier(simplifier, preprocessors=None)

pred_filepath = get_temp_filepath()
print(pred_filepath)
print(simplifier)

/var/folders/bc/fl55zrvs7l102488vgch97qm0000gn/T/tmp6m0731wy
<function get_fairseq_simplifier.<locals>.fairseq_simplifier at 0x133e58488>


In [9]:
preprocessors = get_preprocessors(recommended_preprocessors_kwargs)

In [None]:
def get_preprocessed_simplifier(simplifier, preprocessors):
    composed_preprocessor = ComposedPreprocessor(preprocessors)

    @memoize_simplifier
    @wraps(simplifier)
    def preprocessed_simplifier(complex_filepath, output_pred_filepath):
        print(f'preprocessors={preprocessors}')
        preprocessed_complex_filepath = tempfile.mkstemp()[1]
        pri
        composed_preprocessor.encode_file(complex_filepath, preprocessed_complex_filepath)
        preprocessed_output_pred_filepath = tempfile.mkstemp()[1]
        simplifier(preprocessed_complex_filepath, preprocessed_output_pred_filepath)
        composed_preprocessor.decode_file(preprocessed_output_pred_filepath,
                                          output_pred_filepath,
                                          encoder_filepath=complex_filepath)

    preprocessed_simplifier.__name__ = f'{preprocessed_simplifier.__name__}_{composed_preprocessor.get_suffix()}'
    return preprocessed_simplifier


preprocessors=[LengthRatioPreprocessor(target_ratio=0.25), LevenshteinPreprocessor(bucket_size=0.05, noise_std=0, target_ratio=0.25), WordRankRatioPreprocessor(target_ratio=0.25), SentencePiecePreprocessor(input_filepaths=None, vocab_size=10000)]
simplifier_type="fairseq_simplifier"  
exp_dir="/Users/alex/personal_projects/access/resources/models/best_model"  


usage: ipykernel_launcher.py [-h] [--no-progress-bar] [--log-interval N]
                             [--log-format {json,none,simple,tqdm}]
                             [--tensorboard-logdir DIR] [--seed N] [--cpu]
                             [--fp16] [--memory-efficient-fp16]
                             [--fp16-no-flatten-grads]
                             [--fp16-init-scale FP16_INIT_SCALE]
                             [--fp16-scale-window FP16_SCALE_WINDOW]
                             [--fp16-scale-tolerance FP16_SCALE_TOLERANCE]
                             [--min-loss-scale D]
                             [--threshold-loss-scale THRESHOLD_LOSS_SCALE]
                             [--user-dir USER_DIR]
                             [--empty-cache-freq EMPTY_CACHE_FREQ]
                             [--all-gather-list-size ALL_GATHER_LIST_SIZE]
                             [--model-parallel-size N]
                             [--checkpoint-suffix CHECKPOINT_SUFFIX]
              

SystemExit: 2

In [None]:



def memoize_simplifier(simplifier):
    memo = {}

    @wraps(simplifier)


    return wrapped

In [5]:
%tb
simplifier(source_filepath, pred_filepath)

preprocessors=[LengthRatioPreprocessor(target_ratio=0.25), LevenshteinPreprocessor(bucket_size=0.05, noise_std=0, target_ratio=0.25), WordRankRatioPreprocessor(target_ratio=0.25), SentencePiecePreprocessor(input_filepaths=None, vocab_size=10000)]
simplifier_type="fairseq_simplifier"  
exp_dir="/Users/alex/personal_projects/access/resources/models/best_model"  


No traceback available to show.
usage: ipykernel_launcher.py [-h] [--no-progress-bar] [--log-interval N]
                             [--log-format {json,none,simple,tqdm}]
                             [--tensorboard-logdir DIR] [--seed N] [--cpu]
                             [--fp16] [--memory-efficient-fp16]
                             [--fp16-no-flatten-grads]
                             [--fp16-init-scale FP16_INIT_SCALE]
                             [--fp16-scale-window FP16_SCALE_WINDOW]
                             [--fp16-scale-tolerance FP16_SCALE_TOLERANCE]
                             [--min-loss-scale D]
                             [--threshold-loss-scale THRESHOLD_LOSS_SCALE]
                             [--user-dir USER_DIR]
                             [--empty-cache-freq EMPTY_CACHE_FREQ]
                             [--all-gather-list-size ALL_GATHER_LIST_SIZE]
                             [--model-parallel-size N]
                             [--checkpoint-suffix C

SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:



simplifier = get_fairseq_simplifier(best_model_dir)
simplifier = get_preprocessed_simplifier(simplifier, preprocessors=preprocessors)
# 
pred_filepath = get_temp_filepath()
print(pred_filepath)
# with mute():
#     simplifier(source_filepath, pred_filepath)
# for line in yield_lines(pred_filepath):
#     print(line)
