In [None]:
import spacy
import requests
from spacy import displacy
from IPython import display
from IPython.core.display import display, HTML
import os
from IPython.display import IFrame

In [None]:
with open('pg345.txt') as f:
    dracula = f.read()

In [None]:
!spacy download 'en'

Check we have the Spacy model downloaded

In [None]:
!python -m spacy validate

Load spacy English model (sub in your own path using the cell above)

In [None]:
nlp = spacy.load('/Users/archy/anaconda/envs/translation/lib/python3.5/site-packages/en_core_web_sm/en_core_web_sm-2.1.0')

In [None]:
snipped = dracula.split('3 May. Bistritz')[-1][0:700]

In [None]:
snipped

Use Spacy POS to tag cells:

In [None]:
doc = nlp(snipped)

In [None]:
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
          token.shape_, token.is_alpha, token.is_stop)

Sub in your Yandex API key below (go to https://translate.yandex.com), and run `export YANDEX_API_KEY=xxxxx` in your shell, where `xxxxx` is your key

In [None]:
api_key = os.getenv('YANDEX_API_KEY', None)

In [None]:
r = requests.post('https://translate.yandex.net/api/v1.5/tr.json/translate',
                 data={'key':api_key,
                 'text':'hello',
                 'lang':'en-fr'})

In [None]:
r.json()

In [None]:
def get_translation(word):
    r = requests.post('https://translate.yandex.net/api/v1.5/tr.json/translate',
                 data={'key':api_key,
                 'text':word,
                 'lang':'en-fr'})
    
    return ' '.join(r.json()['text'])
    

## Try and just translate nouns

In [None]:
def render_output_to_html(html_doc, filename):
    
    full_doc = html_doc['header'] + html_doc['body'] + html_doc['footer']
    
    with open(filename, 'w') as f:
        f.write(full_doc)
        
    return filename

def display_html(filename, width=200, height=200):
    return IFrame(src=filename, width=width, height=height)

def mark_translation(text, marker = 'em'):
    
    return '<' + marker + '>' + text + '</' + marker + '>'

In [None]:
output = ''
for token in doc:
    if token.pos_ == 'NOUN' or token.pos_ == 'DET' or token.pos_ == 'ADJ':
        output += get_translation(token.text_with_ws)
    else:
        output += token.text_with_ws

In [None]:
output

Just translating nouns is ok, but jarring, because the determinants are missing e.g. 'one heure'

You can't translate the determinants on their own either, going to need to do them together

The strategy we adopt here is to start accumulating a "trace" whenever we hit a determinant or an adjective. When we get to the noun, we translate. This seems reasonable because in English adjectives precede nouns.

In [None]:
output = ''
trace = []
for token in doc:
    if token.pos_ == 'DET' or token.pos_ == 'ADJ':
        trace.append(token.text_with_ws)
        print(token.text_with_ws)
    elif token.pos_ == 'NOUN' or token.pos_ == 'PROPN':
        if len(trace) > 0:
            trace.append(token.text_with_ws)
            print('translating', trace)
            output += mark_translation(get_translation(' '.join(trace)))
            trace = []
            print('___')
        else:
            output += mark_translation(get_translation(token.text_with_ws))
    
        
    else:
        if token.pos_ == 'PUNCT' and len(trace) > 0:
            output += mark_translation(get_translation(' '.join(trace)))
            print('translating', trace)
            trace = []


        output += token.text_with_ws

In [None]:
display(HTML(output))

## Rendering HTML externally

In [None]:
def add_tooltip_ref(text):
    return f'<div class="htmltooltip"> {text} </div>'

def mark_translation_tooltip(text):
    return f'<a href="#" rel="htmltooltip"> {text} </a>'

Setup for HTML rendering

In [None]:
html_doc = {'header': None, 
           'body': [],
           'footer': []}


In [None]:
with open('hod-raw.txt') as f:
    hod = f.read()

In [None]:
hod_doc = nlp(hod)

In [None]:
len(hod_doc)

## Translate a big chunk (this takes a whle)

In [None]:
n_sample = 100

In [None]:
output = []
footer = []
trace = []
for i, token in enumerate(hod_doc[:n_sample]):
    if token.pos_ == 'DET' or token.pos_ == 'ADJ':
        trace.append(token.text_with_ws)
#         print(token.text_with_ws)
    elif token.pos_ == 'NOUN' or token.pos_ == 'PROPN':
        if len(trace) > 0:
            trace.append(token.text_with_ws)
#             print('translating', trace)
            
            text_ = ' '.join(trace)
            output += [mark_translation_tooltip(get_translation(text_))]
            footer.append(add_tooltip_ref(text_))
            
            trace = []
#             print('___')
        else:
            output += [mark_translation_tooltip(get_translation(token.text_with_ws))]
            footer.append(add_tooltip_ref(token.text_with_ws))
    
    elif len(trace) > 0:
        # If we're still inside a nounphrase, keep accumulating
        trace.append(token.text_with_ws)
    else:
        if token.pos_ == 'PUNCT' and len(trace) > 0:
            # Terminate traces at punctuation, as these correspond to clause endings
            text_ = ' '.join(trace)
            output += [mark_translation_tooltip(get_translation(text_))]
            footer.append(add_tooltip_ref(text_))
#             print('translating', trace)
            trace = []
        else:
            output += [token.text_with_ws]
            
    if i % 1000 == 0:
        print(f"{i} / {len(hod_doc)}")
    

In [None]:
def wrap_in_container(text):
    return f"<div class='container'> {text} </div>"

In [None]:
with open('header.html') as f: 
    html_doc['header'] = f.read()

In [None]:
html_doc['body'] = wrap_in_container(''.join(output))
html_doc['footer'] = '\n'.join(footer)

In [None]:
render_output_to_html(html_doc, 'test_doc.html')

In [None]:
display_html('test_doc.html', width = 600)