In [46]:
from collections import OrderedDict

fname = '1x03c.srt'
subdata = OrderedDict()

'''
PART 1: PARSE THE DATA
'''
def process_lines(subdata: OrderedDict, chunk: list) -> OrderedDict:
    subdata[int(chunk[0])] = {
        'time': chunk[1],
        'text': ' '.join(chunk[2:])
    }
    return subdata

with open(fname, encoding='latin-1') as f:
    chunk = []
    lines = f.read().splitlines()

for line in lines:
    if line == '':
        if len(chunk) > 0:
            process_lines(subdata, chunk)
        chunk = []
    if line.isnumeric() or len(chunk) != 0:
        chunk.append(line)
if len(chunk) > 0:
    process_lines(subdata, chunk)

len(subdata)
subdata

OrderedDict([(2,
              {'time': '00:01:04,560 --> 00:01:05,520',
               'text': '              Ella dijo'}),
             (3,
              {'time': '00:01:04,560 --> 00:01:05,520',
               'text': '              que me podia dar un regalo cuando yo vuelva.'}),
             (4,
              {'time': '00:01:04,560 --> 00:01:05,520',
               'text': '              Y tambien dijo que hoy es jueves.'}),
             (5,
              {'time': '00:01:15,400 --> 00:01:16,360',
               'text': '              Roja.'}),
             (6,
              {'time': '00:01:17,119 --> 00:01:18,560',
               'text': '              Rojo.                -Roja.'}),
             (7,
              {'time': '00:01:20,000 --> 00:01:21,040',
               'text': '              Roja.'}),
             (8,
              {'time': '00:01:27,160 --> 00:01:28,160',
               'text': '              Roja.'}),
             (9,
              {'time': '00:03:35,879 --> 00

In [47]:
'''
PART 2: JOIN THE TEXT
'''
import re
from datetime import datetime

all_text = ''
prev_time = ''
it = 0
for id, vals in subdata.items():

    text = vals['text'].strip()

    if len(all_text) > 0 and all_text[-1] != '\n':
        all_text += ' '

    if prev_time == '':
        # take the last time of the sequence
        prev_time = re.findall(r'(\d\d:\d\d:\d\d)', vals['time'])[-1]

    # check if a lot of time has passed since the last text. if so, this is a new sentence
    # get current time, first time of this line
    current_time = re.findall(r'(\d\d:\d\d:\d\d)', vals['time'])[0]
    tdelta = (datetime.strptime(current_time, '%H:%M:%S') - datetime.strptime(prev_time, '%H:%M:%S')).total_seconds()
    if int(tdelta) > 10 and all_text[-1] != '\n':
        all_text += '\n' + text
        it += 1
    else:
        all_text += text
    subdata[id]['map'] = it

    # check if text ends in punctuation. if so, it's the end of the sentence
    if text.rstrip('</i>')[-1] in ['.', '?', '!', ')']:
        all_text += '\n'
        it += 1
    
    prev_time = re.findall(r'(\d\d:\d\d:\d\d)', vals['time'])[-1]

all_text = all_text.split('\n')
'''
with open('ex.txt', 'w') as f:
    f.write(all_text)
'''
if all_text[-1] == '':
    all_text = all_text[:-1]
all_text[:20]


['Ella dijo que me podia dar un regalo cuando yo vuelva.',
 'Y tambien dijo que hoy es jueves.',
 'Roja.',
 'Rojo.                -Roja.',
 'Roja.',
 'Roja.',
 'Hostia,  Â¿habÃ©is visto el campus virtual?',
 'Hoy la clase de Cultura        es en el aula 110.',
 'Pues vamos para allÃ¡,    que si no, no habrÃ¡ sitio...',
 'No, desayunamos,       nos relajamos y ya...',
 'Bueno, ya tenÃ\xada ganas  de sentarme con mis compaÃ±eros, que me meten     puÃ±aladas por la espalda.',
 'Â¿No estÃ¡s enfadado?          -Â¿Yo? Â¡QuÃ© va!',
 'El ejercicio es buenÃ\xadsimo.',
 'AdemÃ¡s, la BolaÃ±o es la bomba como profe.',
 'Pero a ver, explicÃ¡dmelo.',
 'O sea, Â¿ella os ha dicho   que yo llegarÃ\xada tarde a clase?',
 'No, no. Nos ha dicho   que si alguien llegaba tarde, nos preguntarÃ\xada       el color de la carpeta y que dijÃ©ramos rojo.               -SÃ\xad.',
 'Es que si lo piensas, es buenÃ\xadsimo.',
 'Pero tiene un puntito cruel.',
 'No lo dirÃ¡s por mÃ\xad.          -Bueno, pues...']

In [None]:
from transformers import pipeline
model_checkpoint = "Helsinki-NLP/opus-mt-es-en"
translator = pipeline("translation", model=model_checkpoint)

In [48]:
'''
translated_text = []
for sentence in all_text:
    translated = translator(sentence)[0]['translation_text']
    #print(translated)
    translated_text.append(translated)
'''
translated_text = [translator(sentence)[0]['translation_text'] for sentence in all_text[:5]]

In [50]:
translator('Ella dijo que me podia dar un regalo cuando yo vuelva. Y tambien dijo que hoy es jueves.')

[{'translation_text': "She said she could give me a present when I get back, and she also said it's Thursday."}]

In [49]:
translated_text

['She said she could give me a present when I get back.',
 'And he also said that today is Thursday.',
 'Red.',
 'Red.',
 'Red.']

In [24]:
import transformers
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-es-en")
model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-es-en")


In [25]:
text = '<SEP> '.join(all_text)
tokenized_text = tokenizer.prepare_seq2seq_batch([text], return_tensors='pt')
translation = model.generate(**tokenized_text)
translated_text = tokenizer.batch_decode(translation, skip_special_tokens=False)[0]
translated_text = translated_text.split('<SEP> ')
translated_text

["<pad> You're the one who won't answer my calls! ",
 "Don't complain, baby! ",
 'See if I start working and let my head rest. ',
 "I've been told that if I take my driver's license, I could work in a parking lot parking cars, watching... A party! ",
 "Total, I don't know what the fuck to do with my life. ",
 "I'll do the selectivity and study history. ",
 'I already have enough with my grandmother\'s "depre". ',
 'And if I call you, Pol Rubio, take my mobile phone. ',
 'I close at home to cry all day? ',
 'I tried to keep with my life. ',
 'I already have enough with my grandmother\'s "depre".']

In [29]:
len(translated_text)

11

In [37]:
'''
PART 4: replace translated text in the sub file

dummy mode: replace all excerpts of a "sentence" by the whole translated sentence
since we have no alignments
'''
transdata = OrderedDict()
for key, vals in subdata.items():
    transdata[key] = {
        'time': vals['time'],
        'text': translated_text[vals['map']]
    }

transdata


OrderedDict([(2, {'time': '00:01:04,560 --> 00:01:05,520', 'text': 'Red.'}),
             (3, {'time': '00:01:10,040 --> 00:01:11,160', 'text': 'Red.'}),
             (4, {'time': '00:01:14,000 --> 00:01:15,000', 'text': 'Red.'}),
             (5, {'time': '00:01:15,400 --> 00:01:16,360', 'text': 'Red.'}),
             (6, {'time': '00:01:17,119 --> 00:01:18,560', 'text': 'Red.'}),
             (7, {'time': '00:01:20,000 --> 00:01:21,040', 'text': 'Red.'}),
             (8, {'time': '00:01:27,160 --> 00:01:28,160', 'text': 'Red.'}),
             (9,
              {'time': '00:03:35,879 --> 00:03:38,119',
               'text': 'Holy shit, have you seen the virtual campus?'}),
             (10,
              {'time': '00:03:38,280 --> 00:03:40,400',
               'text': 'Today the Culture class is in classroom 110.'}),
             (11,
              {'time': '00:03:41,920 --> 00:03:44,400',
               'text': "Well, let's go over there, otherwise there won't be any room..."}),
  

In [38]:
'''
PART 5: write translated data into .srt
'''

newf = fname.replace('.srt', '-spa.srt')
with open(newf, 'w', encoding='utf8') as writef:
    for key, vals in transdata.items():
        writef.write(str(key) + '\n')
        writef.write(vals['time'] + '\n')
        writef.write(vals['text'] + '\n')
        writef.write('\n')
