Skip to content

Commit

Permalink
Merge remote-tracking branch 'srt-tools/master' into develop
Browse files Browse the repository at this point in the history
  • Loading branch information
cdown committed Jun 21, 2016
2 parents 0d1a258 + c21e27f commit 4862a51
Show file tree
Hide file tree
Showing 10 changed files with 413 additions and 0 deletions.
34 changes: 34 additions & 0 deletions tools/README.rst
@@ -0,0 +1,34 @@
=========
srt-tools
=========

srt-tools is a repo containing utilities written to process SRT files. All
utilities use the Python srt_ library internally.

.. _srt: https://github.com/cdown/srt

Utilities
---------

- *chinese-lines-only* removes subtitle lines that don't appear to be
Chinese. Useful for turning joing English/Chinese subtitles into Chinese
subtitles only.
- *fix-subtitle-indexing* fixes subtitle indexing. Some badly formed SRT files
will have indexes that occur in a different order than the starting
timestamps for the subtitles they are associated with. This makes some media
players unable to display those subtitles, and they are subsequently lost
into the ether.
- *linear-timeshift* does linear time correction. If you have a movie that
runs slower or faster than the subtitle that you have, it will repeatedly
lose sync. This tool can apply linear time corrections to all subtitles in
the SRT, resyncing it with the video.
- *mux-subs* can mux_ multiple subtitles together into one. For example, if you
have a Chinese subtitle and an English subtitle, and you want to have one
subtitle file that contains both, this tool can do that for you. It also
supports clamping subtitles starting or ending at similar times to the same
time to avoid subtitles jumping around the screen.
- *strip-html* strips HTML formatting from subtitle content. This is especially
prevalant in `SSA/ASS`_ subtitles that have been directly converted to SRT.

.. _mux: https://en.wikipedia.org/wiki/Multiplexing
.. _`SSA/ASS`: https://en.wikipedia.org/wiki/SubStation_Alpha
28 changes: 28 additions & 0 deletions tools/chinese-lines-only.py
@@ -0,0 +1,28 @@
#!/usr/bin/env python

from hanzidentifier import has_chinese
import srt
import utils


def strip_to_chinese_lines_only(subtitles):
for subtitle in subtitles:
subtitle_lines = subtitle.content.splitlines()
chinese_subtitle_lines = (
line for line in subtitle_lines
if has_chinese(line)
)
subtitle.content = '\n'.join(chinese_subtitle_lines)
yield subtitle


def main():
args = utils.basic_parser().parse_args()
subtitles_in = srt.parse(args.input.read())
chinese_subtitles_only = strip_to_chinese_lines_only(subtitles_in)
output = srt.compose(chinese_subtitles_only, strict=args.strict)
args.output.write(output)


if __name__ == '__main__':
main()
15 changes: 15 additions & 0 deletions tools/fix-subtitle-indexing.py
@@ -0,0 +1,15 @@
#!/usr/bin/env python

import utils
import srt


def main():
args = utils.basic_parser().parse_args()
subtitles_in = srt.parse(args.input.read())
output = srt.compose(subtitles_in, strict=args.strict)
args.output.write(output)


if __name__ == '__main__':
main()
36 changes: 36 additions & 0 deletions tools/fixed-timeshift.py
@@ -0,0 +1,36 @@
#!/usr/bin/env python

import srt
import datetime
import utils


def parse_args():
parser = utils.basic_parser()
parser.add_argument(
'--seconds',
type=float,
required=True,
help='how many seconds to shift',
)
return parser.parse_args()


def scalar_correct_subs(subtitles, seconds_to_shift):
td_to_shift = datetime.timedelta(seconds=seconds_to_shift)
for subtitle in subtitles:
subtitle.start += td_to_shift
subtitle.end += td_to_shift
yield subtitle


def main():
args = parse_args()
subtitles_in = srt.parse(args.input.read())
corrected_subs = scalar_correct_subs(subtitles_in, args.seconds)
output = srt.compose(corrected_subs, strict=args.strict)
args.output.write(output)


if __name__ == '__main__':
main()
91 changes: 91 additions & 0 deletions tools/linear-timeshift.py
@@ -0,0 +1,91 @@
#!/usr/bin/env python

import srt
import datetime
import utils


def timedelta_to_milliseconds(delta):
return delta.days * 86400000 + \
delta.seconds * 1000 + \
delta.microseconds / 1000

def parse_args():
def srt_timestamp_to_milliseconds(parser, arg):
try:
delta = srt.srt_timestamp_to_timedelta(arg)
except ValueError:
parser.error('not a valid SRT timestamp: %s' % arg)
else:
return timedelta_to_milliseconds(delta)

parser = utils.basic_parser()
parser.add_argument(
'--from-start',
'--f1',
type=lambda arg: srt_timestamp_to_milliseconds(parser, arg),
required=True,
help='the first desynchronised timestamp',
)
parser.add_argument(
'--to-start',
'--t1',
type=lambda arg: srt_timestamp_to_milliseconds(parser, arg),
required=True,
help='the first synchronised timestamp',
)
parser.add_argument(
'--from-end',
'--f2',
type=lambda arg: srt_timestamp_to_milliseconds(parser, arg),
required=True,
help='the second desynchronised timestamp',
)
parser.add_argument(
'--to-end',
'--t2',
type=lambda arg: srt_timestamp_to_milliseconds(parser, arg),
required=True,
help='the second synchronised timestamp',
)
return parser.parse_args()


def calc_correction(to_start, to_end, from_start, from_end):
angular = (to_end - to_start) / (from_end - from_start)
linear = to_end - angular * from_end
return angular, linear


def correct_time(current_msecs, angular, linear):
return round(current_msecs * angular + linear)


def correct_timedelta(bad_delta, angular, linear):
bad_msecs = timedelta_to_milliseconds(bad_delta)
good_msecs = correct_time(bad_msecs, angular, linear)
good_delta = datetime.timedelta(milliseconds=good_msecs)
return good_delta


def linear_correct_subs(subtitles, angular, linear):
for subtitle in subtitles:
subtitle.start = correct_timedelta(subtitle.start, angular, linear)
subtitle.end = correct_timedelta(subtitle.end, angular, linear)
yield subtitle


def main():
args = parse_args()
angular, linear = calc_correction(
args.to_start, args.to_end,
args.from_start, args.from_end,
)
subtitles_in = srt.parse(args.input.read())
corrected_subs = linear_correct_subs(subtitles_in, angular, linear)
output = srt.compose(corrected_subs, strict=args.strict)
args.output.write(output)


if __name__ == '__main__':
main()
51 changes: 51 additions & 0 deletions tools/lines-matching.py
@@ -0,0 +1,51 @@
#!/usr/bin/env python

import importlib
import srt
import utils


def strip_to_matching_lines_only(subtitles, imports, func_str):
for import_name in imports:
real_import = importlib.import_module(import_name)
globals()[import_name] = real_import

func = eval(func_str) # pylint: disable-msg=eval-used

for subtitle in subtitles:
subtitle_lines = subtitle.content.splitlines()
matching_subtitle_lines = (
line for line in subtitle_lines
if func(line)
)
subtitle.content = '\n'.join(matching_subtitle_lines)
yield subtitle


def parse_args():
parser = utils.basic_parser()
parser.add_argument(
'-f', '--func',
help='a function to use to match lines',
required=True,
)
parser.add_argument(
'-m', '--module',
help='modules to import in the function context',
action='append', default=[],
)
return parser.parse_args()


def main():
args = parse_args()
subtitles_in = srt.parse(args.input.read())
matching_subtitles_only = strip_to_matching_lines_only(
subtitles_in, args.module, args.func,
)
output = srt.compose(matching_subtitles_only, strict=args.strict)
args.output.write(output)


if __name__ == '__main__':
main()
72 changes: 72 additions & 0 deletions tools/mux.py
@@ -0,0 +1,72 @@
#!/usr/bin/env python

import datetime
import srt
import utils
import logging
import operator

log = logging.getLogger(__name__)

def parse_args():
parser = utils.basic_parser(multi_input=True)
parser.add_argument(
'--ms', metavar='MILLISECONDS',
default=datetime.timedelta(milliseconds=600),
type=lambda ms: datetime.timedelta(milliseconds=int(ms)),
help='if subs being muxed are within this number of milliseconds '
'of each other, they will get merged (default: 600)',
)
parser.add_argument(
'--width',
default=5, type=int,
help='the number of subs to consider merging (default: %(default)s)',
)
return parser.parse_args()


def merge_subs(subs, acceptable_diff, attr, width):
'''
Merge subs with similar start/end times together. This prevents the
subtitles jumping around the screen.
The merge is done in-place.
'''
sorted_subs = sorted(subs, key=operator.attrgetter(attr))

for subs in utils.sliding_window(sorted_subs, width=width):
current_sub = subs[0]
future_subs = subs[1:]
current_comp = getattr(current_sub, attr)

for future_sub in future_subs:
future_comp = getattr(future_sub, attr)
if current_comp + acceptable_diff > future_comp:
log.debug(
"Merging %d's %s time into %d",
future_sub.index, attr, current_sub.index,
)
setattr(future_sub, attr, current_comp)
else:
# Since these are sorted, and this one didn't match, we can be
# sure future ones won't match either.
break


def main():
args = parse_args()
logging.basicConfig(level=args.log_level)

muxed_subs = []
for file_input in args.input:
muxed_subs.extend(srt.parse(file_input.read()))

merge_subs(muxed_subs, args.ms, 'start', args.width)
merge_subs(muxed_subs, args.ms, 'end', args.width)

output = srt.compose(muxed_subs, strict=args.strict)
args.output.write(output)


if __name__ == '__main__':
main()
2 changes: 2 additions & 0 deletions tools/requirements.txt
@@ -0,0 +1,2 @@
srt
hanzidentifier
29 changes: 29 additions & 0 deletions tools/strip-html.py
@@ -0,0 +1,29 @@
#!/usr/bin/env python

import re
import srt
import utils
import logging


def strip_html_from_subs(subtitles):
for subtitle in subtitles:
subtitle_lines = subtitle.content.splitlines()
stripped_subtitle_lines = (
re.sub('<[^<]+?>', '', line) for line in subtitle_lines
)
subtitle.content = '\n'.join(stripped_subtitle_lines)
yield subtitle


def main():
args = utils.basic_parser().parse_args()
logging.basicConfig(level=args.log_level)
subtitles_in = srt.parse(args.input.read())
stripped_subs = strip_html_from_subs(subtitles_in)
output = srt.compose(stripped_subs, args.strict)
args.output.write(output)


if __name__ == '__main__':
main()

0 comments on commit 4862a51

Please sign in to comment.