Merge remote-tracking branch 'srt-tools/master' into develop

cdown · Jun 21, 2016 · 4862a51 · 4862a51
2 parents 0d1a258 + c21e27f
commit 4862a51
Show file tree

Hide file tree

Showing 10 changed files with 413 additions and 0 deletions.
diff --git a/tools/README.rst b/tools/README.rst
@@ -0,0 +1,34 @@
+=========
+srt-tools
+=========
+
+srt-tools is a repo containing utilities written to process SRT files. All
+utilities use the Python srt_ library internally.
+
+.. _srt: https://github.com/cdown/srt
+
+Utilities
+---------
+
+- *chinese-lines-only* removes subtitle lines that don't appear to be
+  Chinese. Useful for turning joing English/Chinese subtitles into Chinese
+  subtitles only.
+- *fix-subtitle-indexing* fixes subtitle indexing. Some badly formed SRT files
+  will have indexes that occur in a different order than the starting
+  timestamps for the subtitles they are associated with. This makes some media
+  players unable to display those subtitles, and they are subsequently lost
+  into the ether.
+- *linear-timeshift* does linear time correction. If you have a movie that
+  runs slower or faster than the subtitle that you have, it will repeatedly
+  lose sync. This tool can apply linear time corrections to all subtitles in
+  the SRT, resyncing it with the video.
+- *mux-subs* can mux_ multiple subtitles together into one. For example, if you
+  have a Chinese subtitle and an English subtitle, and you want to have one
+  subtitle file that contains both, this tool can do that for you. It also
+  supports clamping subtitles starting or ending at similar times to the same
+  time to avoid subtitles jumping around the screen.
+- *strip-html* strips HTML formatting from subtitle content. This is especially
+  prevalant in `SSA/ASS`_ subtitles that have been directly converted to SRT.
+
+.. _mux: https://en.wikipedia.org/wiki/Multiplexing
+.. _`SSA/ASS`: https://en.wikipedia.org/wiki/SubStation_Alpha
diff --git a/tools/chinese-lines-only.py b/tools/chinese-lines-only.py
@@ -0,0 +1,28 @@
+#!/usr/bin/env python
+
+from hanzidentifier import has_chinese
+import srt
+import utils
+
+
+def strip_to_chinese_lines_only(subtitles):
+    for subtitle in subtitles:
+        subtitle_lines = subtitle.content.splitlines()
+        chinese_subtitle_lines = (
+            line for line in subtitle_lines
+            if has_chinese(line)
+        )
+        subtitle.content = '\n'.join(chinese_subtitle_lines)
+        yield subtitle
+
+
+def main():
+    args = utils.basic_parser().parse_args()
+    subtitles_in = srt.parse(args.input.read())
+    chinese_subtitles_only = strip_to_chinese_lines_only(subtitles_in)
+    output = srt.compose(chinese_subtitles_only, strict=args.strict)
+    args.output.write(output)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/fix-subtitle-indexing.py b/tools/fix-subtitle-indexing.py
@@ -0,0 +1,15 @@
+#!/usr/bin/env python
+
+import utils
+import srt
+
+
+def main():
+    args = utils.basic_parser().parse_args()
+    subtitles_in = srt.parse(args.input.read())
+    output = srt.compose(subtitles_in, strict=args.strict)
+    args.output.write(output)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/fixed-timeshift.py b/tools/fixed-timeshift.py
@@ -0,0 +1,36 @@
+#!/usr/bin/env python
+
+import srt
+import datetime
+import utils
+
+
+def parse_args():
+    parser = utils.basic_parser()
+    parser.add_argument(
+        '--seconds',
+        type=float,
+        required=True,
+        help='how many seconds to shift',
+    )
+    return parser.parse_args()
+
+
+def scalar_correct_subs(subtitles, seconds_to_shift):
+    td_to_shift = datetime.timedelta(seconds=seconds_to_shift)
+    for subtitle in subtitles:
+        subtitle.start += td_to_shift
+        subtitle.end += td_to_shift
+        yield subtitle
+
+
+def main():
+    args = parse_args()
+    subtitles_in = srt.parse(args.input.read())
+    corrected_subs = scalar_correct_subs(subtitles_in, args.seconds)
+    output = srt.compose(corrected_subs, strict=args.strict)
+    args.output.write(output)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/linear-timeshift.py b/tools/linear-timeshift.py
@@ -0,0 +1,91 @@
+#!/usr/bin/env python
+
+import srt
+import datetime
+import utils
+
+
+def timedelta_to_milliseconds(delta):
+    return delta.days * 86400000 + \
+           delta.seconds * 1000 + \
+           delta.microseconds / 1000
+
+def parse_args():
+    def srt_timestamp_to_milliseconds(parser, arg):
+        try:
+            delta = srt.srt_timestamp_to_timedelta(arg)
+        except ValueError:
+            parser.error('not a valid SRT timestamp: %s' % arg)
+        else:
+            return timedelta_to_milliseconds(delta)
+
+    parser = utils.basic_parser()
+    parser.add_argument(
+        '--from-start',
+        '--f1',
+        type=lambda arg: srt_timestamp_to_milliseconds(parser, arg),
+        required=True,
+        help='the first desynchronised timestamp',
+    )
+    parser.add_argument(
+        '--to-start',
+        '--t1',
+        type=lambda arg: srt_timestamp_to_milliseconds(parser, arg),
+        required=True,
+        help='the first synchronised timestamp',
+    )
+    parser.add_argument(
+        '--from-end',
+        '--f2',
+        type=lambda arg: srt_timestamp_to_milliseconds(parser, arg),
+        required=True,
+        help='the second desynchronised timestamp',
+    )
+    parser.add_argument(
+        '--to-end',
+        '--t2',
+        type=lambda arg: srt_timestamp_to_milliseconds(parser, arg),
+        required=True,
+        help='the second synchronised timestamp',
+    )
+    return parser.parse_args()
+
+
+def calc_correction(to_start, to_end, from_start, from_end):
+    angular = (to_end - to_start) / (from_end - from_start)
+    linear = to_end - angular * from_end
+    return angular, linear
+
+
+def correct_time(current_msecs, angular, linear):
+    return round(current_msecs * angular + linear)
+
+
+def correct_timedelta(bad_delta, angular, linear):
+    bad_msecs = timedelta_to_milliseconds(bad_delta)
+    good_msecs = correct_time(bad_msecs, angular, linear)
+    good_delta = datetime.timedelta(milliseconds=good_msecs)
+    return good_delta
+
+
+def linear_correct_subs(subtitles, angular, linear):
+    for subtitle in subtitles:
+        subtitle.start = correct_timedelta(subtitle.start, angular, linear)
+        subtitle.end = correct_timedelta(subtitle.end, angular, linear)
+        yield subtitle
+
+
+def main():
+    args = parse_args()
+    angular, linear = calc_correction(
+        args.to_start, args.to_end,
+        args.from_start, args.from_end,
+    )
+    subtitles_in = srt.parse(args.input.read())
+    corrected_subs = linear_correct_subs(subtitles_in, angular, linear)
+    output = srt.compose(corrected_subs, strict=args.strict)
+    args.output.write(output)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/lines-matching.py b/tools/lines-matching.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python
+
+import importlib
+import srt
+import utils
+
+
+def strip_to_matching_lines_only(subtitles, imports, func_str):
+    for import_name in imports:
+        real_import = importlib.import_module(import_name)
+        globals()[import_name] = real_import
+
+    func = eval(func_str)  # pylint: disable-msg=eval-used
+
+    for subtitle in subtitles:
+        subtitle_lines = subtitle.content.splitlines()
+        matching_subtitle_lines = (
+            line for line in subtitle_lines
+            if func(line)
+        )
+        subtitle.content = '\n'.join(matching_subtitle_lines)
+        yield subtitle
+
+
+def parse_args():
+    parser = utils.basic_parser()
+    parser.add_argument(
+        '-f', '--func',
+        help='a function to use to match lines',
+        required=True,
+    )
+    parser.add_argument(
+        '-m', '--module',
+        help='modules to import in the function context',
+        action='append', default=[],
+    )
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+    subtitles_in = srt.parse(args.input.read())
+    matching_subtitles_only = strip_to_matching_lines_only(
+        subtitles_in, args.module, args.func,
+    )
+    output = srt.compose(matching_subtitles_only, strict=args.strict)
+    args.output.write(output)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/mux.py b/tools/mux.py
@@ -0,0 +1,72 @@
+#!/usr/bin/env python
+
+import datetime
+import srt
+import utils
+import logging
+import operator
+
+log = logging.getLogger(__name__)
+
+def parse_args():
+    parser = utils.basic_parser(multi_input=True)
+    parser.add_argument(
+        '--ms', metavar='MILLISECONDS',
+        default=datetime.timedelta(milliseconds=600),
+        type=lambda ms: datetime.timedelta(milliseconds=int(ms)),
+        help='if subs being muxed are within this number of milliseconds '
+             'of each other, they will get merged (default: 600)',
+    )
+    parser.add_argument(
+        '--width',
+        default=5, type=int,
+        help='the number of subs to consider merging (default: %(default)s)',
+    )
+    return parser.parse_args()
+
+
+def merge_subs(subs, acceptable_diff, attr, width):
+    '''
+    Merge subs with similar start/end times together. This prevents the
+    subtitles jumping around the screen.
+
+    The merge is done in-place.
+    '''
+    sorted_subs = sorted(subs, key=operator.attrgetter(attr))
+
+    for subs in utils.sliding_window(sorted_subs, width=width):
+        current_sub = subs[0]
+        future_subs = subs[1:]
+        current_comp = getattr(current_sub, attr)
+
+        for future_sub in future_subs:
+            future_comp = getattr(future_sub, attr)
+            if current_comp + acceptable_diff > future_comp:
+                log.debug(
+                    "Merging %d's %s time into %d",
+                    future_sub.index, attr, current_sub.index,
+                )
+                setattr(future_sub, attr, current_comp)
+            else:
+                # Since these are sorted, and this one didn't match, we can be
+                # sure future ones won't match either.
+                break
+
+
+def main():
+    args = parse_args()
+    logging.basicConfig(level=args.log_level)
+
+    muxed_subs = []
+    for file_input in args.input:
+        muxed_subs.extend(srt.parse(file_input.read()))
+
+    merge_subs(muxed_subs, args.ms, 'start', args.width)
+    merge_subs(muxed_subs, args.ms, 'end', args.width)
+
+    output = srt.compose(muxed_subs, strict=args.strict)
+    args.output.write(output)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/requirements.txt b/tools/requirements.txt
@@ -0,0 +1,2 @@
+srt
+hanzidentifier
diff --git a/tools/strip-html.py b/tools/strip-html.py
@@ -0,0 +1,29 @@
+#!/usr/bin/env python
+
+import re
+import srt
+import utils
+import logging
+
+
+def strip_html_from_subs(subtitles):
+    for subtitle in subtitles:
+        subtitle_lines = subtitle.content.splitlines()
+        stripped_subtitle_lines = (
+            re.sub('<[^<]+?>', '', line) for line in subtitle_lines
+        )
+        subtitle.content = '\n'.join(stripped_subtitle_lines)
+        yield subtitle
+
+
+def main():
+    args = utils.basic_parser().parse_args()
+    logging.basicConfig(level=args.log_level)
+    subtitles_in = srt.parse(args.input.read())
+    stripped_subs = strip_html_from_subs(subtitles_in)
+    output = srt.compose(stripped_subs, args.strict)
+    args.output.write(output)
+
+
+if __name__ == '__main__':
+    main()