Merge f8a820c into 2d9f734

cdown · Apr 19, 2021 · b53012c · b53012c
2 parents 2d9f734 + f8a820c
commit b53012c
Show file tree

Hide file tree

Showing 4 changed files with 378 additions and 0 deletions.
diff --git a/setup.py b/setup.py
@@ -31,6 +31,7 @@
         "srt_tools/srt-mux",
         "srt_tools/srt-play",
         "srt_tools/srt-process",
+        "srt_tools/srt-remove"
     ],
     license="Public Domain",
     keywords="srt",

diff --git a/srt_tools/README.rst b/srt_tools/README.rst
@@ -57,6 +57,9 @@ Utilities
   you can naively strip some basic HTML-like markup with ``srt process -m re -f
   'lambda sub: re.sub("<[^<]+?>", "", sub)'``. HTML-like syntax is especially
   prevalant in `SSA/ASS`_ subtitles that have been directly converted to SRT.
+  - *remove* allows removal by index/timestamp in sequential or non-sequential
+  order. By placing indexes/timestamps non-sequentially (i.e 10, 5), you specify
+  to remove all captions past 10 and before 5.
 
 .. _mux: https://en.wikipedia.org/wiki/Multiplexing
 .. _`SSA/ASS`: https://en.wikipedia.org/wiki/SubStation_Alpha

diff --git a/srt_tools/srt-remove b/srt_tools/srt-remove
@@ -0,0 +1,199 @@
+#!/usr/bin/env python
+
+"""Remove subtitles by index or timestamp."""
+
+import srt
+import datetime
+import srt_tools.utils
+import logging
+
+log = logging.getLogger(__name__)
+
+def parse_args():
+    examples = {
+        "Remove a single caption at index 10": "srt remove -i example.srt -x 10 11",
+        "Remove captions from index 10 to the end of the file.": "srt remove -i example.srt -x 10",
+        "Remove by timestamp all captions within :05 - :08": "srt remove -i example.srt -t 00:00:5,00 00:00:8,00",
+        "Remove by timestamp non-sequentially": "srt remove -i example.srt -t 00:00:30,00 00:00:10,00"
+    }
+    parser = srt_tools.utils.basic_parser(
+        description=__doc__, examples=examples, multi_input=True
+    )
+    parser.add_argument(
+    "-t",
+    metavar="TIMESTAMP",
+    type=lambda time: srt.srt_timestamp_to_timedelta(time),
+    nargs='*',
+    help="The index or timestamp to start or stop removing at."
+    )
+    parser.add_argument(
+    "-x",
+    metavar="INDEX",
+    type=int,
+    nargs='*',
+    help="The index or timestamp to start or stop removing at."
+    )
+    return parser.parse_args()
+
+def get_timestamp(subs, obj):
+    if isinstance(obj, datetime.timedelta):
+        return obj
+    elif isinstance(obj, int):
+        len_subs = len(list(subs))
+        if (obj >= 0 and obj >= len_subs) or (obj < 0 and abs(obj) > len_subs):
+            raise IndexError("There is no caption at the specified index.")
+        return list(subs)[obj].start
+    else:
+        raise ValueError("You must enter an index or timestamp.")
+
+def binary_search(arr, n, timestamp, at=False, highest=False):
+    """
+    Search for the caption directly after the specified timestamp.
+    If 'at' is True, captions equivalent to the start can be found.
+    In the case of duplicates, return the lowest index unless 'highest' is true.
+    return -1 if no index is found.
+    """
+    found = False
+    low = 0
+    high = n
+    mid = 0
+    while low < high:
+        mid = (high + low) // 2
+        if arr[mid].start < timestamp:
+            low = mid + 1
+        elif arr[mid].start > timestamp or (at and arr[mid].start == timestamp):
+            if not found and mid > 0 and (arr[mid - 1].start > timestamp or (at and arr[mid - 1].start == timestamp)):
+                high = mid
+            elif not found:
+                found = True
+            elif highest and mid + 1 < n and (arr[mid].start == arr[mid + 1].start or (at and arr[mid + 1].start == timestamp)):
+                low = mid + 1
+            else:
+                return mid
+        else:
+            low = mid + 1
+    return -1
+
+def contains_timestamp(caption, timestamp):
+    return caption.start <= timestamp < caption.end
+
+def captions_containing_timestamp(subs, timestamp, sorted=False):
+    # edge cases
+    subs = list(subs)
+    len_subs = len(subs)
+    if len_subs == 0:
+        return []
+
+    subs = list(srt.sort_and_reindex(subs)) if not sorted else subs
+    if timestamp < subs[0].start:
+        return []
+
+    # find the nearest (uncontained) caption that starts after the timestamp
+    ndx = binary_search(subs, len_subs, timestamp, highest=True)
+    ndx = ndx if ndx != -1 else len_subs
+
+    # find captions that contain the timestamp
+    captions = []
+    for i in range(0, ndx):
+        if contains_timestamp(subs[i], timestamp):
+            captions.append(subs[i])
+    return captions
+
+def split(subs, timestamp):
+    subs = list(subs)
+    captions = reversed(captions_containing_timestamp(subs, timestamp))
+    for caption in captions:
+        if caption.start != timestamp and caption.end != timestamp:
+            subs.pop(caption.index - 1)
+            subs.append(srt.Subtitle(caption.index, caption.start, timestamp, caption.content))
+            subs.append(srt.Subtitle(caption.index, timestamp, caption.end, caption.content))
+    return srt.sort_and_reindex(subs)
+
+def remove_caption_index(subs, index_one, index_two=0):
+    subs = list(subs)
+    len_subs = len(subs)
+    if len_subs == 0:
+        return subs
+
+    # check bounds
+    if index_one >= len_subs or (index_one < 0 and -index_one > len_subs) or index_two > len_subs or (index_two < 0 and -index_two > len_subs):
+        raise IndexError
+
+    # convert index to negative equivalent
+    if index_one >= 0:
+        index_one -= len_subs
+    if index_two >= 0:
+        index_two -= len_subs
+
+    if index_one == index_two:
+        subs.clear()
+        return srt.sort_and_reindex(subs)
+    elif index_one > index_two:
+        index_two += len_subs
+
+    for i in range(index_one, index_two):
+        if i < 0:
+            subs.pop(i)
+        else:
+            subs.pop(0)
+    return srt.sort_and_reindex(subs)
+
+def remove_caption_timestamp(subs, timestamp_one, timestamp_two):
+    subs = list(subs)
+    if len(subs) == 0:
+        return subs
+    elif timestamp_one == timestamp_two:
+        return remove_caption_index(subs, 0)
+
+    # Split the caption at the start and end of the block(s).
+    subs = split(subs, timestamp_one)
+    subs = split(subs, timestamp_two)
+    subs = list(subs)
+
+    # Determine the sequential edge case.
+    sequential = timestamp_one < timestamp_two
+    t2_before_first = timestamp_two <= subs[0].start
+    if sequential and t2_before_first:
+        return subs
+
+    # Determine the first bound and it's edge cases.
+    len_subs = len(subs)
+    cdx = binary_search(subs, len_subs, timestamp_one, at=True, highest=False)
+    t1_after_last = True if cdx == -1 else False
+    if sequential and t1_after_last:
+        return subs
+    elif not sequential and t2_before_first and t1_after_last:
+        return subs
+
+    # Determine the second bound and it's edge cases.
+    cdx2 = binary_search(subs, len_subs, timestamp_two, at=True, highest=False)
+    t2_after_last = True if cdx2 == -1 else False
+    if sequential and timestamp_one <= subs[0].start and t2_after_last:
+        subs.clear()
+        return srt.sort_and_reindex(subs)
+    cdx2 = cdx2 - 1 if timestamp_two < subs[cdx2].start else cdx2
+    return remove_caption_index(subs, cdx, cdx2)
+
+
+def main():
+    args = parse_args()
+    logging.basicConfig(level=args.log_level)
+    srt_tools.utils.set_basic_args(args)
+
+    subs = list(args.input[0])
+    time_args = args.x if args.x else args.t
+    if len(time_args) == 1 and isinstance(time_args[0], int):
+        subs = remove_caption_index(subs, time_args[0])
+    else:
+        timestamps = (get_timestamp(subs, time_args[0]), get_timestamp(subs, time_args[1]))
+        subs = remove_caption_timestamp(subs, timestamps[0], timestamps[1])
+
+    output = srt_tools.utils.compose_suggest_on_fail(subs, strict=args.strict)
+
+    try:
+        args.output.write(output)
+    except (UnicodeEncodeError, TypeError):  # Python 2 fallback
+        args.output.write(output.encode(args.encoding))
+
+if __name__ == "__main__":  # pragma: no cover
+    main()
diff --git a/srt_tools/tests/test_remove_captions.py b/srt_tools/tests/test_remove_captions.py
@@ -0,0 +1,175 @@
+# import unittest
+# import importlib
+# import srt
+# from srt import srt_timestamp_to_timedelta as t
+#
+# def sort(subs):
+#     return list(srt.sort_and_reindex(subs))
+#
+# def create_blocks(setting=0):
+#     subs = []
+#     if setting == 0:
+#         subs.append(srt.Subtitle(1, t('00:00:11,000'), t('00:00:12,701'), "A"))
+#         subs.append(srt.Subtitle(2, t('00:00:12,701'), t('00:00:14,203'), "B"))
+#         subs.append(srt.Subtitle(3, t('00:00:14,500'), t('00:00:19,738'), "C"))
+#         subs.append(srt.Subtitle(4, t('00:00:16,538'), t('00:00:17,272'), "D"))
+#         subs.append(srt.Subtitle(5, t('00:00:17,272'), t('00:00:18,440'), "E"))
+#     elif setting == 1:
+#         subs.append(srt.Subtitle(1, t('00:00:1,000'), t('00:00:10,000'), "A"))
+#         subs.append(srt.Subtitle(2, t('00:00:2,000'), t('00:00:08,000'), "B"))
+#         subs.append(srt.Subtitle(3, t('00:00:3,000'), t('00:00:05,000'), "C"))
+#         subs.append(srt.Subtitle(4, t('00:00:3,500'), t('00:00:04,500'), "D"))
+#         subs.append(srt.Subtitle(5, t('00:00:6,000'), t('00:00:08,000'), "E"))
+#         subs.append(srt.Subtitle(6, t('00:00:9,000'), t('00:00:10,000'), "F"))
+#     return subs
+#
+# from srt_tools.utils import *
+# srt_remove = importlib.import_module('srt-remove')
+# from srt_remove import *
+#
+# class TestRemoveCaptions(unittest.TestCase):
+#     def setUp(self):
+#         self.subs = create_blocks()
+#
+#     def tearDown(self):
+#         pass
+#
+#     def test_get_timestamp(self):
+#         # Indexes
+#         self.assertEqual(get_timestamp(self.subs, 0), self.subs[0].start)
+#         self.assertEqual(get_timestamp(self.subs, 4), self.subs[4].start)
+#         self.assertEqual(get_timestamp(self.subs, -1), self.subs[-1].start)
+#         self.assertEqual(get_timestamp(self.subs, -4), self.subs[-4].start)
+#         with self.assertRaises(IndexError):
+#             get_timestamp(self.subs, -5)
+#
+#         # Strings
+#         self.assertEqual(get_timestamp(self.subs, '00:00:11,000'), self.subs[0].start)
+#         self.assertEqual(get_timestamp(self.subs, '00:00:0,000'), t('00:00:0,000'))
+#         self.assertEqual(get_timestamp(self.subs, '00:00:30,000'), t('00:00:30,000'))
+#         with self.assertRaises(srt.TimestampParseError):
+#             self.assertEqual(get_timestamp(self.subs, '-00:00:50,000'))
+#             self.assertEqual(get_timestamp(self.subs, '00:00:-10,000'))
+#
+#         # Date Time
+#         self.assertEqual(get_timestamp(self.subs, t('00:00:11,000')), self.subs[0].start)
+#
+#     def test_captions_containing_timestamp(self):
+#         self.assertEqual(captions_containing_timestamp(self.subs, t('00:00:00,000')), [])
+#         self.assertEqual(captions_containing_timestamp(self.subs, t('00:00:11,000')), [self.subs[0]])
+#         self.assertEqual(captions_containing_timestamp(self.subs, t('00:00:12,000')), [self.subs[0]])
+#         self.assertEqual(captions_containing_timestamp(self.subs, t('00:00:12,701')), [self.subs[1]])
+#         self.assertEqual(captions_containing_timestamp(self.subs, t('00:00:35,000')), [])
+#         self.assertEqual(captions_containing_timestamp(self.subs, t('00:00:16,708')), [self.subs[2], self.subs[3]])
+#         self.assertEqual(captions_containing_timestamp(self.subs, t('00:00:17,272')), [self.subs[2], self.subs[4]])
+#
+#         # distanced overlaps
+#         rsubs = create_blocks(1)
+#         self.assertEqual(captions_containing_timestamp(rsubs, t('00:00:4,000')), [rsubs[0], rsubs[1], rsubs[2], rsubs[3]])
+#         self.assertEqual(captions_containing_timestamp(rsubs, t('00:00:9,000')), [rsubs[0], rsubs[5]])
+#         self.assertEqual(captions_containing_timestamp(rsubs, t('00:00:9,500')), [rsubs[0],rsubs[5]])
+#         self.assertEqual(captions_containing_timestamp(rsubs, t('00:00:3,450')), [rsubs[0], rsubs[1], rsubs[2]])
+#         self.assertEqual(captions_containing_timestamp(rsubs, t('00:00:5,500')), [rsubs[0], rsubs[1]])
+#         self.assertEqual(captions_containing_timestamp(rsubs, t('00:00:11,000')), [])
+#
+#     def test_remove_caption_index(self):
+#         a = sort([self.subs[0], self.subs[2], self.subs[3], self.subs[4]])
+#         rsubs = create_blocks()
+#         result = remove_caption_index(rsubs, 1, 2)
+#         self.assertEqual(list(result), a)
+#
+#         rsubs = create_blocks()
+#         result = remove_caption_index(rsubs, 1, -3)
+#         self.assertEqual(list(result), a)
+#
+#         rsubs = create_blocks()
+#         result = remove_caption_index(rsubs, 1, 3)
+#         self.assertEqual(list(result), sort([self.subs[0], self.subs[3], self.subs[4]]))
+#
+#         rsubs = create_blocks()
+#         result = remove_caption_index(rsubs, 2, 5)
+#         self.assertEqual(list(result), sort([self.subs[0], self.subs[1]]))
+#
+#         rsubs = create_blocks()
+#         result = remove_caption_index(rsubs, -2, 3)
+#         self.assertEqual(list(result), [])
+#
+#         rsubs = create_blocks()
+#         with self.assertRaises(IndexError):
+#             result = remove_caption_index(rsubs, 1, 8)
+#             result = remove_caption_index(rsubs, -7, 4)
+#             result = remove_caption_index(rsubs, 5, 4)
+#
+#         result = remove_caption_index(rsubs, 3, 1) # reverse
+#         self.assertEqual(list(result), sort([self.subs[1],self.subs[2]]))
+#
+#         rsubs = create_blocks()
+#         result = remove_caption_index(rsubs, 2, 0) # reverse
+#         self.assertEqual(list(result), sort([self.subs[0], self.subs[1]]))
+#
+#         a = sort([self.subs[2], self.subs[3]])
+#         rsubs = create_blocks()
+#         result = remove_caption_index(rsubs, -1, -3) # reverse
+#         self.assertEqual(list(result), a)
+#
+#         rsubs = create_blocks()
+#         result = remove_caption_index(rsubs, 4, 2) # reverse
+#         self.assertEqual(list(result), a)
+#
+#         # single parameter
+#         rsubs = create_blocks()
+#         result = remove_caption_index(rsubs, 0)
+#         self.assertEqual(list(result), [])
+#
+#         rsubs = create_blocks()
+#         result = remove_caption_index(rsubs, 2)
+#         self.assertEqual(list(result), sort([self.subs[0], self.subs[1]]))
+#
+#     def test_remove_caption_timestamp(self):
+#         result = remove_caption_timestamp([], t('00:00:00,000'), t('00:00:30,000'))
+#         self.assertEqual(list(result), [])
+#
+#         rsubs = create_blocks()
+#         result = remove_caption_timestamp(rsubs, rsubs[0].start, rsubs[0].end)
+#         self.assertEqual(list(result), sort([self.subs[1], self.subs[2], self.subs[3], self.subs[4]]))
+#
+#         rsubs = create_blocks()
+#         result = remove_caption_timestamp(rsubs, rsubs[0].start, t('00:00:14,500'))
+#         self.assertEqual(list(result), sort([self.subs[2], self.subs[3], self.subs[4]]))
+#
+#         rsubs = create_blocks()
+#         result = remove_caption_timestamp(rsubs, t('00:00:11,000'), t('00:00:19,738'))
+#         self.assertEqual(list(result), [])
+#
+#         rsubs = create_blocks()
+#         result = remove_caption_timestamp(rsubs, t('00:00:00,000'), t('00:00:30,000'))
+#         self.assertEqual(list(result), [])
+#
+#         rsubs = create_blocks()
+#         result = remove_caption_timestamp(rsubs, t('00:00:00,000'), t('00:00:17,500'))
+#         a = [srt.Subtitle(1, t('00:00:17,500'), t('00:00:18,440'), "E"), srt.Subtitle(2, t('00:00:17,500'), t('00:00:19,738'), "C")]
+#         self.assertEqual(list(result), a) # split
+#
+#         # reverse timestamps
+#         result = remove_caption_timestamp([], t('00:00:30,000'), t('00:00:00,000'))
+#         self.assertEqual(list(result), [])
+#
+#         rsubs = create_blocks()
+#         result = remove_caption_timestamp(rsubs, t('00:00:30,000'), t('00:00:00,000'))
+#         self.assertEqual(list(result), list(self.subs))
+#
+#         rsubs = create_blocks()
+#         result = remove_caption_timestamp(rsubs, t('00:00:14,500'), rsubs[0].start)
+#         self.assertEqual(list(result), sort([self.subs[0],self.subs[1]]))
+#
+#         rsubs = create_blocks()
+#         result = remove_caption_timestamp(rsubs, t('00:00:19,738'), t('00:00:11,000'))
+#         self.assertEqual(list(result), list(self.subs))
+#
+#         rsubs = create_blocks()
+#         result = remove_caption_timestamp(rsubs, rsubs[0].end, rsubs[0].start)
+#         self.assertEqual(list(result), [self.subs[0]])
+#
+#         rsubs = create_blocks()
+#         result = remove_caption_timestamp(rsubs, rsubs[0].start, rsubs[0].start)
+#         self.assertEqual(list(result), [])