Skip to content

Commit

Permalink
Merge f8a820c into 2d9f734
Browse files Browse the repository at this point in the history
  • Loading branch information
SwitchUpCB committed Apr 19, 2021
2 parents 2d9f734 + f8a820c commit b53012c
Show file tree
Hide file tree
Showing 4 changed files with 378 additions and 0 deletions.
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
"srt_tools/srt-mux",
"srt_tools/srt-play",
"srt_tools/srt-process",
"srt_tools/srt-remove"
],
license="Public Domain",
keywords="srt",
Expand Down
3 changes: 3 additions & 0 deletions srt_tools/README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,9 @@ Utilities
you can naively strip some basic HTML-like markup with ``srt process -m re -f
'lambda sub: re.sub("<[^<]+?>", "", sub)'``. HTML-like syntax is especially
prevalant in `SSA/ASS`_ subtitles that have been directly converted to SRT.
- *remove* allows removal by index/timestamp in sequential or non-sequential
order. By placing indexes/timestamps non-sequentially (i.e 10, 5), you specify
to remove all captions past 10 and before 5.

.. _mux: https://en.wikipedia.org/wiki/Multiplexing
.. _`SSA/ASS`: https://en.wikipedia.org/wiki/SubStation_Alpha
Expand Down
199 changes: 199 additions & 0 deletions srt_tools/srt-remove
Original file line number Diff line number Diff line change
@@ -0,0 +1,199 @@
#!/usr/bin/env python

"""Remove subtitles by index or timestamp."""

import srt
import datetime
import srt_tools.utils
import logging

log = logging.getLogger(__name__)

def parse_args():
examples = {
"Remove a single caption at index 10": "srt remove -i example.srt -x 10 11",
"Remove captions from index 10 to the end of the file.": "srt remove -i example.srt -x 10",
"Remove by timestamp all captions within :05 - :08": "srt remove -i example.srt -t 00:00:5,00 00:00:8,00",
"Remove by timestamp non-sequentially": "srt remove -i example.srt -t 00:00:30,00 00:00:10,00"
}
parser = srt_tools.utils.basic_parser(
description=__doc__, examples=examples, multi_input=True
)
parser.add_argument(
"-t",
metavar="TIMESTAMP",
type=lambda time: srt.srt_timestamp_to_timedelta(time),
nargs='*',
help="The index or timestamp to start or stop removing at."
)
parser.add_argument(
"-x",
metavar="INDEX",
type=int,
nargs='*',
help="The index or timestamp to start or stop removing at."
)
return parser.parse_args()

def get_timestamp(subs, obj):
if isinstance(obj, datetime.timedelta):
return obj
elif isinstance(obj, int):
len_subs = len(list(subs))
if (obj >= 0 and obj >= len_subs) or (obj < 0 and abs(obj) > len_subs):
raise IndexError("There is no caption at the specified index.")
return list(subs)[obj].start
else:
raise ValueError("You must enter an index or timestamp.")

def binary_search(arr, n, timestamp, at=False, highest=False):
"""
Search for the caption directly after the specified timestamp.
If 'at' is True, captions equivalent to the start can be found.
In the case of duplicates, return the lowest index unless 'highest' is true.
return -1 if no index is found.
"""
found = False
low = 0
high = n
mid = 0
while low < high:
mid = (high + low) // 2
if arr[mid].start < timestamp:
low = mid + 1
elif arr[mid].start > timestamp or (at and arr[mid].start == timestamp):
if not found and mid > 0 and (arr[mid - 1].start > timestamp or (at and arr[mid - 1].start == timestamp)):
high = mid
elif not found:
found = True
elif highest and mid + 1 < n and (arr[mid].start == arr[mid + 1].start or (at and arr[mid + 1].start == timestamp)):
low = mid + 1
else:
return mid
else:
low = mid + 1
return -1

def contains_timestamp(caption, timestamp):
return caption.start <= timestamp < caption.end

def captions_containing_timestamp(subs, timestamp, sorted=False):
# edge cases
subs = list(subs)
len_subs = len(subs)
if len_subs == 0:
return []

subs = list(srt.sort_and_reindex(subs)) if not sorted else subs
if timestamp < subs[0].start:
return []

# find the nearest (uncontained) caption that starts after the timestamp
ndx = binary_search(subs, len_subs, timestamp, highest=True)
ndx = ndx if ndx != -1 else len_subs

# find captions that contain the timestamp
captions = []
for i in range(0, ndx):
if contains_timestamp(subs[i], timestamp):
captions.append(subs[i])
return captions

def split(subs, timestamp):
subs = list(subs)
captions = reversed(captions_containing_timestamp(subs, timestamp))
for caption in captions:
if caption.start != timestamp and caption.end != timestamp:
subs.pop(caption.index - 1)
subs.append(srt.Subtitle(caption.index, caption.start, timestamp, caption.content))
subs.append(srt.Subtitle(caption.index, timestamp, caption.end, caption.content))
return srt.sort_and_reindex(subs)

def remove_caption_index(subs, index_one, index_two=0):
subs = list(subs)
len_subs = len(subs)
if len_subs == 0:
return subs

# check bounds
if index_one >= len_subs or (index_one < 0 and -index_one > len_subs) or index_two > len_subs or (index_two < 0 and -index_two > len_subs):
raise IndexError

# convert index to negative equivalent
if index_one >= 0:
index_one -= len_subs
if index_two >= 0:
index_two -= len_subs

if index_one == index_two:
subs.clear()
return srt.sort_and_reindex(subs)
elif index_one > index_two:
index_two += len_subs

for i in range(index_one, index_two):
if i < 0:
subs.pop(i)
else:
subs.pop(0)
return srt.sort_and_reindex(subs)

def remove_caption_timestamp(subs, timestamp_one, timestamp_two):
subs = list(subs)
if len(subs) == 0:
return subs
elif timestamp_one == timestamp_two:
return remove_caption_index(subs, 0)

# Split the caption at the start and end of the block(s).
subs = split(subs, timestamp_one)
subs = split(subs, timestamp_two)
subs = list(subs)

# Determine the sequential edge case.
sequential = timestamp_one < timestamp_two
t2_before_first = timestamp_two <= subs[0].start
if sequential and t2_before_first:
return subs

# Determine the first bound and it's edge cases.
len_subs = len(subs)
cdx = binary_search(subs, len_subs, timestamp_one, at=True, highest=False)
t1_after_last = True if cdx == -1 else False
if sequential and t1_after_last:
return subs
elif not sequential and t2_before_first and t1_after_last:
return subs

# Determine the second bound and it's edge cases.
cdx2 = binary_search(subs, len_subs, timestamp_two, at=True, highest=False)
t2_after_last = True if cdx2 == -1 else False
if sequential and timestamp_one <= subs[0].start and t2_after_last:
subs.clear()
return srt.sort_and_reindex(subs)
cdx2 = cdx2 - 1 if timestamp_two < subs[cdx2].start else cdx2
return remove_caption_index(subs, cdx, cdx2)


def main():
args = parse_args()
logging.basicConfig(level=args.log_level)
srt_tools.utils.set_basic_args(args)

subs = list(args.input[0])
time_args = args.x if args.x else args.t
if len(time_args) == 1 and isinstance(time_args[0], int):
subs = remove_caption_index(subs, time_args[0])
else:
timestamps = (get_timestamp(subs, time_args[0]), get_timestamp(subs, time_args[1]))
subs = remove_caption_timestamp(subs, timestamps[0], timestamps[1])

output = srt_tools.utils.compose_suggest_on_fail(subs, strict=args.strict)

try:
args.output.write(output)
except (UnicodeEncodeError, TypeError): # Python 2 fallback
args.output.write(output.encode(args.encoding))

if __name__ == "__main__": # pragma: no cover
main()
175 changes: 175 additions & 0 deletions srt_tools/tests/test_remove_captions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
# import unittest
# import importlib
# import srt
# from srt import srt_timestamp_to_timedelta as t
#
# def sort(subs):
# return list(srt.sort_and_reindex(subs))
#
# def create_blocks(setting=0):
# subs = []
# if setting == 0:
# subs.append(srt.Subtitle(1, t('00:00:11,000'), t('00:00:12,701'), "A"))
# subs.append(srt.Subtitle(2, t('00:00:12,701'), t('00:00:14,203'), "B"))
# subs.append(srt.Subtitle(3, t('00:00:14,500'), t('00:00:19,738'), "C"))
# subs.append(srt.Subtitle(4, t('00:00:16,538'), t('00:00:17,272'), "D"))
# subs.append(srt.Subtitle(5, t('00:00:17,272'), t('00:00:18,440'), "E"))
# elif setting == 1:
# subs.append(srt.Subtitle(1, t('00:00:1,000'), t('00:00:10,000'), "A"))
# subs.append(srt.Subtitle(2, t('00:00:2,000'), t('00:00:08,000'), "B"))
# subs.append(srt.Subtitle(3, t('00:00:3,000'), t('00:00:05,000'), "C"))
# subs.append(srt.Subtitle(4, t('00:00:3,500'), t('00:00:04,500'), "D"))
# subs.append(srt.Subtitle(5, t('00:00:6,000'), t('00:00:08,000'), "E"))
# subs.append(srt.Subtitle(6, t('00:00:9,000'), t('00:00:10,000'), "F"))
# return subs
#
# from srt_tools.utils import *
# srt_remove = importlib.import_module('srt-remove')
# from srt_remove import *
#
# class TestRemoveCaptions(unittest.TestCase):
# def setUp(self):
# self.subs = create_blocks()
#
# def tearDown(self):
# pass
#
# def test_get_timestamp(self):
# # Indexes
# self.assertEqual(get_timestamp(self.subs, 0), self.subs[0].start)
# self.assertEqual(get_timestamp(self.subs, 4), self.subs[4].start)
# self.assertEqual(get_timestamp(self.subs, -1), self.subs[-1].start)
# self.assertEqual(get_timestamp(self.subs, -4), self.subs[-4].start)
# with self.assertRaises(IndexError):
# get_timestamp(self.subs, -5)
#
# # Strings
# self.assertEqual(get_timestamp(self.subs, '00:00:11,000'), self.subs[0].start)
# self.assertEqual(get_timestamp(self.subs, '00:00:0,000'), t('00:00:0,000'))
# self.assertEqual(get_timestamp(self.subs, '00:00:30,000'), t('00:00:30,000'))
# with self.assertRaises(srt.TimestampParseError):
# self.assertEqual(get_timestamp(self.subs, '-00:00:50,000'))
# self.assertEqual(get_timestamp(self.subs, '00:00:-10,000'))
#
# # Date Time
# self.assertEqual(get_timestamp(self.subs, t('00:00:11,000')), self.subs[0].start)
#
# def test_captions_containing_timestamp(self):
# self.assertEqual(captions_containing_timestamp(self.subs, t('00:00:00,000')), [])
# self.assertEqual(captions_containing_timestamp(self.subs, t('00:00:11,000')), [self.subs[0]])
# self.assertEqual(captions_containing_timestamp(self.subs, t('00:00:12,000')), [self.subs[0]])
# self.assertEqual(captions_containing_timestamp(self.subs, t('00:00:12,701')), [self.subs[1]])
# self.assertEqual(captions_containing_timestamp(self.subs, t('00:00:35,000')), [])
# self.assertEqual(captions_containing_timestamp(self.subs, t('00:00:16,708')), [self.subs[2], self.subs[3]])
# self.assertEqual(captions_containing_timestamp(self.subs, t('00:00:17,272')), [self.subs[2], self.subs[4]])
#
# # distanced overlaps
# rsubs = create_blocks(1)
# self.assertEqual(captions_containing_timestamp(rsubs, t('00:00:4,000')), [rsubs[0], rsubs[1], rsubs[2], rsubs[3]])
# self.assertEqual(captions_containing_timestamp(rsubs, t('00:00:9,000')), [rsubs[0], rsubs[5]])
# self.assertEqual(captions_containing_timestamp(rsubs, t('00:00:9,500')), [rsubs[0],rsubs[5]])
# self.assertEqual(captions_containing_timestamp(rsubs, t('00:00:3,450')), [rsubs[0], rsubs[1], rsubs[2]])
# self.assertEqual(captions_containing_timestamp(rsubs, t('00:00:5,500')), [rsubs[0], rsubs[1]])
# self.assertEqual(captions_containing_timestamp(rsubs, t('00:00:11,000')), [])
#
# def test_remove_caption_index(self):
# a = sort([self.subs[0], self.subs[2], self.subs[3], self.subs[4]])
# rsubs = create_blocks()
# result = remove_caption_index(rsubs, 1, 2)
# self.assertEqual(list(result), a)
#
# rsubs = create_blocks()
# result = remove_caption_index(rsubs, 1, -3)
# self.assertEqual(list(result), a)
#
# rsubs = create_blocks()
# result = remove_caption_index(rsubs, 1, 3)
# self.assertEqual(list(result), sort([self.subs[0], self.subs[3], self.subs[4]]))
#
# rsubs = create_blocks()
# result = remove_caption_index(rsubs, 2, 5)
# self.assertEqual(list(result), sort([self.subs[0], self.subs[1]]))
#
# rsubs = create_blocks()
# result = remove_caption_index(rsubs, -2, 3)
# self.assertEqual(list(result), [])
#
# rsubs = create_blocks()
# with self.assertRaises(IndexError):
# result = remove_caption_index(rsubs, 1, 8)
# result = remove_caption_index(rsubs, -7, 4)
# result = remove_caption_index(rsubs, 5, 4)
#
# result = remove_caption_index(rsubs, 3, 1) # reverse
# self.assertEqual(list(result), sort([self.subs[1],self.subs[2]]))
#
# rsubs = create_blocks()
# result = remove_caption_index(rsubs, 2, 0) # reverse
# self.assertEqual(list(result), sort([self.subs[0], self.subs[1]]))
#
# a = sort([self.subs[2], self.subs[3]])
# rsubs = create_blocks()
# result = remove_caption_index(rsubs, -1, -3) # reverse
# self.assertEqual(list(result), a)
#
# rsubs = create_blocks()
# result = remove_caption_index(rsubs, 4, 2) # reverse
# self.assertEqual(list(result), a)
#
# # single parameter
# rsubs = create_blocks()
# result = remove_caption_index(rsubs, 0)
# self.assertEqual(list(result), [])
#
# rsubs = create_blocks()
# result = remove_caption_index(rsubs, 2)
# self.assertEqual(list(result), sort([self.subs[0], self.subs[1]]))
#
# def test_remove_caption_timestamp(self):
# result = remove_caption_timestamp([], t('00:00:00,000'), t('00:00:30,000'))
# self.assertEqual(list(result), [])
#
# rsubs = create_blocks()
# result = remove_caption_timestamp(rsubs, rsubs[0].start, rsubs[0].end)
# self.assertEqual(list(result), sort([self.subs[1], self.subs[2], self.subs[3], self.subs[4]]))
#
# rsubs = create_blocks()
# result = remove_caption_timestamp(rsubs, rsubs[0].start, t('00:00:14,500'))
# self.assertEqual(list(result), sort([self.subs[2], self.subs[3], self.subs[4]]))
#
# rsubs = create_blocks()
# result = remove_caption_timestamp(rsubs, t('00:00:11,000'), t('00:00:19,738'))
# self.assertEqual(list(result), [])
#
# rsubs = create_blocks()
# result = remove_caption_timestamp(rsubs, t('00:00:00,000'), t('00:00:30,000'))
# self.assertEqual(list(result), [])
#
# rsubs = create_blocks()
# result = remove_caption_timestamp(rsubs, t('00:00:00,000'), t('00:00:17,500'))
# a = [srt.Subtitle(1, t('00:00:17,500'), t('00:00:18,440'), "E"), srt.Subtitle(2, t('00:00:17,500'), t('00:00:19,738'), "C")]
# self.assertEqual(list(result), a) # split
#
# # reverse timestamps
# result = remove_caption_timestamp([], t('00:00:30,000'), t('00:00:00,000'))
# self.assertEqual(list(result), [])
#
# rsubs = create_blocks()
# result = remove_caption_timestamp(rsubs, t('00:00:30,000'), t('00:00:00,000'))
# self.assertEqual(list(result), list(self.subs))
#
# rsubs = create_blocks()
# result = remove_caption_timestamp(rsubs, t('00:00:14,500'), rsubs[0].start)
# self.assertEqual(list(result), sort([self.subs[0],self.subs[1]]))
#
# rsubs = create_blocks()
# result = remove_caption_timestamp(rsubs, t('00:00:19,738'), t('00:00:11,000'))
# self.assertEqual(list(result), list(self.subs))
#
# rsubs = create_blocks()
# result = remove_caption_timestamp(rsubs, rsubs[0].end, rsubs[0].start)
# self.assertEqual(list(result), [self.subs[0]])
#
# rsubs = create_blocks()
# result = remove_caption_timestamp(rsubs, rsubs[0].start, rsubs[0].start)
# self.assertEqual(list(result), [])

0 comments on commit b53012c

Please sign in to comment.