Skip to content

Commit

Permalink
Allow any non-numeric character to delimit timestamps
Browse files Browse the repository at this point in the history
  • Loading branch information
cdown committed Feb 21, 2016
1 parent 77c3583 commit 1ac24b1
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 7 deletions.
19 changes: 14 additions & 5 deletions srt.py
Expand Up @@ -12,15 +12,17 @@
log = logging.getLogger(__name__)

SRT_REGEX = re.compile(
r'(\d+)\n(\d+:\d+:\d+[,.]\d+) --> (\d+:\d+:\d+[,.]\d+) ?([^\n]*)\n(.*?)'
# We use "(\S+)" as the timestamp regex to avoid duplicating work and code
# that we would do in srt_timestamp_to_timedelta later anyway.
r'(\d+)\n(\S+) --> (\S+) ?([^\n]*)\n(.*?)'
# Many sub editors don't add a blank line to the end, and many editors
# accept it. We allow it in input.
r'(?:\n|\Z)(?:\n|\Z)'
# Some SRT blocks, while this is technically invalid, have blank lines
# inside the subtitle content. We look ahead a little to check that the
# next lines look like an index and a timestamp as a best-effort
# solution to work around these.
r'(?=(?:\d+\n\d+:|\Z))',
r'(?=(?:\d+\n[^\n]+ -->|\Z))',
re.DOTALL,
)

Expand Down Expand Up @@ -144,14 +146,21 @@ def srt_timestamp_to_timedelta(srt_timestamp):
r'''
Convert an SRT timestamp to a :py:class:`~datetime.timedelta`.
We're intentionally extremely liberal with the delimiters we will accept.
The only truly "acceptable" ones are ":" to delimit hours, minutes, and
seconds, and "," to delimit the milliseconds, but we will accept any
non-numeric character as a delimiter. We do this as many SRT editors use
non-standard delimiters, including "." and "|", but it's quite possible
(and likely) that there are even more that we can't predict.
.. doctest::
>>> srt_timestamp_to_timedelta('01:23:04,000')
datetime.timedelta(0, 4984)
'''
# "." is not technically a legal separator, but some subtitle editors use
# it to delimit msecs, and some players accept it.
hrs, mins, secs, msecs = (int(x) for x in re.split('[,:.]', srt_timestamp))
hrs, mins, secs, msecs = (
int(x) for x in re.split(r'[^\d]', srt_timestamp)
)
return timedelta(hours=hrs, minutes=mins, seconds=secs, milliseconds=msecs)


Expand Down
4 changes: 2 additions & 2 deletions tests/test_srt.py
Expand Up @@ -279,7 +279,7 @@ def test_parser_noncontiguous(subs, fake_idx, fake_hours, garbage):
# don't really delimit subtitles, it has to look at least a little like an
# SRT block.
composed = composed.replace(
'\n\n', '\n\n%d\n%d:%s' % (
'\n\n', '\n\n%d\n%d -->%s' % (
fake_idx, fake_hours, garbage,
)
)
Expand All @@ -294,7 +294,7 @@ def test_parser_noncontiguous(subs, fake_idx, fake_hours, garbage):
)
def test_parser_didnt_match_to_end_raises(subs, fake_idx, fake_hours, garbage):
srt_blocks = [sub.to_srt() for sub in subs]
garbage = '\n\n%d\n%d:%s' % (fake_idx, fake_hours, garbage)
garbage = '\n\n%d\n%d -->%s' % (fake_idx, fake_hours, garbage)
srt_blocks.append(garbage)
composed = ''.join(srt_blocks)

Expand Down

0 comments on commit 1ac24b1

Please sign in to comment.