Permalink
Browse files

Use difflib for diffing.

This allows us to find insertions and deletions between the 2
sequences, instead of simply matching item by item.

This changes subtitle_data.  Now it's possible to have empty
SubtitleLines in the middle of the list, rather than only at the end.

This also changes time_changed and text_changed:

Before we calculated the number of differences using an item-by-item
comparison, then divided that by the length of the longest sequence.

Now we calculate using difflib's ratio() method.  ratio() calculates
the number of differences, multiplies by 2, then divides by the sum of
the 2 sequence lengths.

This should be the same if the sequence lengths match up, but if items
are inserted or deleted, then the numbers will be different.  I think
this is fine, since before if you inserted/deleted a subtitle, the old
calculation would give you a totally bogus number.
  • Loading branch information...
bendk committed Mar 28, 2013
1 parent e787005 commit d2b503338781307b09e4ee40758d1c6c18976d31
Showing with 110 additions and 44 deletions.
  1. +71 −40 babelsubs/storage.py
  2. +39 −4 babelsubs/tests/test_diffing.py
View
@@ -16,7 +16,8 @@
# You should have received a copy of the GNU Affero General Public License along
# with this program. If not, see http://www.gnu.org/licenses/agpl-3.0.html.
-from itertools import izip_longest
+import difflib
+from itertools import izip_longest, izip
import os
import re
from lxml import etree
@@ -136,6 +137,73 @@ def to_clock_time(time_expression, tick_rate=None):
return time_expression
return milliseconds_to_time_clock_exp(time_expression_to_milliseconds(time_expression, tick_rate))
+class _Differ(object):
+ """Class that does the work for diff()."""
+ def __init__(self, set_1, set_2, mappings):
+ if len(set_1) == 0 and len(set_2) == 0:
+ # special case empty sets
+ self.result = {
+ 'subtitle_data' : [],
+ 'changed': False,
+ 'text_changed': 0.0,
+ 'time_changed': 0.0,
+ }
+ return
+ items1 = set_1.subtitle_items(mappings)
+ items2 = set_2.subtitle_items(mappings)
+
+ text_changed = self.calc_changed_amout([s.text for s in items1],
+ [s.text for s in items2])
+ time_changed = self.calc_changed_amout(
+ [(s.start_time, s.end_time) for s in items1],
+ [(s.start_time, s.end_time) for s in items2])
+ self.calc_subtitle_data(items1, items2)
+ self.result = {
+ 'text_changed': text_changed,
+ 'time_changed': time_changed,
+ 'changed': items1 != items2,
+ 'subtitle_data': self._subtitle_data,
+ }
+
+ def calc_changed_amout(self, seq1, seq2):
+ sm = difflib.SequenceMatcher(None, seq1, seq2)
+ return 1.0 - sm.ratio()
+
+ def calc_subtitle_data(self, items1, items2):
+ sm = difflib.SequenceMatcher(
+ None,
+ [(i.start_time, i.end_time, i.text) for i in items1],
+ [(i.start_time, i.end_time, i.text) for i in items2])
+ empty_line = SubtitleLine(None, None, None, None)
+ self._subtitle_data = []
+ for tag, i1, i2, j1, j2 in sm.get_opcodes():
+ if tag == 'equal':
+ for i, j in izip(xrange(i1, i2), xrange(j1, j2)):
+ self.make_subtitle_data_item(items1[i], items2[j])
+ elif tag == 'replace':
+ for i, j in izip_longest(xrange(i1, i2), xrange(j1, j2)):
+ if i is None:
+ self.make_subtitle_data_item(empty_line, items2[j])
+ elif j is None:
+ self.make_subtitle_data_item(items1[j], empty_line)
+ else:
+ self.make_subtitle_data_item(items1[i], items2[j])
+
+ elif tag == 'delete':
+ for i in xrange(i1, i2):
+ self.make_subtitle_data_item(items1[i], empty_line)
+ elif tag == 'insert':
+ for j in xrange(j1, j2):
+ self.make_subtitle_data_item(empty_line, items2[j])
+
+ def make_subtitle_data_item(self, s1, s2):
+ self._subtitle_data.append({
+ 'time_changed': ((s1.start_time, s1.end_time) !=
+ (s2.start_time, s2.end_time)),
+ 'text_changed': s1.text != s2.text,
+ 'subtitles': (s1, s2),
+ })
+
def diff(set_1, set_2, mappings=None):
"""
Performs a simple diff, only taking into account:
@@ -151,49 +219,12 @@ def diff(set_1, set_2, mappings=None):
{
time_changed: bool,
text_changed: bool,
- subtitle_1: [the subtitle data, (start_time, end_time, text),
- subtitle_2: [the subtitle data, (start_time, end_time, text),
+ subtitles: [subtitle_line1, subtitle_line2],
}, ... ordered list with both subtitles. If one list is longer , you
will get an empty SubtitleLine named tupple
]
"""
- result = {
- 'subtitle_data' : [],
- 'changed': False,
- 'text_changed': 0,
- 'time_changed': 0,
- }
- text_change_count = 0
- time_change_count = 0
- if len(set_1) == 0 and len(set_2) == 0:
- # empty sets are the same
- return result
- for sub_1, sub_2 in izip_longest([x for x in set_1.subtitle_items(mappings)],
- [x for x in set_2.subtitle_items(mappings)]):
- sub_added = (sub_1 is None) or (sub_2 is None)
- sub_1 = sub_1 or SubtitleLine(None, None, None, None)
- sub_2 = sub_2 or SubtitleLine(None, None, None, None)
- subtitle_result = {
- 'time_changed': False,
- 'text_changed': False,
- 'subtitles' : [sub_1, sub_2]
- }
- if sub_added:
- subtitle_result['text_changed'] = True
- subtitle_result['time_changed'] = True
- else:
- subtitle_result['text_changed'] = sub_1.text != sub_2.text
- subtitle_result['time_changed'] = sub_1.start_time != sub_2.start_time or sub_1.end_time != sub_2.end_time
- if subtitle_result['time_changed']:
- time_change_count +=1
- if subtitle_result['text_changed']:
- text_change_count +=1
- result['subtitle_data'].append(subtitle_result)
- longest_set_count = max(len(set_1),len(set_2))
- result['text_changed'] = text_change_count / (longest_set_count * 1.0)
- result['time_changed'] = time_change_count / (longest_set_count * 1.0)
- result['changed'] = (time_change_count + text_change_count) > 0
- return result
+ return _Differ(set_1, set_2, mappings).result
class SubtitleSet(object):
BASE_TTML = r'''
@@ -1,5 +1,5 @@
from unittest2 import TestCase
-from babelsubs.storage import SubtitleSet, diff
+from babelsubs.storage import SubtitleSet, SubtitleLine, diff
class DiffingTest(TestCase):
def test_empty_subs(self):
@@ -67,6 +67,43 @@ def test_time_changes(self):
self.assertEqual(sub_data['time_changed'], i ==1)
self.assertFalse(sub_data['text_changed'])
+ def test_insert(self):
+ set_1 = SubtitleSet.from_list('en', [
+ (0, 1000, "Hey 1"),
+ (1000, 2000, "Hey 2"),
+ (2000, 3000, "Hey 3"),
+ (3000, 4000, "Hey 4"),
+ ])
+ set_2 = SubtitleSet.from_list('en', [
+ (0, 1000, "Hey 1"),
+ (500, 800, "Hey 1.5"),
+ (1000, 2000, "Hey 2"),
+ (2000, 3000, "Hey 3"),
+ (3000, 4000, "Hey 4"),
+ ])
+ result = diff(set_1, set_2)
+ self.assertEqual(result['changed'], True)
+ # for both time_change and text_changed, we calculate them as follows:
+ # there are 9 total subs (4 in set_1, 5 in set_2). 8 of those are
+ # matches and 1 is new in set_2. So the change amount is 1/9
+ self.assertAlmostEqual(result['time_changed'], 1/9.0)
+ self.assertAlmostEqual(result['text_changed'], 1/9.0)
+ self.assertEqual(len(result['subtitle_data']), 5)
+
+ # check the lines that haven't changed
+ for i in (0, 2, 3, 4):
+ sub_data = result['subtitle_data'][i]
+ self.assertEquals(sub_data['time_changed'], False)
+ self.assertEquals(sub_data['text_changed'], False)
+ self.assertEquals(sub_data['subtitles'][0], set_2[i])
+ self.assertEquals(sub_data['subtitles'][1], set_2[i])
+ # check the line that was inserted
+ insert_sub_data = result['subtitle_data'][1]
+ self.assertEquals(insert_sub_data['time_changed'], True)
+ self.assertEquals(insert_sub_data['text_changed'], True)
+ self.assertEquals(insert_sub_data['subtitles'][0],
+ SubtitleLine(None, None, None, None))
+ self.assertEquals(insert_sub_data['subtitles'][1], set_2[1])
def test_data_ordering(self):
set_1 = SubtitleSet.from_list('en', [
@@ -95,6 +132,4 @@ def test_unsynced_reflect_time_changes(self):
])
result = diff(set_1, set_2)
- self.assertEqual(result['time_changed'], 0.5)
-
-
+ self.assertAlmostEqual(result['time_changed'], 1/3.0)

2 comments on commit d2b5033

@sjl

This comment has been minimized.

Show comment Hide comment
@sjl

sjl Mar 28, 2013

Looks good to me. The only thing I'm unclear on is that the commit message says "This changes subtitle_data." but I don't see any changes outside of the diff functionality.

sjl replied Mar 28, 2013

Looks good to me. The only thing I'm unclear on is that the commit message says "This changes subtitle_data." but I don't see any changes outside of the diff functionality.

@bendk

This comment has been minimized.

Show comment Hide comment
@bendk

bendk Mar 28, 2013

Owner

The only change is that before the empty SubtitleLine objects could only go at the end of the list and only if one list was longer than the other. Now they can be in the middle. Really a small change.

Owner

bendk replied Mar 28, 2013

The only change is that before the empty SubtitleLine objects could only go at the end of the list and only if one list was longer than the other. Now they can be in the middle. Really a small change.

Please sign in to comment.