# Resources
* [Overview of diff](https://blog.jcoglan.com/2017/02/12/the-myers-diff-algorithm-part-1/)
* [Patience diff intro](https://blog.jcoglan.com/2017/09/19/the-patience-diff-algorithm/#:~:text=What%20it%20really%20is%20is,like%20Myers%20on%20those%20pieces.&text=Patience%20diff%20splits%20this%20problem,exactly%20once%20in%20both%20versions.)

## Python Notes
dictionary insert/delete complexity: O(1)

dictionary lookup: O(n)

list insert/delete complexity: O(n)

list lookup by index: O(1)


## Steps to Find Diff
1. Read files to be compared
2. Define Slice
3. Find unique matching lines
4. Sort unique matching lines to match the most number of lines without cross matching
5. Divide into smaller slices and go to 3

### First read the two files to be compared

In [71]:
# file_A = 'a.py'
# file_B = 'b.py'

file_A = 'c.txt'
file_B = 'd.txt'

def read_files(file_A, file_B):
    with open(file_A,'r') as f:
        a_file = f.read().split('\n')

    with open(file_B,'r') as f:
        b_file = f.read().split('\n')
    
    return a_file, b_file

a_file, b_file = read_files(file_A, file_B)
read_files(file_A, file_B)

(['a', 'b', 'c', 'd', 'e', 'f', ''], ['a', 'b', 'c', 'b', 'd', 'f', 'e', ''])

### Define slice class ( a subset of both file after slicing)

In [75]:
class Slice:
    def __init__(self, a_low, a_high, b_low, b_high):
        self.a_low = a_low
        self.a_high = a_high
        self.b_low = b_low
        self.b_high = b_high
    
    def __str__(self):
        return "A: " + str((self.a_low+1, self.a_high+1)) +\
        "B: " + str((self.b_low+1, self.b_high+1))

# initialize the slice, a is the whole content of file one and b is whole content of file two
start_slice = Slice(0, len(a_file), 0, len(b_file))
print(start_slice)

A: (1, 8)B: (1, 9)


### Find matches(same in content) between A and B that are unique

In [72]:
def find_unique_match(slice):
    # line content: [number of appearances in A, number of appearances in B, position in A, position in B]
    line_counts = {}
    matches = []
    
    for i in range(slice.a_low, slice.a_high):
        if a_file[i] not in line_counts:
            position = i
            line_counts[a_file[i]] = [1,0,position,None]
        else:
            line_counts[a_file[i]][0] += 1
    
    for i in range(slice.b_low, slice.b_high):
        # b[i] is not in line_counts indicates there's no matches
        if b_file[i] in line_counts:
            position = i
            line_counts[b_file[i]][1] += 1
            line_counts[b_file[i]][3] = position
    
    for line in line_counts:
        if line_counts[line][:2] == [1,1]:
            matches.append((line_counts[line][2],line_counts[line][3], line))
    
    return matches # (position in A, position in B)

matches = find_unique_match(start_slice)

# for match in matches:
#     print(match)
find_unique_match(start_slice)

[(0, 0, 'a'), (2, 2, 'c'), (3, 4, 'd'), (4, 6, 'e'), (5, 5, 'f'), (6, 7, '')]

### Use Patience Sort Algorithm to find the best way to slice code blocks

In [93]:
def patience_sort(matches):
    if len(matches) <= 1:
        return matches
        
    stacks = [matches[0]] 
    # each element in directed_matches: (position in B, position in B of the previous element on the stack)
    directed_matches = []
    longest_matches = []
    
    for match in matches[1:]:
        stack_index = search(stacks, match) # need to be updated to binary search to save time
        if stack_index == -1: # front of the stacks
            stacks[0] = match
        elif stack_index == (len(stacks)-1): # end of the stacks
            stacks.append(match)
            directed_matches.append((match,stacks[stack_index]))
        elif stack_index >= 0: # middle of the stack
            stacks[stack_index+1] = match
            directed_matches.append((match,stacks[stack_index]))

    if len(directed_matches) == 0: # If only the front stack is used
        return [stacks[0]] # returns top of the front stack
    
    # finds the edge that contains the element at the end of stacks
    for match in directed_matches:
        if match[0] == stacks[-1]:
            last = match
    prev = [match for match in directed_matches if match[0] == last[1]][0]
    
    # Find the whole linked matches
    while 1:
        longest_matches.append(last[0])
        last = prev
        prev = [match for match in directed_matches if match[0] == last[1]]
        if len(prev) == 0: # if there's no prev, which means end of the linked list
            # Append the last two edges
            longest_matches.append(last[0])
            longest_matches.append(last[1])
            break
        prev = prev[0]
    
    return list(reversed(longest_matches))

def search(stacks, match):
    # find the index of closest match that is smaller than b_pos
    index = -1
    
    while index < (len(stacks)-1) and match[1] > stacks[index+1][1]: 
        index += 1

    return index

patience_sort(matches)

[(0, 0, 'a'), (2, 2, 'c'), (3, 4, 'd'), (5, 5, 'f'), (6, 7, '')]

In [98]:
def patience_diff(slice):
    matches = patience_sort(find_unique_match(slice))
    if len(matches) <= 1:
        return slice
    
    slices = []
    a_line = slice.a_low
    b_line = slice.b_low
    
    for match in matches:
        new_slice = Slice(a_line, match[0], b_line, match[1])
        
        a_line = match[0]+1
        b_line = match[1]+1
        slices.append(patience_diff(new_slice))
    
    return slices

slices = patience_diff(start_slice)
for slice in slices:
    print(str(slice))

A: (1, 1)B: (1, 1)
A: (2, 3)B: (2, 3)
A: (4, 4)B: (4, 5)
A: (5, 6)B: (6, 6)
A: (7, 7)B: (7, 8)
