# Setup

In [1]:
import os, subprocess, re

In [2]:
# Function to execute git commands
def execute_command(cmd, work_dir):
    pipe = subprocess.Popen(cmd, shell=True, cwd=work_dir, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    (out, error) = pipe.communicate()
    if error:
        return error
    return out
    pipe.wait()

## Getting all commit hashes from Butterknife

In [3]:
userhome = os.path.expanduser('~')
project = 'butterknife'
repository = f'{userhome}/Desktop/diff/datasource/{project}'

In [4]:
git_cmd = 'git log --pretty=format:"%H"'
log = execute_command(git_cmd, repository)
list_of_all_commits = log.decode('utf-8').split('\n')[180:] # Get the first 836 commits
total_commits = len(list_of_all_commits)

# Task 1: Compare NLA and NLD

## Utility to count number of added or deleted lines between commits

In [5]:
def count_no_of_lines(commit, parent_commit, algorithm, sign, repository):
    git_cmd = (
        f'git diff -w --ignore-blank-lines --diff-algorithm={algorithm} {commit} {parent_commit} | '
        f'grep \'^[{sign}]\' | '
        f'grep -Ev \'^(--- a/|\+\+\+ b/)\' | '
        f'wc -l'
    )
    return execute_command(git_cmd, repository)

## Compare differences in NLA and NLD for the two algorithms specified

In [6]:
MYERS = 'myers'
HISTOGRAM = 'histogram'

In [7]:
diff_commits = 0 # stores the number of different commits
for parent_commit in range(len(list_of_all_commits)-1,1,-1):
    # parent commit hash
    parent_commit_hash = list_of_all_commits[parent_commit]
    # commit hash
    commit_hash = list_of_all_commits[parent_commit-1]
    # NLA: Myers
    nla_from_myers = int(count_no_of_lines(commit_hash, parent_commit_hash, MYERS, '+', repository).decode('utf-8'))
    # NLA: Histogram
    nla_from_histogram = int(count_no_of_lines(commit_hash, parent_commit_hash, HISTOGRAM, '+', repository).decode('utf-8'))
    # NLD: Myers
    nld_from_myers = int(count_no_of_lines(commit_hash, parent_commit_hash, MYERS, '-', repository).decode('utf-8'))
    # NLD: Histogram
    nld_from_histogram = int(count_no_of_lines(commit_hash, parent_commit_hash, HISTOGRAM, '-', repository).decode('utf-8'))
    if nla_from_histogram != nla_from_myers and nld_from_histogram != nld_from_myers:
        diff_commits += 1

## Results

In [8]:
print(f'Total Commits: {total_commits} Difference: {diff_commits}')
print(f'%Different: {diff_commits/total_commits * 100}')

Total Commits: 836 Difference: 36
%Different: 4.30622009569378


# Task 2: Compare location of changes

## Get formatted diffs between commits

In [9]:
def parse_line_and_number(parent_commit_hash, commit_hash, algorithm):
    git_cmd = (
        f'git diff -w --ignore-blank-lines --diff-algorithm={algorithm} {parent_commit_hash} {commit_hash} | '
        'grep -P \'(^\+(?!\+{2})|^-(?!-{2})|^@{2}|^\s)\''
    )
    diff = execute_command(git_cmd, repository)
    return diff.decode('utf-8').split('\n')

In [10]:
def get_changes(parent_commit_hash, commit_hash, algorithm):
    lines = parse_line_and_number(parent_commit_hash,commit_hash, algorithm)
    start = -1
    additions = -1
    d_add = {}
    d_del = {}
    for line in lines:
        if line == '':
            continue
        if re.search(r'(?<=(@@) )(.*)(?= (@@))', line) is not None:
            changes = re.search(r'(?<=(@@) )(.*)(?= (@@))', line).group()
            extract_line_additons = re.search(r'(?<=\+).*', changes).group()
            start = extract_line_additons.split(',')[0]
            start = int(start) # get the starting line of changes in a block
        elif line[0] == '+': # store added line
            d_add[start] = line
        elif line[0] == '-': # store deleted line
            d_del[start] = line
        start+=1
    return d_add, d_del

## Compare the lines and their numbers

In [11]:
diff_commits = 0 # stores the number of different commits
for parent_commit in range(len(list_of_all_commits)-1,1,-1):
    # parent commit hash
    parent_commit_hash = list_of_all_commits[parent_commit]
    # commit hash
    commit_hash = list_of_all_commits[parent_commit-1]
    # location of added and deleted lines in myers' output
    ma, md = get_changes(parent_commit_hash, commit_hash, MYERS)
    # location of added and deleted lines in histogram's output
    ha, hd = get_changes(parent_commit_hash, commit_hash, HISTOGRAM)
    if ma != ha and md != hd: # compare locations deeply
        diff_commits += 1

## Results

In [12]:
print(f'Total Commits: {total_commits} Difference: {diff_commits}')
print(f'%Different: {diff_commits/total_commits * 100}')

Total Commits: 836 Difference: 49
%Different: 5.861244019138756
