## Exercise 3 – identifying vulnerability-contributing commits

In [1]:
! pip3 install GitPython
! pip3 install seaborn
! pip3 install pydriller

import numpy as np
import pandas as pd
import os
import time

import git
from git import RemoteProgress
from git import Repo

import matplotlib.pyplot as plt
import seaborn as sns

BLUE   = '\033[94m'
GREEN  = '\033[92m'
ORANGE = '\033[93m'
RED    = '\033[91m'
ENDC   = '\033[0m'



In [122]:
url_a = "https://github.com/opennetworkinglab/onos"
dir_a = "onos"

url_b = "https://github.com/apache/ofbiz/"
dir_b = "ofbiz"

url_c = "https://github.com/apache/struts/"
dir_c = "struts"

if not os.path.isdir(dir_a):
    Repo.clone_from(url_a, dir_a)
if not os.path.isdir(dir_b):
    Repo.clone_from(url_b, dir_b)
if not os.path.isdir(dir_c):
    Repo.clone_from(url_c, dir_c)

In [134]:
##### (1) ONOS
# local_link = "onos"
# fixing_commit = "af1fa39a53c0016e92c1de246807879c16f507d6"

##### (2) OFBIZ
# local_link = "ofbiz"
# fixing_commit = "34125e42d1db74064482c296c871e11c92dc4527"   # Not commit listed in spreadsheet 

##### (3) STRUTS
local_link = "struts"
fixing_commit = "9d47af6ffa355977b5acc713e6d1f25fac260a28"

repo = Repo(local_link)

In [135]:
for commit in list(repo.iter_commits()):
    # must include '.hexsha' or '==' won't match them even if console output looks the same
    if commit.hexsha == fixing_commit:
        affected_files = commit.stats.files
        print(commit.stats.files)

{'core/src/main/java/com/opensymphony/xwork2/validator/validators/URLValidator.java': {'insertions': 11, 'deletions': 12, 'lines': 23}}


In [136]:
import io
import sys

# printing the 'difference': so just the lines added or deleted
# is not enough: need the whole file + the lines added/deleted

# 1. store the lines added and deleted
# 2. loop through the full file, highlight the lines added or deleted
# 3. 'expand' added lines to the full scope

# (1) if line has no indentation, it is of global scope and it will be treated as a single line
# (2) if not global scope, 'keep looking upwards till it encounters a starting bracket'
# (3)  if not global scope'keep looking downwards till it encounters a closing bracket'

full_lines = []
add_lines  = []
sub_lines  = []
blame_lines= []

diff_data = repo.git.diff(fixing_commit + "^", fixing_commit).splitlines()

commit = repo.commit(fixing_commit)

for line in diff_data:

    # ignore 'file path' lines
    if line.startswith("++") or line.startswith("--"):
        continue

    if line.startswith("+"):
        add_lines.append(line)
    if line.startswith("-"):
        sub_lines.append(line)  

for affected_file in affected_files:
    print("############################## File: " + affected_file + " ##################################")
    searchScope = False
    
    # retrieve the full contents of the modified file
    targetfile = commit.tree / affected_file
    with io.BytesIO(targetfile.data_stream.read()) as f:
        full_lines = f.read().decode('utf-8').splitlines()

    print("addlines[0] = ", add_lines[0])

    for full_line in full_lines:
        if len(add_lines) == 0:
            break
            
        # if the line has no leading whitespace, it is of global scope and is treated individually
        if len(full_line.lstrip()) == len(full_line):
            searchScope = False
       
        elif "}" in full_line:
            searchScope = False            
            
        # to prevent duplicates, always take the first item, then remove it
        if add_lines[0][1:] == full_line:
            searchScope = True
            print("[" + BLUE, full_line, ENDC + "]")
            # do NOT add 'added' lines to the blame list: these have been added by
            # the fixing commit, and so will just be 'blamed' on the vulnerability fixer
            # blame_lines.append(full_line)
            add_lines.pop(0)  
        else:
            if searchScope == True:
                print("[" + ORANGE + full_line + ENDC + "]")
                blame_lines.append(full_line)
            else:
                print("[" + full_line + "]")
       

print("should be empty: ", add_lines) 
 

############################## File: core/src/main/java/com/opensymphony/xwork2/validator/validators/URLValidator.java ##################################
addlines[0] =  +    public static final String DEFAULT_URL_REGEX = "^(?:https?|ftp)://" +
[/*]
[ * Copyright 2002-2006,2009 The Apache Software Foundation.]
[ * ]
[ * Licensed under the Apache License, Version 2.0 (the "License");]
[ * you may not use this file except in compliance with the License.]
[ * You may obtain a copy of the License at]
[ * ]
[ *      http://www.apache.org/licenses/LICENSE-2.0]
[ * ]
[ * Unless required by applicable law or agreed to in writing, software]
[ * distributed under the License is distributed on an "AS IS" BASIS,]
[ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.]
[ * See the License for the specific language governing permissions and]
[ * limitations under the License.]
[ */]
[package com.opensymphony.xwork2.validator.validators;]
[]
[import com.opensymphony.xwork2.valida

### Assign blame to lines identified in 'vulnerable scope' (from lines *added*)

In [137]:
# git blame = "Annotates each line in the given file with information from the revision which last modified the line"
    
blame_commits = []
blame_commits_unique = []

# loop through the array of files affected by the fixing commit
for affected_file in affected_files:
    print("############################## New file ##################################")

    print("blamelines[0] = ", blame_lines[0])

    for commit, lines in repo.blame(fixing_commit, affected_file):
        for line in lines:
            
            if len(blame_lines) == 0:
                break
                
            if blame_lines[0] == line:
                blame_lines.pop(0)  
                print("[" + BLUE + line + ENDC + "] commit [" + GREEN, commit, ENDC + "]")
                blame_commits.append(commit)
                if commit not in blame_commits_unique:
                    blame_commits_unique.append(commit)
            else:
                print("[" + line + "]")


        print("")
    

############################## New file ##################################
blamelines[0] =              "[a-z][a-z0-9-]*[a-z0-9]" +
[/*]
[ * Copyright 2002-2006,2009 The Apache Software Foundation.]
[ *]
[ * Licensed under the Apache License, Version 2.0 (the "License");]
[ * you may not use this file except in compliance with the License.]
[ * You may obtain a copy of the License at]
[ *]
[ *      http://www.apache.org/licenses/LICENSE-2.0]
[ *]
[ * Unless required by applicable law or agreed to in writing, software]
[ * distributed under the License is distributed on an "AS IS" BASIS,]
[ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.]
[ * See the License for the specific language governing permissions and]
[ * limitations under the License.]
[ */]
[package com.opensymphony.xwork2.validator.validators;]
[]
[import com.opensymphony.xwork2.validator.ValidationException;]

[import org.apache.commons.lang3.StringUtils;]

[import org.apache.logging.log4j.LogMana

### Add commits from lines *subtracted* to the blame list

In [138]:
print(sub_lines)

print("[" + sub_lines[0][1:] + "]")

for affected_file in affected_files:
    for commit, lines in repo.blame(fixing_commit, affected_file):
        for line in lines:  
            print("[" + line + "]")
            if line.strip() == sub_lines[0][1:].strip():
                
                print("%s changed this line: %s" % (commit, line))
                blame_commits.append(commit)
                if commit not in blame_commits_unique:
                    blame_commits_unique.append(commit)
                
                sub_lines.pop(0)

['-    public static final String DEFAULT_URL_REGEX = "^(https?|ftp):\\\\/\\\\/" +', '-            "(([a-z0-9$_\\\\.\\\\+!\\\\*\\\\\'\\\\(\\\\),;\\\\?&=\\\\-]|%[0-9a-f]{2})+" +', '-            "(:([a-z0-9$_\\\\.\\\\+!\\\\*\\\\\'\\\\(\\\\),;\\\\?&=\\\\-]|%[0-9a-f]{2})+)?" +', '-            "@)?(#?" +', '-            ")((([a-z0-9]\\\\.|[a-z0-9][a-z0-9-]*[a-z0-9]\\\\.)*" +', '-            "|((\\\\d|[1-9]\\\\d|1\\\\d{2}|2[0-4][0-9]|25[0-5])\\\\.){3}" +', '-            "(\\\\d|[1-9]\\\\d|1\\\\d{2}|2[0-4][0-9]|25[0-5])" +', '-            ")(:\\\\d+)?" +', '-            ")(((\\\\/([a-z0-9$_\\\\.\\\\+!\\\\*\\\\\'\\\\(\\\\),;:@&=\\\\-]|%[0-9a-f]{2})*)*" +', '-            "(\\\\?([a-z0-9$_\\\\.\\\\+!\\\\*\\\\\'\\\\(\\\\),;:@&=\\\\-\\\\/\\\\:]|%[0-9a-f]{2})*)" +', '-            "?)?)?" +', '-            "(#([a-z0-9$_\\\\.\\\\+!\\\\*\\\\\'\\\\(\\\\),;:@&=\\\\-]|%[0-9a-f]{2})*)?" +']
[    public static final String DEFAULT_URL_REGEX = "^(https?|ftp):\\/\\/" +]
[/*]
[ * Copyright 2002-2006,2009 The 

### Calculate the VCC

In [139]:
print("List of commits that contributed to the vulnerable code portions:\n")
worst_commit = None
worst_number = 0
for unique_commit in blame_commits_unique:
    if blame_commits.count(unique_commit) > worst_number:
        worst_commit = unique_commit
        worst_number = blame_commits.count(unique_commit)
    print(GREEN, unique_commit, ENDC, " : ", BLUE, blame_commits.count(unique_commit), ENDC)

print("\nworst commit (VCC): ", RED, worst_commit, ENDC)

List of commits that contributed to the vulnerable code portions:

[92m 931df54ab379bf4eb5a625bf05066b8563c3737b [0m  :  [94m 1 [0m

worst commit (VCC):  [91m 931df54ab379bf4eb5a625bf05066b8563c3737b [0m


### (a) Message and title of fixing commit

In [140]:
show_data = repo.git.show("-s", worst_commit).splitlines()
for line in show_data:
    print(line)

commit 931df54ab379bf4eb5a625bf05066b8563c3737b
Author: Lukasz Lenart <lukaszlenart@apache.org>
Date:   Wed Nov 16 07:46:29 2016 +0100

    Optimises validator to reduce number of compiling pattern


### (b) total files affected 

In [141]:
# 'commit.hexsha' can be replaced with just 'commit'
print("total repo commits: ", len(list(repo.iter_commits())), "\n")

for commit in list(repo.iter_commits()):
#     print(commit)
#     print(commit.message)
    if commit.hexsha == worst_commit.hexsha: 
        print(commit.stats.files)
        print("\nnumber of files affected: ", len(commit.stats.files), "\n")
        print("commit hash: ", commit)

total repo commits:  5678 

{'core/src/main/java/com/opensymphony/xwork2/validator/validators/URLValidator.java': {'insertions': 38, 'deletions': 23, 'lines': 61}}

number of files affected:  1 

commit hash:  931df54ab379bf4eb5a625bf05066b8563c3737b


### (c) total directories affected 

In [142]:
dirs = []

for commit in list(repo.iter_commits()):
    if str(commit) == worst_commit.hexsha: 
        
        for file in commit.stats.files:
#             print(file)
            folder = file.split("/")[0:-1]
            folderpath =  "/".join(folder)
            print("folder: " + folderpath)
            if folderpath not in dirs:
                dirs.append(folderpath)

print("\nTotal unique directories: ", len(dirs))

folder: core/src/main/java/com/opensymphony/xwork2/validator/validators

Total unique directories:  1


### (d) total lines of code (INCLUDING comments and blank lines) deleted
### (e) total lines of code (INCLUDING comments and blank lines) added
### (f) total lines of code (EXCLUDING comments and blank lines) deleted
### (g) total lines of code (EXCLUDING comments and blank lines) added

In [143]:
diff_data = repo.git.diff(worst_commit.hexsha + "^", worst_commit).splitlines()

total_lines_added_including_blank_comments   = 0
total_lines_added_excluding_blank_comments   = 0
total_lines_removed_including_blank_comments = 0
total_lines_removed_excluding_blank_comments = 0

isComment = False

for line in diff_data:

    # determine whether the line is an addition or a deletion
    isAdded   = False
    isRemoved = False
    
    if line.startswith("+"):
        isAdded = True 
    if line.startswith("-"):
        isRemoved = True

    # remove the leading '+'
    line = line[1:]

    # ignore 'file path' lines
    if line.startswith("++") or line.startswith("--"):
        continue
    
    # detect when comment code starts
    if line.strip().startswith("/*") or line.startswith('"""'):
        isComment = True

    # if line is empty or is a single-line comment, it is a 'blank or comment' line
    if len(line.strip()) == 0:
        total_lines_added_including_blank_comments += 1
#         print("[" + ORANGE + "BLANK" + ENDC + "]")
    elif isAdded and (isComment or line.startswith("//") or line.startswith("#")):
        total_lines_added_including_blank_comments += 1
#         print("[" + ORANGE + line + ENDC + "]")
    elif isAdded:
#             print("Whitespace chars: ", len(line), " vs total chars: ", len(line))
        total_lines_added_including_blank_comments += 1
        total_lines_added_excluding_blank_comments += 1
#         print("[" + BLUE + line + ENDC + "]")
    elif isRemoved:
#         print("[" + RED + line + ENDC + "]")
        total_lines_removed_including_blank_comments += 1
        total_lines_removed_excluding_blank_comments += 1      
       
        # detect when comment code ends
    if line.strip().endswith("*/") or line.endswith('"""'):
        isComment = False
        
print("total lines added (INCLUDING comments and blanks): ", total_lines_added_including_blank_comments)
print("total lines added (EXCLUDING comments and blanks): ", total_lines_added_excluding_blank_comments)
print("total lines removed (INCLUDING comments and blanks): ", total_lines_removed_including_blank_comments)
print("total lines removed (EXCLUDING comments and blanks): ", total_lines_removed_excluding_blank_comments)

total lines added (INCLUDING comments and blanks):  47
total lines added (EXCLUDING comments and blanks):  34
total lines removed (INCLUDING comments and blanks):  23
total lines removed (EXCLUDING comments and blanks):  23


### (h) How many days between fixing commit and previous commit to the same file?
### (i) How many times has the file been modified since creation?
### (j) Which developers have modifed the file?

In [144]:
import math

last_commit_time = -1
between_time = -1
prevCommit = False
contributors = []

print("worst_commit: ", worst_commit, "\n")

for affected_file in affected_files:
    print("\t", ORANGE, affected_file, ENDC)
    commits_touching_path = list(repo.iter_commits(paths=affected_file))

#     print(commits_touching_path)

    for commit in commits_touching_path:

        # GitPython differentiates between 'author' and 'commiter'
        # commited_date is in epoch time
        year  = str(time.localtime(commit.committed_date).tm_year)
        month = str(time.localtime(commit.committed_date).tm_mon)
        day   = str(time.localtime(commit.committed_date).tm_mday)

        if prevCommit:
            print(GREEN + commit.hexsha + ENDC + "\t" + day + "/" + month + "/" + year + "\t" + str(commit.committer))
            prevCommit = False
            between_time = last_commit_time - commit.committed_date
        elif commit.hexsha == worst_commit.hexsha:
            print(ORANGE + commit.hexsha + ENDC + "\t" + day + "/" + month + "/" + year + "\t" + str(commit.committer))
            last_commit_time = commit.committed_date
            prevCommit = True
        elif commit.hexsha == fixing_commit:
            print(BLUE + commit.hexsha + ENDC + "\t" + day + "/" + month + "/" + year + "\t" + str(commit.committer))
        else:
            print(commit.hexsha + "\t" + day + "/" + month + "/" + year + "\t" + str(commit.committer))

        # add the contributor to the list of contributors, if they are not already on the list
        if str(commit.committer) not in contributors:
            contributors.append(str(commit.committer))

    print("\nTotal commits to file:", len(commits_touching_path))
    print("Days between worst commit and previous commit:", math.ceil(between_time / 86400), "days")
    print("Unique contributors: ", GREEN, contributors, ENDC)
    print("Total unique contributors to file: ", len(contributors), "\n\n")

worst_commit:  931df54ab379bf4eb5a625bf05066b8563c3737b 

	 [93m core/src/main/java/com/opensymphony/xwork2/validator/validators/URLValidator.java [0m
b37af16aeea4cb3a9f9a843b9fb671ade1db0101	21/10/2017	Lukasz Lenart
0ae6aa84eed74fefd5dbfd1407d61023000140d9	21/10/2017	Lukasz Lenart
418a20c0594f23764fe29ced400c1219239899a8	4/8/2017	Stefaan Dutry
8a04e80f01350c90f053d71366d5e0c2186fded5	2/8/2017	Stefaan Dutry
[94m9d47af6ffa355977b5acc713e6d1f25fac260a28[0m	2/8/2017	Stefaan Dutry
8df5a897f61f3ef45c36fdd9275e66669ae4516c	1/8/2017	Lukasz Lenart
1ad6d2c31726864275458c02f9cbbe22936f7833	14/4/2017	Lukasz Lenart
569e1f6c82dbb562f105279df68e3dd2cf0545bd	14/4/2017	Lukasz Lenart
6f272e4873bdba03c1b3fe3ac8c0670b680d2c6c	14/4/2017	Lukasz Lenart
[93m931df54ab379bf4eb5a625bf05066b8563c3737b[0m	16/11/2016	Lukasz Lenart
[92m554b9dddb0fbd1e581ef577dd62a7c22955ad0f6[0m	16/11/2016	Lukasz Lenart
d19b9eaa82753a5dd671ee4f0847574f65f4aebf	28/7/2016	Lukasz Lenart
a0fdca138feec2c2e94eb75ca1f8b76678b4d152

### (k) for each developer identified, how many commits hav### (k) For each developer in (j), how may commits have they submitted? Are they experienced or new?e they made? Are they experienced?

In [145]:
commit_authors = repo.git.shortlog("-sne", "--all").splitlines()

author_commits = []

for commit_author in commit_authors:
    commit, author = commit_author.split("\t")
    # remove author's email
    author_clean = author.split("<")[0].strip()
#     print("author: [" + author_clean + "]")
    if author_clean in contributors:
        print("author: [" + author_clean + "]")
        author_commits.append([int(commit), author])
    
df = pd.DataFrame(author_commits, columns=['Commit', 'Author'])
df.head(100)

author: [Lukasz Lenart]
author: [Lukasz Lenart]
author: [Stefaan Dutry]


Unnamed: 0,Commit,Author
0,2213,Lukasz Lenart <lukaszlenart@apache.org>
1,240,Lukasz Lenart <lukasz.lenart@gmail.com>
2,61,Stefaan Dutry <stefaan.dutry@gmail.com>
