## Exercise 4 – identifying vulnerability-contributing commits

### Initialise modules

In [2]:
! pip3 install GitPython
! pip3 install seaborn
! pip3 install pydriller

import numpy as np
import pandas as pd
import os
import time

import git
from git import RemoteProgress
from git import Repo

import matplotlib.pyplot as plt
import seaborn as sns

BLUE   = '\033[94m'
GREEN  = '\033[92m'
ORANGE = '\033[93m'
RED    = '\033[91m'
ENDC   = '\033[0m'



### Clone github repositories

In [3]:
url_a = "https://github.com/jenkinsci/groovy-sandbox"
dir_a = "groovy-sandbox"

# for cloning.
if not os.path.isdir(dir_a):
    Repo.clone_from(url_a, dir_a)

### Set repository ('local_link') and fixing commit ('fixing_commit')

In [4]:
local_link = "groovy-sandbox"
fixing_commit = "0cd7ec12b7c56cfa3167d99c5f43147ce05449d3"
manual_vcc = "00fcf59a73dee30b29c496630b6b8ec09c5b13e4"


repo = Repo(local_link)

### Checking if fixing commit exists

In [133]:
all_commits = repo.iter_commits()
for commit in list(all_commits):
    if commit.hexsha == fixing_commit:
        affected_files = commit.stats.files
        print("Fixing commit exists")

Fixing commit exists


### Calculate the VCC

In [172]:
import io
import sys

full_lines = []
add_lines  = []
blame_lines= []

diff_data = repo.git.diff(fixing_commit + "^", fixing_commit).splitlines()

commit = repo.commit(fixing_commit)

for line in diff_data:
    if line.startswith("++") or line.startswith("--"):
        continue
    if line.startswith("+"):
        add_lines.append(line)

for affected_file in affected_files:
    searchScope = False
    
    # retrieve the full contents of the modified file
    targetfile = commit.tree / affected_file
    with io.BytesIO(targetfile.data_stream.read()) as f:
        full_lines = f.read().decode('utf-8').splitlines()
    for full_line in full_lines:
        if len(add_lines) == 0:
            break
            
        # if the line has no leading whitespace, it is of global scope and is treated individually
        if len(full_line.lstrip()) == len(full_line):
            searchScope = False
       
        elif full_line.strip().endswith("}"):
            searchScope = False            
            
        # to prevent duplicates, always take the first item, then remove it
        if add_lines[0][1:] == full_line:
            searchScope = True
            add_lines.pop(0)  
        else:
            if searchScope == True:
                blame_lines.append(full_line)
blame_commits = []
blame_commits_unique = []

# loop through the array of files affected by the fixing commit
for affected_file in affected_files:
    for commit, lines in repo.blame(fixing_commit, affected_file):
        for line in lines:
            if len(blame_lines) == 0:
                break
            if blame_lines[0] == line:
                blame_lines.pop(0)  
                blame_commits.append(commit)
                if commit not in blame_commits_unique:
                    blame_commits_unique.append(commit)
print("vulnerable code portions commits:\n")
worst_commit = None
worst_number = 0
for unique_commit in blame_commits_unique:
    if blame_commits.count(unique_commit) > worst_number:
        worst_commit = unique_commit
        worst_number = blame_commits.count(unique_commit)
    print(GREEN, unique_commit, ENDC, " : ", BLUE, blame_commits.count(unique_commit), ENDC)

print("\nworst commit (VCC): ", RED, worst_commit, ENDC)

vulnerable code portions commits:

[92m 15f420cb799350455182159f3c0f63145b23c19b [0m  :  [94m 1 [0m
[92m 00fcf59a73dee30b29c496630b6b8ec09c5b13e4 [0m  :  [94m 15 [0m

worst commit (VCC):  [91m 00fcf59a73dee30b29c496630b6b8ec09c5b13e4 [0m


### Title and committer of VCC commit

In [13]:
show_data = repo.git.show("-s", manual_vcc).splitlines()
for line in show_data:
    print(line)

commit 00fcf59a73dee30b29c496630b6b8ec09c5b13e4
Author: Jesse Glick <jglick@cloudbees.com>
Date:   Fri Jul 28 13:55:51 2017 -0400

    Closed sandbox bypasses via:
    [SECURITY-566] interface coercion
    [SECURITY-567] method referencing
    [SECURITY-580] list to constructor coercion
    [SECURITY-582] super constructor


### Total files and Directory affected 

In [49]:
prev_commit = "0f2ed68c3c76c7fc22887965d4d26612753e43b8"
diff_data = repo.git.diff('--name-status',manual_vcc, prev_commit).splitlines()
print("Total Files modified: "+ str(len(diff_data)))
diff_dir = repo.git.diff('--dirstat=files,0',manual_vcc, prev_commit).splitlines()
print("Total unique directories modified: "+ str(len(diff_dir)))

Total Files modified: 7
Total unique directories modified: 3


### lines added and deleted (including & excluding comments):

In [170]:
diff_data = repo.git.diff(worst_commit.hexsha + "^", worst_commit).splitlines()

total_lines_added_including_blank_comments   = 0
total_lines_added_excluding_blank_comments   = 0
total_lines_removed_including_blank_comments = 0
total_lines_removed_excluding_blank_comments = 0

isComment = False

for line in diff_data:
    isAdded   = False
    isRemoved = False
    if line.startswith("+"):
        isAdded = True 
    if line.startswith("-"):
        isRemoved = True


    # ignore 'file path' lines
    if line.startswith("++") or line.startswith("--"):
        continue
    
    # detect when comment code starts
    if line.strip().startswith("/*") or line.startswith('"""'):
        isComment = True

    if len(line.strip()) == 0:
        total_lines_added_including_blank_comments += 1
    elif isAdded and (isComment or line.startswith("//") or line.startswith("#")):
        total_lines_added_including_blank_comments += 1
    elif isAdded:
        total_lines_added_including_blank_comments += 1
        total_lines_added_excluding_blank_comments += 1
    elif isRemoved:
        total_lines_removed_including_blank_comments += 1
        total_lines_removed_excluding_blank_comments += 1      
       
        # detect when comment code ends
    if line.strip().endswith("*/") or line.endswith('"""'):
        isComment = False
        
print("lines added (INCLUDING comments and blanks): ", total_lines_added_including_blank_comments)
print("lines added (EXCLUDING comments and blanks): ", total_lines_added_excluding_blank_comments)
print("lines removed (INCLUDING comments and blanks): ", total_lines_removed_including_blank_comments)
print("lines removed (EXCLUDING comments and blanks): ", total_lines_removed_excluding_blank_comments)

lines added (INCLUDING comments and blanks):  334
lines added (EXCLUDING comments and blanks):  260
lines removed (INCLUDING comments and blanks):  15
lines removed (EXCLUDING comments and blanks):  15


####  How many days between current VCC - previous commit

In [105]:
affected_vcc_files=[]
all_commits = repo.iter_commits()
for commit in list(all_commits):
    if commit.hexsha == manual_vcc:
        affected_vcc_files = commit.stats.files
# list(affected_vcc_files)

In [107]:
prev_commit = "0f2ed68c3c76c7fc22887965d4d26612753e43b8"
# repo.git.log('--pretty=tformat:%C(auto)%h (%ar, %ad, %an)', manual_vcc, prev_commit).splitlines()
all_dates = repo.git.log('--pretty=tformat:%cs', manual_vcc,prev_commit, list(affected_vcc_files)[0]).splitlines()
from datetime import datetime

for commit in list(affected_vcc_files):
    all_dates= repo.git.log('--pretty=tformat:%cs', manual_vcc,commit).splitlines()
    d1 = datetime.strptime(all_dates[0], "%Y-%m-%d")
    d2 = datetime.strptime(all_dates[1], "%Y-%m-%d")
    diff = abs((d2 - d1).days)
    print(commit, ":", diff, "days")


pom.xml : 3 days
src/main/java/org/kohsuke/groovy/sandbox/GroovyInterceptor.java : 314 days
src/main/java/org/kohsuke/groovy/sandbox/SandboxTransformer.java : 15 days
src/main/java/org/kohsuke/groovy/sandbox/ScopeTrackingClassCodeExpressionTransformer.java : 17 days
src/main/java/org/kohsuke/groovy/sandbox/impl/Checker.java : 3 days
src/main/java/org/kohsuke/groovy/sandbox/impl/SandboxedMethodClosure.java : 1352 days
src/test/groovy/org/kohsuke/groovy/sandbox/TheTest.groovy : 3 days


#### How many times has the files in VCC been modified since creation?

In [117]:
for commit in list(affected_vcc_files):
    ind_commits = repo.git.log('--follow','--pretty=tformat:%h', manual_vcc,prev_commit, commit).splitlines()
    print(commit, ":", len(list(ind_commits)))

pom.xml : 41
src/main/java/org/kohsuke/groovy/sandbox/GroovyInterceptor.java : 9
src/main/java/org/kohsuke/groovy/sandbox/SandboxTransformer.java : 43
src/main/java/org/kohsuke/groovy/sandbox/ScopeTrackingClassCodeExpressionTransformer.java : 3
src/main/java/org/kohsuke/groovy/sandbox/impl/Checker.java : 18
src/main/java/org/kohsuke/groovy/sandbox/impl/SandboxedMethodClosure.java : 3
src/test/groovy/org/kohsuke/groovy/sandbox/TheTest.groovy : 48


#### Which developers have modifed the file(since creation)?

In [126]:
for commit in list(affected_vcc_files):
    ind_commits = repo.git.log('--follow','--pretty=tformat:%cn', manual_vcc,prev_commit, commit).splitlines()
    contributors = []
    for committer in ind_commits: 
        if committer not in contributors:
            contributors.append(committer)
    print(commit, ":", (contributors))

pom.xml : ['Jesse Glick', 'Kohsuke Kawaguchi']
src/main/java/org/kohsuke/groovy/sandbox/GroovyInterceptor.java : ['Jesse Glick', 'Kohsuke Kawaguchi']
src/main/java/org/kohsuke/groovy/sandbox/SandboxTransformer.java : ['Jesse Glick', 'Andrew Bayer', 'Kohsuke Kawaguchi', 'Oliver Gondža', 'Julien Viet']
src/main/java/org/kohsuke/groovy/sandbox/ScopeTrackingClassCodeExpressionTransformer.java : ['Jesse Glick']
src/main/java/org/kohsuke/groovy/sandbox/impl/Checker.java : ['Jesse Glick', 'Kohsuke Kawaguchi']
src/main/java/org/kohsuke/groovy/sandbox/impl/SandboxedMethodClosure.java : ['Jesse Glick', 'Kohsuke Kawaguchi']
src/test/groovy/org/kohsuke/groovy/sandbox/TheTest.groovy : ['Jesse Glick', 'Andrew Bayer', 'Kohsuke Kawaguchi', 'Oliver Gondža', 'Nicolas De Loof', 'Julien Viet']


#### How many commits have each of the developer submitted?

In [131]:
commit_authors = repo.git.shortlog("-sne", "--all").splitlines()

author_commits = []

for commit_author in commit_authors:
    commit, author = commit_author.split("\t")
    author_clean = author.split("<")[0].strip()
    if author_clean in contributors:
        author_commits.append([int(commit), author])
    
df = pd.DataFrame(author_commits, columns=['Commit', 'Author'])
df.head(100)

Unnamed: 0,Commit,Author
0,116,Kohsuke Kawaguchi <kk@kohsuke.org>
1,40,Jesse Glick <jglick@cloudbees.com>
2,29,Andrew Bayer <andrew.bayer@gmail.com>
3,2,Julien Viet <julien@julienviet.com>
4,1,Nicolas De Loof <nicolas.deloof@gmail.com>
5,1,Oliver Gondža <ogondza@gmail.com>
