In [None]:
import os
import copy
import math
import shutil
import Levenshtein
from git import Repo

In [2]:
owner = "fzaninotto"
repo = "Faker"
url = f"https://github.com/{owner}/{repo}.git"
path = "./tmp"

In [3]:
from src.gitruck import Gitruck

gitruck = Gitruck(owner, repo)

In [4]:
gitruck.calculate_truck_factor()

33

# GitPython

In [None]:
owner = "puppetlabs"
repo = "puppet"
url = f"https://github.com/{owner}/{repo}.git"
path = "./tmp"

In [None]:
if os.path.exists(path):
    shutil.rmtree(path)
repo = Repo.clone_from(url, path)

#### Getting all code files

In [None]:
# Uses linguist to get the files
git_cmd = repo.git
raw_files = git_cmd.execute(["github-linguist", "-b"])
raw_files

# Removes the top analysis
raw_files = raw_files.split("\n")
for i in range(len(raw_files)):
    if raw_files[i] == "":
        raw_files = raw_files[i + 1 :]
        break

# Gets the names of the files
_files = []
for line in raw_files:
    if "  " in line:
        _files.append(line.strip())

In [None]:
def print_files_from_git(root, files, level=0):
    for entry in root:
        if entry.type == "tree":
            print_files_from_git(entry, files, level + 1)
        else:
            if entry.path in _files:
                files.append(entry)


files = []
print_files_from_git(repo.head.commit.tree, files)

In [None]:
len(files)

#### Getting contributors

In [None]:
git_cmd = repo.git
contributors = git_cmd.execute(["git", "shortlog", "-sne", "--all"])

contributors = contributors.split("\n")
for i in range(len(contributors)):
    contributor = contributors[i]
    count, identifier = contributor.strip().split("\t")
    count = int(count)
    name, email = identifier.split("<")
    name = name.strip()
    email = email[:-1].strip()
    contributors[i] = tuple([count, name, email])

In [None]:
len(contributors)

In [None]:
# Groups emails
grouped_emails = {}
for contributor in contributors:
    if contributor[2] not in grouped_emails.keys():
        grouped_emails[contributor[2]] = [contributor[1]]
    else:
        grouped_emails[contributor[2]].append(contributor[1])

# Groups names
grouped_names = []
while len(grouped_emails.keys()) > 0:
    reference_group = grouped_emails.pop(list(grouped_emails.keys())[0])
    # Expands the reference group with other groups
    for key in list(grouped_emails.keys()):
        comparison_group = grouped_emails[key]
        # Checks if there is a pair of names in both groups
        # that have a Levenshtein distance of 1 or less
        for lhs_name in reference_group:
            for rhs_name in comparison_group:
                if Levenshtein.distance(lhs_name, rhs_name) <= 1:
                    # Unites both groups
                    reference_group += grouped_emails.pop(key)
                    break
            else:  # Bad code to make the inner loop break propagate to the outer loop
                continue
            break
    grouped_names.append(reference_group)

# Creates a mapped-dev-name
dev_name = {}
for name_group in grouped_names:
    root_name = name_group[0]
    for name in name_group:
        dev_name[name] = root_name

#### Getting commits per file path

In [None]:
commits_per_filepath = {}
for file in files:
    commits_for_file_generator = repo.iter_commits(all=True, paths=file.path)
    commits_per_filepath[file.path] = list(commits_for_file_generator)

#### Calculating DOA

In [None]:
files = _files
contributors = list(set(dev_name.values()))

In [None]:
# Defines DOA data structure
DOA = {}
for contributor in contributors:
    DOA[contributor] = {}
    for file in files:
        DOA[contributor][file] = 0.0

# Calculates DOA
for contributor in contributors:
    for file in files:
        # Authorship
        authorship = 0
        first_author = commits_per_filepath[file][-1].author.name
        if (first_author in dev_name.keys()) and (
            dev_name[commits_per_filepath[file][-1].author.name] == contributor
        ):
            authorship = 1

        # Deliveries
        deliveries = 0
        acceptances = 0
        for commit in commits_per_filepath[file]:
            if (commit.author.name in dev_name.keys()) and (
                dev_name[commit.author.name] == contributor
            ):
                deliveries += 1
            else:
                acceptances += 1

        # Final value
        DOA[contributor][file] = (
            3.293
            + 1.098 * authorship
            + 0.164 * deliveries
            - 0.321 * math.log(1 + acceptances)
        )

In [None]:
# Normalizes DOA
normalized_DOA = copy.deepcopy(DOA)
for file in files:
    # Gets minimum and maximum values
    max_val = -1
    min_val = 999999
    for contributor in contributors:
        if DOA[contributor][file] > max_val:
            max_val = DOA[contributor][file]
        if DOA[contributor][file] < min_val:
            min_val = DOA[contributor][file]

    # Normalizes each value
    for contributor in contributors:
        normalized_DOA[contributor][file] = (DOA[contributor][file] - min_val) / (
            max_val - min_val
        )

In [None]:
# Parameters
k = 0.75
m = 3.293

# Computes authored files
authored_files = {}
for contributor in contributors:
    authored_files[contributor] = []
    for file in files:
        if (normalized_DOA[contributor][file] >= k) and (DOA[contributor][file] >= m):
            authored_files[contributor].append(file)

# Orders it based on amount of files
authored_files = {
    k: v
    for k, v in sorted(
        authored_files.items(), key=lambda pair: len(pair[1]), reverse=True
    )
}

In [None]:
# Calculates the superset of files
_files = []
for contributor in authored_files.keys():
    _files += authored_files[contributor]
_files = list(set(_files))

# Computes the truck factor
truck_factor = 0
while _files:
    # Stops if file coverage is below 50%
    if len(_files) < (0.5 * len(files)):
        break

    # Removes top author and increases truck factor
    authored_files.pop(next(iter(authored_files)), None)
    truck_factor += 1

    # Recalculates file set
    _files = []
    for contributor in authored_files.keys():
        _files += authored_files[contributor]
    _files = list(set(_files))

In [None]:
print(truck_factor)