# Preprocessing

In [1]:
%pip install pydriller
%pip install lizard

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
from pydriller import Repository
import os
import datetime as dt
import pandas as pd

In [3]:
# Clone code from the repo and save it for code portability -> via normal git clone
url = "https://github.com/apache/kafka"
repo_path = os.path.join(os.getcwd(), 'kafka')
clone = f"git clone {url} {repo_path}" 

os.system(clone) # Cloning

128

In [4]:
# checkout the tag 3.6.0
os.chdir(repo_path)
os.system("git checkout 3.6.0")
# back to the "home" folder
os.chdir("..")

In [5]:
from_tag = "3.5.1"
from_date=dt.datetime(2023, 7, 14, 18, 51, 0)

to_tag = "3.6.0"
to_date=dt.datetime(2023, 9, 29, 6, 56, 0)

# Using datetimes of the releases, since tags don't yield any commits
# Open question 
# repo = Repository(path_to_repo=repo_path, from_tag=to_tag, to_tag=from_tag)
repo = Repository(path_to_repo=repo_path, since=from_date, to=to_date)

# Exercise 1

In [42]:
file_changes = {}
# i = 30
for commit in repo.traverse_commits():

    author = commit.author.name

    for file in commit.modified_files:

        filename = file.new_path if file.new_path else file.old_path

        if(filename not in file_changes):
            file_changes[filename] = {
                'num of revisions': 0,
                'authors': {},
                'revisions': {},
            }

        file_changes[filename]['num of revisions'] += 1
        
        if(author not in file_changes[filename]['authors']):
            file_changes[filename]['authors'][author] = {
                "added": 0,
                "removed": 0,
                'total': 0,
            }

        file_changes[filename]['authors'][author]['added'] += file.added_lines
        file_changes[filename]['authors'][author]['removed'] += file.deleted_lines
        file_changes[filename]['authors'][author]['total'] += file.added_lines + file.deleted_lines


        file_changes[filename]['revisions'][commit.hash] = {
            'changed lines': file.added_lines + file.deleted_lines
        }

In [7]:
# save data into json "exercise1.json"
import json
with open('exercise1.json', 'w') as outfile:
    json.dump(file_changes, outfile, indent=4)

# Exercise 2

In [8]:
import lizard

In [9]:
def calculate_indentation_complexity(file_path):
    """
    Calculate the indentation-based complexity of a Python file.

    :param file_path: Path to the Python file.
    :return: The total indentation count and the number of lines
    """
    total_indentation = 0
    nr_lines = 0

    with open(file_path, 'r') as file:
        for line in file:
            # Count leading spaces and tabs
            indentation = len(line) - len(line.lstrip())
            
            # Update total and max indentation
            total_indentation += indentation
            nr_lines += 1

    return total_indentation, nr_lines

In [23]:
def get_nr_changes_between_commits(relative_file_path):
    """
    Get the number of changes between two commits for a file.

    :param relative_file_path: relative filepath in the analysed repository.
    :return: The number of changes.
    """
    # if relative file path has not \\ like separator replace them
    if relative_file_path in file_changes:
        return file_changes[relative_file_path]['num of revisions']
    else:
        return -1

In [40]:
get_nr_changes_between_commits("clients\\src\\test\\java\\org\\apache\\kafka\\clients\\MockClient.java")

1

In [41]:
# Calculate lines of code for each .java file

# Get all java files
java_files = []
for root, dirs, files in os.walk(repo_path):
    # print(root, dirs)
    for file in files:
        if file.endswith(".java"):
             java_files.append(os.path.join(root, file))

# Calculate lines of code and cyclomatic complexity for each file
analysis = {}
for file in java_files:
    i = lizard.analyze_file(file)

    # filename = relative path to cwd
    filename = file.replace(repo_path+"\\", "")

    analysis[filename] = {
        'loc': i.nloc,
        'cc': sum([func.cyclomatic_complexity for func in i.function_list]),
        'ic': calculate_indentation_complexity(file)[0], # for absolute values TODO maybe divide by nr of lines returned
        'nr_changes': get_nr_changes_between_commits(filename),
    }


true


In [43]:
# convert to pandas dataframe
df = pd.DataFrame.from_dict(analysis, orient='index')
df = df.sort_values(by=['nr_changes'], ascending=False)
df.head(10)

Unnamed: 0,loc,cc,ic,nr_changes
streams\src\test\java\org\apache\kafka\streams\processor\internals\assignment\AssignmentTestUtils.java,926,111,11180,7
streams\src\main\java\org\apache\kafka\streams\processor\internals\assignment\RackAwareTaskAssignor.java,454,89,6724,7
core\src\test\java\kafka\log\remote\RemoteLogManagerTest.java,1396,64,14304,7
group-coordinator\src\main\java\org\apache\kafka\coordinator\group\GroupMetadataManager.java,1900,311,26980,6
metadata\src\main\java\org\apache\kafka\controller\QuorumController.java,1753,265,22092,6
streams\src\test\java\org\apache\kafka\streams\processor\internals\assignment\RackAwareTaskAssignorTest.java,929,54,8015,6
core\src\main\java\kafka\log\remote\RemoteLogManager.java,1196,247,17963,6
connect\runtime\src\main\java\org\apache\kafka\connect\runtime\isolation\PluginUtils.java,346,52,4323,5
group-coordinator\src\test\java\org\apache\kafka\coordinator\group\GroupCoordinatorServiceTest.java,486,20,4031,5
group-coordinator\src\test\java\org\apache\kafka\coordinator\group\GroupMetadataManagerTest.java,7225,280,86937,5


# Exercise 3