# Preprocessing

In [1]:
%pip install pydriller
%pip install lizard

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
from pydriller import Repository
import os
import datetime as dt
import pandas as pd
import tqdm

In [3]:
# Clone code from the repo and save it for code portability -> via normal git clone
url = "https://github.com/apache/kafka"
repo_path = os.path.join(os.getcwd(), 'kafka')
clone = f"git clone {url} {repo_path}" 

os.system(clone) # Cloning

128

In [4]:
# checkout the tag 3.6.0
os.chdir(repo_path)
os.system("git checkout 3.6.0")
# back to the "home" folder
os.chdir("..")

In [5]:
from_tag = "3.5.1"
from_date=dt.datetime(2023, 7, 14, 18, 51, 0)

to_tag = "3.6.0"
to_date=dt.datetime(2023, 9, 29, 6, 56, 0)

# Using datetimes of the releases, since tags don't yield any commits
# Open question 
# repo = Repository(path_to_repo=repo_path, from_tag=to_tag, to_tag=from_tag)
repo = Repository(path_to_repo=repo_path, since=from_date, to=to_date)

# Get file changes - like in EX1

In [6]:
file_changes = {}
# i = 30
for commit in repo.traverse_commits():

    author = commit.author.name

    for file in commit.modified_files:

        filename = file.new_path if file.new_path else file.old_path

        if(filename not in file_changes):
            file_changes[filename] = {
                'num of revisions': 0,
            }

        file_changes[filename]['num of revisions'] += 1
        

# Exercise 2

In [7]:
import lizard

In [8]:
def calculate_indentation_complexity(file_path):
    """
    Calculate the indentation-based complexity of a Python file.

    :param file_path: Path to the Python file.
    :return: The total indentation count and the number of lines
    """
    total_indentation = 0
    nr_lines = 0

    with open(file_path, 'r') as file:
        for line in file:
            # Count leading spaces and tabs
            total_indentation += line.count("    ")
            nr_lines += 1

    return total_indentation, nr_lines

In [9]:
def get_nr_changes_between_commits(relative_file_path):
    """
    Get the number of changes between two commits for a file.

    :param relative_file_path: relative filepath in the analysed repository.
    :return: The number of changes.
    """
    # if relative file path has not \\ like separator replace them
    if relative_file_path in file_changes:
        return file_changes[relative_file_path]['num of revisions']
    else:
        return -1

In [10]:
get_nr_changes_between_commits("clients\\src\\test\\java\\org\\apache\\kafka\\clients\\MockClient.java")

1

In [11]:
# Calculate lines of code for each .java file

# Get all java files
java_files = []
for root, dirs, files in os.walk(repo_path):
    # print(root, dirs)
    for file in files:
        if file.endswith(".java"):
             java_files.append(os.path.join(root, file))

# Calculate lines of code and cyclomatic complexity for each file
analysis = {}
for file in tqdm.tqdm(java_files):
    i = lizard.analyze_file(file)

    # filename = relative path to cwd
    filename = file.replace(repo_path+"\\", "")

    analysis[filename] = {
        'loc': i.nloc,
        'cc': sum([func.cyclomatic_complexity for func in i.function_list]),
        'ic': calculate_indentation_complexity(file)[0], # for absolute values 
        # 'relative_ic': calculate_indentation_complexity(file)[0] / calculate_indentation_complexity(file)[1], # for relative values '
        'nr_changes': get_nr_changes_between_commits(filename),
    }


100%|██████████| 4264/4264 [00:50<00:00, 84.77it/s] 


In [12]:
# convert to pandas dataframe
df = pd.DataFrame.from_dict(analysis, orient='index')
df = df.sort_values(by=['nr_changes'], ascending=False)
df

Unnamed: 0,loc,cc,ic,nr_changes
core\src\main\java\kafka\log\remote\RemoteLogManager.java,1196,247,4422,15
core\src\test\java\kafka\log\remote\RemoteLogManagerTest.java,1396,64,3501,13
metadata\src\main\java\org\apache\kafka\controller\QuorumController.java,1753,265,5461,9
metadata\src\main\java\org\apache\kafka\metadata\migration\KRaftMigrationDriver.java,730,138,2294,8
metadata\src\test\java\org\apache\kafka\controller\QuorumControllerTest.java,1378,57,4381,8
...,...,...,...,...
clients\src\test\java\org\apache\kafka\common\utils\ImplicitLinkedHashMultiCollectionTest.java,139,16,273,-1
clients\src\test\java\org\apache\kafka\common\utils\JavaTest.java,73,8,115,-1
clients\src\test\java\org\apache\kafka\common\utils\LogCaptureAppender.java,93,18,184,-1
clients\src\test\java\org\apache\kafka\common\utils\LoggingSignalHandlerTest.java,8,1,6,-1


In [13]:
# with plotly display heapmap of the top 10 files with the most changes
import plotly.express as px

df_normalized = df.apply(lambda x: (x - x.min()) / (x.max() - x.min()) if x.name != 'files' else x)

df_normalized['relative_count'] = df_normalized['nr_changes'] + df_normalized['cc']  + df_normalized['loc'] + df_normalized['ic']
df_normalized = df_normalized.sort_values(by=['relative_count'], ascending=False).head(50)
df_normalized.drop(columns=['relative_count'], inplace=True)
# Create the heatmap using Plotly
fig = px.imshow(df_normalized,
                labels=dict(color="Relative Value"),
                x=df_normalized.columns,  # Use the columns of the dataframe as the x-axis labels
                y=df_normalized.index,  # Use the index for the y-axis labels
                aspect="auto",
                title="Heatmap of File Metrics (Relative to Column)")

# Customizing the layout to have meaningful axis titles
fig.update_xaxes(side="top")  # To move the x-axis to the top of the heatmap
fig.update_layout(xaxis_title="Metric", yaxis_title="File")

# Show the heatmap
fig.show()

In [14]:
fig.write_html("ex2_heatmap.html")