# Preprocessing

In [46]:
%pip install pydriller

Note: you may need to restart the kernel to use updated packages.


In [47]:
from pydriller import Repository
import os
import datetime as dt
from datetime import timedelta, datetime
import numpy as np

In [48]:
# Clone code from the repo and save it for code portability -> via normal git clone
url = "https://github.com/apache/kafka"
repo_path = os.path.join(os.getcwd(), 'kafka')
clone = f"git clone {url} {repo_path}" 

os.system(clone) # Cloning

128

In [49]:
# checkout the tag 3.6.0
os.chdir(repo_path)
os.system("git checkout 3.6.0")
# back to the "home" folder
os.chdir("..")

In [50]:
from_tag = "3.5.1"
from_date=dt.datetime(2023, 7, 14, 18, 51, 0)

to_tag = "3.6.0"
to_date=dt.datetime(2023, 9, 29, 6, 56, 0)

# Using datetimes of the releases, since tags don't yield any commits
# Open question 
# repo = Repository(path_to_repo=repo_path, from_tag=to_tag, to_tag=from_tag)
repo = Repository(path_to_repo=repo_path, since=from_date, to=to_date)

# Exercise 3

In [51]:
# Create a matrix of all current java files
import os

def get_unique_java_files(directory):
    unique_files = set()
    for root, dirs, files in os.walk(directory):
        # print(root)
        # print(files)
        for file in files:
            if file.endswith(".java") or True: #Hotfix
                file_path = os.path.join(root, file)
                # remove the repo path from the file path
                file_path = file_path.replace(directory+"\\", "")
                unique_files.add(file_path)
    return list(unique_files)

# Replace 'your_repo_directory' with the path to your repository
unique_files = get_unique_java_files(repo_path)

In [52]:
def add_file_to_MATRIX(file):
    if file not in FILE_INDEX_MAP:
        print("Adding file to MATRIX: ", file)
        
        new_index = len(FILE_INDEX_MAP)
        FILE_INDEX_MAP[file] = new_index
        unique_files.append(file)  # Update the unique_files list
        
        for window in MATRIX:
            # Correctly expand each matrix to include the new file
            current_matrix = MATRIX[window]
            new_matrix_size = len(unique_files)
            new_matrix = np.zeros((new_matrix_size, new_matrix_size))
            # Copy the old matrix values into the new matrix
            new_matrix[:current_matrix.shape[0], :current_matrix.shape[1]] = current_matrix
            MATRIX[window] = new_matrix


In [53]:
#add them to shared indexes
def process_files(files):
    modified_file_indices = []
    for file in files:
        # TODO if you want to select only java files, do it there!
        # print(file.new_path if file.new_path else file.old_path)
        # Accessing filename correctly 
        filename = file.new_path if file.new_path else file.old_path
        add_file_to_MATRIX(filename) #check if file is already in the MATRIX, if not add it (issue for files which were created and are not in repo now)
        modified_file_indices.append(FILE_INDEX_MAP[filename])
    return modified_file_indices

In [54]:
def update_MATRIX2(files_current, files_others, window):
    # print("Updating MATRIX for window: ", window, " for files: ", files_current, files_others)
    for i in files_current:
            for j in files_others:
                if i != j:  # Skip incrementing for the same file
                    # print("Updating MATRIX for window: ", window, " for files: ", i, j)
                    MATRIX[window][i][j] += 1
                         

In [55]:
time_windows = [24, 48, 72, 168]  # time windows in hours
MATRIX = {window: np.zeros((len(unique_files), len(unique_files))) for window in time_windows}
FILE_INDEX_MAP = {file: index for index, file in enumerate(unique_files)}


In [56]:
from collections import deque
from datetime import timedelta
all_commits = Repository(path_to_repo=repo_path, 
                        #  only_modifications_with_file_types=['.java'],
                         since=datetime(2023, 8, 20, 17, 59, 0)
                         ).traverse_commits()

commit_queue = deque()
for commit in all_commits:
        commit_data = (commit.committer_date, process_files(commit.modified_files))
        commit_queue.append(commit_data)

        # Check if the time span exceeds 168 hours
        if commit_queue[-1][0] - commit_queue[0][0] > timedelta(hours=168):
            # Start processing the queue
            while commit_queue:
                current_commit = commit_queue.popleft()
                files = current_commit[1]
                current_commit_time = current_commit[0]
                # Process files and update MATRIX for each time window


                #TODO can be done more efficient
                for window in time_windows:
                    # Determine adjacent files within the time window
                    adjacent_files = []
                    for (date, file_list) in commit_queue:
                        if current_commit_time - date <= timedelta(hours=window):
                            adjacent_files.append(file_list)

                    update_MATRIX2(files, adjacent_files, window)

                # Maintain the rolling window up to 168 hours
                while commit_queue and commit_queue[-1][0] - commit_queue[0][0] > timedelta(hours=168):
                    commit_queue.popleft()

Adding file to MATRIX:  core\src\main\scala\kafka\common\LongRef.scala
Adding file to MATRIX:  storage\src\test\java\org\apache\kafka\tiered\storage\utils\ActionUtils.java


In [57]:
import pandas as pd

def MATRIX_to_dataframe(MATRIX, unique_files):
    df_list = []
    for window, matrix in MATRIX.items():
        df = pd.DataFrame(matrix, index=unique_files, columns=unique_files)
        df = df.stack().reset_index()
        df.columns = ['File1', 'File2', 'Count']
        df['Time_Window'] = window
        df_list.append(df)

    return pd.concat(df_list, ignore_index=True)

result_df = MATRIX_to_dataframe(MATRIX, unique_files)

In [58]:
# Example: Filter out pairs with zero counts
result_df = result_df[result_df['Count'] > 0]

# Sort, reset index, etc.
result_df = result_df.sort_values(by=['Count'], ascending=False).reset_index(drop=True)

In [59]:
# make file 1 and file2 display the whole name
pd.set_option('display.max_colwidth', None)
result_df.head(30)
# filtered_df = result_df[result_df['File1'].str.contains('RemoteLogManager.java')]
# filtered_df.head(30)

Unnamed: 0,File1,File2,Count,Time_Window
0,core\src\main\java\kafka\log\remote\RemoteLogManager.java,core\src\main\java\kafka\log\remote\RemoteLogManager.java,10.0,168
1,core\src\main\java\kafka\log\remote\RemoteLogManager.java,core\src\main\java\kafka\log\remote\RemoteLogManager.java,10.0,48
2,core\src\main\java\kafka\log\remote\RemoteLogManager.java,core\src\main\java\kafka\log\remote\RemoteLogManager.java,10.0,72
3,core\src\main\java\kafka\log\remote\RemoteLogManager.java,core\src\main\java\kafka\log\remote\RemoteLogManager.java,10.0,24
4,core\src\main\java\kafka\log\remote\RemoteLogManager.java,storage\src\test\java\org\apache\kafka\server\log\remote\storage\LocalTieredStorage.java,9.0,48
5,core\src\main\java\kafka\log\remote\RemoteLogManager.java,storage\src\test\java\org\apache\kafka\server\log\remote\storage\LocalTieredStorage.java,9.0,168
6,core\src\main\java\kafka\log\remote\RemoteLogManager.java,storage\src\test\java\org\apache\kafka\server\log\remote\storage\LocalTieredStorage.java,9.0,24
7,core\src\main\java\kafka\log\remote\RemoteLogManager.java,storage\src\test\java\org\apache\kafka\server\log\remote\storage\LocalTieredStorage.java,9.0,72
8,core\src\main\java\kafka\log\remote\RemoteLogManager.java,storage\api\src\main\java\org\apache\kafka\server\log\remote\storage\RemoteStorageManager.java,8.0,48
9,core\src\main\java\kafka\log\remote\RemoteLogManager.java,storage\api\src\main\java\org\apache\kafka\server\log\remote\storage\RemoteStorageManager.java,8.0,24
