# Preprocessing

In [18]:
%pip install pydriller

Note: you may need to restart the kernel to use updated packages.


In [19]:
from pydriller import Repository
import os
import datetime as dt
from datetime import timedelta

In [20]:
# Clone code from the repo and save it for code portability -> via normal git clone
url = "https://github.com/apache/kafka"
repo_path = os.path.join(os.getcwd(), 'kafka')
clone = f"git clone {url} {repo_path}" 

os.system(clone) # Cloning

128

In [21]:
# checkout the tag 3.6.0
os.chdir(repo_path)
os.system("git checkout 3.6.0")
# back to the "home" folder
os.chdir("..")

In [22]:
from_tag = "3.5.1"
from_date=dt.datetime(2023, 7, 14, 18, 51, 0)

to_tag = "3.6.0"
to_date=dt.datetime(2023, 9, 29, 6, 56, 0)

# Using datetimes of the releases, since tags don't yield any commits
# Open question 
# repo = Repository(path_to_repo=repo_path, from_tag=to_tag, to_tag=from_tag)
repo = Repository(path_to_repo=repo_path, since=from_date, to=to_date)

# Exercise 3

In [23]:
# Create a matrix of all current java files
import os

def get_unique_java_files(directory):
    unique_files = set()
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(".java"):
                file_path = os.path.join(root, file)
                # remove the repo path from the file path
                file_path = file_path.replace(directory+"\\", "")
                unique_files.add(file_path)
    return list(unique_files)

# Replace 'your_repo_directory' with the path to your repository
unique_files = get_unique_java_files(repo_path)

In [24]:
# initalize matrix
import numpy as np

time_windows = [24, 48, 72, 168]  # time windows in hours
matrices = {window: np.zeros((len(unique_files), len(unique_files))) for window in time_windows}
file_index_map = {file: index for index, file in enumerate(unique_files)}



In [45]:
file_index_map

{'connect\\runtime\\src\\test\\resources\\test-plugins\\bad-packaging\\test\\plugins\\CoLocatedPlugin.java': 0,
 'group-coordinator\\src\\main\\java\\org\\apache\\kafka\\coordinator\\group\\GroupCoordinatorConfig.java': 1,
 'streams\\src\\test\\java\\org\\apache\\kafka\\streams\\integration\\RestoreIntegrationTest.java': 2,
 'clients\\src\\test\\java\\org\\apache\\kafka\\clients\\consumer\\internals\\CompletedFetchTest.java': 3,
 'connect\\mirror\\src\\main\\java\\org\\apache\\kafka\\connect\\mirror\\MirrorMakerConfig.java': 4,
 'server-common\\src\\test\\java\\org\\apache\\kafka\\timeline\\TimelineHashMapTest.java': 5,
 'streams\\src\\main\\java\\org\\apache\\kafka\\streams\\state\\internals\\MemoryNavigableLRUCache.java': 6,
 'connect\\runtime\\src\\test\\java\\org\\apache\\kafka\\connect\\integration\\SessionedProtocolIntegrationTest.java': 7,
 'streams\\src\\test\\java\\org\\apache\\kafka\\streams\\state\\internals\\RocksDBWindowStoreTest.java': 8,
 'streams\\upgrade-system-tests-2

In [58]:
def add_file_to_matrices(file, file_index_map, matrices):
    if file not in file_index_map:
        new_index = len(file_index_map)
        file_index_map[file] = new_index
        unique_files.append(file)  # Update the unique_files list
        
        for window in matrices:
            # Correctly expand each matrix to include the new file
            current_matrix = matrices[window]
            new_matrix_size = len(unique_files)
            new_matrix = np.zeros((new_matrix_size, new_matrix_size))
            # Copy the old matrix values into the new matrix
            new_matrix[:current_matrix.shape[0], :current_matrix.shape[1]] = current_matrix
            matrices[window] = new_matrix
        

In [59]:
from datetime import timedelta

def update_matrices(commit, matrices, file_index_map):
    commit_time = commit.committer_date  # Adjusted to use your attribute

    for file in commit.modified_files:
        # Accessing filename correctly as per your example
        filename = file.new_path if file.new_path else file.old_path
        add_file_to_matrices(filename, file_index_map, matrices)

    modified_file_indices = [file_index_map[file.new_path if file.new_path else file.old_path] for file in commit.modified_files]

    for i in modified_file_indices:
        for j in modified_file_indices:
            if i != j:  # Skip incrementing for the same file
                for window in time_windows:
                    if commit_time - commit.committer_date <= timedelta(hours=window):
                        matrices[window][i, j] += 1

In [65]:
i = 1
all_commits = Repository(path_to_repo=repo_path, 
                         only_modifications_with_file_types=['.java'],
                         num_workers = 1,
                         since=datetime(2023, 8, 9, 17, 59, 0)
                         ).traverse_commits()

for commit in all_commits:
    update_matrices(commit, matrices, file_index_map)
    # i -= 1
    # if i == 0:
    #     break

In [66]:
import pandas as pd

def matrices_to_dataframe(matrices, unique_files):
    df_list = []
    for window, matrix in matrices.items():
        df = pd.DataFrame(matrix, index=unique_files, columns=unique_files)
        df = df.stack().reset_index()
        df.columns = ['File1', 'File2', 'Count']
        df['Time_Window'] = window
        df_list.append(df)

    return pd.concat(df_list, ignore_index=True)

result_df = matrices_to_dataframe(matrices, unique_files)

In [67]:
# After adding a file in add_file_to_matrices
print(f"Updated unique_files length: {len(unique_files)}")

# Just before creating the DataFrame
print(f"Matrix dimensions: {matrices[next(iter(matrices))].shape}")
print(f"Unique files count: {len(unique_files)}")


Updated unique_files length: 4774
Matrix dimensions: (4774, 4774)
Unique files count: 4774


In [68]:
# Example: Filter out pairs with zero counts
result_df = result_df[result_df['Count'] > 0]

# Sort, reset index, etc.
result_df = result_df.sort_values(by=['Count'], ascending=False).reset_index(drop=True)

In [72]:
# make file 1 and file2 display the whole name
pd.set_option('display.max_colwidth', None)
result_df

Unnamed: 0,File1,File2,Count,Time_Window
0,core\src\main\java\kafka\log\remote\RemoteLogManager.java,core\src\test\java\kafka\log\remote\RemoteLogManagerTest.java,23.0,168
1,core\src\main\java\kafka\log\remote\RemoteLogManager.java,core\src\test\java\kafka\log\remote\RemoteLogManagerTest.java,23.0,48
2,core\src\main\java\kafka\log\remote\RemoteLogManager.java,core\src\test\java\kafka\log\remote\RemoteLogManagerTest.java,23.0,24
3,core\src\test\java\kafka\log\remote\RemoteLogManagerTest.java,core\src\main\java\kafka\log\remote\RemoteLogManager.java,23.0,24
4,core\src\main\java\kafka\log\remote\RemoteLogManager.java,core\src\test\java\kafka\log\remote\RemoteLogManagerTest.java,23.0,72
...,...,...,...,...
197771,server-common\src\test\java\org\apache\kafka\timeline\TimelineIntegerTest.java,server-common\src\test\java\org\apache\kafka\timeline\BaseHashTableTest.java,1.0,48
197772,server-common\src\test\java\org\apache\kafka\timeline\TimelineIntegerTest.java,server-common\src\main\java\org\apache\kafka\timeline\TimelineLong.java,1.0,48
197773,server-common\src\test\java\org\apache\kafka\timeline\TimelineIntegerTest.java,server-common\src\test\java\org\apache\kafka\timeline\TimelineHashSetTest.java,1.0,48
197774,server-common\src\test\java\org\apache\kafka\timeline\TimelineIntegerTest.java,server-common\src\main\java\org\apache\kafka\timeline\SnapshotRegistry.java,1.0,48
