# Preprocessing

In [22]:
%pip install pydriller

Note: you may need to restart the kernel to use updated packages.


In [23]:
from pydriller import Repository
import os
import datetime as dt
from datetime import timedelta

In [24]:
# Clone code from the repo and save it for code portability -> via normal git clone
url = "https://github.com/apache/kafka"
repo_path = os.path.join(os.getcwd(), 'kafka')
clone = f"git clone {url} {repo_path}" 

os.system(clone) # Cloning

128

In [25]:
# checkout the tag 3.6.0
os.chdir(repo_path)
os.system("git checkout 3.6.0")
# back to the "home" folder
os.chdir("..")

In [26]:
from_tag = "3.5.1"
from_date=dt.datetime(2023, 7, 14, 18, 51, 0)

to_tag = "3.6.0"
to_date=dt.datetime(2023, 9, 29, 6, 56, 0)

# Using datetimes of the releases, since tags don't yield any commits
# Open question 
# repo = Repository(path_to_repo=repo_path, from_tag=to_tag, to_tag=from_tag)
repo = Repository(path_to_repo=repo_path, since=from_date, to=to_date)

# Exercise 3

In [27]:
# Create a matrix of all current java files
import os

def get_unique_java_files(directory):
    unique_files = set()
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(".java"):
                file_path = os.path.join(root, file)
                # remove the repo path from the file path
                file_path = file_path.replace(directory+"\\", "")
                unique_files.add(file_path)
    return list(unique_files)

# Replace 'your_repo_directory' with the path to your repository
unique_files = get_unique_java_files(repo_path)

In [28]:
# initalize matrix
import numpy as np

time_windows = [24, 48, 72, 168]  # time windows in hours
matrices = {window: np.zeros((len(unique_files), len(unique_files))) for window in time_windows}
file_index_map = {file: index for index, file in enumerate(unique_files)}



In [29]:
def add_file_to_matrices(file, file_index_map, matrices):
    if file not in file_index_map:
        new_index = len(file_index_map)
        file_index_map[file] = new_index
        unique_files.append(file)  # Update the unique_files list
        
        for window in matrices:
            # Expand each matrix for the new file
            matrices[window] = np.pad(matrices[window], ((0, 1), (0, 1)), mode='constant')

In [30]:
from datetime import timedelta

def update_matrices(commit, matrices, file_index_map):
    commit_time = commit.committer_date  # Adjusted to use your attribute

    for file in commit.modified_files:
        # Accessing filename correctly as per your example
        filename = file.new_path if file.new_path else file.old_path
        add_file_to_matrices(filename, file_index_map, matrices)

    modified_file_indices = [file_index_map[file.new_path if file.new_path else file.old_path] for file in commit.modified_files]

    for i in modified_file_indices:
        for j in modified_file_indices:
            if i != j:  # Skip incrementing for the same file
                for window in time_windows:
                    if commit_time - commit.committer_date <= timedelta(hours=window):
                        matrices[window][i, j] += 1

In [31]:
i = 1
for commit in Repository(path_to_repo=repo_path, only_modifications_with_file_types=['.java']).traverse_commits():
    update_matrices(commit, matrices, file_index_map)
    i -= 1
    if i == 0:
        break

In [32]:
import pandas as pd

def matrices_to_dataframe(matrices, unique_files):
    df_list = []
    for window, matrix in matrices.items():
        df = pd.DataFrame(matrix, index=unique_files, columns=unique_files)
        df = df.stack().reset_index()
        df.columns = ['File1', 'File2', 'Count']
        df['Time_Window'] = window
        df_list.append(df)

    return pd.concat(df_list, ignore_index=True)

result_df = matrices_to_dataframe(matrices, unique_files)

In [33]:
# Example: Filter out pairs with zero counts
result_df = result_df[result_df['Count'] > 0]

# Sort, reset index, etc.
result_df = result_df.sort_values(by=['Count'], ascending=False).reset_index(drop=True)