# Preprocessing

In [4]:
%pip install pydriller --upgrade
%pip install ujson --upgrade
%pip install numpy --upgrade
%pip install scipy --upgrade

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [5]:
from pydriller import Repository, Git
import numpy as np
from scipy.sparse import csc_matrix, vstack
from datetime import timedelta, datetime
import os
import ujson
from collections import deque

In [6]:
# Clone code from the repo and save it for code portability -> via normal git clone
url = "https://github.com/apache/kafka"
repo_path = os.path.join(os.getcwd(), 'kafka')
clone = f"git clone {url} \"{repo_path}\"" 

os.system(clone) # Cloning

128

In [183]:
# checkout the tag 3.6.0
os.chdir(repo_path)
os.system("git checkout 3.6.0")
# back to the "home" folder
os.chdir("..")

# Exercise 3

In [184]:
# Configuration

# Timeframes to look at
timeframes = [168, 72, 48, 24]

temporal_coupling_save_file = "./ex_3_temporal_coupling.json"
logical_coupling_save_file = "./ex_3_logical_coupling.json"

repo_path = "./kafka"

In [185]:
# Get all files currently in the repository
files = [os.path.relpath(entity, repo_path) for entity in Git(repo_path).files()]

number_of_files = len(files)
commits_since = datetime(2023, 9, 1, 0, 0, 0)

commits = [c for c in Repository(repo_path).traverse_commits()]

timeframes.sort(reverse=True)

In [186]:
commit_window = deque()
temporal_update_vectors = [{'v': [], 'w': []} for _ in timeframes]
logical_update_vectors = []

for commit in commits:
    modifications = [file.new_path for file in commit.modified_files]
    mask = np.isin(files, modifications)
    indices = np.where(mask)[0]
    c1_modifications = csc_matrix((np.ones_like(indices), (np.zeros_like(indices), indices)), shape=(1, len(files)))
    c1_time = commit.committer_date
    c1 = {"time": c1_time, "modifications": c1_modifications}
    inside_timeframe = [False for _ in timeframes]
    i = 0
    logical_update_vectors.append(c1_modifications)
    while i < len(commit_window):
        c2 = commit_window[i]
        delta = c1_time - c2["time"]
        for idx, timeframe in enumerate(timeframes):
            if inside_timeframe[idx] or delta <= timedelta(hours=timeframe):
                temporal_update_vectors[idx]['v'].append(c1_modifications)
                temporal_update_vectors[idx]['w'].append(c2["modifications"])
                inside_timeframe[idx] = True
        if inside_timeframe[0]:
            i += 1
        else:
            commit_window.popleft()
    commit_window.append(c1)              

In [187]:
# For each timeframe compute a temporal coupling matrix
temporal_matrices = []
for update_vector in temporal_update_vectors:
    v1_stack = vstack(update_vector['v'])
    v2_stack = vstack(update_vector['w'])
    m = v1_stack.transpose(copy=True) @ v2_stack
    a = v1_stack.multiply(v2_stack)
    duplicates = a.transpose(copy=True) @ a
    temporal_matrices.append(m + m.transpose(copy=True) - duplicates)

In [188]:
# Compute the logical coupling matrix and joint commits vector
logical_coupling_matrix = vstack(logical_update_vectors).transpose() @ vstack(logical_update_vectors)

joint_commits_vector = logical_coupling_matrix.sum(axis=0).A1 - logical_coupling_matrix.diagonal()

In [189]:
# Convert the temporal coupling matrix to the required format
temporal_coupling = {}
for idx, matrix in enumerate(temporal_matrices):
    for row, col in zip(*matrix.nonzero()):
        if row == col:
            continue
        val = int(matrix[row, col])
        key = f"{row}_{col}"
        if key not in temporal_coupling:
            temporal_coupling[key] = {
                "file_pair": [
                    files[row],
                    files[col]
                ],
                "coupled_commits": [
                    {
                        "time_window": timeframe,
                        "commit_count": 0
                    } for timeframe in reversed(timeframes)
                ]
            }
        temporal_coupling[key]["coupled_commits"][len(timeframes) - 1 - idx]["commit_count"] = val

In [190]:
# Convert the logical coupling matrix and vector to the required format
logical_coupling = []
for row, col in zip(*logical_coupling_matrix.nonzero()):
    # We just need to look at the upper triangle because the matrix is symmetric
    if row >= col:
        continue
    val = int(logical_coupling_matrix[row, col])
    file_name_1 = files[row]
    file_name_2 = files[col]
    
    logical_coupling.append({
        "file_pair": [file_name_1, file_name_2],
        "logical_coupling": {
            "Joint": val,
            file_name_1: int(joint_commits_vector[row]) - val,
            file_name_2: int(joint_commits_vector[col]) - val
        }
    })

In [191]:
# Saving temporal couplings
with open(temporal_coupling_save_file, "w") as f:
    ujson.dump(list(temporal_coupling.values()), f, indent=4)

In [192]:
# Saving logical couplings
with open(logical_coupling_save_file, "w") as f:
    ujson.dump(logical_coupling, f, indent=4)