# Preprocessing

In [1]:
%pip install pydriller

Note: you may need to restart the kernel to use updated packages.


In [2]:
from pydriller import Repository
import os
import datetime as dt
from datetime import timedelta

In [3]:
# Clone code from the repo and save it for code portability -> via normal git clone
url = "https://github.com/apache/kafka"
repo_path = os.path.join(os.getcwd(), 'kafka')
clone = f"git clone {url} {repo_path}" 

os.system(clone) # Cloning

128

In [4]:
# checkout the tag 3.6.0
os.chdir(repo_path)
os.system("git checkout 3.6.0")
# back to the "home" folder
os.chdir("..")

In [5]:
from_tag = "3.5.1"
from_date=dt.datetime(2023, 7, 14, 18, 51, 0)

to_tag = "3.6.0"
to_date=dt.datetime(2023, 9, 29, 6, 56, 0)

# Using datetimes of the releases, since tags don't yield any commits
# Open question 
# repo = Repository(path_to_repo=repo_path, from_tag=to_tag, to_tag=from_tag)
repo = Repository(path_to_repo=repo_path, since=from_date, to=to_date)

# Exercise 3

In [6]:
# return array of files which were modified in following time horizon given the from_time
from datetime import timedelta
from collections import defaultdict

def get_mod_files_between(from_time, windows):
    results = {window: defaultdict(int) for window in windows}

    # Iterate through commits once, checking against each window
    for commit in Repository(path_to_repo=repo_path, since=from_time, to=from_time+timedelta(hours=168)).traverse_commits():
        for file in commit.modified_files:
            filename = file.new_path if file.new_path else file.old_path

            if(not filename.endswith(".java")):
                continue


            # append into each suitable window
            for window in windows:
                window_end_time = from_time + timedelta(hours=window)
                if from_time <= commit.committer_date <= window_end_time:
                        results[window][filename] += 1

    # Convert defaultdict to regular dict for final output
    return {window: dict(file_dict) for window, file_dict in results.items()}


In [14]:
from collections import Counter
from datetime import datetime
#iterate over each changes from the start of the repo?
# -> look in a windof of next commits within a window of X
# -> calculate the number of changes in this window

file_connections = defaultdict(lambda: {24: {}, 48: {}, 72: {}, 168: {}})
windows = [24, 48, 72, 168]
i = 1

all_commits = Repository(path_to_repo=repo_path, 
                         only_modifications_with_file_types=['.java'],
                         since=datetime(2023, 8, 20, 17, 59, 0)
                         ).traverse_commits()

for commit in all_commits:
    for file in commit.modified_files:
        filename = file.new_path if file.new_path else file.old_path

        #if not java files -> skip
        if(not filename.endswith(".java")):
            continue
        file_dict = file_connections[filename]

        mod_files_results = get_mod_files_between(commit.committer_date, windows)
        for window, mod_list in mod_files_results.items():
            file_dict[window] = dict(Counter(file_dict[window]) + Counter(mod_list))
                
    # i -= 1
    # if i <= 0:
    #     break

In [15]:
#This function has 2 assumptions 
#   -> we don't remove modification of the file itself in the array
#   -> we have an causality -> given modification of file 1 how many times was FILE2 modified in the time window
def generate_commit_structure(file_connections):
    output = {}
    # Collect data for each file pair across all time windows
    for file, time_windows in file_connections.items():
        for time_window, connections in time_windows.items():
            for connected_file, count in connections.items():
                file_pair = tuple(sorted([file, connected_file]))
                if file_pair not in output:
                    output[file_pair] = {}
                if time_window not in output[file_pair]:
                    output[file_pair][time_window] = 0
                output[file_pair][time_window] += count

    # Format the output
    formatted_output = []
    for file_pair, time_windows in output.items():
        coupled_commits = [{"time_window": tw, "commit_count": count} for tw, count in time_windows.items()]
        formatted_output.append({
            "file_pair": list(file_pair),
            "coupled_commits": coupled_commits
        })

    return formatted_output

In [16]:
#Test the transformation into Slides type of code
file_connections_test = {"FILE1": {12: {"FILE2": 1, "FILE3": 1, "FILE4": 5},
                                   24: {"FILE2": 1, "FILE3": 1, "FILE4": 5}},
                        "FILE4": {12: {"FILE2": 1, "FILE3": 1, "FILE41": 5}}}
new_structure = generate_commit_structure(file_connections_test)
new_structure

[{'file_pair': ['FILE1', 'FILE2'],
  'coupled_commits': [{'time_window': 12, 'commit_count': 1},
   {'time_window': 24, 'commit_count': 1}]},
 {'file_pair': ['FILE1', 'FILE3'],
  'coupled_commits': [{'time_window': 12, 'commit_count': 1},
   {'time_window': 24, 'commit_count': 1}]},
 {'file_pair': ['FILE1', 'FILE4'],
  'coupled_commits': [{'time_window': 12, 'commit_count': 5},
   {'time_window': 24, 'commit_count': 5}]},
 {'file_pair': ['FILE2', 'FILE4'],
  'coupled_commits': [{'time_window': 12, 'commit_count': 1}]},
 {'file_pair': ['FILE3', 'FILE4'],
  'coupled_commits': [{'time_window': 12, 'commit_count': 1}]},
 {'file_pair': ['FILE4', 'FILE41'],
  'coupled_commits': [{'time_window': 12, 'commit_count': 5}]}]

In [17]:
# transform repository into the structure for the slides
new_structure = generate_commit_structure(file_connections)

new_structure

[{'file_pair': ['core\\src\\main\\java\\kafka\\log\\remote\\RemoteLogManager.java',
   'core\\src\\main\\java\\kafka\\log\\remote\\RemoteLogManager.java'],
  'coupled_commits': [{'time_window': 24, 'commit_count': 1},
   {'time_window': 48, 'commit_count': 1},
   {'time_window': 72, 'commit_count': 1},
   {'time_window': 168, 'commit_count': 1}]},
 {'file_pair': ['core\\src\\main\\java\\kafka\\log\\remote\\RemoteLogManager.java',
   'core\\src\\test\\java\\kafka\\log\\remote\\RemoteLogManagerTest.java'],
  'coupled_commits': [{'time_window': 24, 'commit_count': 2},
   {'time_window': 48, 'commit_count': 2},
   {'time_window': 72, 'commit_count': 2},
   {'time_window': 168, 'commit_count': 2}]},
 {'file_pair': ['core\\src\\main\\java\\kafka\\log\\remote\\RemoteLogManager.java',
   'storage\\api\\src\\main\\java\\org\\apache\\kafka\\server\\log\\remote\\storage\\RemoteStorageManager.java'],
  'coupled_commits': [{'time_window': 24, 'commit_count': 1},
   {'time_window': 48, 'commit_count

In [18]:
# convert into pandas with following headers FILE1, FILE2, window_24, window_48, window_72, window_168
import pandas as pd

def to_pandas(data):
    # Initialize a list to store the rows for the DataFrame
    rows = []

    # Process each item in the data to create a row for the DataFrame
    for item in data:
        row = {
            'FILE1': item['file_pair'][0],
            'FILE2': item['file_pair'][1],
            'window_24': 0,
            'window_48': 0,
            'window_72': 0,
            'window_168': 0
        }
        # Update the row with the commit counts for each time window
        for commit in item['coupled_commits']:
            time_window = commit['time_window']
            count = commit['commit_count']
            if time_window == 24:
                row['window_24'] += count
            elif time_window == 48:
                row['window_48'] += count
            elif time_window == 72:
                row['window_72'] += count
            elif time_window == 168:
                row['window_168'] += count

        rows.append(row)

    # Create a DataFrame from the rows
    df = pd.DataFrame(rows)

    return df



In [19]:
# sortby window 24
df = to_pandas(new_structure)
df.sort_values(by=['window_168'], ascending=False)
df

Unnamed: 0,FILE1,FILE2,window_24,window_48,window_72,window_168
0,core\src\main\java\kafka\log\remote\RemoteLogM...,core\src\main\java\kafka\log\remote\RemoteLogM...,1,1,1,1
1,core\src\main\java\kafka\log\remote\RemoteLogM...,core\src\test\java\kafka\log\remote\RemoteLogM...,2,2,2,2
2,core\src\main\java\kafka\log\remote\RemoteLogM...,storage\api\src\main\java\org\apache\kafka\ser...,1,2,2,2
3,core\src\main\java\kafka\log\remote\RemoteLogM...,storage\src\main\java\org\apache\kafka\storage...,1,1,1,1
4,core\src\main\java\kafka\log\remote\RemoteLogM...,storage\src\test\java\org\apache\kafka\server\...,1,1,1,1
...,...,...,...,...,...,...
336,connect\runtime\src\test\java\org\apache\kafka...,streams\src\main\java\org\apache\kafka\streams...,0,1,1,1
337,metadata\src\main\java\org\apache\kafka\contro...,metadata\src\main\java\org\apache\kafka\contro...,1,1,1,1
338,metadata\src\main\java\org\apache\kafka\contro...,streams\src\main\java\org\apache\kafka\streams...,0,1,1,1
339,streams\src\main\java\org\apache\kafka\streams...,streams\src\main\java\org\apache\kafka\streams...,1,1,1,1
