# Patchwork Code Review Analysis

In [1]:
from pathlib import Path

import pymongo
from pymongo.cursor import Cursor
import pandas as pd
import numpy as np
from numpy import median
import whatthepatch
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
def parse_diff(diff_text: str):
    changed_lines = 0
    changed_files = 0

    incomplete_flag = False

    try:
        diffs = [diff for diff in whatthepatch.parse_patch(diff_text)]

        changed_files = len(diffs)

        # count modified lines -- lines that exist only in old or or new version
        for diff in diffs:
            try:
                for change in diff.changes:
                    if change.old is None or change.new is None:
                        changed_lines += 1
            except Exception:
                # create new blank file - no changes
                incomplete_flag = True
                pass

    except Exception:
        # parsing error - skip
        # e.g., https://patchwork.ozlabs.org/project/qemu-devel/patch/1643044621-15892-11-git-send-email-eric.devolder@oracle.com/
        incomplete_flag = True
        pass

    return (changed_files, changed_lines, incomplete_flag)


In [3]:
def generate_file_names(project:str, level: int, cleaned=False):
    if not cleaned:
        return [
            f"data/data_{project}_change{level}.csv",
            f"data/describe_{project}_change{level}.csv"
        ]
    
    return [
        f"output/cleaned_{project}_change{level}.csv",
        f"output/cleaned_describe_{project}_change{level}.csv"
    ]
    

def export_dataset(patch_group_info_list: list, project: str, level: int):

    patch_group_info_df = pd.DataFrame(patch_group_info_list)

    data_filename, description_filename = generate_file_names(
        project=project,
        level=level
    )

    # raw data    
    patch_group_info_df.to_csv(data_filename, index=False)

    # project level - descriptive analysis
    patch_group_info_df.describe(include="all").to_csv(
        description_filename
    )

## Database Connection

In [4]:
client = pymongo.MongoClient("mongodb://localhost:27017/")
db = client["code_review_db"]

## Target Project

In [5]:
selected_projects = [
    "ffmpeg-project-1",
    "ozlabs-project-18",
    "kernel-project-399",
    "ozlabs-project-14",
    "kernel-project-62"  
]

project_name = {
    "ffmpeg-project-1": "ffmpeg",
    "ozlabs-project-18": "u-boot",
    "kernel-project-399": "netdev + bpf",
    "ozlabs-project-14": "qemu",
    "kernel-project-62": "arm"
}

### Parameters

In [6]:
test_run = False

## Data Retrieving

In [7]:
def get_patch_groups(level: int, project: str) -> Cursor:
    changes_col = db[f"patchwork_change{level}"]
    return changes_col.find(
        {"project": project}
    )


def get_patches_in_group(
    level: int,
    patch_group: Cursor
) -> list:
    
    patch_col = db["patchwork_patch"]
    patch_group_column_name = f"change{level}"
    
    grouped_patches = patch_col.find(
        {
            patch_group_column_name: patch_group["original_id"]
        }
    )
    all_patches_list = list(grouped_patches.clone())
    all_patches_list.sort(key=lambda p: p["date"])

    return all_patches_list


def get_comments_in_group(
        level: int,
        patch_group_id: str
) -> Cursor:
    
    patch_group_column_name = f"change{level}"
    comment_col = db["patchwork_comment"]
    
    grouped_comments = comment_col.find(
                        {
            patch_group_column_name: patch_group_id
        }
    )
    all_comments_list = list(grouped_comments.clone())
    all_comments_list.sort(key=lambda c: c["date"])

    return all_comments_list


def prepare_code_review_data(
    level: int,
    patch_group: Cursor
) -> dict:
    
    code_review_data = {
        "all_patches_list": None,
        "all_comments_list": None,
        "first_patch": None,
        "last_patch": None,
        "first_comment": None,
        "last_comment": None,
    }

    # get patches in group - sort by timestamp
    all_patches_list = get_patches_in_group(
        level=level,
        patch_group=patch_group
    )

    # safe guard - in case no patches in the group
    if len(all_patches_list) > 0:
        code_review_data["all_patches_list"] = all_patches_list

        code_review_data["first_patch"] = all_patches_list[0]
        code_review_data["last_patch"] = all_patches_list[-1]

        # find comments in group - sort by timestamp
        all_comments_list = get_comments_in_group(
            level=level,
            patch_group_id=patch_group["original_id"]
        )

        # print("finish querying - start calculation")

        # safe guard - in case no comments were made
        if len(all_comments_list) > 0:
            
            code_review_data["first_comment"] = all_comments_list[0]
            code_review_data["last_comment"] = all_comments_list[-1]

            code_review_data["all_comments_list"] = all_comments_list

    
    return code_review_data

## Metric Query

### Metrics Computation

In [8]:
def metrics_computation(
    patch_group:dict,
    all_patches_list: list,
    all_comments_list: list,
    first_patch: dict,
    first_comment: dict,
    last_comment: dict
) -> dict:
    # iterations
    iterations = len(all_patches_list)

    # intensity
    # comments or discussion length
    comments = len(all_comments_list) if all_comments_list is not None else 0

    total_changed_files = 0
    total_changed_lines = 0
    changed_files_first = 0
    changed_lines_first = 0
    changed_files_last = 0
    changed_lines_last = 0
    incomplete_change_info = False

    # individual submitters in grouped patches
    individuals = set()

    for i, patch in enumerate(all_patches_list):
        # print(all_patches_list[0]["code_diff"])
        (
            changed_files,
            changed_lines,
            incomplete_change_info,
        ) = parse_diff(diff_text=patch["code_diff"])

        total_changed_files += changed_files
        total_changed_lines += changed_lines

        # first version
        if i == 0:
            changed_files_first = changed_files
            changed_lines_first = changed_lines

        # last version
        if i == len(all_patches_list) - 1:
            changed_files_last = changed_files
            changed_lines_last = changed_lines

        # patch author
        individuals.add(patch["submitter_individual"])

    # code churn
    changed_lines_avg = total_changed_lines / len(all_patches_list)

    # files changed
    changed_files_avg = total_changed_files / len(all_patches_list)

    # participation
    # authors
    authors = len(individuals)

    # reviewers - commentators that are not authors
    commentators = []
    if all_comments_list is not None:
        commentators = set(
            [
                comment["submitter_individual"]
                for comment in all_comments_list
                if comment["submitter_individual"]
                not in individuals
            ]
        )
    reviewers = len(commentators)

    # time
    # first response
    response_time_seconds = (
        (first_comment["date"] - first_patch["date"]).total_seconds()
        if first_comment is not None
        else 0
    )

    # finalization - first patch until last comment
    # suggested by Rigby and Bird
    finalizing_time_seconds = (
        (last_comment["date"] - first_patch["date"]).total_seconds()
        if last_comment is not None
        else 0
    )

    # result per group
    is_accepted = True if patch_group["is_accepted"] else False

    return {
        "reviewers": reviewers,
        "authors": authors,
        "comments": comments,
        "iterations": iterations,
        "changed_lines_avg": changed_lines_avg,
        "changed_lines_first": changed_lines_first,
        "changed_lines_last": changed_lines_last,
        "changed_files_avg": changed_files_avg,
        "changed_files_first": changed_files_first,
        "changed_files_last": changed_files_last,
        "incomplete_change_info": incomplete_change_info,
        "response_time": response_time_seconds / 60 / 60,
        "finalizing_time": finalizing_time_seconds / 60 / 60,
        "is_accepted": is_accepted,
    }


In [9]:
# code review metrics calculation - per patch group (change1 and change2)
patch_group_info_list = []

def append_list_structure(patch_group_info_list:list, entry: dict):

    # ensure all columns are available
    assert "project_original_id" in entry
    assert "patch_group_original_id" in entry
    assert "reviewer_count" in entry
    assert "author_count" in entry
    assert "comment_count" in entry
    assert "iteration_count" in entry
    assert "changed_lines_avg" in entry
    assert "changed_lines_first" in entry
    assert "changed_lines_last" in entry
    assert "changed_files_avg" in entry
    assert "changed_files_first" in entry
    assert "changed_files_last" in entry
    assert "incomplete_change_info" in entry
    assert "response_time" in entry
    assert "finalizing_time" in entry
    assert "is_accepted" in entry

    patch_group_info_list.append(entry)

    return patch_group_info_list


### Extract Data

In [10]:
print("start query")
for level in range(1, 3):

    for project in selected_projects:

        data_filename, description_filename = generate_file_names(
            project=project, level=level
        )

        if Path(data_filename).is_file():
            print(project, level, data_filename, "results already exist, skipping")

        else:
            total_group_executed = 0
            patch_group_info_list = []

            # get patch groups
            patch_groups = get_patch_groups(level=level, project=project)

            print("# of total groups: ", len(list(patch_groups.clone())))

            # running through grouped patches (including patches identified as individual)
            for patch_group in patch_groups:
                print(patch_group["original_id"])

                # prepare code review dataset in patch group from database
                code_review_data = prepare_code_review_data(
                    level=level,
                    patch_group=patch_group
                )

                if code_review_data["all_patches_list"] is not None:

                    # metrics computation
                    metric_results = metrics_computation(
                        patch_group=patch_group,
                        all_patches_list=code_review_data["all_patches_list"],
                        all_comments_list=code_review_data["all_comments_list"],
                        first_patch=code_review_data["first_patch"],
                        first_comment=code_review_data["first_comment"],
                        last_comment=code_review_data["last_comment"]
                    )

                    patch_group_info_list = append_list_structure(
                        patch_group_info_list=patch_group_info_list,
                        entry={
                            "project_original_id": project,
                            "patch_group_original_id": patch_group["original_id"],
                            "reviewer_count": metric_results["reviewers"],
                            "author_count": metric_results["authors"],
                            "comment_count": metric_results["comments"],
                            "iteration_count": metric_results["iterations"],
                            "changed_lines_avg": metric_results["changed_lines_avg"],
                            "changed_lines_first": metric_results["changed_lines_first"],
                            "changed_lines_last": metric_results["changed_lines_last"],
                            "changed_files_avg": metric_results["changed_files_avg"],
                            "changed_files_first": metric_results["changed_files_first"],
                            "changed_files_last": metric_results["changed_files_last"],
                            "incomplete_change_info":metric_results["incomplete_change_info"],
                            "response_time": metric_results["response_time"],
                            "finalizing_time": metric_results["finalizing_time"],
                            "is_accepted": metric_results["is_accepted"],
                        }
                    )

                    # print("finish calculation")

                    total_group_executed += 1

                # test run circuit breaker
                if test_run and total_group_executed > 1000:
                    break

            # export data set once per project
            export_dataset(
                patch_group_info_list=patch_group_info_list,
                project=project,
                level=level
            )
            


start query
ffmpeg-project-1 1 data/data_ffmpeg-project-1_change1.csv results already exist, skipping
ozlabs-project-18 1 data/data_ozlabs-project-18_change1.csv results already exist, skipping
kernel-project-399 1 data/data_kernel-project-399_change1.csv results already exist, skipping
ozlabs-project-14 1 data/data_ozlabs-project-14_change1.csv results already exist, skipping
kernel-project-62 1 data/data_kernel-project-62_change1.csv results already exist, skipping
ffmpeg-project-1 2 data/data_ffmpeg-project-1_change2.csv results already exist, skipping
ozlabs-project-18 2 data/data_ozlabs-project-18_change2.csv results already exist, skipping
kernel-project-399 2 data/data_kernel-project-399_change2.csv results already exist, skipping
ozlabs-project-14 2 data/data_ozlabs-project-14_change2.csv results already exist, skipping
kernel-project-62 2 data/data_kernel-project-62_change2.csv results already exist, skipping


## Descriptive Analysis and Visualization
- Data cleaning - Negative response time and finalizing time are remove from timestamp calculation (incomplete data)

In [11]:
concat_counting_metrics_df = pd.DataFrame()
concat_time_metrics_df = pd.DataFrame()
concat_cleaned_description_df = pd.DataFrame()
concat_cleaned_description_list = []

# calculate cleaned version of data description
for level in range(1, 3):
    for project in selected_projects:

        metrics = [
            "reviewer_count",
            "author_count",
            "comment_count",
            "iteration_count",
            "changed_lines_avg",
            "changed_files_avg",
            "changed_lines_last",
            "changed_files_last",
            "response_time",
            "finalizing_time",
            "is_accepted",
        ]

        data_filename, _ = generate_file_names(
            project=project,
            level=level,
            cleaned=False
        )

        assert Path(data_filename).is_file()
        print("cleaning data in:", data_filename)

        cleaned_description_list = []
        data_df = pd.read_csv(data_filename)

        # only consider valid code review activity
        # groups must have at least one reviewer, one comment, one code change
        data_df = data_df[(data_df["comment_count"] > 0) & (data_df["reviewer_count"] > 0) & (data_df["changed_lines_avg"] > 0)]

        for metric in metrics:
            
            # apply filter to remove negative values
            if metric in ["response_time", "finalizing_time"]:
                target_metric = data_df[data_df[metric] >0][metric]
            else:
                target_metric = data_df[metric]

            if metric != "is_accepted":
                cleaned_description_list.append({
                    "metric": metric,
                    "count": len(target_metric),
                    "mean": np.mean(target_metric),
                    "std": np.std(target_metric),
                    "min": min(target_metric),
                    "25%": np.percentile(target_metric, 25),
                    "50%": np.percentile(target_metric, 50),
                    "75%": np.percentile(target_metric, 75),
                    "max": max(target_metric)
                })
            else:
                cleaned_description_list.append({
                    "metric": metric,
                    "count": len(target_metric),
                    "mean": sum(target_metric),
                    "std": None,
                    "min": None,
                    "25%": None,
                    "50%": None,
                    "75%": None,
                    "max": None
                })
        
        cleaned_description_df = pd.DataFrame(cleaned_description_list)
        
        _, cleaned_describe_filename = generate_file_names(
            project=project,
            level=level,
            cleaned=True
        )

        cleaned_description_df.to_csv(cleaned_describe_filename, index=False)

        # store description in one large table
        cleaned_description_df["project_name"] =  project_name[project]
        concat_cleaned_description_df = pd.concat([
            concat_cleaned_description_df,
            cleaned_description_df
        ])

        print("concatenating data from:", data_filename)
        data_df["level"] = f"change-{level}"
        
        # rename project
        data_df["project_original_id"] = data_df["project_original_id"].replace(
            project, project_name[project]
        )
        
        # seperate data frame for counting metrics
        data_df["log_comment_count"] = np.log10(data_df["comment_count"])
        data_df["log_changed_lines_avg"] = np.log10(data_df["changed_lines_avg"])
        data_df["log_changed_files_avg"] = np.log10(data_df["changed_files_avg"])

        concat_counting_metrics_df = pd.concat([
            concat_counting_metrics_df,
            data_df.loc[:, ~data_df.columns.isin(["response_time", "finalizing_time"])]
        ], axis=0)
        
        # seperate data frame for time metrics
        time_df = data_df.loc[:, data_df.columns.isin([
            "project_original_id",
            "patch_group_original_id",
            "level",
            "response_time",
            "finalizing_time"
        ])]

        # only take entries with non-negative time metrics
        time_df = time_df[(time_df["response_time"] > 0) & (time_df["finalizing_time"] > 0)]

        time_df["log_response_time"] = np.log10(time_df["response_time"])
        time_df["log_finalizing_time"] = np.log10(time_df["finalizing_time"])

        concat_time_metrics_df = pd.concat([
            concat_time_metrics_df,
            time_df.loc[:, ~time_df.columns.isin(["response_time", "finalizing_time"])]
        ], axis=0)

print("total normal metric dataframe size:", concat_counting_metrics_df.shape)
print("total time metric dataframe size:", concat_time_metrics_df.shape)

# melt time metrics to compare with violin plot
concat_time_metrics_df = pd.melt(
    concat_time_metrics_df,
    id_vars=["project_original_id", "patch_group_original_id", "level"],
    var_name="time_metric",
    value_name="time_hr"
)


cleaning data in: data/data_ffmpeg-project-1_change1.csv
concatenating data from: data/data_ffmpeg-project-1_change1.csv
cleaning data in: data/data_ozlabs-project-18_change1.csv
concatenating data from: data/data_ozlabs-project-18_change1.csv
cleaning data in: data/data_kernel-project-399_change1.csv
concatenating data from: data/data_kernel-project-399_change1.csv
cleaning data in: data/data_ozlabs-project-14_change1.csv
concatenating data from: data/data_ozlabs-project-14_change1.csv
cleaning data in: data/data_kernel-project-62_change1.csv
concatenating data from: data/data_kernel-project-62_change1.csv
cleaning data in: data/data_ffmpeg-project-1_change2.csv
concatenating data from: data/data_ffmpeg-project-1_change2.csv
cleaning data in: data/data_ozlabs-project-18_change2.csv
concatenating data from: data/data_ozlabs-project-18_change2.csv
cleaning data in: data/data_kernel-project-399_change2.csv
concatenating data from: data/data_kernel-project-399_change2.csv
cleaning data in

In [12]:
# Summary table
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

display(concat_cleaned_description_df)

pd.reset_option('all')

Unnamed: 0,metric,count,mean,std,min,25%,50%,75%,max,project_name
0,reviewer_count,13032,1.578959,1.096787,1.0,1.0,1.0,2.0,14.0,ffmpeg
1,author_count,13032,1.027778,0.16894,1.0,1.0,1.0,1.0,3.0,ffmpeg
2,comment_count,13032,4.020565,5.120827,1.0,1.0,2.0,5.0,125.0,ffmpeg
3,iteration_count,13032,1.646486,1.508528,1.0,1.0,1.0,2.0,37.0,ffmpeg
4,changed_lines_avg,13032,94.45377,487.052552,0.5,4.0,12.0,45.0,25300.5,ffmpeg
5,changed_files_avg,13032,2.585931,11.254737,1.0,1.0,1.0,2.0,513.0,ffmpeg
6,changed_lines_last,13032,95.256983,493.954452,0.0,4.0,12.0,45.0,25301.0,ffmpeg
7,changed_files_last,13032,2.616483,11.366668,1.0,1.0,1.0,2.0,513.0,ffmpeg
8,response_time,12998,148.631392,885.436229,0.005278,1.473611,9.900278,43.365556,26739.402778,ffmpeg
9,finalizing_time,13021,474.89967,1850.216112,0.013056,14.151111,49.447778,227.635,47614.636667,ffmpeg


In [13]:
concat_counting_metrics_df.head()

Unnamed: 0,project_original_id,patch_group_original_id,reviewer_count,author_count,comment_count,iteration_count,changed_lines_avg,changed_lines_first,changed_lines_last,changed_files_avg,changed_files_first,changed_files_last,incomplete_change_info,is_accepted,level,log_comment_count,log_changed_lines_avg,log_changed_files_avg
0,ffmpeg,ffmpeg-change1-1,10,1,44,11,324.545455,328,327,4.727273,5,5,False,False,change-1,1.643453,2.511276,0.674611
1,ffmpeg,ffmpeg-change1-2,5,1,15,10,8.5,11,7,2.2,3,2,False,True,change-1,1.176091,0.929419,0.342423
2,ffmpeg,ffmpeg-change1-3,2,1,3,10,10.0,10,10,1.0,1,1,False,False,change-1,0.477121,1.0,0.0
3,ffmpeg,ffmpeg-change1-4,2,1,2,2,4.5,8,1,2.0,3,1,False,True,change-1,0.30103,0.653213,0.30103
4,ffmpeg,ffmpeg-change1-5,2,1,3,2,54.0,58,50,3.0,3,3,False,False,change-1,0.477121,1.732394,0.477121


In [14]:
concat_time_metrics_df.head()

Unnamed: 0,project_original_id,patch_group_original_id,level,time_metric,time_hr
0,ffmpeg,ffmpeg-change1-1,change-1,log_response_time,-0.656482
1,ffmpeg,ffmpeg-change1-2,change-1,log_response_time,1.986793
2,ffmpeg,ffmpeg-change1-3,change-1,log_response_time,0.575701
3,ffmpeg,ffmpeg-change1-4,change-1,log_response_time,-0.395534
4,ffmpeg,ffmpeg-change1-5,change-1,log_response_time,0.516205


In [15]:
# export violin plots for counting metrics
normal_metrics = [
    "reviewer_count",
    "author_count",
    "comment_count",
    "log_comment_count",
    "iteration_count",
    "log_changed_lines_avg",
    "log_changed_files_avg",
]
sns.set(style="darkgrid")
plt.figure(figsize=(8,6))

for metric in normal_metrics:
    sns.violinplot(
        x="project_original_id",
        y=metric,
        hue="level",
        data=concat_counting_metrics_df[concat_counting_metrics_df[metric] > 0],
        palette="Pastel1"
    )
    
    plt.xlabel("project") 
    plt.savefig(f"output/plot_{metric}.pdf")
    plt.clf()


<Figure size 800x600 with 0 Axes>

In [16]:
# only plot change-1 for time metrics 
sns.set(style="darkgrid")
plt.figure(figsize=(8,8))

ax = sns.violinplot(
    x="project_original_id",
    y="time_hr",
    hue="time_metric",
    split=True,
    scale_hue=False,
    data=concat_time_metrics_df[concat_time_metrics_df["level"] =="change-1"],
    palette="Pastel1",
    inner=None,
    linewidth=1
)
sns.pointplot(
    x="project_original_id", 
    y="time_hr",
    hue="time_metric",
    data=concat_time_metrics_df[concat_time_metrics_df["level"] =="change-1"],
    estimator=median,
    linestyles="",
    palette="Pastel1",
    scale=0.5,
    ax=ax
)

# only keep  legends for median
h, l = ax.get_legend_handles_labels()
plt.legend(
    h[2:4],
    ["Median of response time", "Median of code review time"],
    bbox_to_anchor=(1.05, 1),
    loc=2,
    borderaxespad=0.
)
   
sns.move_legend(
    ax,
    "lower center",
    bbox_to_anchor=(.5, 1),
    ncol=3,
    title=None, 
    frameon=False
)

# customized scale
plt.yticks(
    [-1.78, 0, 1.38, 2.22, 2.86, 3.94],
    ["1min", "1hr", "1day", "1week", "1month", "1year"]
)

plt.ylabel("Response and Code Review Time")
plt.xlabel("Project")
plt.savefig(f"output/plot_time_metrics.pdf")
plt.clf()


<Figure size 800x800 with 0 Axes>