In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
from datetime import datetime
pd.set_option('display.max_colwidth', None)

GK_GREEN = '#07b871'
RED = "#f55b5b"
DARK_RED = "#db0000"
GREY = "#c2c2c2"
BLUE = "#4287f5"
MAJOR_RELEASE_TYPE = 'major'
MINOR_RELEASE_TYPE = 'minor'
PATCH_RELEASE_TYPE = 'patch'
NA_RELEASE_TYPE = pd.NA

In [3]:
issues = pd.read_csv('../csv/greenkeeper_issues.csv')
comments = pd.read_csv('../csv/greenkeeper_comments.csv')
events = pd.read_csv('../csv/greenkeeper_events.csv')
commits = pd.read_csv('../csv/greenkeeper_commits.csv')
package_names = pd.read_csv('../csv/greenkeeper_package_names.csv')
library_versions = pd.read_csv('../csv/breaking_library_versions.csv')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Helper Functions

In [4]:
def get_issue(id):
    return issues[issues['issue_id'] == id]

def get_issue_url(id):
    return issues[issues['issue_id'] == id]['issue_url']

def get_comment(id):
    return comments[comments['comment_id'] == id]

def get_comments_for_issue(id):
    return comments[comments['comment_issue_id'] == id]

def get_event(id):
    return events[events['event_id'] == id]

def calculate_percent(numer, denom):
    return round((numer/denom)*100, 2)

def get_update_type(prev_ver, new_ver):
    if pd.isnull(prev_ver) or pd.isnull(new_ver):
        return NA_RELEASE_TYPE
    try:
        prev_split = prev_ver.split('.')
        new_split = new_ver.split('.')
        if int(new_split[0]) > int(prev_split[0]):
            return MAJOR_RELEASE_TYPE
        elif int(new_split[1]) > int(prev_split[1]):
            return MINOR_RELEASE_TYPE
        elif int(new_split[2]) > int(prev_split[2]):
            return PATCH_RELEASE_TYPE
        else:
            return NA_RELEASE_TYPE
    except Exception as e:
        return NA_RELEASE_TYPE
    
def get_issue_id_for_commit(commit):
    try:
        event_id = commit.commit_event_id
        event = get_event(event_id)
        if event.size == 0:
            return np.nan
        issue = get_issue(event.event_issue_id.values[0].astype(np.int64))
        if issue.size == 0:
            return np.nan
        return issue.issue_id.values[0].astype(np.int64)
    except Exception:
        return np.nan

Add issue ID to commits data frame

In [5]:
commits['commit_issue_id'] = commits.apply(get_issue_id_for_commit, axis=1)

Add update type to issues table

In [6]:
issues['update_type'] = \
    issues.apply(lambda row: get_update_type(row['issue_dependency_actual_version'], row['issue_dependency_next_version']), axis=1)

Create library_releases and all_release_types

In [17]:
library_versions["version_published_at"] = library_versions["version_published_at"].astype("datetime64")
library_versions = library_versions.dropna(subset=['package_name'])
library_versions = library_versions.sort_values(['package_name', 'version_published_at'])

grouped_package_releases = library_versions.groupby(by='package_name')
packages = list()
total_time_diffs = list()
avg_time_between_releases = list()
first_release_dates = list()
last_release_dates = list()
total_releases = list()
all_release_types = list()
for package_name, group in grouped_package_releases:
    packages.append(package_name)
    prev_release_date = None
    curr_release_date = None
    prev_release_number = None
    curr_release_number = None
    first_release_date = None
    releases_count = 0
    total_time_diff = pd.Timedelta(seconds=0)
    for row_index, row in group.iterrows():
        prev_release_date = curr_release_date
        prev_release_number = curr_release_number
        curr_release_date = row['version_published_at']
        curr_release_number = row['version']
        releases_count += 1
        if prev_release_date is None:
            first_release_date = curr_release_date
            all_release_types.append(NA_RELEASE_TYPE)
            continue
        local_time_diff = (curr_release_date - prev_release_date)
        total_time_diff += local_time_diff
        all_release_types.append(get_update_type(prev_release_number, curr_release_number))
    avg = total_time_diff / releases_count
    total_time_diffs.append(total_time_diff)
    avg_time_between_releases.append(avg)
    first_release_dates.append(first_release_date)
    last_release_dates.append(curr_release_date)
    total_releases.append(releases_count)

library_releases = pd.DataFrame({
    'package_name': packages,
    'total_time_diff': total_time_diffs,
    'avg_time_between_releases': avg_time_between_releases,
    'first_release_date': first_release_dates,
    'last_release_date': last_release_dates,
    'total_releases': total_releases,
})

library_releases['broken_builds_caused'] = \
    library_releases.apply(lambda row: len(issues[issues['issue_dependency_name'] == row['package_name']]), axis=1)

library_versions['version_release_type'] = all_release_types

In [18]:
library_releases['avg_time_between_releases_seconds'] = \
    (library_releases['avg_time_between_releases'].dt.days * 24 * 60 * 60) + \
    (library_releases['avg_time_between_releases'].dt.seconds)

Unnamed: 0,package_name,total_time_diff,avg_time_between_releases,first_release_date,last_release_date,total_releases,broken_builds_caused,avg_time_between_releases_seconds
0,3box,769 days 17:12:07.032000,5 days 11:57:05.193085714,2018-08-20 14:13:14.031,2020-09-28 07:25:21.063,140,2,475025
1,3box-orbitdb-plugins,395 days 02:20:06.886000,17 days 23:00:54.858454545,2019-05-27 12:47:27.249,2020-06-25 15:07:34.135,22,3,1551654
2,42-cent-base,1769 days 10:45:57.075000,126 days 09:20:25.505357142,2014-09-16 21:39:26.712,2019-07-22 08:25:23.787,14,5,10920025
3,@0x/utils,741 days 15:01:30.934000,15 days 18:42:09.594340425,2018-10-18 14:03:17.353,2020-10-29 05:04:48.287,47,1,1363329
4,@0x/web3-wrapper,741 days 15:01:26.440000,14 days 06:17:20.123846153,2018-10-18 14:03:33.658,2020-10-29 05:05:00.098,52,1,1232240
...,...,...,...,...,...,...,...,...
7356,zora,1277 days 20:10:50.134000,31 days 04:00:15.856926829,2016-12-18 17:46:00.962,2020-06-18 13:56:51.096,41,3,2692815
7357,zos,1677 days 18:45:55.671000,22 days 16:08:43.725283783,2015-07-18 19:17:09.688,2020-02-20 14:03:05.359,74,1,1958923
7358,zos-lib,670 days 23:29:35.817000,8 days 04:23:02.631914634,2018-04-20 14:33:24.803,2020-02-20 14:03:00.620,82,3,706982
7359,zotero-plugin,1083 days 03:23:44.548000,10 days 00:41:53.190259259,2017-12-05 16:13:26.240,2020-11-22 19:37:10.788,108,1,866513


Write out augmented csv

In [19]:
commits.to_csv('../csv/aug_greenkeeper_commits.csv', index=False)
issues.to_csv('../csv/aug_greenkeeper_issues.csv', index=False)
library_versions.to_csv('../csv/aug_breaking_library_versions.csv', index=False)
library_releases.to_csv('../csv/breaking_library_releases.csv', index=False)