In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib as mpl
from datetime import datetime
pd.set_option('display.max_colwidth', None)
sns.set()

GK_GREEN = '#21B534'
GK_GREEN_LIGHT = '#07f596'
GK_PURPLE = '#7494EA'
GK_BLUE = '#19535F'
RED = "#f55b5b"
DARK_RED = "#db0000"
GREY = "#a6a6a6"
BLUE = "#4287f5"
LIGHT_GREEN = '#88D39B'

COLOUR_PALETTE = sns.color_palette('Greens_r')
GK_GREEN = COLOUR_PALETTE[1]
TWO_COLOUR_PALETTE = [GK_GREEN, GK_PURPLE]
THREE_COLOUR_PALETTE = [GK_GREEN, GK_PURPLE, GK_BLUE]
FOUR_COLOUR_PALETTE = THREE_COLOUR_PALETTE + [LIGHT_GREEN]

MAJOR_RELEASE_TYPE = 'major'
MINOR_RELEASE_TYPE = 'minor'
PATCH_RELEASE_TYPE = 'patch'
NA_RELEASE_TYPE = pd.NA

PROJECT_ROOT = '..'
PLOTS_PATH = f'{PROJECT_ROOT}/plots'

BINS_COUNT = 40

## Load Data

In [2]:
issues = pd.read_csv(f'{PROJECT_ROOT}/csv/aug_greenkeeper_issues.csv')
comments = pd.read_csv(f'{PROJECT_ROOT}/csv/greenkeeper_comments.csv')
events = pd.read_csv(f'{PROJECT_ROOT}/csv/greenkeeper_events.csv')
commits = pd.read_csv(f'{PROJECT_ROOT}/csv/aug_greenkeeper_commits.csv')
package_names = pd.read_csv(f'{PROJECT_ROOT}/csv/greenkeeper_package_names.csv')
library_versions = pd.read_csv(f'{PROJECT_ROOT}/csv/aug_breaking_library_versions.csv')
library_releases = pd.read_csv(f'{PROJECT_ROOT}/csv/breaking_library_releases.csv')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [3]:
unique_package_count = len(issues['issue_repo_url'].unique())
print(f"{len(issues)} Greenkeeper Breaking Build issue reports (across {unique_package_count} unique projects)")
print(f"{len(comments)} comments for those issue reports")
print(f"{len(events)} events on those issue reports")
# print(f"\tIncludes {events['event_description'].unique()}")
print(f"{len(commits)} commits that were referenced in these breaking issue reports")
print(f"Total of {len(library_versions)} version release records for libraries that have at least 1 breaking release")
print(f"Release frequency information on {len(library_releases)} libraries")

123197 Greenkeeper Breaking Build issue reports (across 12134 unique projects)
365625 comments for those issue reports
209750 events on those issue reports
17623 commits that were referenced in these breaking issue reports
Total of 556742 version release records for libraries that have at least 1 breaking release
Release frequency information on 7361 libraries


#### Custome Helper Functions

In [4]:
def get_issue(id):
    return issues[issues['issue_id'] == id]

def get_issue_url(id):
    return issues[issues['issue_id'] == id]['issue_url']

def get_comment(id):
    return comments[comments['comment_id'] == id]

def get_comments_for_issue(id):
    return comments[comments['comment_issue_id'] == id]

def get_event(id):
    return events[events['event_id'] == id]

def calculate_percent(numer, denom):
    return round((numer/denom)*100, 2)

def get_update_type(prev_ver, new_ver):
    if pd.isnull(prev_ver) or pd.isnull(new_ver):
        return NA_RELEASE_TYPE
    try:
        prev_split = prev_ver.split('.')
        new_split = new_ver.split('.')
        if int(new_split[0]) > int(prev_split[0]):
            return MAJOR_RELEASE_TYPE
        elif int(new_split[1]) > int(prev_split[1]):
            return MINOR_RELEASE_TYPE
        else:
            return PATCH_RELEASE_TYPE
    except Exception as e:
        return NA_RELEASE_TYPE
    
def get_issue_id_for_commit(commit):
    try:
        event_id = commit.commit_event_id
        event = get_event(event_id)
        if event.size == 0:
            return np.nan
        issue = get_issue(event.event_issue_id.values[0].astype(np.int64))
        if issue.size == 0:
            return np.nan
        return issue.issue_id.values[0].astype(np.int64)
    except Exception:
        return np.nan
    
def strfdelta(tdelta, fmt):
    d = {"days": tdelta.days}
    d["hours"], rem = divmod(tdelta.seconds, 3600)
    d["minutes"], d["seconds"] = divmod(rem, 60)
    return fmt.format(**d)

def save_plot(plot, file_name, dpi=400):
    plot.savefig(f'{PLOTS_PATH}/{file_name}', dpi=400)
    
def create_hist_plot(params, ax, cbar_ax=None, color=GK_GREEN, bins=80):
    result = sns.histplot(
        **params,
        ax=ax,
        kde=True,
        bins=bins,
        color=color,
        cbar=cbar_ax is not None,
        cbar_ax=cbar_ax
    )
    return result

def create_reg_line(params, ax):
    result = sns.regplot(
        **params,
        scatter=False,
        line_kws={"color": GK_PURPLE},
        ax=ax,
    )
    return result

### Augment Data

In [5]:
events['event_description'] = events['event_description'].astype('category')
issues["issue_created_at"] = issues["issue_created_at"].astype("datetime64")
issues["issue_updated_at"] = issues["issue_updated_at"].astype("datetime64")
issues["issue_closed_at"] = issues["issue_closed_at"].astype("datetime64")
issues['update_type'] = issues['update_type'].astype('category')
library_releases["first_release_date"] = library_releases["first_release_date"].astype("datetime64")
library_releases["last_release_date"] = library_releases["last_release_date"].astype("datetime64")

library_releases["avg_time_between_releases"] = \
    pd.to_timedelta(library_releases["avg_time_between_releases"])

comments['comment_created_at'] = pd.to_datetime(comments['comment_created_at'])
comments['comment_updated_at'] = pd.to_datetime(comments['comment_updated_at'])
comments['comment_user_type'] = comments['comment_user_type'].astype('category')

### How many dependencies do packages have?

In [6]:
import json

deps_dtos = list()
for index, row in package_names.iterrows():
    deps_str = row['package_dependencies']
    if pd.isna(deps_str):
        deps = dict()
    else:
        deps = json.loads(deps_str.replace("'", '"')) if ~pd.isna(deps_str) else dict()

    dev_deps_str = row['package_dev_dependencies']
    if pd.isna(dev_deps_str):
        dev_deps = dict()
    else:
        dev_deps = json.loads(dev_deps_str.replace("'", '"')) if dev_deps_str else dict()
    deps_dtos.append({
        'package_name': row['package_name'],
        'deps': deps,
        'dev_deps': dev_deps,
    })
    
DEP = 'Dependency'
DEV_DEP = 'Dev Dependency'
packages = list()
deps_names = list()
deps_versions = list()
deps_types =  list()

for deps_dto in deps_dtos:
    package = deps_dto['package_name']
    for dep_name, dep_version in deps_dto['deps'].items():
        packages.append(package)
        deps_names.append(dep_name)
        deps_versions.append(dep_version)
        deps_types.append(DEP)
    for dep_name, dep_version in deps_dto['dev_deps'].items():
        packages.append(package)
        deps_names.append(dep_name)
        deps_versions.append(dep_version)
        deps_types.append(DEV_DEP)
        
dependencies_df = pd.DataFrame({
    'package': packages,
    'deps_name': deps_names,
    'deps_version': deps_versions,
    'deps_type': deps_types,
})


# grouped_deps = \
#     dependencies_df.groupby(by=['package', 'deps_type'])['deps_name']\
#     .count()\
#     .reset_index(name='count')


In [7]:
deps_groups = dependencies_df.groupby(by=['deps_name'])
provider_packages_list = list()
clients_list = list()
for name, group in deps_groups:
    provider_packages_list.extend([name for i in range(len(group))])
    clients_list.extend(list(group.apply(lambda row: row['package'], axis=1)))

provider_to_clients = pd.DataFrame({
    'provider': provider_packages_list,
    'client': clients_list,
})

In [8]:
issues_dist = issues[['issue_id', 'issue_repo_url', 'issue_dependency_name', 'issue_dependency_actual_version', 'issue_dependency_next_version']].copy()
issues_dist.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 123197 entries, 0 to 123196
Data columns (total 5 columns):
 #   Column                           Non-Null Count   Dtype 
---  ------                           --------------   ----- 
 0   issue_id                         123197 non-null  int64 
 1   issue_repo_url                   123197 non-null  object
 2   issue_dependency_name            117384 non-null  object
 3   issue_dependency_actual_version  117402 non-null  object
 4   issue_dependency_next_version    79193 non-null   object
dtypes: int64(1), object(4)
memory usage: 4.7+ MB


In [9]:
ptcp = provider_to_clients.groupby(by=['provider']).size().to_frame('clients').reset_index()
ptcp['breaking_issues'] = ptcp.apply(lambda row: len(issues_dist[issues_dist['issue_dependency_name'] == row['provider']].index), axis=1)

In [10]:
ptcp[ptcp['breaking_issues'] > 0]

Unnamed: 0,provider,clients,breaking_issues
3,3box,2,2
6,3box-orbitdb-plugins,2,3
11,42-cent-base,12,5
22,@0x/utils,1,1
23,@0x/web3-wrapper,1,1
...,...,...,...
25830,zombie,7,3
25831,zone.js,312,61
25833,zora,4,3
25834,zotero-plugin,1,1


## Most breaking packages by proprtion

In [11]:
library_versions["version_published_at"] = library_versions["version_published_at"].astype("datetime64")
library_versions = library_versions.dropna(subset=['package_name'])
library_versions = library_versions.sort_values(['package_name', 'version_published_at'])

library_versions.groupby(by=['package_name'])
grouped_package_releases = library_versions.groupby(by='package_name')
packages = list()
prev_releases = list()
curr_releases = list()
time_between_releases = list()
for package_name, group in grouped_package_releases:
    prev_release = None
    prev_release_date = None
    curr_release = None
    curr_release_date = None
    for row_index, row in group.iterrows():
        prev_release = curr_release
        prev_release_date = curr_release_date
        curr_release = row['version']
        curr_release_date = row['version_published_at']
        if prev_release_date is None:
            continue
        release_time_diff = (curr_release_date - prev_release_date)
        packages.append(package_name)
        prev_releases.append(prev_release)
        curr_releases.append(curr_release)
        time_between_releases.append(release_time_diff)
        
        
my_df = pd.DataFrame({
    'package': packages,
    'prev_release': prev_releases,
    'curr_release': curr_releases,
    'time_between_release': time_between_releases,
})

In [76]:
def get_client_count(row):
    try:
        return ptcp.loc[row['package']]['clients']
    except:
        return pd.NA
my_df['client_count'] = my_df.apply(get_client_count, axis=1)
my_df = my_df.dropna(subset=['client_count'])

In [89]:
my_df.head(10)


Unnamed: 0,package,prev_release,curr_release,time_between_release,client_count
0,3box,0.0.1,0.0.2,15 days 21:44:20.609000,2
1,3box,0.0.2,0.0.3,1 days 16:38:51.812000,2
2,3box,0.0.3,0.0.4,0 days 00:59:30.090000,2
3,3box,0.0.4,0.0.5-beta-1,4 days 07:42:30.188000,2
4,3box,0.0.5-beta-1,0.0.5-beta-2,0 days 00:00:51.060000,2
5,3box,0.0.5-beta-2,1.0.0-beta-1,1 days 07:40:06.798000,2
6,3box,1.0.0-beta-1,1.0.0-beta-2,0 days 18:49:29.783000,2
7,3box,1.0.0-beta-2,1.0.0-beta-3,0 days 01:37:18.113000,2
8,3box,1.0.0-beta-3,1.0.0-beta-4,18 days 16:02:20.994000,2
9,3box,1.0.0-beta-4,1.0.0-beta-5,1 days 00:02:39.515000,2


In [94]:
issues_dist = issues[['issue_id', 'issue_repo_url', 'issue_dependency_name', 'issue_dependency_actual_version', 'issue_dependency_next_version']].copy()
issues_dist = issues_dist.dropna(subset=['issue_dependency_name', 'issue_dependency_actual_version', 'issue_dependency_next_version'])
issues_dist

Unnamed: 0,issue_id,issue_repo_url,issue_dependency_name,issue_dependency_actual_version,issue_dependency_next_version
5631,576010086,https://api.github.com/repos/ealush/butter-toast,@babel/cli,7.8.3,7.8.4
5632,576010086,https://api.github.com/repos/ealush/butter-toast,@babel/core,7.8.6,7.8.7
5633,576010086,https://api.github.com/repos/ealush/butter-toast,@babel/plugin-proposal-class-properties,7.8.0,7.8.3
5634,576010086,https://api.github.com/repos/ealush/butter-toast,@babel/plugin-transform-runtime,7.8.0,7.8.3
5635,576010086,https://api.github.com/repos/ealush/butter-toast,@babel/preset-env,7.8.6,7.8.7
...,...,...,...,...,...
85124,359772997,https://api.github.com/repos/andcards/react-gesture,binary-ui-styles,0.6.50,0.6.51
85125,359571661,https://api.github.com/repos/andcards/react-gesture,binary-ui-styles,0.6.49,0.6.50
85126,359504712,https://api.github.com/repos/andcards/react-gesture,binary-ui-styles,0.6.48,0.6.49
85127,359487306,https://api.github.com/repos/andcards/react-gesture,binary-ui-styles,0.6.47,0.6.48


In [None]:
def get_number_of_breaks(row):
    try:
        breaking_issues = issues_dist.loc[
            (issues_dist['issue_dependency_name'] == row['package']) &
            (issues_dist['issue_dependency_actual_version'] == row['prev_release']) &
            (issues_dist['issue_dependency_next_version'] == row['curr_release'])
        ]
        return len(breaking_issues)
    except Exception as e:
        return pd.NA
    
my_df['breaking_issues'] = my_df.apply(get_number_of_breaks, axis=1)
# my_df.head(10)

In [114]:
my_df[my_df['package'] == 'binary-ui-styles']

Unnamed: 0,package,prev_release,curr_release,time_between_release,client_count
217915,binary-ui-styles,0.0.1-rc.0,0.0.1-rc.1,0 days 00:04:38.574000,1
217916,binary-ui-styles,0.0.1-rc.1,0.0.1-rc.2,0 days 00:01:57.474000,1
217917,binary-ui-styles,0.0.1-rc.2,0.0.1-rc.3,12 days 23:43:30.652000,1
217918,binary-ui-styles,0.0.1-rc.3,0.0.1-rc.4,17 days 13:20:41.288000,1
217919,binary-ui-styles,0.0.1-rc.4,0.1.0-rc.0,13 days 06:30:57.739000,1
...,...,...,...,...,...
218048,binary-ui-styles,0.6.50,0.6.51,0 days 14:19:56.127000,1
218049,binary-ui-styles,0.6.51,0.6.52,0 days 03:18:16.500000,1
218050,binary-ui-styles,0.6.52,0.6.53,6 days 04:28:43.184000,1
218051,binary-ui-styles,0.6.53,0.6.54,0 days 08:11:09.897000,1


In [113]:
issues_dist

Unnamed: 0,issue_id,issue_repo_url,issue_dependency_name,issue_dependency_actual_version,issue_dependency_next_version
5631,576010086,https://api.github.com/repos/ealush/butter-toast,@babel/cli,7.8.3,7.8.4
5632,576010086,https://api.github.com/repos/ealush/butter-toast,@babel/core,7.8.6,7.8.7
5633,576010086,https://api.github.com/repos/ealush/butter-toast,@babel/plugin-proposal-class-properties,7.8.0,7.8.3
5634,576010086,https://api.github.com/repos/ealush/butter-toast,@babel/plugin-transform-runtime,7.8.0,7.8.3
5635,576010086,https://api.github.com/repos/ealush/butter-toast,@babel/preset-env,7.8.6,7.8.7
...,...,...,...,...,...
85124,359772997,https://api.github.com/repos/andcards/react-gesture,binary-ui-styles,0.6.50,0.6.51
85125,359571661,https://api.github.com/repos/andcards/react-gesture,binary-ui-styles,0.6.49,0.6.50
85126,359504712,https://api.github.com/repos/andcards/react-gesture,binary-ui-styles,0.6.48,0.6.49
85127,359487306,https://api.github.com/repos/andcards/react-gesture,binary-ui-styles,0.6.47,0.6.48
