In [2]:
import numpy as np
import numpy.polynomial.polynomial as poly
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
from datetime import datetime
from tqdm.notebook import tqdm_notebook
from sklearn.metrics import r2_score
tqdm_notebook.pandas()
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', None)

PROJECT_ROOT = '..'
CSV_FOLDER = 'csv'

GK_GREEN = '#21B534'
GK_GREEN_LIGHT = '#07f596'
GK_PURPLE = '#7494EA'
GK_BLUE = '#19535F'
RED = "#f55b5b"
DARK_RED = "#db0000"
GREY = "#a6a6a6"
BLUE = "#4287f5"
LIGHT_GREEN = '#88D39B'

MAJOR = 'MAJOR'
MINOR = 'MINOR'
PATCH = 'PATCH'
PINNED = 'PINNED'
NA_RELEASE_TYPE = pd.NA
DEP = 'Dependency'
DEV_DEP = 'Dev Dependency'

COLOUR_PALETTE = sns.color_palette('Greens_r')
GK_GREEN = COLOUR_PALETTE[1]
TWO_COLOUR_PALETTE = [GK_GREEN, GK_PURPLE]
THREE_COLOUR_PALETTE = [GK_GREEN, GK_PURPLE, GK_BLUE]
FOUR_COLOUR_PALETTE = THREE_COLOUR_PALETTE + [LIGHT_GREEN]

BINS_COUNT = 40

PLOT_LABEL_SIZE = 16
PLOT_TICK_LABEL_SIZE = 14

FIG_SIZE_W = 7.5
FIF_SIZE_H = 6


# Loading Data Functions

In [3]:
def load_issues():
    issues = pd.read_csv(f'{PROJECT_ROOT}/{CSV_FOLDER}/aug_greenkeeper_issues.csv')
    issues["issue_created_at"] = issues["issue_created_at"].astype("datetime64")
    issues["issue_updated_at"] = issues["issue_updated_at"].astype("datetime64")
    issues["issue_closed_at"] = issues["issue_closed_at"].astype("datetime64")
    issues['update_type'] = issues['update_type'].astype('category')
    return issues.copy()

def load_events():
    events = pd.read_csv(f'{PROJECT_ROOT}/{CSV_FOLDER}/greenkeeper_events.csv')
    events['event_description'] = events['event_description'].astype('category')
    return events.copy()

def load_comments():
    comments = pd.read_csv(f'{PROJECT_ROOT}/{CSV_FOLDER}/greenkeeper_comments.csv')
    comments['comment_created_at'] = pd.to_datetime(comments['comment_created_at'])
    comments['comment_updated_at'] = pd.to_datetime(comments['comment_updated_at'])
    comments['comment_user_type'] = comments['comment_user_type'].astype('category')
    return comments.copy()
    
def load_commits():
    commits = pd.read_csv(f'{PROJECT_ROOT}/{CSV_FOLDER}/aug_greenkeeper_commits.csv')
    return commits.copy()

def load_package_names():  
    package_names = pd.read_csv(f'{PROJECT_ROOT}/{CSV_FOLDER}/greenkeeper_package_names.csv')
    return package_names.copy()

def load_library_versions():
    library_versions = pd.read_csv(f'{PROJECT_ROOT}/{CSV_FOLDER}/aug_breaking_library_versions.csv')
    library_versions["version_published_at"] = library_versions["version_published_at"].astype("datetime64")
    library_versions["version_release_type"] = library_versions["version_release_type"].astype('category')
    library_versions["time_until_next_release"] = \
        pd.to_timedelta(library_versions["time_until_next_release"])
    return library_versions.copy()

def load_library_releases():
    library_releases = pd.read_csv(f'{PROJECT_ROOT}/{CSV_FOLDER}/breaking_library_releases.csv')
    library_releases["first_release_date"] = library_releases["first_release_date"].astype("datetime64")
    library_releases["last_release_date"] = library_releases["last_release_date"].astype("datetime64")
    library_releases["avg_time_between_releases"] = \
        pd.to_timedelta(library_releases["avg_time_between_releases"])
    return library_releases.copy()

def load_package_releases_and_breaks():
    package_releases_and_breaks = \
        pd.read_csv(f'{PROJECT_ROOT}/{CSV_FOLDER}/releases_and_breaks_counts_by_package.csv')
    return package_releases_and_breaks.copy()

def load_provider_to_clients():
    provider_to_clients = \
        pd.read_csv(f'{PROJECT_ROOT}/{CSV_FOLDER}/provider_to_clients.csv')
    return provider_to_clients.copy()

def load_package_dependencies():
    package_dependencies = \
        pd.read_csv(f'{PROJECT_ROOT}/{CSV_FOLDER}/package_dependencies.csv')
    return package_dependencies.copy()

In [7]:
load_provider_to_clients().head()

Unnamed: 0,provider,client
0,0http,k-fastify-gateway
1,101,observable-backoff
2,101,primus-graphql
3,101,validate-reql
4,101,rethinkdb-validator-stream


# Helper Functions

In [None]:
###################
# Helper functions
###################
def get_issue(issues, id):
    return issues[issues['issue_id'] == id]

def get_issue_url(issues, id):
    return issues[issues['issue_id'] == id]['issue_url']

def get_comment(comments, id):
    return comments[comments['comment_id'] == id]

def get_comments_for_issue(comments, id):
    return comments[comments['comment_issue_id'] == id]

def get_event(events, id):
    return events[events['event_id'] == id]

def calculate_percent(numer, denom):
    return round((numer/denom)*100, 2)

def get_update_type(prev_ver, new_ver):
    if pd.isnull(prev_ver) or pd.isnull(new_ver):
        return NA_RELEASE_TYPE
    try:
        prev_split = prev_ver.split('.')
        new_split = new_ver.split('.')
        if int(new_split[0]) > int(prev_split[0]):
            return MAJOR
        elif int(new_split[1]) > int(prev_split[1]):
            return MINOR
        elif int(new_split[2]) > int(prev_split[2]):
            return PATCH
        else:
            return NA_RELEASE_TYPE
    except Exception as e:
        return NA_RELEASE_TYPE
    
def get_update_type_v2(prev_ver, new_ver):
    if pd.isnull(prev_ver) or pd.isnull(new_ver):
        return NA_RELEASE_TYPE
    try:
        prev_split = prev_ver.split('.')
        prev_major = prev_split[0]
        prev_minor = prev_split[1]
        prev_patch = prev_split[2]
        new_split = new_ver.split('.')
        new_major = new_split[0]
        new_minor = new_split[1]
        new_patch = new_split[2]
        if new_major != prev_major:
            return MAJOR
        elif new_minor != prev_minor:
            return MINOR
        elif new_patch != prev_patch:
            return PATCH
        else:
            return NA_RELEASE_TYPE
    except Exception as e:
        return NA_RELEASE_TYPE
    
def get_issue_id_for_commit(events, issues, commit):
    try:
        event_id = commit.commit_event_id
        event = get_event(events, event_id)
        if event.size == 0:
            return np.nan
        issue = get_issue(issues, event.event_issue_id.values[0].astype(np.int64))
        if issue.size == 0:
            return np.nan
        return issue.issue_id.values[0].astype(np.int64)
    except Exception:
        return np.nan

def strfdelta(tdelta, fmt):
    d = {"days": tdelta.days}
    d["hours"], rem = divmod(tdelta.seconds, 3600)
    d["minutes"], d["seconds"] = divmod(rem, 60)
    return fmt.format(**d)

# Plotting Functions

In [None]:
def create_hist_plot(params, ax, cbar_ax=None, color=GK_GREEN, bins=BINS_COUNT):
    result = sns.histplot(
        **params,
        ax=ax,
        kde=True,
        bins=bins,
        color=color,
        cbar=cbar_ax is not None,
        cbar_ax=cbar_ax
    )
    return result

def create_reg_line(params, ax):
    result = sns.regplot(
        **params,
        scatter=False,
        line_kws={"color": GK_PURPLE},
        ax=ax,
    )
    return result

def save_plot(plot, full_file_path, dpi=400):
    plot.savefig(f'{full_file_path}', dpi=400)