In [3]:
import numpy as np
import numpy.polynomial.polynomial as poly
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import os
import warnings
warnings.filterwarnings('ignore', 'This pattern has match groups')
from matplotlib.ticker import FuncFormatter
from datetime import datetime
from tqdm.notebook import tqdm_notebook
from sklearn.metrics import r2_score
from scipy.stats import zscore, wilcoxon, mannwhitneyu
tqdm_notebook.pandas()
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', None)

PROJECT_ROOT = '..'
CSV_FOLDER = 'csv'
IMAGES_PATH = os.path.join(PROJECT_ROOT, 'images')
os.makedirs(IMAGES_PATH, exist_ok=True)

GK_GREEN = '#21B534'
GK_GREEN_LIGHT = '#07f596'
GK_PURPLE = '#7494EA'
GK_BLUE = '#19535F'
RED = "#f55b5b"
DARK_RED = "#db0000"
GREY = "#a6a6a6"
BLUE = "#4287f5"
LIGHT_GREEN = '#88D39B'

MAJOR = 'MAJOR'
MINOR = 'MINOR'
PATCH = 'PATCH'
PINNED = 'PINNED'
NA_RELEASE_TYPE = pd.NA
DEP = 'Dependency'
DEV_DEP = 'Dev Dependency'

COLOUR_PALETTE = sns.color_palette('Greens_r')
GK_GREEN = COLOUR_PALETTE[1]
TWO_COLOUR_PALETTE = [GK_GREEN, GK_PURPLE]
THREE_COLOUR_PALETTE = [GK_GREEN, GK_PURPLE, GK_BLUE]
FOUR_COLOUR_PALETTE = THREE_COLOUR_PALETTE + [LIGHT_GREEN]

BINS_COUNT = 40

PLOT_LABEL_SIZE = 16
PLOT_TICK_LABEL_SIZE = 14

FIG_SIZE_W = 7.5
FIF_SIZE_H = 6

%run _cliffs_delta.ipynb

# Loading Data Functions

In [16]:
def load_issues():
    dtypes = {
        'issue_id': 'int64',
        'issue_number': 'int64',
        'issue_url': 'object',
        'issue_title': 'object',
        'issue_state': 'category',
        'issue_is_locked': 'bool',
        'issue_created_at': 'object',
        'issue_updated_at': 'object',
        'issue_closed_at': 'object',
        'issue_user_login': 'category',
        'issue_labels': 'category',
        'issue_num_comments': 'int64',
        'issue_events_url': 'object',
        'issue_dependency_name': 'category',
        'issue_dependency_type': 'category',
        'issue_dependency_actual_version': 'object',
        'issue_dependency_next_version': 'object',
        'issue_dependency_bundle_name': 'category',
        'issue_body_parser': 'category',
        'issue_repo_url': 'object',
        'update_type': 'category',
    }
    parse_dates = ['issue_created_at', 'issue_updated_at', 'issue_closed_at']
    renaming = {
        'issue_id': 'id',
        'issue_number': 'number',
        'issue_url': 'url',
        'issue_title': 'title',
        'issue_state': 'state',
        'issue_is_locked': 'is_locked',
        'issue_created_at': 'created_at',
        'issue_updated_at': 'updated_at',
        'issue_closed_at': 'closed_at',
        'issue_user_login': 'user_login',
        'issue_labels': 'labels',
        'issue_num_comments': 'num_comments',
        'issue_events_url': 'events_url',
        'issue_dependency_name': 'dependency_name',
        'issue_dependency_type': 'dependency_type',
        'issue_dependency_actual_version': 'dependency_actual_version',
        'issue_dependency_next_version': 'dependency_next_version',
        'issue_dependency_bundle_name': 'dependency_bundle_name',
        'issue_body_parser': 'body_parser',
        'issue_repo_url': 'repo_url',
        'update_type': 'update_type',
    }
    file_path = f'{PROJECT_ROOT}/{CSV_FOLDER}/aug_greenkeeper_issues.csv'
    issues = pd.read_csv(
        file_path,
        dtype=dtypes,
        parse_dates=parse_dates
    ).rename(columns=renaming)
    issues['repo_name'] = issues['repo_url'].apply(lambda s: s.split('https://api.github.com/repos/')[1])
    return issues

def load_events():
    events = pd.read_csv(f'{PROJECT_ROOT}/{CSV_FOLDER}/greenkeeper_events.csv')
    events['event_description'] = events['event_description'].astype('category')
    return events.copy()

def load_comments():
    comments = pd.read_csv(f'{PROJECT_ROOT}/{CSV_FOLDER}/greenkeeper_comments.csv')
    comments['comment_created_at'] = pd.to_datetime(comments['comment_created_at'])
    comments['comment_updated_at'] = pd.to_datetime(comments['comment_updated_at'])
    comments['comment_user_type'] = comments['comment_user_type'].astype('category')
    return comments.copy()
    
def load_commits():
    dtypes = {
        'commit_event_url': 'object',
        'commit_event_id': 'int64',
        'commit_message': 'object',
        'commit_git_committer_email': 'object',
        'commit_git_committer_name': 'object',
        'commit_git_author_email': 'object',
        'commit_git_author_name': 'object',
        'commit_github_committer_login': 'object',
        'commit_github_committer_id': 'float64',
        'commit_github_committer_type': 'category',
        'commit_github_author_login': 'object',
        'commit_github_author_id': 'float64',
        'commit_github_author_type': 'category',
        'commit_stats_deletions': 'int64',
        'commit_stats_additions': 'int64',
        'commit_stats_total': 'int64',
        'commit_tree_sha': 'object',
        'commit_sha': 'object',
        'commit_num_parents': 'int64',
        'commit_num_comments': 'int64',
        'commit_file_name': 'category',
        'commit_file_additions': 'int64',
        'commit_file_deletions': 'int64',
        'commit_file_changes': 'int64',
        'commit_file_sha': 'object',
        'commit_file_status': 'category',
        'commit_issue_id': 'float64',
    }
    renaming = {
        'commit_event_url': 'event_url',
        'commit_event_id': 'event_id',
        'commit_message': 'message',
        'commit_git_committer_email': 'committer_email',
        'commit_git_committer_name': 'committer_name',
        'commit_git_author_email': 'author_email',
        'commit_git_author_name': 'author_email',
        'commit_github_committer_login': 'committer_login',
        'commit_github_committer_id': 'committer_id',
        'commit_github_committer_type': 'committer_type',
        'commit_github_author_login': 'author_login',
        'commit_github_author_id': 'author_id',
        'commit_github_author_type': 'author_type',
        'commit_stats_deletions': 'deletions',
        'commit_stats_additions': 'additions',
        'commit_stats_total': 'stats_total',
        'commit_tree_sha': 'tree_sha',
        'commit_sha': 'sha',
        'commit_num_parents': 'num_parents',
        'commit_num_comments': 'num_comments',
        'commit_file_name': 'file_name',
        'commit_file_additions': 'file_additions',
        'commit_file_deletions': 'file_deletions',
        'commit_file_changes': 'file_changes',
        'commit_file_sha': 'file_sha',
        'commit_file_status': 'file_status',
        'commit_issue_id': 'issue_id',
    }
    file_path = f'{PROJECT_ROOT}/{CSV_FOLDER}/aug_greenkeeper_commits.csv'
    commits = pd.read_csv(
        file_path,
        dtype=dtypes,
     ).rename(columns=renaming)
    return commits

def load_package_names():
    dtypes = {
        'package_name': 'category',
        'package_gh_url_api': 'object',
        'package_gh_url': 'object',
        'package_author': 'category',
        'package_description': 'object',
        'package_repo_url': 'object',
        'package_repo_type': 'category',
        'package_version': 'object',
        'package_dependencies': 'object',
        'package_dev_dependencies': 'object',
        'package_peer_dependencies': 'object',
    }
    renaming = {
        'package_name': 'name',
        'package_gh_url_api': 'gh_url_api',
        'package_gh_url': 'gh_url',
        'package_author': 'author',
        'package_description': 'description',
        'package_repo_url': 'repo_url',
        'package_repo_type': 'repo_type',
        'package_version': 'version',
        'package_dependencies': 'dependencies',
        'package_dev_dependencies': 'dev_dependencies',
        'package_peer_dependencies': 'peer_dependencies',
    }
    file_path = f'{PROJECT_ROOT}/{CSV_FOLDER}/greenkeeper_package_names.csv'
    package_names = pd.read_csv(
        file_path,
        dtype=dtypes
    ).rename(columns=renaming)
    return package_names

def load_library_versions():
    dtypes = {
        'package_name': 'category',
        'version': 'object',
        'version_published_at': 'object',
        'version_release_type': 'category',
        'broken_clients_count': 'int64',
        'time_until_next_release': 'object',
    }
    parse_dates = ['version_published_at']
    file_path = f'{PROJECT_ROOT}/{CSV_FOLDER}/aug_breaking_library_versions.csv'
    library_versions = pd.read_csv(
        file_path,
        dtype=dtypes,
        parse_dates=parse_dates,
    )
    library_versions["time_until_next_release"] = pd.to_timedelta(library_versions["time_until_next_release"])
    return library_versions

def load_library_releases():
    library_releases = pd.read_csv(f'{PROJECT_ROOT}/{CSV_FOLDER}/breaking_library_releases.csv')
    library_releases["first_release_date"] = library_releases["first_release_date"].astype("datetime64")
    library_releases["last_release_date"] = library_releases["last_release_date"].astype("datetime64")
    library_releases["avg_time_between_releases"] = \
        pd.to_timedelta(library_releases["avg_time_between_releases"])
    return library_releases.copy()

def load_package_releases_and_breaks():
    package_releases_and_breaks = \
        pd.read_csv(f'{PROJECT_ROOT}/{CSV_FOLDER}/releases_and_breaks_counts_by_package.csv')
    return package_releases_and_breaks.copy()

def load_provider_to_clients():
    provider_to_clients = \
        pd.read_csv(f'{PROJECT_ROOT}/{CSV_FOLDER}/provider_to_clients.csv')
    return provider_to_clients.copy()

def load_package_dependencies():
    package_dependencies = \
        pd.read_csv(f'{PROJECT_ROOT}/{CSV_FOLDER}/package_dependencies.csv')
    return package_dependencies.copy()

In [None]:
def load_all_issues():
    dtypes = {
        'id': 'int64',
        'repo_name': 'category',
        'url': 'object',
        'repository_url': 'object',
        'comments_url': 'object',
        'events_url': 'object',
        'html_url': 'object',
        'number': 'int64',
        'title': 'object',
        'user_id': 'int64',
        'user_login': 'category',
        'user_type': 'category',
        'state': 'category',
        'locked': 'bool',
        'comments': 'int64',
        'created_at': 'object',
        'updated_at': 'object',
        'closed_at': 'object',
        'body': 'object',
        'is_pull_request': 'bool',
    }
    parse_dates = ['created_at', 'updated_at', 'closed_at']
    file_path = f'{PROJECT_ROOT}/{CSV_FOLDER}/all_project_issues.csv'
    all_issues = pd.read_csv(
        file_path,
        dtype=dtypes,
        parse_dates=parse_dates,
    )
    return all_issues

In [1]:
def load_non_gk_issues_for_analysis():
    dtypes = {
        'id': 'int64',
        'repo_name': 'category',
        'url': 'object',
        'repository_url': 'object',
        'comments_url': 'object',
        'events_url': 'object',
        'html_url': 'object',
        'number': 'int64',
        'title': 'object',
        'user_id': 'int64',
        'user_login': 'category',
        'user_type': 'category',
        'state': 'category',
        'locked': 'bool',
        'comments': 'int64',
        'created_at': 'object',
        'updated_at': 'object',
        'closed_at': 'object',
        'body': 'object',
        'is_pull_request': 'bool',
    }
    parse_dates = ['created_at', 'updated_at', 'closed_at']
    file_path = f'{PROJECT_ROOT}/{CSV_FOLDER}/non_gk_issues_for_analysis.csv'
    issues = pd.read_csv(
        file_path,
        dtype=dtypes,
        parse_dates=parse_dates,
    )
    return issues

In [None]:
def load_gk_issues_for_analysis():
    dtypes = {
        'id': 'int64',
        'number': 'int64',
        'url': 'object',
        'title': 'object',
        'state': 'category',
        'is_locked': 'bool',
        'created_at': 'object',
        'updated_at': 'object',
        'closed_at': 'object',
        'user_login': 'category',
        'labels': 'category',
        'num_comments': 'int64',
        'events_url': 'object',
        'dependency_name': 'category',
        'dependency_type': 'object',
        'dependency_actual_version': 'object',
        'dependency_next_version': 'object',
        'dependency_bundle_name': 'category',
        'body_parser': 'category',
        'repo_url': 'object',
        'update_type': 'category',
        'repo_name': 'object',
        'html_url': 'object',
        'body': 'object',
    }
    parse_dates = ['created_at', 'updated_at', 'closed_at']
    file_path = f'{PROJECT_ROOT}/{CSV_FOLDER}/gk_issues_for_analysis.csv'
    issues = pd.read_csv(
        file_path,
        dtype=dtypes,
        parse_dates=parse_dates,
    )
    return issues

In [17]:
def load_non_gkirbbi_comments():
    dtypes = {
        'id': 'int64',
        'issue_id': 'int64',
        'repo_name': 'category',
        'url': 'object',
        'issue_url': 'object',
        'user_id': 'int64',
        'user_login': 'category',
        'user_type': 'category',
        'created_at': 'object',
        'updated_at': 'object',
        'body': 'object',
    }
    parse_dates = ['created_at', 'updated_at']
    file_path = f'{PROJECT_ROOT}/{CSV_FOLDER}/non_gkirbbi_project_issue_comments.csv'
    non_gkirbbi_comments = pd.read_csv(
        file_path,
        dtype=dtypes,
        parse_dates=parse_dates,
    )
    return non_gkirbbi_comments

In [23]:
def load_bens_collected_commits():
    file_path = f'{PROJECT_ROOT}/{CSV_FOLDER}/bens_collected_issue_commits.csv'
    bens_collected_commits = pd.read_csv(
        file_path,
    )
    return bens_collected_commits

In [15]:
def load_ngkir_commits():
    dtypes = {
        'commit_sha': 'object',
        'issue_id': 'float64',
        'repo_name': 'object',
        'url': 'object',
        'html_url': 'object',
        'message': 'object',
        'author_login': 'object',
        'author_type': 'object',
        'committer_login': 'object',
        'committer_type': 'object',
        'stats_total': 'int64',
        'stats_additions': 'int64',
        'stats_deletions': 'int64',
        'file_name': 'object',
        'file_status': 'object',
        'file_additions': 'int64',
        'file_deletions': 'int64',
        'file_changes': 'int64',
        'file_patch': 'object',
    }
    file_path = f'{PROJECT_ROOT}/{CSV_FOLDER}/ngkir_bens_collected_issue_commits.csv'
    commits = pd.read_csv(
        file_path,
        dtype=dtypes,
    ).drop(columns=['issue_id'])
    return commits

def load_commit_event_relationships():
    dtypes = {
        'event_id': 'int64',
        'issue_id': 'int64',
        'repo_name': 'category',
        'event_type': 'category',
        'commit_id': 'object',
        'commit_url': 'object',
    }
    file_path = f'{PROJECT_ROOT}/{CSV_FOLDER}/non_gkirbbi_issue_commit_events.csv'
    renaming = {
        'commit_id': 'commit_sha'
    }
    commit_event_rels = pd.read_csv(
        file_path,
        dtype=dtypes,
    ).rename(columns=renaming)
    return commit_event_rels

# Helper Functions

In [None]:
###################
# Helper functions
###################
def get_issue(issues, id):
    return issues[issues['issue_id'] == id]

def get_issue_url(issues, id):
    return issues[issues['issue_id'] == id]['issue_url']

def get_comment(comments, id):
    return comments[comments['comment_id'] == id]

def get_comments_for_issue(comments, id):
    return comments[comments['comment_issue_id'] == id]

def get_event(events, id):
    return events[events['event_id'] == id]

def calculate_percent(numer, denom):
    return round((numer/denom)*100, 2)

def get_update_type(prev_ver, new_ver):
    if pd.isnull(prev_ver) or pd.isnull(new_ver):
        return NA_RELEASE_TYPE
    try:
        prev_split = prev_ver.split('.')
        new_split = new_ver.split('.')
        if int(new_split[0]) > int(prev_split[0]):
            return MAJOR
        elif int(new_split[1]) > int(prev_split[1]):
            return MINOR
        elif int(new_split[2]) > int(prev_split[2]):
            return PATCH
        else:
            return NA_RELEASE_TYPE
    except Exception as e:
        return NA_RELEASE_TYPE
    
def get_update_type_v2(prev_ver, new_ver):
    if pd.isnull(prev_ver) or pd.isnull(new_ver):
        return NA_RELEASE_TYPE
    try:
        prev_split = prev_ver.split('.')
        prev_major = prev_split[0]
        prev_minor = prev_split[1]
        prev_patch = prev_split[2]
        new_split = new_ver.split('.')
        new_major = new_split[0]
        new_minor = new_split[1]
        new_patch = new_split[2]
        if new_major != prev_major:
            return MAJOR
        elif new_minor != prev_minor:
            return MINOR
        elif new_patch != prev_patch:
            return PATCH
        else:
            return NA_RELEASE_TYPE
    except Exception as e:
        return NA_RELEASE_TYPE
    
def get_issue_id_for_commit(events, issues, commit):
    try:
        event_id = commit.commit_event_id
        event = get_event(events, event_id)
        if event.size == 0:
            return np.nan
        issue = get_issue(issues, event.event_issue_id.values[0].astype(np.int64))
        if issue.size == 0:
            return np.nan
        return issue.issue_id.values[0].astype(np.int64)
    except Exception:
        return np.nan

def strfdelta(tdelta, fmt):
    d = {"days": tdelta.days}
    d["hours"], rem = divmod(tdelta.seconds, 3600)
    d["minutes"], d["seconds"] = divmod(rem, 60)
    return fmt.format(**d)

In [12]:
from tabulate import tabulate

def tfns(df, cols, title='', should_round=True, verbose=True):
    table_five_number_summary(df, cols, title, should_round, verbose)

def table_five_number_summary(df, cols, title='', should_round=True, verbose=True):
    for_table = list()
    for col in cols:
        for_table.append(five_number_summary_for_table(col, df[col], should_round))
    h = [title, 'Count', 'Mean', 'STD', 'Min', '25%', 'Median', '75%', 'Max']
    print(tabulate(for_table, headers=h))
    print()

def five_number_summary_for_table(title, s, should_round=True, verbose=True):
    c = s.describe()[0]
    mean = my_round(s.describe()[1], 3) if should_round else s.describe()[1]
    std = my_round(s.describe()[2], 3) if should_round else s.describe()[2]
    minimum = my_round(s.describe()[3], 3) if should_round else s.describe()[3]
    first_q = my_round(s.describe()[4], 3) if should_round else s.describe()[4]
    median = my_round(s.describe()[5], 3) if should_round else s.describe()[5]
    third_q = my_round(s.describe()[6], 3) if should_round else s.describe()[6]
    maximum = my_round(s.describe()[7], 3) if should_round else s.describe()[7]
    return [title, c, mean, std, minimum, first_q, median, third_q, maximum]


def my_round(val, prec):
    if type(val) == pd._libs.tslibs.timedeltas.Timedelta:
        return val.round('1s')
    else:
        return round(val, 3)

In [None]:
def calculate_percent(numer, denom):
    return round((numer/denom)*100, 2)

# Plotting Functions

In [None]:
GRAYSCALE_CMAP = sns.cubehelix_palette(
    50,
    hue=0.05,
    rot=0,
    light=0.9,
    dark=0,
    as_cmap=True
)

def create_hist_plot(params, ax, cbar_ax=None, color=GK_GREEN, bins=BINS_COUNT):
    result = sns.histplot(
        **params,
        ax=ax,
        kde=True,
        bins=bins,
        color=color,
        cbar=cbar_ax is not None,
        cbar_ax=cbar_ax
    )
    return result

def create_reg_line(params, ax):
    result = sns.regplot(
        **params,
        scatter=False,
        line_kws={"color": GK_PURPLE},
        ax=ax,
    )
    return result

def save_plot(plot, full_file_path, dpi=400):
    plot.savefig(f'{full_file_path}', dpi=400)
    
def save_fig(fig_id, tight_layout=True, fig_extension='png', resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + '.' + fig_extension)
    print('Saving figure', fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)
    
def show_values_on_bars(axs, h_v="v", h_offset=0, v_offset=0, val_formatter=int):
    def _show_on_single_plot(ax):
        if h_v == 'v':
            for p in ax.patches:
                _x = p.get_x() + p.get_width() / 2
                _y = p.get_y() + p.get_height()
                value = p.get_height()
                ax.text(_x, _y, value, ha='center')
        elif h_v == 'h':
            for p in ax.patches:
                _x = p.get_x() + p.get_width() + float(h_offset)
                _y = p.get_y() + p.get_height() / 2 + float(v_offset)
                value = val_formatter(p.get_width())
                ax.text(_x, _y, value, ha='center')
    if isinstance(axs, np.ndarray):
        for idx, ax in np.ndenumerate(axs):
            _show_on_single_plot(ax)
    else:
        _show_on_single_plot(axs)

<h3>FuncFormatters</h3>

In [6]:
def format_log_exp(value):
    table = ['\u2070', '\u2071', '\u00B2', '\u00B3', '\u2074', '\u2075', '\u2076', '\u2077', '\u2078', '\u2079']
    if value >= 0:
        return f'$10{table[int(value)]}$'
    else:
        return f'$10\u207B{table[abs(int(value))]}$'

log10_func_formatter = FuncFormatter(lambda x, pos: format_log_exp(x))

In [None]:
proportion_func_formatter = FuncFormatter(lambda x, pos: f'{int(x*100)}%')

In [14]:
def filter_outliers(df=None, col=None, threshold=3):
    if col is None:
        return filter_outliers_dataframe(df, threshold)
    else:
        return filter_outliers_series(df, col, threshold)

def filter_outliers_dataframe(df, threshold=3):
    z_scores = zscore(df, nan_policy='omit')
    filtered_entries = (np.abs(z_scores) < threshold).all(axis=1)
    return df[filtered_entries].copy()

def filter_outliers_series(df, col, threshold=3):
    z_scores = zscore(df[col], nan_policy='omit')
    filtered_entries = (np.abs(z_scores) < threshold)
    return df[filtered_entries].copy()

In [28]:
def mannwhitneyu_cliffsdelta(x, y):
    # Use mann-whitney test because wilcoxon test requires same length
    (statistic, pvalue) = mannwhitneyu(x, y)
    (delta, size) = cliffsDelta(x, y)
    print(f'''\
\tMann-Whitney: statistic={statistic} pvalue={pvalue}
\tCliff'a Delta: delta={delta} size={size}
    ''')