In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
from datetime import datetime
from tqdm.notebook import tqdm_notebook
tqdm_notebook.pandas()
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', None)

PROJECT_ROOT = '..'
CSV_FOLDER = 'csv'

%run _utils.ipynb

In [None]:
issues = pd.read_csv(f'{PROJECT_ROOT}/{CSV_FOLDER}/greenkeeper_issues.csv')
comments = pd.read_csv(f'{PROJECT_ROOT}/{CSV_FOLDER}/greenkeeper_comments.csv')
events = pd.read_csv(f'{PROJECT_ROOT}/{CSV_FOLDER}/greenkeeper_events.csv')
commits = pd.read_csv(f'{PROJECT_ROOT}/{CSV_FOLDER}/greenkeeper_commits.csv')
package_names = pd.read_csv(f'{PROJECT_ROOT}/{CSV_FOLDER}/greenkeeper_package_names.csv')
library_versions = pd.read_csv(f'{PROJECT_ROOT}/{CSV_FOLDER}/breaking_library_versions.csv')

<h3>Commits</h3>
<p>Add <b>commit_issue_id</b> to commits</p>

In [None]:
def local_get_issue_id_for_commit(commit):
    return get_issue_id_for_commit(events, issues, commit)

commits['commit_issue_id'] = commits.progress_apply(local_get_issue_id_for_commit, axis=1)
commits.to_csv(f'{PROJECT_ROOT}/{CSV_FOLDER}/aug_greenkeeper_commits.csv', index=False)

<h3>Issues</h3>
<p>Add <b>update_type</b> to issues</p>

In [None]:
def local_get_update_type(row):
    actual_version = row['issue_dependency_actual_version']
    next_version = row['issue_dependency_next_version']
    return get_update_type_v2(actual_version, next_version)

issues['update_type'] = \
    issues.progress_apply(local_get_update_type, axis=1)
issues.to_csv(f'{PROJECT_ROOT}/{CSV_FOLDER}/aug_greenkeeper_issues.csv', index=False)

In [None]:
raw_repo_names = pd.Series(issues['repo_url'].unique()).apply(lambda url: url.split('https://api.github.com/repos/')[1]).to_frame(name='repo')
raw_repo_names.to_csv(f'{PROJECT_ROOT}/{CSV_FOLDER}/raw_repo_names.csv', index=False)

<h3>Package dependencies</h3>
<p>Parse dev and dev_dep strings to json and then flatten the DTOs for the dataframe</p>

In [None]:
import json

def str_to_dict(s):
    if pd.isna(s):
        return dict()
    return json.loads(s.replace("'", '"')) if ~pd.isna(s) else dict()

# Parse dev and dev_dep strings to json
deps_dtos = list()
for index, row in tqdm_notebook(package_names.iterrows()):
    deps_dtos.append({
        'package_name': row['package_name'],
        'deps': str_to_dict(row['package_dependencies']),
        'dev_deps': str_to_dict(row['package_dev_dependencies']),
    })

# Flatten dtos for df
DEP = 'Dependency'
DEV_DEP = 'Dev Dependency'
packages = list()
deps_names = list()
deps_versions = list()
deps_types =  list()
for deps_dto in tqdm_notebook(deps_dtos):
    package = deps_dto['package_name']
    for dep_name, dep_version in deps_dto['deps'].items():
        packages.append(package)
        deps_names.append(dep_name)
        deps_versions.append(dep_version)
        deps_types.append(DEP)
    for dep_name, dep_version in deps_dto['dev_deps'].items():
        packages.append(package)
        deps_names.append(dep_name)
        deps_versions.append(dep_version)
        deps_types.append(DEV_DEP)
        
dependencies_df = pd.DataFrame({
    'package': packages,
    'deps_name': deps_names,
    'deps_version': deps_versions,
    'deps_type': deps_types,
})
dependencies_df.to_csv(f'{PROJECT_ROOT}/{CSV_FOLDER}/package_dependencies.csv', index=False)

<h3>Provider-to-clients</h3>
<p><i>Invert</i> the dependencies df to get the clients that depend on every provider</p>

In [None]:
deps_groups = dependencies_df.groupby(by=['deps_name'])
provider_packages_list = list()
clients_list = list()
for name, group in tqdm_notebook(deps_groups):
    provider_packages_list.extend([name for i in range(len(group))])
    clients_list.extend(list(group.apply(lambda row: row['package'], axis=1)))

provider_to_clients = pd.DataFrame({
    'provider': provider_packages_list,
    'client': clients_list,
})

provider_to_clients.to_csv(f'{PROJECT_ROOT}/{CSV_FOLDER}/provider_to_clients.csv', index=False)

# library_versions

In [None]:
library_versions["version_published_at"] = library_versions["version_published_at"].astype("datetime64")
library_versions = library_versions.dropna(subset=['package_name'])
library_versions = library_versions.sort_values(['package_name', 'version_published_at'])

all_release_types = list()

result = {
    'package_name': list(),
    'total_time_diff': list(),
    'avg_time_between_releases': list(),
    'first_release_date': list(),
    'last_release_date': list(),
    'total_releases': list(),
}

grouped_package_releases = library_versions.groupby(by='package_name')

# Loop over every release record for a specific library.
# Keep adding the time between each release to total_time_diff
# After loop, calculate avg time between releases
# Also keep track of each release type (MAJOR, MINOR< PATCH) and
# save it in all_release_types to be added as a columnn to library_versions df
def calculate_release_data_for_library(library, versions_df):
    prev_release_date = None
    curr_release_date = None
    prev_release_number = None
    curr_release_number = None
    first_release_date = None
    releases_count = 0
    total_time_diff = pd.Timedelta(seconds=0)
    for row_index, row in group.iterrows():
        prev_release_date = curr_release_date
        prev_release_number = curr_release_number
        curr_release_date = row['version_published_at']
        curr_release_number = row['version']
        releases_count += 1
        if prev_release_date is None:
            first_release_date = curr_release_date
            all_release_types.append(NA_RELEASE_TYPE)
            continue
        local_time_diff = (curr_release_date - prev_release_date)
        total_time_diff += local_time_diff
        all_release_types.append(get_update_type_v2(prev_release_number, curr_release_number))
    avg = total_time_diff / releases_count
    result['package_name'].append(package_name)
    result['total_time_diff'].append(total_time_diff)
    result['avg_time_between_releases'].append(avg)
    result['first_release_date'].append(first_release_date)
    result['last_release_date'].append(curr_release_date)
    result['total_releases'].append(releases_count)


for package_name, group in tqdm_notebook(grouped_package_releases):
    calculate_release_data_for_library(package_name, group)
library_versions['version_release_type'] = all_release_types

library_releases = pd.DataFrame(result)
library_releases['broken_builds_caused'] = \
    library_releases.progress_apply(
        lambda row: len(issues[issues['issue_dependency_name'] == row['package_name']]), axis=1)

def get_count_of_issues(library_version, df):
    version_issues = df[df['issue_dependency_next_version'] == library_version]
    return len(version_issues)

broken_clients_count = list()
grouped = library_versions.groupby(by=['package_name'])
for package, df in tqdm_notebook(grouped):
    issues_for_package = \
        issues[issues['issue_dependency_name'] == package]
    for idx, row in df.iterrows():
        breaks_count = get_count_of_issues(row['version'], issues_for_package)
        broken_clients_count.append(breaks_count)
                
library_versions['broken_clients_count'] = broken_clients_count


grouped = library_versions.groupby(by=['package_name'])
result = list()
last_idx = 0
for package_name, group in tqdm_notebook(grouped):
    last_idx += len(group)
    for idx, lv_row in group.iterrows():
        next_idx = idx + 1
#         print(f'idx={idx} next_idx={next_idx} len={len(group)}')
        if next_idx >= last_idx:
            result.append(pd.NA)
        else:
            result.append(group.loc[next_idx]['version_published_at'] - lv_row['version_published_at'])
library_versions['time_until_next_release'] = result


library_versions.to_csv(f'{PROJECT_ROOT}/{CSV_FOLDER}/aug_breaking_library_versions.csv', index=False)

<h3>library_releases</h3>
<p>Calculate <b>avg_time_between_releases_seconds</b></p>

In [None]:
library_releases['avg_time_between_releases_seconds'] = \
    (library_releases['avg_time_between_releases'].dt.days * 24 * 60 * 60) + \
    (library_releases['avg_time_between_releases'].dt.seconds)
library_releases.to_csv(f'{PROJECT_ROOT}/{CSV_FOLDER}/breaking_library_releases.csv', index=False)

# packages_release_and_breaks

In [None]:
grouped = library_versions.groupby(by=['package_name'])

result = {
    'package': list(),
    'total_releases': list(),
    'total_breaks': list(),
    'major_releases': list(),
    'major_breaks': list(),
    'minor_releases': list(),
    'minor_breaks': list(),
    'patch_releases': list(),
    'patch_breaks': list()
}

def release_broke_a_client(package, version, df):
    result = df.loc[
        (df['issue_dependency_next_version'] == version)
    ]
    return not result.empty

for package, df in tqdm_notebook(grouped):
    patch_count = 0
    patch_breaks_count = 0
    minor_count = 0
    minor_breaks_count = 0
    major_count = 0
    major_breaks_count = 0
    issues_for_package = \
        issues_with_update_type_count[issues_with_update_type_count['issue_dependency_name'] == package]
    for idx, row in df.iterrows():
        if row['version_release_type'] == PATCH:
            patch_count += 1
            if release_broke_a_client(package, row['version'], issues_for_package):
                patch_breaks_count += 1
        elif row['version_release_type'] == MINOR:
            minor_count += 1
            if release_broke_a_client(package, row['version'], issues_for_package):
                minor_breaks_count += 1
        elif row['version_release_type'] == MAJOR:
            major_count += 1
            if release_broke_a_client(package, row['version'], issues_for_package):
                major_breaks_count += 1
    result['package'].append(package)
    result['total_releases'].append(major_count + minor_count + patch_count)
    result['total_breaks'].append(major_breaks_count + minor_breaks_count + patch_breaks_count)
    result['major_releases'].append(major_count)
    result['major_breaks'].append(major_breaks_count)
    result['minor_releases'].append(minor_count)
    result['minor_breaks'].append(minor_breaks_count)
    result['patch_releases'].append(patch_count)
    result['patch_breaks'].append(patch_breaks_count)
        
packages_release_and_breaks_df = pd.DataFrame(result)

packages_release_and_breaks_df['total_ratio'] = \
    packages_release_and_breaks_df['total_breaks'] / packages_release_and_breaks_df['total_releases']
packages_release_and_breaks_df['major_ratio'] = \
    packages_release_and_breaks_df['major_breaks'] / packages_release_and_breaks_df['major_releases']
packages_release_and_breaks_df['minor_ratio'] = \
    packages_release_and_breaks_df['minor_breaks'] / packages_release_and_breaks_df['minor_releases']
packages_release_and_breaks_df['patch_ratio'] = \
    packages_release_and_breaks_df['patch_breaks'] / packages_release_and_breaks_df['patch_releases']

packages_release_and_breaks_df.to_csv(f'{PROJECT_ROOT}/{CSV_FOLDER}/releases_and_breaks_counts_by_package.csv', index=False)