In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
from datetime import datetime
from tqdm.notebook import tqdm_notebook
tqdm_notebook.pandas()
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', None)

PROJECT_ROOT = '..'
CSV_FOLDER = 'csv'

%run _utils.ipynb

# issues = load_issues()
# comments = load_comments()
# events = load_events()
# commits = load_commits()
# package_names = load_package_names()
# library_versions = load_library_versions()
# library_releases = load_library_releases()



<h3>Issues</h3>

In [21]:
dtypes = {
    'issue_id': 'int64', 'issue_number': 'int64', 'issue_url': 'str',
    'issue_title': 'str', 'issue_state': 'category', 'issue_is_locked': 'bool',
    'issue_created_at': 'str', 'issue_updated_at': 'str', 'issue_closed_at': 'str',
    'issue_user_login': 'str', 'issue_labels': 'str', 'issue_num_comments': 'int64',
    'issue_events_url': 'str', 'issue_dependency_name': 'category', 'issue_dependency_type': 'category',
    'issue_dependency_actual_version': 'str', 'issue_dependency_next_version': 'str', 
    'issue_dependency_bundle_name': 'category', 'issue_body_parser': 'category', 'issue_repo_url': 'str',
    'update_type': 'category',
}
parse_dates = ['issue_created_at', 'issue_updated_at', 'issue_closed_at']
renaming = {
    'issue_id': 'id', 'issue_number': 'numer', 'issue_url': 'url',
    'issue_title': 'title', 'issue_state': 'state', 'issue_is_locked': 'is_locked',
    'issue_created_at': 'created_at', 'issue_updated_at': 'updated_at', 'issue_closed_at': 'closed_at',
    'issue_user_login': 'user_login', 'issue_labels': 'labels', 'issue_num_comments': 'num_comments',
    'issue_events_url': 'events_url', 'issue_dependency_name': 'dependency_name',
    'issue_dependency_type': 'dependency_type', 'issue_dependency_actual_version': 'dependency_actual_version',
    'issue_dependency_next_version': 'dependency_next_version', 'issue_dependency_bundle_name': 'dependency_bundle_name',
    'issue_body_parser': 'body_parser', 'issue_repo_url': 'repo_url', 'update_type': 'update_type',
}

issues = pd.read_csv(
    f'{PROJECT_ROOT}/{CSV_FOLDER}/aug_greenkeeper_issues.csv',
    dtype=dtypes,
    parse_dates=parse_dates
).rename(columns=renaming)


<h3>Comments</h3>

In [26]:
dtypes ={
    'comment_issue_url': 'str', 'comment_issue_id': 'int64', 'comment_id': 'int64',
    'comment_url': 'str', 'comment_created_at': 'str', 'comment_updated_at': 'str',
    'comment_body': 'object', 'comment_author_association': 'str', 'comment_user_id': 'int64',
    'comment_user_login': 'str', 'comment_user_type': 'category',
}
parse_dates = ['comment_created_at', 'comment_updated_at']
renaming = {
    'comment_issue_url': 'issue_url', 'comment_issue_id': 'issue_id', 'comment_id': 'id',
    'comment_url': 'url', 'comment_created_at': 'created_at', 'comment_updated_at': 'updated_at',
    'comment_body': 'body', 'comment_author_association': 'author_association',
    'comment_user_id': 'user_id', 'comment_user_login': 'user_login', 'comment_user_type': 'user_type',
}

comments = pd.read_csv(
    f'{PROJECT_ROOT}/{CSV_FOLDER}/greenkeeper_comments.csv',
    dtype=dtypes,
    parse_dates=parse_dates,
).rename(columns=renaming)

<h3>Events</h3>

In [45]:
dtypes = {
    'event_issue_url': 'str', 'event_issue_id': 'int64', 'event_id': 'int64',
    'event_url': 'str', 'event_created_at': 'str', 'event_description': 'category',
    'event_actor_id': 'float64', 'event_actor_login': 'category', 'event_commit_id': 'str',
    'event_commit_url': 'str', 'event_label': 'category',
}
parse_dates = ['event_created_at']
renaming = {
    'event_issue_url': 'issue_url', 'event_issue_id': 'issue_id', 'event_id': 'id',
    'event_url': 'url', 'event_created_at': 'created_at', 'event_description': 'description',
    'event_actor_id': 'actor_id', 'event_actor_login': 'actor_login', 'event_commit_id': 'commit_id',
    'event_commit_url': 'commit_url', 'event_label': 'label',
}
events = pd.read_csv(
    f'{PROJECT_ROOT}/{CSV_FOLDER}/greenkeeper_events.csv',
    dtype=dtypes,
    parse_dates=parse_dates,
).rename(columns=renaming)


<h3>Commits</h3>

In [12]:
dtypes = {
    'commit_event_url': 'str', 'commit_event_id': 'int64', 'commit_message': 'str',
    'commit_git_committer_email': 'str', 'commit_git_committer_name': 'str', 
    'commit_git_author_email': 'str', 'commit_git_author_name': 'str',
    'commit_github_committer_login': 'str', 'commit_github_committer_id': 'float64',
    'commit_github_committer_type': 'str', 'commit_github_author_login': 'str',
    'commit_github_author_id': 'float64', 'commit_github_author_type': 'str',
    'commit_stats_deletions': 'int64', 'commit_stats_additions': 'int64',
    'commit_stats_total': 'int64', 'commit_tree_sha': 'str',
    'commit_sha': 'str', 'commit_num_parents': 'int64',
    'commit_num_comments': 'int64', 'commit_file_name': 'str',
    'commit_file_additions': 'int64', 'commit_file_deletions': 'int64',
    'commit_file_changes': 'int64', 'commit_file_sha': 'str', 
    'commit_file_status': 'str', 'commit_issue_id': 'float64',
}
renaming = {
    'commit_event_url': 'event_url', 'commit_event_id': 'event_id', 'commit_message': 'message',
    'commit_git_committer_email': 'committer_email', 'commit_git_committer_name': 'committer_name', 
    'commit_git_author_email': 'author_email', 'commit_git_author_name': 'author_name',
    'commit_github_committer_login': 'committer_login', 'commit_github_committer_id': 'committer_id',
    'commit_github_committer_type': 'committer_type', 'commit_github_author_login': 'author_login',
    'commit_github_author_id': 'author_id', 'commit_github_author_type': 'author_type',
    'commit_stats_deletions': 'deletions', 'commit_stats_additions': 'additions',
    'commit_stats_total': 'total_modifications', 'commit_tree_sha': 'tree_sha',
    'commit_sha': 'sha', 'commit_num_parents': 'num_parents',
    'commit_num_comments': 'num_comments', 'commit_file_name': 'file_name',
    'commit_file_additions': 'file_additions', 'commit_file_deletions': 'file_deletions',
    'commit_file_changes': 'file_modifications', 'commit_file_sha': 'file_sha', 
    'commit_file_status': 'file_status', 'commit_issue_id': 'issue_id',
}

commits = pd.read_csv(
    f'{PROJECT_ROOT}/{CSV_FOLDER}/aug_greenkeeper_commits.csv',
    dtype=dtypes
).rename(columns=renaming)


<h3>Package Names</h3>

In [54]:
dtypes = {
    'package_name': 'category',
    'package_gh_url_api': 'str',
    'package_gh_url': 'str',
    'package_author': 'category',
    'package_description': 'str',
    'package_repo_url': 'str',
    'package_repo_type': 'category',
    'package_version': 'str',
    'package_dependencies': 'str',
    'package_dev_dependencies': 'str',
    'package_peer_dependencies': 'str',
}
renaming = {
    'package_name': 'name',
    'package_gh_url_api': 'url_api',
    'package_gh_url': 'url',
    'package_author': 'author',
    'package_description': 'description',
    'package_repo_url': 'repo_url',
    'package_repo_type': 'repo_type',
    'package_version': 'version',
    'package_dependencies': 'dependencies',
    'package_dev_dependencies': 'dev_dependencies',
    'package_peer_dependencies': 'peer_dependencies',
}

package_names = pd.read_csv(
    f'{PROJECT_ROOT}/{CSV_FOLDER}/greenkeeper_package_names.csv',
    dtype=dtypes,
).rename(columns=renaming)


<h3>Library Versions</h3>

In [60]:
dtypes = {
    'package_name': 'category',
    'version': 'str',
    'version_published_at': 'str',
    'version_release_type': 'category',
    'broken_clients_count': 'int64',
    'time_until_next_release': 'str',
}
parse_dates = ['version_published_at']
library_versions = pd.read_csv(
    f'{PROJECT_ROOT}/{CSV_FOLDER}/aug_breaking_library_versions.csv',
    dtype=dtypes,
    parse_dates=parse_dates
)
library_versions["time_until_next_release"] = \
    pd.to_timedelta(library_versions["time_until_next_release"])

<h3>Library Releases</h3>

In [68]:
dtypes = {
    'package_name': 'object',
    'total_time_diff': 'object',
    'avg_time_between_releases': 'str',
    'first_release_date': 'str',
    'last_release_date': 'str',
    'total_releases': 'int64',
    'broken_builds_caused': 'int64',
    'avg_time_between_releases_seconds': 'int64',
}
parse_dates = ['first_release_date', 'last_release_date']
library_releases = pd.read_csv(
    f'{PROJECT_ROOT}/{CSV_FOLDER}/breaking_library_releases.csv',
    dtype=dtypes,
    parse_dates=parse_dates,
)
library_releases["avg_time_between_releases"] = \
    pd.to_timedelta(library_releases["avg_time_between_releases"])

<h3>General Explore</h3>

In [2]:
unique_package_count = len(issues['issue_repo_url'].unique())
print(f"{len(issues)} Greenkeeper Breaking Build issue reports (across {unique_package_count} unique projects)")
print(f"{len(comments)} comments for those issue reports")
print(f"{len(events)} events on those issue reports")
# print(f"\tIncludes {events['event_description'].unique()}")
print(f"{len(commits)} commits that were referenced in these breaking issue reports")
print(f"Total of {len(library_versions)} version release records for libraries that have at least 1 breaking release")
print(f"Release frequency information on {len(library_releases)} libraries")

123197 Greenkeeper Breaking Build issue reports (across 12134 unique projects)
365625 comments for those issue reports
209750 events on those issue reports
17623 commits that were referenced in these breaking issue reports
Total of 556742 version release records for libraries that have at least 1 breaking release
Release frequency information on 7361 libraries
