In [93]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
from datetime import datetime
pd.set_option('display.max_colwidth', None)

import sys
!{sys.executable} -m pip install tabulate



In [94]:
def print_markdown_table(columns, dtypes, descriptions):
    col_indexes = [i for i in range(len(columns))]
    df = pd.DataFrame({
            'col_index': col_indexes, 
            'attribute': columns, 
            'type': dtypes, 
            'description': descriptions
        },
        index=None
    )
    print(df.to_markdown(index=False))

## GKIRs

In [95]:
gkir_dtypes = {
    'id': 'int64',
    'number': 'int64',
    'url': 'str',
    'title': 'str',
    'state': 'category',
    'is_locked': 'bool',
    'created_at': 'str',
    'updated_at': 'str',
    'closed_at': 'str',
    'user_login': 'category',
    'labels': 'str',
    'num_comments': 'int64',
    'events_url': 'str',
    'dependency_name': 'category',
    'dependency_type': 'category',
    'dependency_actual_version': 'str',
    'dependency_next_version': 'str',
    'dependency_bundle_name': 'category',
    'body_parser': 'category',
    'repo_url': 'str',
    'update_type': 'category',
    'repo_name': 'category',
    'html_url': 'str',
    'body': 'str'
}
gkir_descriptions = {
    'id': 'Issue ID',
    'number': 'Issue number in project.',
    'url': 'GitHub API url of the issue.',
    'title': 'Issue title.',
    'state': 'Issue state (open, closed, etc.).',
    'is_locked': 'Whether the issue has been locked.',
    'created_at': 'When the issue was created.',
    'updated_at': 'When the issue was last updated.',
    'closed_at': 'When the issue was closed (if applicable).',
    'user_login': 'Username of the entity that created the issue.',
    'labels': 'Any labels the issue has.',
    'num_comments': 'The number of comments on the issue.',
    'events_url': 'Url for related events.',
    'dependency_name': 'The name of the dependency that caused Greenkeeper to open the issue.',
    'dependency_type': 'The type of the dependency that caused Greenkeeper to open the issue.',
    'dependency_actual_version': 'The version of the dependency used by the client project.',
    'dependency_next_version': 'The version of the dependency that was just released and caused Greenkeeper to open the issue.',
    'dependency_bundle_name': 'The bundle type name of the issue (if applicable).',
    'body_parser': 'The parser type that was used to parse the issue (if applicable).',
    'repo_url': 'The url of the client repo.',
    'update_type': 'The update type (major, minor, or patch) between dependency_actual_version and dependency_next_version',
    'repo_name': 'The full name of the parent repo.',
    'html_url': 'GitHub url of the issue.',
    'body': 'The body of the issue.'
}
parse_dates = ['created_at', 'updated_at', 'closed_at']
gkirs = pd.read_csv('../greenkeeper_data/gkirs.csv', dtype=gkir_dtypes, parse_dates=parse_dates)

In [96]:
print_markdown_table(gkirs.columns.to_numpy(), gkir_dtypes.values(), gkir_descriptions.values())

|   col_index | attribute                 | type     | description                                                                                            |
|------------:|:--------------------------|:---------|:-------------------------------------------------------------------------------------------------------|
|           0 | id                        | int64    | Issue ID                                                                                               |
|           1 | number                    | int64    | Issue number in project.                                                                               |
|           2 | url                       | str      | GitHub API url of the issue.                                                                           |
|           3 | title                     | str      | Issue title.                                                                                           |
|           4 | state                   

## GKIR Comments

In [97]:
gkir_comments_dtypes = {
    'comment_issue_url': 'str',
    'comment_issue_id': 'int64',
    'comment_id': 'int64',
    'comment_url': 'str',
    'comment_created_at': 'str',
    'comment_updated_at': 'str',
    'comment_body': 'str',
    'comment_author_association': 'category',
    'comment_user_id': 'int64',
    'comment_user_login': 'category',
    'comment_user_type': 'category',
}
gkir_comments_descriptions = {
    'comment_issue_url': 'The GitHub API url of the issue the comment was created on.',
    'comment_issue_id': 'The ID of the issue the comment was created on.',
    'comment_id': 'The comment ID.',
    'comment_url': 'The GitHub API url of the comment.',
    'comment_created_at': 'When the comment was created.',
    'comment_updated_at': 'When the comment was last updated.',
    'comment_body': 'The body of the comment.',
    'comment_author_association': 'The comment authors association to the parent repo.',
    'comment_user_id': 'The ID of the entity who created the comment.',
    'comment_user_login': 'The username of the entity who created the comment.',
    'comment_user_type': 'The type of the entity who created the comment.',
}
parse_dates = ['comment_created_at', 'comment_updated_at']
gkir_comments = pd.read_csv('../greenkeeper_data/gkir_comments.csv', dtype=gkir_comments_dtypes, parse_dates=parse_dates)

In [98]:
print_markdown_table(gkir_comments.columns.to_numpy(), gkir_comments_dtypes.values(), gkir_comments_descriptions.values())

|   col_index | attribute                  | type     | description                                                 |
|------------:|:---------------------------|:---------|:------------------------------------------------------------|
|           0 | comment_issue_url          | str      | The GitHub API url of the issue the comment was created on. |
|           1 | comment_issue_id           | int64    | The ID of the issue the comment was created on.             |
|           2 | comment_id                 | int64    | The comment ID.                                             |
|           3 | comment_url                | str      | The GitHub API url of the comment.                          |
|           4 | comment_created_at         | str      | When the comment was created.                               |
|           5 | comment_updated_at         | str      | When the comment was last updated.                          |
|           6 | comment_body               | str      | 

## GKIR Commits

In [99]:
gkir_commits_dtypes = {
    'commit_sha': 'str',
    'issue_id': 'int64',
    'repo_name': 'category',
    'url': 'str',
    'html_url': 'str',
    'message': 'str',
    'author_login': 'category',
    'author_type': 'category',
    'committer_login': 'category',
    'committer_type': 'category',
    'stats_total': 'int64',
    'stats_additions': 'int64',
    'stats_deletions': 'int64',
    'file_name': 'str',
    'file_status': 'category',
    'file_additions': 'int64',
    'file_deletions': 'int64',
    'file_changes': 'int64',
    'file_patch': 'str',
}
gkir_commits_descriptions = {
    'commit_sha': 'The commit sha - note that this is not unique in the dataset, as a commit can have multiple file changes.',
    'issue_id': 'The ID of the issue that references the commit.',
    'repo_name': 'The full name of the repo the commit belongs to.',
    'url': 'The GitHub API url of the commit.',
    'html_url': 'The GitHub url of the commit.',
    'message': 'The commit message',
    'author_login': 'The username of the author of the commit.',
    'author_type': 'The user type of the author of the commit.',
    'committer_login': 'The username of the committer of the commit.',
    'committer_type': 'The user type of the committer of the commit.',
    'stats_total': 'The total additions and deletions the commit consists of.',
    'stats_additions': 'The additions the commit consists of.',
    'stats_deletions': 'The deletions the commit consists of.',
    'file_name': 'The file name the commit affects.',
    'file_status': 'The type of modification made to the file.',
    'file_additions': 'The additions of the file change.',
    'file_deletions': 'The deletions of the file change.',
    'file_changes': 'The total additions and deletions of the file change.',
    'file_patch': 'The raw file patch from the commit.',
}
gkir_commits = pd.read_csv('../greenkeeper_data/gkir_commits.csv', dtype=gkir_commits_dtypes)

In [100]:
print_markdown_table(gkir_commits.columns.to_numpy(), gkir_commits_dtypes.values(), gkir_commits_descriptions.values())

|   col_index | attribute       | type     | description                                                                                               |
|------------:|:----------------|:---------|:----------------------------------------------------------------------------------------------------------|
|           0 | commit_sha      | str      | The commit sha - note that this is not unique in the dataset, as a commit can have multiple file changes. |
|           1 | issue_id        | int64    | The ID of the issue that references the commit.                                                           |
|           2 | repo_name       | category | The name of the repo the commit belongs to.                                                               |
|           3 | url             | str      | The GitHub API url of the commit.                                                                         |
|           4 | html_url        | str      | The GitHub url of the commit.        

## Non-GKIRs

In [101]:
non_gkirs_dtypes = {
    'id': 'float64',
    'repo_name': 'category',
    'url': 'object',
    'repository_url': 'str',
    'comments_url': 'str',
    'events_url': 'str',
    'html_url': 'str',
    'number': 'float64',
    'title': 'str',
    'user_id': 'float64',
    'user_login': 'category',
    'user_type': 'category',
    'state': 'category',
    'locked': 'bool',
    'comments': 'float64',
    'created_at': 'str',
    'updated_at': 'str',
    'closed_at': 'str',
    'body': 'object',
    'is_pull_request': 'bool',
}
parse_dates=['created_at', 'updated_at', 'closed_at']
non_gkirs_descriptions = {
    'id': 'The issue ID.',
    'repo_name': 'The full name of the repo the issue was created in.',
    'url': 'The GitHub API url of the issue.',
    'repository_url': 'The GitHub API url of the repo the issue was created in.',
    'comments_url': 'The GitHub API url of the comments for the issue.',
    'events_url': 'The GitHub API url of the events for the issue.',
    'html_url': 'The GitHub HTML url of the issue.',
    'number': 'The issue number of the parent repo.',
    'title': 'The issue title.',
    'user_id': 'The ID of the entity that created the issue.',
    'user_login': 'The username of the entity that created the issue.',
    'user_type': 'The type of the entity that created the issue.',
    'state': 'The current state of the issue.',
    'locked': 'Whether the issue has been locked.',
    'comments': 'The number of comments on the issue',
    'created_at': 'When the issue was created.',
    'updated_at': 'When the issue was last updated.',
    'closed_at': 'When the issue was closed (if applicable).',
    'body': 'The issue body.',
    'is_pull_request': 'Whether the issue is a pull request.',
}
non_gkirs = pd.read_csv('../greenkeeper_data/non_gkirs.csv', dtype=non_gkirs_dtypes, parse_dates=parse_dates)

In [102]:
print_markdown_table(non_gkirs.columns.to_numpy(), non_gkirs_dtypes.values(), non_gkirs_descriptions.values())

|   col_index | attribute       | type     | description                                              |
|------------:|:----------------|:---------|:---------------------------------------------------------|
|           0 | id              | float64  | The issue ID.                                            |
|           1 | repo_name       | category | The name of the repo the issue was created in.           |
|           2 | url             | object   | The GitHub API url of the issue.                         |
|           3 | repository_url  | str      | The GitHub API url of the repo the issue was created in. |
|           4 | comments_url    | str      | The GitHub API url of the comments for the issue.        |
|           5 | events_url      | str      | The GitHub API url of the events for the issue.          |
|           6 | html_url        | str      | The GitHub HTML url of the issue.                        |
|           7 | number          | float64  | The issue number of

## Non-GKIR Comments

In [103]:
non_gkir_comments_dtypes = {
    'id': 'int64',
    'issue_id': 'int64',
    'repo_name': 'category',
    'url': 'str',
    'issue_url': 'str',
    'user_id': 'int64',
    'user_login': 'category',
    'user_type': 'category',
    'created_at': 'str',
    'updated_at': 'str',
    'body': 'str',
}
parse_dates = ['created_at', 'updated_at']
non_gkir_comments_descriptions = {
    'id': 'The comment ID.',
    'issue_id': 'The ID of the issue the comment was created on.',
    'repo_name': 'The full name of the repo the comment was created in.',
    'url': 'The GitHub API url of the comment.',
    'issue_url': 'The GitHub API url of the issue the comment was created on.',
    'user_id': 'The ID of the entity who created the comment.',
    'user_login': 'The username of the entity who created the comment.',
    'user_type': 'The type of the entity who created the comment.',
    'created_at': 'When the comment was created.',
    'updated_at': 'When the comment was last updated.',
    'body': 'The body of the comment.',
}

non_gkir_comments = pd.read_csv('../greenkeeper_data/non_gkir_comments.csv', dtype=non_gkir_comments_dtypes, parse_dates=parse_dates)

In [104]:
print_markdown_table(non_gkir_comments.columns.to_numpy(), non_gkir_comments_dtypes.values(), non_gkir_comments_descriptions.values())

|   col_index | attribute   | type     | description                                                 |
|------------:|:------------|:---------|:------------------------------------------------------------|
|           0 | id          | int64    | The comment ID.                                             |
|           1 | issue_id    | int64    | The ID of the issue the comment was created on.             |
|           2 | repo_name   | category | The name of the repo the comment was created in.            |
|           3 | url         | str      | The GitHub API url of the comment.                          |
|           4 | issue_url   | str      | The GitHub API url of the issue the comment was created on. |
|           5 | user_id     | int64    | The ID of the entity who created the comment.               |
|           6 | user_login  | category | The username of the entity who created the comment.         |
|           7 | user_type   | category | The type of the entity who creat

## Non-GKIR Commits

In [105]:
non_gkir_commits_dtypes = {
    'commit_sha': 'str',
    'issue_id': 'float64',
    'repo_name': 'category',
    'url': 'str',
    'html_url': 'str',
    'message': 'str',
    'author_login': 'category',
    'author_type': 'category',
    'committer_login': 'category',
    'committer_type': 'category',
    'stats_total': 'int64',
    'stats_additions': 'int64',
    'stats_deletions': 'int64',
    'file_name': 'str',
    'file_status': 'category',
    'file_additions': 'int64',
    'file_deletions': 'int64',
    'file_changes': 'int64',
    'file_patch': 'str',
}
non_gkir_commits_descriptions = {
    'commit_sha': 'The commit sha - note that this is not unique in the dataset, as a commit can have multiple file changes.',
    'issue_id': 'The ID of the issue that references the commit.',
    'repo_name': 'The full name of the repo the commit belongs to.',
    'url': 'The GitHub API url of the commit.',
    'html_url': 'The GitHub url of the commit.',
    'message': 'The commit message',
    'author_login': 'The username of the author of the commit.',
    'author_type': 'The user type of the author of the commit.',
    'committer_login': 'The username of the committer of the commit.',
    'committer_type': 'The user type of the committer of the commit.',
    'stats_total': 'The total additions and deletions the commit consists of.',
    'stats_additions': 'The additions the commit consists of.',
    'stats_deletions': 'The deletions the commit consists of.',
    'file_name': 'The file name the commit affects.',
    'file_status': 'The type of modification made to the file.',
    'file_additions': 'The additions of the file change.',
    'file_deletions': 'The deletions of the file change.',
    'file_changes': 'The total additions and deletions of the file change.',
    'file_patch': 'The raw file patch from the commit.',
}
non_gkir_commits = pd.read_csv('../greenkeeper_data/non_gkir_commits.csv', dtype=non_gkir_commits_dtypes)

In [106]:
print_markdown_table(non_gkir_commits.columns.to_numpy(), non_gkir_commits_dtypes.values(), non_gkir_commits_descriptions.values())

|   col_index | attribute       | type     | description                                                                                               |
|------------:|:----------------|:---------|:----------------------------------------------------------------------------------------------------------|
|           0 | commit_sha      | str      | The commit sha - note that this is not unique in the dataset, as a commit can have multiple file changes. |
|           1 | issue_id        | float64  | The ID of the issue that references the commit.                                                           |
|           2 | repo_name       | category | The name of the repo the commit belongs to.                                                               |
|           3 | url             | str      | The GitHub API url of the commit.                                                                         |
|           4 | html_url        | str      | The GitHub url of the commit.        

## Non-GKIR Commit-Event Relationships

In [107]:
non_gkir_commit_event_relationships_dtypes = {
    'event_id': 'int64',
    'issue_id': 'int64',
    'repo_name': 'category',
    'event_type': 'category',
    'commit_id': 'category',
    'commit_url': 'object',
}

non_gkir_commit_event_relationships_descriptions = {
    'event_id': 'The associated event ID.',
    'issue_id': 'The associated issue ID.',
    'repo_name': 'The name of the parent repo.',
    'event_type': 'The event type.',
    'commit_id': 'The referenced commit sha.',
    'commit_url': 'The GitHub API url of the commit.',
}
non_gkir_commit_event_relationships = pd.read_csv('../greenkeeper_data/non_gkir_commit_event_relationships.csv', dtype=non_gkir_commit_event_relationships_dtypes)

In [108]:
print_markdown_table(
    non_gkir_commit_event_relationships.columns.to_numpy(),
    non_gkir_commit_event_relationships_dtypes.values(),
    non_gkir_commit_event_relationships_descriptions.values()
)

|   col_index | attribute   | type     | description                       |
|------------:|:------------|:---------|:----------------------------------|
|           0 | event_id    | int64    | The associated event ID.          |
|           1 | issue_id    | int64    | The associated issue ID.          |
|           2 | repo_name   | category | The name of the parent repo.      |
|           3 | event_type  | category | The event type.                   |
|           4 | commit_id   | category | The referenced commit sha.        |
|           5 | commit_url  | object   | The GitHub API url of the commit. |


## Repos Info

In [112]:
repos_info_dtypes = {
    'repo_name': 'category',
    'is_fork': 'bool',
    'size': 'int64',
    'stargazers_count': 'int64',
    'watchers_count': 'int64',
    'language': 'category',
    'package_name': 'category',
    'use_repo_name': 'bool',
    'on_libraries_io_npm': 'bool',
    'npm_dependent_repos_count': 'float64',
    'npm_dependents_count': 'float64',
    'npm_forks': 'float64',
    'npm_language': 'category',
    'npm_rank': 'float64',
    'npm_stars': 'float64',
}
repos_info_descriptions = {
    'repo_name': 'The full name of the repo',
    'is_fork': 'Whether the repo is a fork.',
    'size': 'The size of the repo.',
    'stargazers_count': 'The number of stars the repo has.',
    'watchers_count': 'The number of watchers the repo has.',
    'language': 'The primary language of the repo.',
    'package_name': 'The package name on npm.',
    'use_repo_name': 'Whether the repo name is different from the name in the project package.json file.',
    'on_libraries_io_npm': 'Whether the package was found on libraries.io.',
    'npm_dependent_repos_count': 'The number of dependents on the package - from npm.',
    'npm_dependents_count': 'The number of dependents on the package - from npm.',
    'npm_forks': 'The number of forks of the package - from npm.',
    'npm_language': 'The primary language of the package - from npm.',
    'npm_rank': 'The rank of the package - from npm.',
    'npm_stars': 'The number of stars the package has - from npm ',
}
repos_info = pd.read_csv('../greenkeeper_data/repos_info.csv', dtype=repos_info_dtypes)

In [113]:
print_markdown_table(
    repos_info.columns.to_numpy(),
    repos_info_dtypes.values(),
    repos_info_descriptions.values()
)

|   col_index | attribute                 | type     | description                                                                        |
|------------:|:--------------------------|:---------|:-----------------------------------------------------------------------------------|
|           0 | repo_name                 | category | The full name of the repo                                                          |
|           1 | is_fork                   | bool     | Whether the repo is a fork.                                                        |
|           2 | size                      | int64    | The size of the repo.                                                              |
|           3 | stargazers_count          | int64    | The number of stars the repo has.                                                  |
|           4 | watchers_count            | int64    | The number of watchers the repo has.                                               |
|           5 | lang