In [2]:
%run _utils.ipynb

In [42]:
issues = load_issues()
comments = load_comments()

In [43]:
NOT_CLASSIFIED = 'Other'
# Set all comments to 'Not Classified'
comments['comment_classification'] = NOT_CLASSIFIED
users_comments = comments[comments['comment_user_type'] == 'User'].copy().reset_index(drop=True)
issues_with_user_comment = issues[issues['id'].isin(users_comments['comment_issue_id'])].copy().reset_index(drop=True)
percent_of_issues_with_user_comment = f'{calculate_percent(len(issues_with_user_comment), len(issues))}%'
print(f'{percent_of_issues_with_user_comment} of in-range issues have a comment from a user.')
users_comments = users_comments.drop(users_comments[pd.isna(users_comments['comment_body'])].index)

8.58% of in-range issues have a comment from a user.


In [49]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
lemma = WordNetLemmatizer()

def clean_comment_body(c):
    text = c['comment_body']
    tokens = [t for t in text.split()]
    clean_tokens = list()
    for token in tokens:
        if token not in stopwords.words('english'):
            clean_tokens.append(lemma.lemmatize(token))
    return ' '.join(clean_tokens)

users_comments['lemm_body'] = users_comments.progress_apply(clean_comment_body, axis=1)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10723.0), HTML(value='')))




In [52]:
################################
# START FIX_REFERENCED
################################
FIX_REFERENCED = 'Fix Referenced'

reg_pr_url = '|'.join([
    'https:\/\/github\.com\/[\S]*\/(pull|issues)\/[\S]*'
])
reg_closed_by = '|'.join([
    '(closed|fixed|resolved|done|updated) (in|by|via|with)'
])
reg_number = '|'.join([
    '^#\d*|(Merged|Close|PR|see).*#\d*'
])

users_comments.loc[
    (users_comments['comment_classification'] == NOT_CLASSIFIED) & (
        (users_comments['lemm_body'].str.contains(pat=reg_pr_url, case=True, regex=True)) |
        (users_comments['lemm_body'].str.contains(pat=reg_closed_by, case=False, regex=True)) |
        (users_comments['lemm_body'].str.contains(pat=reg_number, case=False, regex=True))
    ),
    'comment_classification'
] = FIX_REFERENCED

fix_referenced_comments = users_comments[users_comments['comment_classification'] == FIX_REFERENCED]
print(f"{calculate_percent(len(fix_referenced_comments), len(users_comments))}% of user comments explicitly link a fix.")
# fix_referenced_comments['comment_body'].sample(10)
################################
# END FIX_REFERENCED
################################

34.9% of user comments explicitly link a fix.


In [59]:
################################
# START FALSE_ALARM
################################
FALSE_ALARM = 'False Alarm'

reg_flaky = '|'.join([
    'flake',
    'flaky',
    'flakiness',
    'fluke',
    '(server|test|CI) hiccup',
    'inconsistent test',
    'brittle test',
    'unstable unit test',
    'unstable test',
    'Spurious test failure',
    'Tests are passing',
    'tests are actually passing',
])
reg_random = '|'.join([
    '(fail|failing) random',
    'random.*fail.*',
    'random build error',
    'intermittent',
])
reg_retrigger = '|'.join([
    'retrigger',
    're-trigger',
    'rebuild',
    're-build',
    'restart',
    're-starting',
    're-run',
    'retried',
    'rerun',
    'reran',
    're-ran',
])
reg_false_positive = '|'.join([
    'false (positive|alarm|negative|alert)',
    'falso positive',
])
reg_invalid = '|'.join([
    '^(invalid|not applicable|not (a|an) issue|unrelated)',
    'Couldn\'t reproduce',
    'all tests pass',
])
reg_unrelated = '|'.join([
    '(unrelated.*fail)',
    'failed for other reasons',
    'build is actually passing',
    'build hiccup',
])
reg_timeout = '|'.join([
    'timeout|time-out',
])

users_comments.loc[
    (users_comments['comment_classification'] == NOT_CLASSIFIED) & (
        (users_comments['lemm_body'].str.contains(pat=reg_flaky, case=False, regex=True)) |
        (users_comments['lemm_body'].str.contains(pat=reg_random, case=False, regex=True)) |
        (users_comments['lemm_body'].str.contains(pat=reg_retrigger, case=False, regex=True)) |
        (users_comments['lemm_body'].str.contains(pat=reg_false_positive, case=False, regex=True)) | 
        (users_comments['lemm_body'].str.contains(pat=reg_invalid, case=False, regex=True)) |
        (users_comments['lemm_body'].str.contains(pat=reg_unrelated, case=False, regex=True)) |
        (users_comments['lemm_body'].str.contains(pat=reg_timeout, case=False, regex=True))
    ),
    'comment_classification'
] = FALSE_ALARM


false_alarm_comments = users_comments[users_comments['comment_classification'] == FALSE_ALARM]
print(f"{calculate_percent(len(false_alarm_comments), len(users_comments))}% of user comments indicate a false alarm.")
# false_alarm_comments['comment_body'].sample(10)
################################
# END FALSE_ALARM
################################

18.39% of user comments indicate a false alarm.


In [53]:
################################
# START TRANSIENT_FAILURE
################################
TRANSIENT_FAILURE = 'Transient Failure'

reg_transient = '|'.join([
    'transient.*fail',
])

users_comments.loc[
    (users_comments['comment_classification'] == NOT_CLASSIFIED) & (
        (users_comments['lemm_body'].str.contains(pat=reg_transient, case=False, regex=True))
    ),
    'comment_classification'
] = TRANSIENT_FAILURE

transient_failure_comments = users_comments[users_comments['comment_classification'] == TRANSIENT_FAILURE]
print(f"{calculate_percent(len(transient_failure_comments), len(users_comments))}% of user comments saying failure was caused by transient dependency")
# transient_failure_comments['comment_body'].sample(10)
################################
# END TRANSIENT_FAILURE
################################

0.24% of user comments saying failure was caused by transient dependency


In [54]:
################################
# START MENTION_GREENKEEPER
################################
MENTION_GREENKEEPER = 'Mention Greenkeeper'

reg_greenkeeper = '|'.join([
    'greenkeeper',
])

users_comments.loc[
    (users_comments['comment_classification'] == NOT_CLASSIFIED) & (
        (users_comments['lemm_body'].str.contains(pat=reg_greenkeeper, case=False, regex=True))
    ),
    'comment_classification'
] = MENTION_GREENKEEPER

mention_greenkeeper_comments = users_comments[users_comments['comment_classification'] == MENTION_GREENKEEPER]
print(f"{calculate_percent(len(mention_greenkeeper_comments), len(users_comments))}% of user comments mentioning Greenkeeper")
# mention_greenkeeper_comments['comment_body'].sample(10)
################################
# END MENTION_GREENKEEPER
################################

3.32% of user comments mentioning Greenkeeper


In [55]:
################################
# START MENTION_CI_SYSTEM
################################
MENTION_CI_SYSTEM = 'Mention CI System'

reg_ci_system = '|'.join([
    'travis',
    'CircleCI',
    'circle ci',
    'jarvis',
    'jenkins',
    'CI.*issue',
])

users_comments.loc[
    (users_comments['comment_classification'] == NOT_CLASSIFIED) & (
        (users_comments['lemm_body'].str.contains(pat=reg_ci_system, case=False, regex=True))
    ),
    'comment_classification'
] = MENTION_CI_SYSTEM

mention_ci_system_comments = users_comments[users_comments['comment_classification'] == MENTION_CI_SYSTEM]
print(f"{calculate_percent(len(mention_ci_system_comments), len(users_comments))}% of user comments mention the projects CI build system")
# mention_greenkeeper_comments['comment_body'].sample(10)
################################
# END MENTION_CI_SYSTEM
################################

4.54% of user comments mention the projects CI build system


In [56]:
################################
# START FIX_MENTIONED
################################
FIX_MENTIONED = 'Fix Mentioned'

reg_fixed = '(fix(ed)*|resolved|done|solved|closed|updated|upgraded|closing|merged)|(fix|fixed|bumped|merged|upgraded).*manually|manually.*merged'
reg_update_in = '.*update.*in.*|Making a PR'

users_comments.loc[
    (users_comments['comment_classification'] == NOT_CLASSIFIED) & (
        (users_comments['lemm_body'].str.contains(pat=reg_fixed, case=False, regex=True)) | 
        (users_comments['lemm_body'].str.contains(pat=reg_update_in, case=False, regex=True))
    ),
    'comment_classification'
] = FIX_MENTIONED

fix_mentioned_comments = users_comments[users_comments['comment_classification'] == FIX_MENTIONED]
print(f"{calculate_percent(len(fix_mentioned_comments), len(users_comments))}% of user comments saying the issues has been fixed (without linking the fix).")
# fix_mentioned_comments['comment_body'].sample(10)
################################
# END FIX_MENTIONED
################################

17.25% of user comments saying the issues has been fixed (without linking the fix).


In [60]:
# TODO: Classify others
users_comments[users_comments['comment_classification'] == NOT_CLASSIFIED][['comment_body']].sample(10)

Unnamed: 0,comment_body
77,Duplicate of #80.
9582,it's not 😛
9210,no longer a dependency
111,Problem rectified \r\n![cat-typing](https://user-images.githubusercontent.com/36107/79525708-20aed580-8096-11ea-8428-f6f492e42bc1.gif)\r\n
9169,v2.4.0 was unpublished.
5993,Failures were due to unrelated build issues.
2845,Covered by #472
1316,"````\r\nserver-jre8 v8.0.202 [Approved] - Possibly broken\r\n15server-jre8 package files install completed. Performing other installation steps.\r\n16ERROR: Exception calling ""DownloadFile"" with ""2"" argument(s): ""The remote server returned an error: (404) Not Found.""\r\n17Environment Vars (like PATH) have changed. Close/reopen your shell to\r\n18 see the changes (or in powershell/cmd.exe just type `refreshenv`).\r\n19The install of server-jre8 was NOT successful.\r\n20Error while running 'C:\ProgramData\chocolatey\lib\server-jre8\tools\chocolateyInstall.ps1'.\r\n21 See log for details.\r\n22\r\n23elasticsearch v6.7.1 [Approved] - Likely broken for FOSS users (due to download location changes)\r\n24elasticsearch package files install completed. Performing other installation steps.\r\n````"
10143,Deleted build cache and reinstalled dependencies. No more Snyk errors.
2308,Faaaalse positive.
