<h1>Processing for Failed Pins manual analysis</h1>

<h3>Load helper functions</h3>

In [3]:
%run ../../_utils.ipynb

<h3>Load raw <code>sample_failed_pins</code></h3>

In [66]:
dtypes = {
    'id': 'int64',
    'url': 'object',
    'title': 'object',
    'html_url': 'object',
    'body': 'object',
    'comment_issue_id': 'int64',
    'comment_url': 'object',
    'comment_body': 'object',
    'pin_status': 'category',
}
file_path = f'./sample_failed_pins.csv'
sample_failed_pins = pd.read_csv(file_path, dtype=dtypes)

<h3>Clean <code>sample_failed_pins</code> and write to csv for excel sheet</h3>

In [67]:
sample_failed_pins.drop(['body', 'comment_url', 'comment_issue_id', 'pin_status'], axis=1)
for_csv = sample_failed_pins[['id', 'url', 'title', 'comment_body', 'html_url']].copy()
for_csv['notes'] = ''
for_csv['build_fail_reason'] = ''
for_csv.to_csv(f'./ma_sample_failed_pins.csv', index=False)

<h3>Check classified samples</h3>

In [133]:
categorized_failed_pins = pd.read_excel('ma_sample_failed_pins.xlsx', index_col=0)  
are_classified = \
    categorized_failed_pins[~pd.isna(categorized_failed_pins["build_fail_reason"])]

useful_classified = are_classified[are_classified['build_fail_reason'] != 'Build not available'].copy()

linter_error = [
    'Linter error'
]

error_maps = {
    'Linter/Project Guideline Error': [
        'Linter error',
        'Bundle Size Error',
        'Test coverage error',
    ],
    'Incompatible Node/npm/dependency error': [
        'Incompatible Node error',
        'Actual npm error',
        'npm install error',
        'Incompatible dependency error',
        'Missing dependency'
    ],
    'Timeout/Network Error': [
        'Build timeout',
        'Connection timeout',
        'Test timeout',
        'Test timeout error',
        'Network error',
        'Network error/404 file not found',
    ],
    'Security Error': [
        'Audit error',
        'Security error',
        'Authentication error',
    ],
#     'CI Configuration/Environment Error': [
#         'Unrelated CI failure',
#         'Build failed to start',
#         'Master branch was broken',
#         'Docker error',
#         'Aborted due to warnings',
#         'Cancelled Build'
#     ],
    'Clone Error/Missing File': [
        'Clone error / Missing file',
        'Clone error'
    ],
    'Unrelated Test Failure': [
        'Unrelated test failure',
        'Browser testing error',
    ],
    'Lockfile Error': [
        'Lockfile error',
    ],
    'Syntax/Build Error': [
        'Syntax error',
        'Unrelated CI error',
        'Unrelated CI failure',
        'Build failed to start',
        'Master branch was broken',
        'Docker error',
        'Aborted due to warnings',
        'Cancelled Build',
    ],
}

for overarching_error, matching_errors in error_maps.items():
    useful_classified.loc[
        useful_classified['build_fail_reason'].isin(matching_errors),
        'build_fail_reason'
    ] = overarching_error

print(f'{len(are_classified)} are classified')
print(useful_classified['build_fail_reason'].value_counts())
print(f'{len(useful_classified)} are classified and useful')
proportions = (
    round(
        useful_classified['build_fail_reason'].value_counts() / 
        useful_classified['build_fail_reason'].count(),
        3
    )
        
).to_frame(name='proportion')\
    .rename_axis('build_fail_reason')\
    .reset_index()
proportions.head(20)

613 are classified
Syntax/Build Error                        83
Incompatible Node/npm/dependency error    67
Unrelated Test Failure                    56
Linter/Project Guideline Error            54
Clone Error/Missing File                  41
Lockfile Error                            37
Timeout/Network Error                     34
Security Error                             9
Name: build_fail_reason, dtype: int64
381 are classified and useful


Unnamed: 0,build_fail_reason,proportion
0,Syntax/Build Error,0.218
1,Incompatible Node/npm/dependency error,0.176
2,Unrelated Test Failure,0.147
3,Linter/Project Guideline Error,0.142
4,Clone Error/Missing File,0.108
5,Lockfile Error,0.097
6,Timeout/Network Error,0.089
7,Security Error,0.024


Write out samples for second author

In [146]:
to_write = useful_classified.sample(n=58, random_state=7)
to_write = to_write[['url', 'html_url']]

# to_write['build_fail_reason'].value_counts()

to_write.to_excel('sample_failed_pins_second_author.xlsx')