In [80]:
import os

import numpy as np
import pandas as pd

In [81]:
# Read all pk1 issues files from Data/is (sub)folders
pickled_dirs = []
repo_names = []
for folder in os.listdir('../Data/is'):
    try:
        pickled_dirs.append(f'../Data/is/{folder}/'+os.listdir('../Data/is/'+folder)[0])
        repo_names.append(folder)
    except:
        print('NO ISSUE DATA IN: ', folder)
        continue

print('\n', pickled_dirs[:20])
print('\n', repo_names[:20])

NO ISSUE DATA IN:  prisma-graphql-import

 ['../Data/is/AE9RB-ruby-redis/AE9RB_ruby-redis_is.pk1', '../Data/is/AigeStudio-Android/AigeStudio_Android_is.pk1', '../Data/is/AlburIvan-SlickForm/AlburIvan_SlickForm_is.pk1', '../Data/is/AlexLiuSheng-CheckVersionLib/AlexLiuSheng_CheckVersionLib_is.pk1', '../Data/is/AllenDowney-ThinkJavaCode/AllenDowney_ThinkJavaCode_is.pk1', '../Data/is/Aufree-ESTMusicIndicator/Aufree_ESTMusicIndicator_is.pk1', '../Data/is/Azure-azure-storage-fuse/Azure_azure-storage-fuse_is.pk1', '../Data/is/BMIRDS-deepslide/BMIRDS_deepslide_is.pk1', '../Data/is/BetterCloud-vault-java-driver/BetterCloud_vault-java-driver_is.pk1', '../Data/is/BreakingMalwareResearch-eleven/BreakingMalwareResearch_eleven_is.pk1', '../Data/is/BurntSushi-quickcheck/BurntSushi_quickcheck_is.pk1', '../Data/is/BycorSanchez-resources/BycorSanchez_resources_is.pk1', '../Data/is/CMU-Perceptual-Computing-Lab-openpose/CMU-Perceptual-Computing-Lab_openpose_is.pk1', '../Data/is/CVCalendar-CVCalendar/CVCal

In [82]:
# Read in all pickled files
df_list = [pd.read_pickle(file_dir) for file_dir in pickled_dirs]

In [83]:
def clean_issue_df(issue_df: pd.DataFrame) -> pd.DataFrame:
    """Takes a DataFrame of issue data and returns a cleaned DataFrame."""

    author_dict_list = issue_df['author'].values.tolist()
    unpacked_dict_list = []
        
    for item in author_dict_list:
        # If `item` is empty append dict of NaNs, else append data
        if item == None:
            unpacked_dict_list.append({'login': np.NaN, 'company': np.NaN})
        else:
            unpacked_dict_list.append(item)

    # Recombine issue_df and unpacked dicts
    author_issue_df = pd.DataFrame(unpacked_dict_list)
    clean_issue_df = issue_df.drop(labels=['author'], axis='columns')
    clean_issue_df['author'] = author_issue_df['login']
    clean_issue_df['company'] = author_issue_df['company']
    
    # Type conversion
    clean_issue_df['createdAt'] = clean_issue_df['createdAt'].apply(pd.Timestamp)
    clean_issue_df['updatedAt'] = clean_issue_df['updatedAt'].apply(pd.Timestamp)
    clean_issue_df['closedAt'] = clean_issue_df['closedAt'].apply(pd.Timestamp)
    
    return clean_issue_df

In [84]:
# Clean each issue DataFrame, record indices of 0-length DataFrames (for pickling)
cleaned_issues = []
empty_df_ind = []
for ind, df in enumerate(df_list):
    if len(df) > 0:
        cleaned_issues.append(clean_issue_df(df))
    else:
        empty_df_ind.append(ind)

In [85]:
cleaned_issues[6][:5]

Unnamed: 0,authorAssociation,bodyText,closedAt,comments,createdAt,number,state,title,updatedAt,author,company
0,NONE,Hi! Nice work! I would like to know if I could...,2019-04-16 22:18:11+00:00,"{'totalCount': 1, 'nodes': [{'author': {'login...",2019-03-06 03:15:26+00:00,1,CLOSED,Run the code on CPUs,2019-04-16 22:18:11+00:00,Tato14,
1,NONE,"Hi~I've read your paper, I'm really interested...",2019-03-08 12:34:36+00:00,"{'totalCount': 1, 'nodes': [{'author': {'login...",2019-03-08 08:58:12+00:00,2,CLOSED,Datasets request,2019-03-08 12:34:36+00:00,Daisy5566,
2,NONE,What would you recommend for splitting up sing...,2019-03-10 23:52:31+00:00,"{'totalCount': 1, 'nodes': [{'author': {'login...",2019-03-10 22:25:21+00:00,3,CLOSED,Single Layer TIF format,2019-03-10 23:52:31+00:00,jlevy44,
3,NONE,Hi!\nAs transfer learning seem to be quite a b...,2019-04-10 16:22:31+00:00,"{'totalCount': 2, 'nodes': [{'author': {'login...",2019-04-09 09:28:04+00:00,4,CLOSED,Use pre-trained ResNet with frozen layers,2019-04-10 16:22:32+00:00,Tato14,
4,NONE,"Sorry for the spam, I am now digging a little ...",2019-04-10 16:24:21+00:00,"{'totalCount': 1, 'nodes': [{'author': {'login...",2019-04-09 12:50:18+00:00,5,CLOSED,Attention based classification in ResNet18,2019-04-10 16:24:21+00:00,Tato14,


In [86]:
# Do the indices "add up"?
len(cleaned_issues), len(repo_names), len(empty_df_ind)

(705, 799, 94)

In [92]:
cleaned_dir = '../Data/cleaned_issues/'
for ind, df in enumerate(cleaned_issues):
    if ind not in empty_df_ind:
        df.to_pickle(cleaned_dir+repo_names[ind]+'_is.pk1')

In [94]:
# Test pickled files
pd.read_pickle(cleaned_dir+repo_names[5]+'_is.pk1')[:5]

Unnamed: 0,authorAssociation,bodyText,closedAt,comments,createdAt,number,state,title,updatedAt,author,company
0,NONE,Just cloned (fc7dddf) and this is the result o...,2017-11-06 23:18:26+00:00,"{'totalCount': 2, 'nodes': [{'author': {'login...",2017-11-06 19:14:06+00:00,13,CLOSED,Doesn't compile due to warnings treated as err...,2017-11-06 23:18:26+00:00,AaronFriel,
1,NONE,I can not compile on raspbian\n./build.sh: lin...,2017-12-10 21:24:22+00:00,"{'totalCount': 14, 'nodes': [{'author': {'logi...",2017-11-06 19:17:57+00:00,14,CLOSED,Does not compile on raspbian,2017-12-10 21:24:22+00:00,raphaelm22,
2,NONE,"Hi,\nI've been trying out the blobFUSE adapter...",2017-11-20 16:44:16+00:00,"{'totalCount': 2, 'nodes': [{'author': {'login...",2017-11-17 16:18:41+00:00,16,CLOSED,Missing file system information,2017-11-20 16:44:16+00:00,NillsF,@Microsoft
3,NONE,"Hello,\nI might be a temporary issue, but I am...",2017-12-02 05:28:25+00:00,"{'totalCount': 22, 'nodes': [{'author': {'logi...",2017-11-21 13:42:47+00:00,18,CLOSED,apt-get install blobfuse doesn't work anymore,2017-12-02 05:28:25+00:00,NillsF,@Microsoft
4,NONE,Hello\nI try to use fuse inside of a docker co...,2017-12-02 05:26:10+00:00,"{'totalCount': 4, 'nodes': [{'author': {'login...",2017-11-24 09:09:51+00:00,20,CLOSED,fuse: device not found,2019-01-04 20:56:57+00:00,TsuyoshiUshio,SimpleArchitect
