In [24]:
import pandas as pd
import numpy as np

In [25]:
adamsmith_df = pd.read_pickle('repos/adamsmith/adamsmith_game-theory-poker_cm.pk1')
embulk_df = pd.read_pickle('repos/embulk/embulk_embulk_cm.pk1')
google_df = pd.read_pickle('repos/google/google_iosched_cm.pk1')
guillaume_df = pd.read_pickle('repos/guillaumebort/guillaumebort_play20-spring-demo_cm.pk1')
iven_df = pd.read_pickle('repos/iven-he/iven-he_webspider_cm.pk1')
microsoft_df = pd.read_pickle('repos/microsoft/microsoft_Mesh-processing-library_cm.pk1')
oleiade_df = pd.read_pickle('repos/oleiade/oleiade_trousseau_cm.pk1')
pandas_df = pd.read_pickle('repos/pandas-dev/df_commits_pandas.pk1')
yang_df = pd.read_pickle('repos/yangyangwithgnu/yangyangwithgnu_hardseed_cm.pk1')
zhn_df = pd.read_pickle('repos/zhnnnnn/zhnnnnn_ZHNCosmos_cm.pk1')

In [26]:
def clean_commit_df(commit_df: pd.DataFrame) -> pd.DataFrame:
    """
    Given a DataFrame of commit data, return the cleaned DataFrame.
    """

    PR_dict_list = commit_df['associatedPullRequests'].values.tolist()
    unpacked_dict_list = []

    for item in PR_dict_list:
        # If `nodes` is empty append dict of NaNs, else append data
        if len(item['nodes']) == 0:
            unpacked_dict_list.append({'id': np.NaN, 'title': np.NaN, 'author': np.NaN,\
                                       'authorAssociation': np.NaN, 'createdAt': np.NaN,\
                                       'updatedAt': np.NaN, 'closedAt': np.NaN,\
                                       'number': np.NaN, 'state': np.NaN})
        else:
            unpacked = item['nodes'][0]
            unpacked_dict_list.append(unpacked)

    assocPR_df = pd.DataFrame(unpacked_dict_list)

    ##########################

    committer_dicts = commit_df['committer'].values.tolist()
    unpacked_dict_list = []

    for item in committer_dicts:
        # If `user` is empty append dict of NaNs, else append data
        if item['user'] == None:
            unpacked_dict_list.append({'login': np.NaN, 'company': np.NaN})
        else:
            unpacked = item['user']
            unpacked_dict_list.append(unpacked)

    committer_df = pd.DataFrame(unpacked_dict_list)

    #########################

    status_dicts = commit_df['status'].values.tolist()
    unpacked_dict_list = []

    for item in status_dicts:
        # If `item` is empty append dict of NaNs, else append data
        if item == None:
            unpacked_dict_list.append({'id': np.NaN, 'state': np.NaN})
        else:
            unpacked = item
            unpacked_dict_list.append(unpacked)

    status_df = pd.DataFrame(unpacked_dict_list)

    ########################

    renamed_cols = {'author':'assocPR_author',
                    'authorAssociation':'assocPR_authorAssociation',
                    'closedAt':'assocPR_closedAt',
                    'createdAt':'assocPR_createdAt',
                    'id':'assocPR_id',
                    'number':'assocPR_number',
                    'state':'assocPR_state',
                    'title':'assocPR_title',
                    'updatedAt':'assocPR_updatedAt'}

    assocPR_df.rename(columns=renamed_cols, inplace=True)

    # Make list of PR author login values
    author_dicts = assocPR_df['assocPR_author'].values.tolist()
    unpacked_dict_list = []

    for item in author_dicts:
        # If `item` is empty append NaN, else append data
        if item == None or type(item) == float:
            unpacked_dict_list.append(np.NaN)
        else:
            unpacked_dict_list.append(item['login'])

    assocPR_df['assocPR_author'] = unpacked_dict_list

    renamed_cols = {'company':'committer_company',
                    'login':'committer_author'}
    committer_df.rename(columns=renamed_cols, inplace=True)

    renamed_cols = {'id':'status_id',
                    'state':'status_state'}
    status_df.rename(columns=renamed_cols, inplace=True)

    #######################

    commits_cleaned = pd.concat([assocPR_df, committer_df, status_df], axis='columns', ignore_index=False)

    commits_cleaned['committedDate'] = commit_df['committedDate'].values
    commits_cleaned['messageHeadline'] = commit_df['messageHeadline'].values
    commits_cleaned['oid'] = commit_df['oid'].values

    # Convert times to pd.Timestamp
    commits_cleaned['assocPR_createdAt'] = commits_cleaned['assocPR_createdAt'].apply(pd.Timestamp)
    commits_cleaned['assocPR_updatedAt'] = commits_cleaned['assocPR_updatedAt'].apply(pd.Timestamp)
    commits_cleaned['assocPR_closedAt'] = commits_cleaned['assocPR_closedAt'].apply(pd.Timestamp)
    
    return commits_cleaned

In [29]:
adamsmith_df = clean_commit_df(adamsmith_df)
embulk_df = clean_commit_df(embulk_df)
google_df = clean_commit_df(google_df)
guillaume_df = clean_commit_df(guillaume_df)
iven_df = clean_commit_df(iven_df)
microsoft_df = clean_commit_df(microsoft_df)
oleiade_df = clean_commit_df(oleiade_df)
pandas_df = clean_commit_df(pandas_df)
yang_df = clean_commit_df(yang_df)
zhn_df = clean_commit_df(zhn_df)


adamsmith_df.to_pickle('./repos/adamsmith_commits_cleaned.pk1')
embulk_df.to_pickle('./repos/embulk_commits_cleaned.pk1')
google_df.to_pickle('./repos/google_commits_cleaned.pk1')
guillaume_df.to_pickle('./repos/guillaume_commits_cleaned.pk1')
iven_df.to_pickle('./repos/iven_commits_cleaned.pk1')
microsoft_df.to_pickle('./repos/microsoft_commits_cleaned.pk1')
oleiade_df.to_pickle('./repos/oleiade_commits_cleaned.pk1')
pandas_df.to_pickle('./repos/pandas_commits_cleaned.pk1')
yang_df.to_pickle('./repos/yang_commits_cleaned.pk1')
zhn_df.to_pickle('./repos/zhn_commits_cleaned.pk1')