#### Imports and Data Ingestion

In [1]:
from pprint import pprint
import pandas as pd
import numpy as np

In [2]:
commits = pd.read_pickle('df_commits_pandas.pk1')

In [3]:
commits.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20036 entries, 0 to 35
Data columns (total 6 columns):
associatedPullRequests    20036 non-null object
committedDate             20036 non-null object
committer                 20036 non-null object
messageHeadline           20036 non-null object
oid                       20036 non-null object
status                    13672 non-null object
dtypes: object(6)
memory usage: 1.1+ MB


#### Inspect DataFrame

In [4]:
commits.head()

Unnamed: 0,associatedPullRequests,committedDate,committer,messageHeadline,oid,status
0,{'nodes': [{'id': 'MDExOlB1bGxSZXF1ZXN0MzExMzY...,2019-08-27T21:50:22Z,"{'user': {'login': 'WillAyd', 'company': 'inno...",TYPING: change to FrameOrSeries Alias in panda...,d91ffa6407c1baf6afe7d0a1b9655f44da77ac24,{'id': 'MDY6U3RhdHVzODU4MTI3OmQ5MWZmYTY0MDdjMW...
1,{'nodes': [{'id': 'MDExOlB1bGxSZXF1ZXN0MzExMTI...,2019-08-27T21:39:03Z,"{'user': {'login': 'jbrockmendel', 'company': ...","CLN: Use ABC classes for isinstance checks, re...",080d57ee9fef9275518908cb7665ea062684c29b,{'id': 'MDY6U3RhdHVzODU4MTI3OjA4MGQ1N2VlOWZlZj...
2,{'nodes': [{'id': 'MDExOlB1bGxSZXF1ZXN0MzEwNjM...,2019-08-27T21:32:40Z,"{'user': {'login': 'WillAyd', 'company': 'inno...",TYPING: --check-untyped-defs util._decorators ...,bd8dbf906e4352567094637c9c824c350dae3ad2,{'id': 'MDY6U3RhdHVzODU4MTI3OmJkOGRiZjkwNmU0Mz...
3,{'nodes': [{'id': 'MDExOlB1bGxSZXF1ZXN0MzAzNDE...,2019-08-27T14:09:41Z,"{'user': {'login': 'jschendel', 'company': None}}",Replace with nested dict raises for overlappin...,041b6b180f8175b642977852f01e9211983b46ce,{'id': 'MDY6U3RhdHVzODU4MTI3OjA0MWI2YjE4MGY4MT...
4,{'nodes': [{'id': 'MDExOlB1bGxSZXF1ZXN0MzA3ODg...,2019-08-26T23:52:44Z,"{'user': {'login': 'WillAyd', 'company': 'inno...","CLN: internals.blocks cleanup, typing (#27941)",49d2019723b0089bd357adf6c936c5a82e0cc775,{'id': 'MDY6U3RhdHVzODU4MTI3OjQ5ZDIwMTk3MjNiMD...


In [5]:
commits.tail()

Unnamed: 0,associatedPullRequests,committedDate,committer,messageHeadline,oid,status
31,{'nodes': []},2009-08-05T03:30:16Z,"{'user': {'login': 'wesm', 'company': '@ursa-l...",first commit with cleaned up code,c6b236db73ff81007909be6406f0e484edc4a9eb,
32,{'nodes': []},2009-08-05T03:17:29Z,"{'user': {'login': 'wesm', 'company': '@ursa-l...",added svn:ignore,445114e1b20da8d4976c8d9050aa90c5bd508c54,
33,{'nodes': []},2009-08-05T02:33:13Z,"{'user': {'login': 'wesm', 'company': '@ursa-l...",oops,1eeadf4e401647faa20911f531bc05c1872262ea,
34,{'nodes': []},2009-08-05T02:32:49Z,"{'user': {'login': 'wesm', 'company': '@ursa-l...",adding trunk,ec1a0a2a2571dc2c1c26612b374d4a66b22f0938,
35,{'nodes': []},2009-07-31T15:07:16Z,"{'user': {'login': 'wesm', 'company': '@ursa-l...",Initial directory structure.,9d0080576446de475d34b0dbb58389b15cd4f529,


In [6]:
# Inspect the contents of the dict-type columns
print(commits['associatedPullRequests'].iloc[0], '\n')
print(commits['committer'].iloc[0], '\n')
print(commits['status'].iloc[0])

{'nodes': [{'id': 'MDExOlB1bGxSZXF1ZXN0MzExMzY3OTE4', 'title': 'TYPING: change to FrameOrSeries Alias in pandas._typing', 'author': {'login': 'simonjayhawkins'}, 'authorAssociation': 'MEMBER', 'createdAt': '2019-08-27T12:50:17Z', 'updatedAt': '2019-08-28T11:34:26Z', 'closedAt': '2019-08-27T21:50:23Z', 'number': 28173, 'state': 'MERGED'}]} 

{'user': {'login': 'WillAyd', 'company': 'innobi'}} 

{'id': 'MDY6U3RhdHVzODU4MTI3OmQ5MWZmYTY0MDdjMWJhZjZhZmU3ZDBhMWI5NjU1ZjQ0ZGE3N2FjMjQ=', 'state': 'SUCCESS'}


#### Extract Features from Dicts

Features to extract:
- All `nodes` values: id, title, author, authorAssociation, createdAt, updatedAt, closedAt, number, state
- `committer` login and company
- `status` ID and state

In [7]:
new_df = pd.DataFrame(commits['associatedPullRequests'])
new_df.head()

Unnamed: 0,associatedPullRequests
0,{'nodes': [{'id': 'MDExOlB1bGxSZXF1ZXN0MzExMzY...
1,{'nodes': [{'id': 'MDExOlB1bGxSZXF1ZXN0MzExMTI...
2,{'nodes': [{'id': 'MDExOlB1bGxSZXF1ZXN0MzEwNjM...
3,{'nodes': [{'id': 'MDExOlB1bGxSZXF1ZXN0MzAzNDE...
4,{'nodes': [{'id': 'MDExOlB1bGxSZXF1ZXN0MzA3ODg...


In [8]:
assocPR_test_df = pd.DataFrame()
malformed_dicts = []
for (i, data) in new_df.iterrows():
    try:
        interim_df = pd.DataFrame.from_dict(data['associatedPullRequests']['nodes'][0])
        assocPR_test_df = assocPR_test_df.append(interim_df)
    except:
        malformed_dicts.append(data)
        pass

#### Next Step: Format the `malformed_dicts` and `.append()`

In [20]:
malformed_dicts_df = pd.DataFrame()

for obs in malformed_dicts:
    df = pd.DataFrame.from_dict(obs)
    malformed_dicts_df = malformed_dicts_df.append(df)

In [21]:
commits_assocPR = assocPR_test_df.append(malformed_dicts_df)

commits_assocPR.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,99,author,authorAssociation,closedAt,createdAt,id,number,state,title,updatedAt
login,,,,,,,,,,,...,,simonjayhawkins,MEMBER,2019-08-27T21:50:23Z,2019-08-27T12:50:17Z,MDExOlB1bGxSZXF1ZXN0MzExMzY3OTE4,28173.0,MERGED,TYPING: change to FrameOrSeries Alias in panda...,2019-08-28T11:34:26Z
login,,,,,,,,,,,...,,addisonlynch,CONTRIBUTOR,2019-08-27T21:39:04Z,2019-08-26T22:44:15Z,MDExOlB1bGxSZXF1ZXN0MzExMTI1MDc0,28158.0,MERGED,"CLN: Use ABC classes for isinstance checks, re...",2019-08-27T21:39:04Z
login,,,,,,,,,,,...,,simonjayhawkins,MEMBER,2019-08-27T21:32:41Z,2019-08-24T19:07:08Z,MDExOlB1bGxSZXF1ZXN0MzEwNjM0NDQ1,28128.0,MERGED,TYPING: check-untyped-defs for util._decorators,2019-08-28T11:35:15Z
login,,,,,,,,,,,...,,charlesdong1991,CONTRIBUTOR,2019-08-27T14:09:42Z,2019-08-01T14:26:59Z,MDExOlB1bGxSZXF1ZXN0MzAzNDExNDc4,27696.0,MERGED,Replace with nested dict raises for overlappin...,2019-08-27T14:09:55Z
login,,,,,,,,,,,...,,jbrockmendel,MEMBER,2019-08-26T23:52:45Z,2019-08-15T22:55:38Z,MDExOlB1bGxSZXF1ZXN0MzA3ODg1MDE2,27941.0,MERGED,"CLN: internals.blocks cleanup, typing",2019-08-27T20:48:21Z


#### Next Dict-type Column to Unpack from `commits` DataFrame: `committers`

- Join with `commits_assocPR` on `author`

In [11]:
committer_df = pd.DataFrame()

for comm in commits['committer'].values:
    # Impute NaNs
    if comm['user'] == None:
        comm['user'] = {'login': np.NaN, 'company': np.NaN}
    if comm['user']['login'] == None:
        comm['user']['login'] = np.NaN
    if comm['user']['company'] == None:
        comm['user']['company'] = np.NaN
    
    interim_df = pd.DataFrame.from_dict(comm, orient='index')
    committer_df = committer_df.append(interim_df)

committer_df = committer_df.reset_index().drop(axis=1, labels='index')
committer_df = committer_df.rename(columns={'login': 'author'})

In [12]:
committer_df.head()

Unnamed: 0,author,company
0,WillAyd,innobi
1,jbrockmendel,
2,WillAyd,innobi
3,jschendel,
4,WillAyd,innobi


In [13]:
committer_dict = committer_df.to_dict()
master_committer_dict = {}
for author, company in zip(committer_dict['author'].values(), committer_dict['company'].values()):
    master_committer_dict[author] = company

master_committer_dict

{'WillAyd': 'innobi',
 'jbrockmendel': nan,
 'jschendel': nan,
 'gfyoung': nan,
 'TomAugspurger': '@ContinuumIO',
 nan: nan,
 'datapythonista': nan,
 'jorisvandenbossche': nan,
 'jreback': nan,
 'topper-123': nan,
 'mroeschke': '@housecanary',
 'simonjayhawkins': nan,
 'toobaz': nan,
 'chris-b1': nan,
 'sinhrks': nan,
 'kawochen': nan,
 'wesm': '@ursa-labs / @rstudio',
 'HHammond': nan,
 'rockg': nan,
 'RahulHP': nan,
 'llllllllll': '@quantopian',
 'hayd': nan,
 'mortada': 'Tesla',
 'moutai': nan,
 'rcarneva': nan,
 'IamGianluca': nan,
 'Dr-Irv': nan,
 'varinf': nan,
 'grahamjeffries': nan,
 'bkasel': nan,
 'proinsias': 'Hospital IQ',
 'sxwang': nan,
 'embray': nan,
 'tacaswell': 'Brookhaven National Lab',
 'srib': nan,
 'ischwabacher': nan,
 'parsleyt': nan,
 'behzadnouri': nan,
 'aisipos': nan,
 'jlec': '@gentoo ',
 'khs26': 'Ensoft Ltd',
 'codypiersall': 'Advanced Radar Research Center (University of Oklahoma)',
 'jankatins': '@zenjob',
 'berendt': 'Betacloud Solutions GmbH / B1 Sys

In [14]:
test_df = pd.DataFrame()
test_df['author'] = commits_assocPR['author']
author_list = test_df['author'].tolist()
company_list = []
for author in author_list:
    try:
        company_list.append(master_committer_dict[author])
    except:
        company_list.append(np.NaN)

test_df['authorCompany'] = company_list

In [15]:
test_df.head(10)

Unnamed: 0,author,authorCompany
0,simonjayhawkins,
1,addisonlynch,
2,simonjayhawkins,
3,charlesdong1991,
4,jbrockmendel,
5,jbrockmendel,
6,simonjayhawkins,
7,TomAugspurger,@ContinuumIO
8,TomAugspurger,@ContinuumIO
9,TomAugspurger,@ContinuumIO


In [19]:
commits_assocPR['authorCompany'] = test_df['authorCompany']
commits_assocPR.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,author,authorAssociation,closedAt,createdAt,id,number,state,title,updatedAt,authorCompany
0,,,,,,,,,,,...,simonjayhawkins,MEMBER,2019-08-27T21:50:23Z,2019-08-27T12:50:17Z,MDExOlB1bGxSZXF1ZXN0MzExMzY3OTE4,28173.0,MERGED,TYPING: change to FrameOrSeries Alias in panda...,2019-08-28T11:34:26Z,
1,,,,,,,,,,,...,addisonlynch,CONTRIBUTOR,2019-08-27T21:39:04Z,2019-08-26T22:44:15Z,MDExOlB1bGxSZXF1ZXN0MzExMTI1MDc0,28158.0,MERGED,"CLN: Use ABC classes for isinstance checks, re...",2019-08-27T21:39:04Z,
2,,,,,,,,,,,...,simonjayhawkins,MEMBER,2019-08-27T21:32:41Z,2019-08-24T19:07:08Z,MDExOlB1bGxSZXF1ZXN0MzEwNjM0NDQ1,28128.0,MERGED,TYPING: check-untyped-defs for util._decorators,2019-08-28T11:35:15Z,
3,,,,,,,,,,,...,charlesdong1991,CONTRIBUTOR,2019-08-27T14:09:42Z,2019-08-01T14:26:59Z,MDExOlB1bGxSZXF1ZXN0MzAzNDExNDc4,27696.0,MERGED,Replace with nested dict raises for overlappin...,2019-08-27T14:09:55Z,
4,,,,,,,,,,,...,jbrockmendel,MEMBER,2019-08-26T23:52:45Z,2019-08-15T22:55:38Z,MDExOlB1bGxSZXF1ZXN0MzA3ODg1MDE2,27941.0,MERGED,"CLN: internals.blocks cleanup, typing",2019-08-27T20:48:21Z,


Notes:
- Fix the empty cols
- Figure out how to merge test_df and commits_assocPR