#### Imports

In [1]:
import json

import numpy as np
import pandas as pd

#### Read Data

In [2]:
commits = pd.read_csv('df_commits_pandas.csv')
commits.head()

Unnamed: 0,associatedPullRequests,committedDate,committer,messageHeadline,oid,status
0,{'nodes': [{'id': 'MDExOlB1bGxSZXF1ZXN0MzExMzY...,2019-08-27T21:50:22Z,"{'user': {'login': 'WillAyd', 'company': 'inno...",TYPING: change to FrameOrSeries Alias in panda...,d91ffa6407c1baf6afe7d0a1b9655f44da77ac24,{'id': 'MDY6U3RhdHVzODU4MTI3OmQ5MWZmYTY0MDdjMW...
1,{'nodes': [{'id': 'MDExOlB1bGxSZXF1ZXN0MzExMTI...,2019-08-27T21:39:03Z,"{'user': {'login': 'jbrockmendel', 'company': ...","CLN: Use ABC classes for isinstance checks, re...",080d57ee9fef9275518908cb7665ea062684c29b,{'id': 'MDY6U3RhdHVzODU4MTI3OjA4MGQ1N2VlOWZlZj...
2,{'nodes': [{'id': 'MDExOlB1bGxSZXF1ZXN0MzEwNjM...,2019-08-27T21:32:40Z,"{'user': {'login': 'WillAyd', 'company': 'inno...",TYPING: --check-untyped-defs util._decorators ...,bd8dbf906e4352567094637c9c824c350dae3ad2,{'id': 'MDY6U3RhdHVzODU4MTI3OmJkOGRiZjkwNmU0Mz...
3,{'nodes': [{'id': 'MDExOlB1bGxSZXF1ZXN0MzAzNDE...,2019-08-27T14:09:41Z,"{'user': {'login': 'jschendel', 'company': None}}",Replace with nested dict raises for overlappin...,041b6b180f8175b642977852f01e9211983b46ce,{'id': 'MDY6U3RhdHVzODU4MTI3OjA0MWI2YjE4MGY4MT...
4,{'nodes': [{'id': 'MDExOlB1bGxSZXF1ZXN0MzA3ODg...,2019-08-26T23:52:44Z,"{'user': {'login': 'WillAyd', 'company': 'inno...","CLN: internals.blocks cleanup, typing (#27941)",49d2019723b0089bd357adf6c936c5a82e0cc775,{'id': 'MDY6U3RhdHVzODU4MTI3OjQ5ZDIwMTk3MjNiMD...


#### What's Inside the Dicts?

In [3]:
print(commits.iloc[0]['associatedPullRequests'], '\n')
print(commits.iloc[0]['committer'], '\n')
print(commits.iloc[0]['messageHeadline'], '\n')
print(commits.iloc[0]['status'])

{'nodes': [{'id': 'MDExOlB1bGxSZXF1ZXN0MzExMzY3OTE4', 'title': 'TYPING: change to FrameOrSeries Alias in pandas._typing', 'author': {'login': 'simonjayhawkins'}, 'authorAssociation': 'MEMBER', 'createdAt': '2019-08-27T12:50:17Z', 'updatedAt': '2019-08-28T11:34:26Z', 'closedAt': '2019-08-27T21:50:23Z', 'number': 28173, 'state': 'MERGED'}]} 

{'user': {'login': 'WillAyd', 'company': 'innobi'}} 

TYPING: change to FrameOrSeries Alias in pandas._typing (#28173) 

{'id': 'MDY6U3RhdHVzODU4MTI3OmQ5MWZmYTY0MDdjMWJhZjZhZmU3ZDBhMWI5NjU1ZjQ0ZGE3N2FjMjQ=', 'state': 'SUCCESS'}


#### Convert `str` Representations to `dict`

In [4]:
# JSON can't handle single quotes, so convert all dicts to use double quotes
# Then convert all str reprs of dicts into actual dict type

In [5]:
commits['associatedPullRequests'] = commits['associatedPullRequests'].str.replace("'", "\"")
commits['committer'] = commits['committer'].str.replace("'", "\"")
commits['status'] = commits['status'].str.replace("'", "\"")

In [6]:
commits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20036 entries, 0 to 20035
Data columns (total 6 columns):
associatedPullRequests    20036 non-null object
committedDate             20036 non-null object
committer                 20036 non-null object
messageHeadline           20036 non-null object
oid                       20036 non-null object
status                    13672 non-null object
dtypes: object(6)
memory usage: 939.3+ KB


In [7]:
commits['committer'].value_counts()

{"user": {"login": "jreback", "company": None}}                                                                  8415
{"user": {"login": "wesm", "company": "@ursa-labs / @rstudio"}}                                                  3744
{"user": None}                                                                                                   2294
{"user": {"login": "jorisvandenbossche", "company": None}}                                                       1190
{"user": {"login": "TomAugspurger", "company": "@ContinuumIO"}}                                                   659
{"user": {"login": "cpcloud", "company": "@twosigma"}}                                                            596
{"user": {"login": "adamklein", "company": "TwoSigma Investments"}}                                               303
{"user": {"login": "jtratner", "company": None}}                                                                  248
{"user": {"login": "sinhrks", "company": None}}         

In [8]:
def convert_to_dict(series: pd.Series) -> pd.Series:
    dicts_array = []
    for observation in series:
        try:
            dicts_array.append(json.loads(observation))
        except:
            pass

    return pd.Series(dicts_array)

In [9]:
commits['associatedPullRequests'] = convert_to_dict(commits['associatedPullRequests'])
commits['committer'] = convert_to_dict(commits['committer'])
commits['status'] = convert_to_dict(commits['status'])

In [10]:
commits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20036 entries, 0 to 20035
Data columns (total 6 columns):
associatedPullRequests    19316 non-null object
committedDate             20036 non-null object
committer                 6352 non-null object
messageHeadline           20036 non-null object
oid                       20036 non-null object
status                    13672 non-null object
dtypes: object(6)
memory usage: 939.3+ KB


In [11]:
commits['committer'].value_counts()

{'user': {'login': 'wesm', 'company': '@ursa-labs / @rstudio'}}                                                                  3744
{'user': {'login': 'TomAugspurger', 'company': '@ContinuumIO'}}                                                                   659
{'user': {'login': 'cpcloud', 'company': '@twosigma'}}                                                                            596
{'user': {'login': 'adamklein', 'company': 'TwoSigma Investments'}}                                                               303
{'user': {'login': 'changhiskhan', 'company': '@Tubitv '}}                                                                        206
{'user': {'login': 'WillAyd', 'company': 'innobi'}}                                                                               175
{'user': {'login': 'orbitfold', 'company': 'Leibniz Rechenzentrum'}}                                                               98
{'user': {'login': 'shoyer', 'company': '@google '}}          

In [12]:
type(commits['associatedPullRequests'][0]) == dict

True

#### Remove all non-Dict (malformed) Entries

In [13]:
def remove_non_dicts(series) -> pd.Series:
    series_array = series.tolist()
    for observation in series_array:
        if type(observation) != dict:
            series.remove(observation)
        

In [14]:
(type(commits.associatedPullRequests.values) != dict)

True

In [15]:
# Filter non-dict types out of DF for all dict-type cols
pass

#### Convert Dict Keys to DataFrame Columns

In [None]:
commits['assoc_PR_id']
commits['assoc_PR_title']
commits['assoc_PR_author']
commits['assoc_PR_author_association']
commits['assoc_PR_createdAt']
commits['assoc_PR_updatedAt']
commits['assoc_PR_number']
commits['assoc_PR_state']