In [None]:
import pandas as pd
import pickle
import numpy as np

### Load responses dict into dataframe, preliminary processing, indexing

In [None]:
with open('patch_responses.pickle', 'rb') as handle:
    data = pickle.load(handle)
    

In [None]:
response_df = pd.DataFrame(data)

In [None]:
response_df.head()

##### Data sanity checks and tests, primarily testing number of patch ids and upstream commits (among other intermediate dataframe tests)

In [None]:
response_df.patch_id.nunique()

In [None]:
response_df['upstream'] = response_df['upstream'].map(list)

In [None]:
response_df['upstream_counts'] = response_df['upstream'].apply(lambda x: len(x))

In [None]:
response_df.upstream_counts.value_counts()

In [None]:
del response_df['upstream_counts']

In [None]:
def get_resp_msg_id(list_dicts):
    resp_msg = []
    if list_dicts:
        for d in list_dicts:
            resp_msg.append(d['resp_msg_id'])
    return resp_msg

In [None]:
response_df['response_ids'] = response_df['responses'].apply(lambda x: get_resp_msg_id(x))

##### Some more checks: unique response message ids

In [None]:
response_ids = response_df['response_ids'].apply(pd.Series).stack().tolist()

In [None]:
total_unique_response_ids = len(set(response_ids))
total_unique_response_ids

In [None]:
del response_df['response_ids']

In [None]:
df_a = response_df.set_index(['cluster_id', 'patch_id'])
#df_a['upstream'] = df_a['upstream'].map(list)
print(df_a.columns)
df_a.columns
df_a.head()

In [None]:
df_a.reset_index(inplace=False).patch_id.nunique()

### Denormalize the dataframe by cluster id and patch id, duplicating corresponding rows for responses and upstream -- doing sanity checks all the way

In [None]:
df_a1 = pd.melt(df_a.responses.apply(pd.Series).reset_index(), 
             id_vars=['cluster_id', 'patch_id'],
             value_name='responses').sort_index()

In [None]:
df_a1.patch_id.nunique()

In [None]:
df_a1.responses.count()

In [None]:
df_a2 = pd.melt(df_a.upstream.apply(pd.Series).reset_index(), 
             id_vars=['cluster_id', 'patch_id'],
             value_name='upstream').sort_index()

In [None]:
df_a2.patch_id.nunique()

In [None]:
df_a2.upstream.count()

#### Merge and check

In [None]:
cols_to_use = df_a1.columns.difference(df_a2.columns)
print(cols_to_use)
df_aa = pd.merge(df_a1, df_a2, on='cluster_id').drop(['patch_id_x', 'variable_x'], axis=1)

In [None]:
df_aa.head(3)

In [None]:
df_aa.patch_id_y.nunique()

In [None]:
df_aa.upstream.nunique()

In [None]:
df_aaa = df_aa.responses.apply(pd.Series)

In [None]:
df_aaa.resp_msg_id.nunique()

#### The next two should have equal number of rows

In [None]:
df_aaa.shape

In [None]:
df_aa.shape

In [None]:
df_final = pd.concat([df_aa, df_aaa], axis=1)

In [None]:
df_final.head(3)

#### Rename merge column

In [None]:
df_final.rename(columns={"patch_id_y": "patch_id"}, inplace=True)

In [None]:
#### Some more random exploration

In [None]:
df_final.loc[:, 'resp_msg_id'][0]

In [None]:
message = df_final.loc[:, 'message'][0]
message

In [None]:
import email

def _get_message_field(msg, field):
    if msg:
        return email.message_from_bytes(msg[0])[field]
    else:
        return None

In [None]:
_get_message_field(message, 'from')

In [None]:
df_final_prime = df_final.where(pd.notnull(df_final), None)

In [None]:
df_final_prime['response_author'] = df_final_prime['message'].apply(lambda x: _get_message_field(x, 'from'))

In [None]:
df_final_prime['response_author'].value_counts()

In [None]:
# Number of unique authors
df_final_prime['response_author'].nunique()

In [None]:
df_final_prime.head()

In [None]:
# Number of unique patches
df_final_prime['patch_id'].nunique()

In [None]:
# Number of unique commits
df_final_prime['upstream'].nunique()

## Number of email responses linked to patches

In [None]:
df_final_prime.groupby('patch_id')['response_author'].count()

# The above gives total response count. For unique authors do the following
#df_final.groupby('patch_id')['response_author'].nunique()

In [None]:
df_final_prime.groupby(['patch_id', 'upstream'])['response_author'].count()

In [None]:
from collections import Counter

df_response_summary = df_final_prime.groupby(
   ['upstream']
).agg(
    related_patch_count=('patch_id', lambda x: x.nunique()), # get unique patch counts per upstream commit
    num_responses=('response_author', "count"), # get response counts per upstream commit
    responding_authors=('response_author', lambda x: dict(Counter(x[x.notna()].tolist()))) # get responding authors per upstream commit, with number of responses resp.
)

In [None]:
df_response_summary.reset_index(inplace=True)

In [None]:
df_response_summary.head()

In [None]:
df_response_summary[df_response_summary['num_responses'] > 0]

#### Example how message content (bytes) can be explored

In [None]:
msg = email.message_from_bytes(message[0])

In [None]:
msg.keys()

In [None]:
msg['From']
payload = msg.get_payload()

In [None]:
payload