In [None]:
import pandas as pd
import pickle
import numpy as np

### Load responses dict into dataframe, preliminary processing, indexing

In [None]:
with open('patch_responses.pickle', 'rb') as handle:
    data = pickle.load(handle)
    

In [None]:
response_df = pd.DataFrame(data)

In [None]:
response_df.head()

##### Data sanity checks and tests, primarily testing number of patch ids and upstream commits (among other intermediate dataframe tests)

In [None]:
response_df.patch_id.nunique()

In [None]:
response_df['upstream'] = response_df['upstream'].map(list)

In [None]:
response_df['upstream_counts'] = response_df['upstream'].apply(lambda x: len(x))

In [None]:
response_df.upstream_counts.value_counts()

In [None]:
del response_df['upstream_counts']

In [None]:
def get_resp_msg_id(list_dicts):
    resp_msg = []
    if list_dicts:
        for d in list_dicts:
            resp_msg.append(d['resp_msg_id'])
    return resp_msg

In [None]:
response_df['response_ids'] = response_df['responses'].apply(lambda x: get_resp_msg_id(x))

##### Some more checks: unique response message ids

In [None]:
response_ids = response_df['response_ids'].apply(pd.Series).stack().tolist()

In [None]:
total_unique_response_ids = len(set(response_ids))
total_unique_response_ids

In [None]:
del response_df['response_ids']

In [None]:
response_df.set_index(['cluster_id', 'patch_id'], inplace=True)
response_df.columns
response_df.head()

In [None]:
response_df.reset_index(inplace=False).patch_id.nunique()

In [None]:
response_df.head()

### Denormalize the dataframe by cluster id and patch id, duplicating corresponding rows for responses and upstream -- doing sanity checks all the way

In [None]:
df_melt_responses = pd.melt(response_df.responses.apply(pd.Series).reset_index(), 
             id_vars=['cluster_id', 'patch_id'],
             value_name='responses').sort_index()

In [None]:
df_melt_responses.head()

In [None]:
df_melt_responses.patch_id.nunique()

In [None]:
df_melt_responses.responses.count()

In [None]:
df_melt_upstream = pd.melt(response_df.upstream.apply(pd.Series).reset_index(), 
             id_vars=['cluster_id', 'patch_id'],
             value_name='upstream').sort_index()

In [None]:
df_melt_upstream.patch_id.nunique()

In [None]:
df_melt_upstream.upstream.count()

#### Merge and check (in memory)

In [None]:
# Use this merge when working locally/ smaller dataframes
#df_combined = pd.merge(df_melt_responses, df_melt_upstream, how='left', on=['cluster_id', 'patch_id']).drop(['variable_x', 'variable_y'], axis=1)

In [None]:
#df_combined.head(3)

In [None]:
#df_combined.cluster_id.nunique()

In [None]:
#df_combined.patch_id.nunique()

In [None]:
#df_combined.upstream.nunique()

In [None]:
#df_combined_responses = df_aa.responses.apply(pd.Series)

In [None]:
#df_combined_responses.head()

####  Merge to prevent out of memory (use chunks when loading)

In [None]:
df_melt_responses.drop('variable', axis=1, inplace=True)

In [None]:
df_melt_responses.fillna({'patch_id':'missing_patch_id'}, inplace=True)

In [None]:
df_melt_upstream.drop('variable', axis=1, inplace=True)

In [None]:
df_melt_upstream.fillna({'patch_id':'missing_patch_id','upstream':'missing_upstream'}, inplace=True)

In [None]:
# creating a empty dataframe to save result
df_chunk_combined = pd.DataFrame(columns=(df_melt_upstream.columns.append(df_melt_responses.columns)).unique())
df_chunk_combined.to_csv("df_chunk_combined.csv",index_label=False)

In [None]:
df_melt_responses.to_csv("df_melt_responses.csv")

In [None]:
#df_melt_responses.dtypes

In [None]:
#df_melt_responses.tail()

In [None]:
del(df_melt_responses)

In [None]:
def process_chunk(x):
    try:
        df_melt_responses=pd.merge(df_melt_upstream,x, how='right', on=['cluster_id', 'patch_id']).drop_duplicates()
        df_melt_responses.to_csv("df_chunk_combined.csv",mode="a",header=False,index=False)
    except ValueError:
        print("problem! at the following")
        print("patch_ids: {}".format(x.patch_id.tolist()))
        print("cluster_ids: {}".format(x.cluster_id.tolist()))

In [None]:
# Read csv in chunks
reader = pd.read_csv("df_melt_responses.csv", chunksize=25000, header=0, index_col=0) # chunksize depends on your colsize

In [None]:
df_melt_upstream.dtypes

In [None]:
for r in reader:
    process_chunk(r)

In [None]:
del(df_melt_upstream)

In [None]:
df_chunk_combined = pd.read_csv("df_chunk_combined.csv")

In [None]:
df_chunk_combined.head(5)

In [None]:
#df_chunk_combined[pd.isnull(df_chunk_combined.patch_id) == True]

In [None]:
# This includes the NaN patch_id: 'missing_patch_id'
df_chunk_combined.patch_id.nunique()

In [None]:
df_chunk_combined.drop_duplicates().shape

In [None]:
from ast import literal_eval

def try_literal_eval(s):
    try:
        return literal_eval(s)
    except ValueError:
        return s
    
df_chunk_combined['responses'] = df_chunk_combined['responses'].map(try_literal_eval)

In [None]:
# Separate response dict keys
df_combined_responses = df_chunk_combined.responses.apply(pd.Series)

In [None]:
# Remove columns where all values are NaN/missing
df_combined_responses.dropna(how='all', axis='columns', inplace=True)

In [None]:
df_combined_responses.shape

In [None]:
df_chunk_combined.shape

In [None]:
df_final = pd.concat([df_chunk_combined, df_combined_responses], axis=1)

In [None]:
df_final.shape

In [None]:
df_final.head(3)

#### Rename merge column

In [None]:
#df_final.rename(columns={"patch_id_y": "patch_id"}, inplace=True)

In [None]:
#### Some more random exploration

In [None]:
#df_final.loc[:, 'resp_msg_id'][0]

In [None]:
#message = df_final.loc[:, 'message'][0]
#message

In [None]:
import email

def _get_message_field(msg, field):
    if msg:
        return email.message_from_bytes(msg[0])[field]
    else:
        return None

In [None]:
_get_message_field(message, 'from')

In [None]:
df_final_prime = df_final.where(pd.notnull(df_final), None)

In [None]:
df_final_prime['response_author'] = df_final_prime['message'].apply(lambda x: _get_message_field(x, 'from'))

In [None]:
df_final_prime['response_author'].value_counts()

In [None]:
# Number of unique authors
df_final_prime['response_author'].nunique()

In [None]:
df_final_prime.head()

In [None]:
# Number of unique patches
df_final_prime['patch_id'].nunique()

In [None]:
# Number of unique commits
df_final_prime['upstream'].nunique()

In [None]:
df_final_prime.to_csv('df_final_prime.csv', encoding='utf-8', index=False)

## Number of email responses linked to patches

In [None]:
df_final_prime.groupby('patch_id')['response_author'].count()

# The above gives total response count. For unique authors do the following
#df_final.groupby('patch_id')['response_author'].nunique()

In [None]:
df_final_prime.groupby(['patch_id', 'upstream'])['response_author'].count()

In [None]:
from collections import Counter

def process_patch_ids(x):
    patch_id_list = list(dict.fromkeys(x))
    try:
        patch_id_list.remove('missing_patch_id')
    except ValueError:
        pass
    return patch_id_list

def process_responding_authors(x):
    d = dict(Counter(x[x.notna()].tolist()))
    sorted_d = sorted(d.items(), key=lambda x: x[1], reverse=True)
    return sorted_d

df_response_summary = df_final_prime.groupby(
   ['upstream']
).agg(
    patches=('patch_id', lambda x: process_patch_ids(x.tolist())), # list all related patches
    related_patch_count=('patch_id', lambda x: len(process_patch_ids(x.tolist()))), # get unique patch counts per upstream commit
    num_responses=('response_author', "count"), # get response counts per upstream commit
    responding_authors=('response_author', lambda x: process_responding_authors(x)) # get responding authors per upstream commit, with number of responses resp.
)

In [None]:
df_response_summary.reset_index(inplace=True)

In [None]:
df_response_summary.to_csv('df_response_summary.csv', encoding='utf-8', index=False)

In [None]:
#df_response_summary.tail()

In [None]:
#df_response_summary[df_response_summary['num_responses'] > 0]

In [None]:
#df_response_summary.loc[233, 'responding_authors']

#### Example how message content (bytes) can be explored

In [None]:
#msg = email.message_from_bytes(message[0])

In [None]:
#msg.keys()

In [None]:
#msg['From']
#payload = msg.get_payload()

In [None]:
#payload