In [None]:
# !git submodule update --recursive --remote

In [None]:
!./pasta sync -mbox

In [None]:
!./pasta analyse rep

In [None]:
!./pasta rate

In [None]:
!./pasta analyse upstream

In [None]:
!./pasta rate

In [None]:
!./pasta prepare_evaluation --review

In [None]:
!./pasta prepare_evaluation --ignored

In [None]:
import pandas as pd

from pypasta.Config import Config

config = Config('linux')

In [None]:
import pickle

with open(config.f_responses_pkl, 'rb') as handle:
    response_df = pd.DataFrame(pickle.load(handle))
response_df.head()

In [None]:
response_df['upstream'] = response_df['upstream'].map(list)

response_df.index.name = "idx"

response_df.fillna({'patch_id': '_'}, inplace=True)
print("Filled NA for patch_id")

response_df.set_index(['patch_id'], append=True, inplace=True)
print("Done setting index for response_df")

# Denormalize
df_melt_responses = pd.melt(response_df.responses.apply(pd.Series).reset_index(),
                            id_vars=['idx', 'patch_id'],
                            value_name='responses').sort_index()

df_melt_responses.drop('variable', axis=1, inplace=True)

print("melt_responses_shape {}".format(df_melt_responses.shape))

In [None]:
import flat_table

df_denorm_responses = flat_table.normalize(df_melt_responses, expand_dicts=True, expand_lists=True)
df_denorm_responses.drop('index', axis=1, inplace=True)
df_denorm_responses.drop_duplicates(inplace=True)
print("Computed de-normalized responses, writing to disk...")
df_denorm_responses.to_csv('resources/linux/resources/df_denorm_responses.csv', index=False)
print("Processed responses!")

df_melt_upstream = pd.melt(response_df.upstream.apply(pd.Series).reset_index(),
                           id_vars=['idx', 'patch_id'],
                           value_name='upstream').sort_index()

df_melt_upstream.drop('variable', axis=1, inplace=True)
df_melt_upstream.drop_duplicates(inplace=True)

df_melt_upstream.to_csv('resources/linux/resources/df_denorm_upstream.csv', index=False)
print("Processed upstream!")

In [None]:
import dask.dataframe as dd

def try_literal_eval(s):
    try:
        return literal_eval(s)
    except ValueError:
        return s

def _get_message_field(msg, field):
    if not (np.all(pd.isnull(msg))):
        return email.message_from_bytes(msg)[field]
    else:
        return None

dd1 = df_denorm_responses.set_index(['idx'])

dd2 = df_melt_upstream.set_index(['idx'])

df_dask_final = dd.merge(dd1, dd2, left_index=True, right_index=True, how='left') \
    .drop(['patch_id_y'], axis=1) \
    .reset_index(drop=True) \
    .rename(columns={"patch_id_x": "patch_id"})

df_dask_final.to_csv("resources/linux/resources/df_dask_final.csv")

final = dd.read_csv("resources/linux/resources/df_dask_final.csv", blocksize=50e7, 
                    dtype={"idx ": "int32", "patch_id ": "category",
                           "responses.resp_msg_id": "category",
                           "responses.parent": "category",
                           "upstream": "category"}).drop('Unnamed: 0', axis=1)

print("Final shape with possible duplicate rows{}".format(final.shape))
final.drop_duplicates(inplace=True)

# Convert to pandas
df_pd_final = final.compute()

# Remove rows with no patch and other infos
index_names = df_pd_final[(df_pd_final['patch_id'] == '_') &
                          (df_pd_final['upstream'].isna())].index
df_pd_final.drop(index_names, inplace=True)

print("Final shape after removing duplicates {}".format(final.shape))

# df_pd_final.to_csv(config.f_merged_responses_upstream, index=False)
# print("Finished writing de-duplicated pandas merged dataframe to disk")

final = dd.from_pandas(df_pd_final, npartitions=20)

final.reset_index().rename(columns={'index': 'idx'}).compute()

In [None]:
final.to_csv('resources/linux/resources/final.csv', single_file=True)

In [None]:
del df_pd_final

In [None]:
del df_denorm_responses

In [None]:
del df_melt_upstream

In [None]:
del df_dask_final

In [None]:
import dask.dataframe as dd

final = dd.read_csv('resources/linux/resources/final.csv', blocksize=50e7,
                        dtype={"idx ": "int32",
                               "patch_id ": "category",
                               "responses.parent": "category",
                               "upstream": "category",
                               "response_author": "category"}).drop('Unnamed: 0', axis=1)
final.head()

In [None]:
from pypasta import Config
from pypasta.Repository.Mbox import Mbox

config = Config('linux')


repo = config.repo
repo.register_mbox(config)
repo.mbox.load_threads()

# Discard null patches (coming from upstreams that were not mapped to any patch emails)
unique_patches = set(final.patch_id.unique().compute())
unique_patches.discard('_')

In [None]:
from fuzzywuzzy import fuzz
import email
import chardet

def get_relevant_patches(characteristics):
    # First, we have to define the term 'relevant patch'. For our analysis, we
    # must only consider patches that either fulfil rule 1 or 2:
    #
    # 1. Patch is the parent of a thread.
    #    This covers classic one-email patches
    #
    # 2. Patch is the 1st level child of the parent of a thread
    #    In this case, the parent can either be a patch (e.g., a series w/o
    #    cover letter) or not a patch (e.g., parent is a cover letter)
    #
    # 3. The patch must not be sent from a bot (e.g., tip-bot)
    #
    # 4. Ignore stable review patches
    #
    # All other patches MUST be ignored. Rationale: Maintainers may re-send
    # the patch as a reply of the discussion. Such patches must be ignored.
    # Example: Look at the thread of
    #     <20190408072929.952A1441D3B@finisterre.ee.mobilebroadband>
    #
    # Furthermore, only consider patches that actually patch Linux (~14% of all
    # patches on Linux MLs patch other projects). Then only consider patches
    # that are not for next, not from bots (there are a lot of bots) and that
    # are no 'process mails' (e.g., pull requests)

    relevant = set()

    all_messages = 0
    skipped_bot = 0
    skipped_stable = 0
    skipped_not_linux = 0
    skipped_no_patch = 0
    skipped_not_first_patch = 0
    skipped_process = 0
    skipped_next = 0

    for m, c in characteristics.items():
        skip = False
        all_messages += 1

        if not c.is_patch:
            skipped_no_patch += 1
            continue

        if not c.patches_linux:
            skipped_not_linux += 1
            skip = True
        if not c.is_first_patch_in_thread:
            skipped_not_first_patch += 1
            skip = True

        if c.is_from_bot:
            skipped_bot += 1
            skip = True
        if c.is_stable_review:
            skipped_stable += 1
            skip = True
        if c.process_mail:
            skipped_process += 1
            skip = True
        if c.is_next:
            skipped_next += 1
            skip = True

        if skip:
            continue

        relevant.add(m)

    print('')
    print('=== Calculation of relevant patches ===')
    print('All messages: %u' % all_messages)
    print('  No patches: %u' % skipped_no_patch)
    print('Skipped patches:')
    print('  Not Linux: %u' % skipped_not_linux)
    print('  Bot: %u' % skipped_bot)
    print('  Stable: %u' % skipped_stable)
    print('  Process mail: %u' % skipped_process)
    print('  Next: %u' % skipped_next)
    print('Relevant patches: %u' % len(relevant))

    return relevant

def _is_response_from_bot(message):
    lmc = LinuxMailCharacteristics(repo, None, None, message)
    flag = lmc.is_from_bot
    return message, flag

def check_person_duplicates(patch_id, resp_msg_id, author1, author2):
    try:
        name1, email1 = author1
        name2, email2 = author2
        if email1 == email2:
            return True
        if name1 == name2:
            return True
        return fuzz.token_sort_ratio(name1, name2) >= 80
    except Exception as e:
        print(e)
        print("Error parsing authors for patch id {} and response {}: author1 {} and author2 {}"
                 .format(patch_id, resp_msg_id, author1, author2))
        return False

In [None]:
from pypasta.LinuxMailCharacteristics import load_linux_mail_characteristics, email_get_from, LinuxMailCharacteristics
from multiprocessing import Pool, cpu_count


_, clustering = config.load_cluster()
clustering.optimize()

patch_characteristics = load_linux_mail_characteristics(config, None, clustering, unique_patches)

# Consider only relevant patches (as per given definition of relevance)
relevant_patches = get_relevant_patches(patch_characteristics)
final_filtered_1 = final[final['patch_id'].isin(relevant_patches)]

# Filter responses -- only responses to the patch itself count as a response, and not the rest of the thread emails
final_filtered_2 = final_filtered_1[final_filtered_1['patch_id'] == final_filtered_1['responses.parent']]

In [None]:
p1 = Pool(processes=int(cpu_count()), maxtasksperchild=1)
response_to_bot = p1.map(_is_response_from_bot, list(final_filtered_2['responses.resp_msg_id'].unique().compute()),
                         chunksize=1000)
p1.close()
p1.join()

In [None]:
import pandas as pd

response_bot_df = pd.DataFrame(response_to_bot, columns=['responses.resp_msg_id', 'response_is_bot'])

final_filtered_2 = dd.merge(final_filtered_2, response_bot_df, how='left', on=['responses.resp_msg_id'])

if 'response_is_bot_x' in final_filtered_2.columns:
    final_filtered_2 = final_filtered_2.drop(['response_is_bot_x'], axis=1) \
        .rename(columns={"response_is_bot_y": "response_is_bot"})

# Remove duplicate rows with response message id, upstream, and patch_id (artifact of denormalization?)
final_dedup = final_filtered_2.drop_duplicates(subset=['responses.resp_msg_id', 'upstream', 'patch_id'],
                                               keep='first')

# Rename some columns, removing the 'responses.' prefix to simplify dataframe Series ops
new_columns = ['patch_id', 'response_author', 'resp_parent', 'resp_msg_id', 'upstream', 'response_is_bot']
final_dedup = final_dedup.rename(columns=dict(zip(final_dedup.columns, new_columns)))

In [None]:
def parseaddr_unicode(addr) -> (str, str):
    """Like parseaddr but return name in unicode instead of in RFC 2047 format
    '=?UTF-8?B?TmjGoW4gTmd1eeG7hW4=?= <abcd@gmail.com>' -> ('Nhơn Nguyễn', "abcd@gmail.com")
    """
    # name, e_mail = email.utils.parseaddr(addr)
    # e_mail = e_mail.strip().lower()
    name, e_mail = addr
    name_list = []
    if name:
        name = name.strip()

        for decoded_string, charset in email.header.decode_header(name):
            if charset is not None:

                try:
                    if isinstance(decoded_string, bytes):
                        name = decoded_string.decode(charset or 'utf-8')
                    else:
                        name = str(decoded_string, 'utf-8', errors='ignore')
                except UnicodeDecodeError:
                    encoding = chardet.detect(decoded_string)['encoding']
                    try:
                        name = decoded_string.decode(encoding)
                    except TypeError:
                        name = str(decoded_string, 'utf-8', errors='ignore')
            else:
                name = str(decoded_string)
            name_list.append(name)

    final_name = u''.join(name_list)
    return final_name, e_mail

def get_patch_author(message, repo):
    try:
        msg = repo.mbox.get_messages(message)[0]
        return email_get_from(msg)
    except Exception as e:
        print(e)
        return email_get_from(message)

final_dedup['patch_author'] = final_dedup['patch_id'].map(lambda x: get_patch_author(x, repo),
                                                          meta=pd.Series([], dtype=object, name='x'))

final_dedup['responder'] = final_dedup['resp_msg_id'].map(lambda x: get_patch_author(x, repo),
                                                          meta=pd.Series([], dtype=object, name='x'))

# This flag could detect authors responding themselves to the patches, e.g., responses to patches as rest
# of the patch series (spotted often this case)
final_dedup['self_response'] = final_dedup.map_partitions(lambda df: df.apply(
    (lambda row: check_person_duplicates(row.patch_id, row.resp_msg_id, row.patch_author, row.responder)),
    axis=1), meta=pd.Series([], dtype=object, name='row'))

final_dedup.to_csv('resources/linux/resources/filtered_responses.csv', single_file=True)

print("Written filtered response dataframe to disk, Done!")