## Establish Link Between MailingListDiscussion & Commit


### Import and Making Database Connections

In [1]:
import re
from bson.objectid import ObjectId
from pymongo import MongoClient
from mongoengine import connect
from pycoshark.utils import create_mongodb_uri_string

# You may have to update this dict to match your DB credentials
credentials = {'db_user': '',
               'db_password': '',
               'db_hostname': 'localhost',
               'db_port': 27017,
               'db_authentication_database': '',
               'db_ssl_enabled': False}

uri = create_mongodb_uri_string(**credentials)
connect('smartshark_2_1', host='127.0.0.1', port=27017)

client = MongoClient('localhost', 27017)
# Accessing database
db = client['smartshark_2_1']


### Models

In [2]:
from mongoengine import Document, StringField, ListField, DateTimeField, IntField, BooleanField, ObjectIdField, \
    DictField, DynamicField, LongField, EmbeddedDocument, EmbeddedDocumentField, FileField, FloatField
import hashlib


class Project(Document):
    """
    Project class.
    Inherits from :class:`mongoengine.Document`

    Index: #name

    ShardKey: name

    :property name: (:class:`~mongoengine.fields.StringField`) name of the project
    """
    meta = {
        'indexes': [
            '#name'
        ],
        'shard_key': ('name', ),
    }

    # PK: name
    # Shard Key: hashed name
    name = StringField(max_length=200, required=True, unique=True)

class Message(Document):
    """
    Message class.
    Inherits from :class:`mongoengine.Document`

    Index: message_id

    ShardKey: message_id, mailing_list_id

    :property message_id: (:class:`~mongoengine.fields.StringField`) id of the message (worldwide unique)
    :property mailing_list_id: (:class:`~mongoengine.fields.ObjectIdField`) id of the :class:`~pycoshark.mongomodels.MailingList` to which the message belongs
    :property reference_ids: ((:class:`~mongoengine.fields.ListField` of (:class:`~mongoengine.fields.ObjectIdField`)) id to messages that are referenced by this message
    :property in_reply_to_id: (:class:`~mongoengine.fields.ObjectIdField`) id of a message to which this message is a reply
    :property from_id: (:class:`~mongoengine.fields.ObjectIdField`) id of a person :class:`~pycoshark.mongomodels.People` from which this message is
    :property to_ids: ((:class:`~mongoengine.fields.ListField` of (:class:`~mongoengine.fields.ObjectIdField`)) ids of persons :class:`~pycoshark.mongomodels.People` to which this message was sent
    :property cc_ids: ((:class:`~mongoengine.fields.ListField` of (:class:`~mongoengine.fields.ObjectIdField`)) ids of persons :class:`~pycoshark.mongomodels.People` to which this message was sent (cc)
    :property subject: (:class:`~mongoengine.fields.StringField`) subject of the message
    :property body: (:class:`~mongoengine.fields.StringField`) message text
    :property date: (:class:`~mongoengine.fields.DateTimeField`)  date when the message was sent
    :property patches: ((:class:`~mongoengine.fields.ListField` of (:class:`~mongoengine.fields.StringField`))  if patches were applied to the message
    """

    meta = {
        'indexes': [
            'message_id'
        ],
        'shard_key': ('message_id', 'mailing_list_id'),
    }

    # PK: message_id
    # Shard Key: message_id, mailing_list_id

    message_id = StringField(required=True, unique_with=['mailing_list_id'])
    mailing_list_id = ObjectIdField(required=True)
    reference_ids = ListField(ObjectIdField())
    in_reply_to_id = ObjectIdField()
    from_id = ObjectIdField()
    to_ids = ListField(ObjectIdField())
    cc_ids = ListField(ObjectIdField())
    subject = StringField()
    body = StringField()
    date = DateTimeField()
    patches = ListField(StringField())
    external_id = StringField()
    
    
class MailingList(Document):
    """
        MailingList class.
        Inherits from :class:`mongoengine.Document`

        Index: #name

        ShardKey: name

        :property project_id: (:class:`~mongoengine.fields.ObjectIdField`) :class:`~pycoshark.mongomodels.Project` id id to which the mailing list belongs
        :property name: (:class:`~mongoengine.fields.StringField`) name of the mailing list
        :property last_updated: (:class:`~mongoengine.fields.DateTimeField`) date when the data of the mailing list was last updated in the database
    """
    meta = {
        'indexes': [
            '#name'
        ],
        'shard_key': ('name', ),
    }

    # PK: name
    # Shard Key: hashed name

    project_id = ObjectIdField(required=True)
    name = StringField(required=True)
    last_updated = DateTimeField()


class Commit(Document):
    """
    Commit class.

    Inherits from :class:`mongoengine.Document`.

    Index: vcs_system_id

    ShardKey: revision_hash, vcs_system_id

    :property vcs_system_id: (:class:`~mongoengine.fields.ObjectIdField`) :class:`~pycoshark.mongomodels.VCSSystem` id to which this commit belongs
    :property revision_hash: (:class:`~mongoengine.fields.StringField`) revision hash for this commit
    :property branches: ((:class:`~mongoengine.fields.ListField` of (:class:`~mongoengine.fields.StringField`))  list of branches to which this commit belongs
    :property parents: ((:class:`~mongoengine.fields.ListField` of (:class:`~mongoengine.fields.StringField`))  list of parents (revision hashes) of this commit
    :property author_id: (:class:`~mongoengine.fields.ObjectIdField`) :class:`~pycoshark.mongomodels.People` id of the person that authored this commit
    :property author_date: (:class:`~mongoengine.fields.DateTimeField`)  date of the authored commit
    :property author_date_offset: (:class:`~mongoengine.fields.IntField`)  offset for the author date
    :property committer_id: (:class:`~mongoengine.fields.ObjectIdField`) :class:`~pycoshark.mongomodels.People` id of the person that comitted this commit
    :property committer_date: (:class:`~mongoengine.fields.DateTimeField`)  date of the committed commit
    :property committer_date_offset: (:class:`~mongoengine.fields.IntField`)  offset for the committer date
    :property message: (:class:`~mongoengine.fields.StringField`) message of the commit
    :property linked_issue_ids: ((:class:`~mongoengine.fields.ListField` of (:class:`~mongoengine.fields.ObjectIdField`))  :class:`~pycoshark.mongomodels.Issue` ids linked to this commit
    :property code_entity_states: ((:class:`~mongoengine.fields.ListField` of (:class:`~mongoengine.fields.ObjectIdField`))  :class:`~pycoshark.mongomodels.CodeEntityState` code entity states for this commit
    :property labels: (:class:`~mongoengine.fields.DictField`) dictionary of different labels for this commit, is_bugfix etc.
    :property validations: ((:class:`~mongoengine.fields.ListField` of (:class:`~mongoengine.fields.StringField`))  list of different validations on this commit
    :property fixed_issue_ids: ((:class:`~mongoengine.fields.ListField` of (:class:`~mongoengine.fields.ObjectIdField`)) verified :class:`~pycoshark.mongomodels.Issue` ids linked to this commit
    :property szz_issue_ids: ((:class:`~mongoengine.fields.ListField` of (:class:`~mongoengine.fields.ObjectIdField`)) verified :class:`~pycoshark.mongomodels.Issue` issues linked by the SZZ algorithm
    """

    meta = {
        'indexes': [
            'vcs_system_id',
        ],
        'shard_key': ('revision_hash', 'vcs_system_id'),
    }

    # PK: revision_hash, vcs_system_id
    # Shard Key: revision_hash, vcs_system_id

    vcs_system_id = ObjectIdField(required=True)
    revision_hash = StringField(max_length=50, required=True, unique_with=['vcs_system_id'])
    branches = ListField(StringField(max_length=500), null=True)
    parents = ListField(StringField(max_length=50))
    author_id = ObjectIdField()
    author_date = DateTimeField()
    author_date_offset = IntField()
    committer_id = ObjectIdField()
    committer_date = DateTimeField()
    committer_date_offset = IntField()
    message = StringField()
    linked_issue_ids = ListField(ObjectIdField())
    code_entity_states = ListField(ObjectIdField())
    labels = DictField()
    validations = ListField(StringField(max_length=50))
    fixed_issue_ids = ListField(ObjectIdField())
    szz_issue_ids = ListField(ObjectIdField())
    linked_message_ids = ListField(ObjectIdField())
    fixed_message_ids = ListField(ObjectIdField())
    szz_message_ids = ListField(ObjectIdField())


class Issue(Document):
    """
    Issue class.
    Inherits from :class:`mongoengine.Document`

    Index: external_id, issue_system_id

    ShardKey: external_id, issue_system_id

    :property external_id: (:class:`~mongoengine.fields.StringField`) id that was assigned from the issue system to this issue
    :property issue_system_id: (:class:`~mongoengine.fields.ObjectIdField`) id of the :class:`~pycoshark.mongomodels.IssueSystem` to which this issue belongs
    :property title: (:class:`~mongoengine.fields.StringField`) title of the issue
    :property desc: (:class:`~mongoengine.fields.StringField`) description of the issue
    :property created_at: (:class:`~mongoengine.fields.DateTimeField`)  date, when this issue was created
    :property updated_at: (:class:`~mongoengine.fields.DateTimeField`)  date, when this issue was last updated
    :property creator_id: (:class:`~mongoengine.fields.ObjectIdField`) id of the :class:`~pycoshark.mongomodels.People` document which created this issue
    :property reporter_id: (:class:`~mongoengine.fields.ObjectIdField`) id of the :class:`~pycoshark.mongomodels.People` document which reported this issue
    :property issue_type: (:class:`~mongoengine.fields.StringField`) type of the issue
    :property priority: (:class:`~mongoengine.fields.StringField`) priority of the issue
    :property status: (:class:`~mongoengine.fields.StringField`) status of the issue
    :property affects_versions: ((:class:`~mongoengine.fields.ListField` of (:class:`~mongoengine.fields.StringField`)) list of affected versions by this issue
    :property components: ((:class:`~mongoengine.fields.ListField` of (:class:`~mongoengine.fields.StringField`))  list, which componenets are affected
    :property labels: ((:class:`~mongoengine.fields.ListField` of (:class:`~mongoengine.fields.StringField`))  list of labels for this issue
    :property issue_type_manual: (:class:`~mongoengine.fields.DictField`) for manual issue types for this issue, contains information about the issue_type and the author, the author is the key and the issue_type is the value
    :property issue_type_verified: (:class:`~mongoengine.fields.StringField`) verified issue_type of the issue; source is manual issue types
    :property resolution: (:class:`~mongoengine.fields.StringField`) resolution for this issue
    :property fix_versions: ((:class:`~mongoengine.fields.ListField` of (:class:`~mongoengine.fields.StringField`))  list of versions on which this issue is fixed
    :property assignee_id: (:class:`~mongoengine.fields.ObjectIdField`) id of the :class:`~pycoshark.mongomodels.People` document to which this issue was assigned
    :property issue_links: ((:class:`~mongoengine.fields.ListField` of (:class:`~mongoengine.fields.DictField`)) to which this issue is linked
    :property parent_issue_id: (:class:`~mongoengine.fields.ObjectIdField`) id of the :class:`~pycoshark.mongomodels.Issue` document that is the parent of this issue
    :property original_time_estimate: (:class:`~mongoengine.fields.IntField`)  estimated time to solve this issue
    :property environment: (:class:`~mongoengine.fields.StringField`) environment that is affected by this issue
    :property platform: (:class:`~mongoengine.fields.StringField`) platform that is affected by this issue
    :property is_pull_request: (:class:`~mongoengine.fields.BoleanField`) true if this issue is a pull request, Github issues can be pull requests
    """
    meta = {
        'indexes': [
            'issue_system_id'
        ],
        'shard_key': ('external_id', 'issue_system_id'),
    }

    # PK: external_id, issue_system_id
    # Shard Key: external_id, issue_system_id

    external_id = StringField(unique_with=['issue_system_id'])
    issue_system_id = ObjectIdField(required=True)
    title = StringField()
    desc = StringField()
    created_at = DateTimeField()
    updated_at = DateTimeField()
    creator_id = ObjectIdField()
    reporter_id = ObjectIdField()

    issue_type = StringField()
    priority = StringField()
    status = StringField()
    affects_versions = ListField(StringField())
    components = ListField(StringField())
    labels = ListField(StringField())
    issue_type_manual = DictField()
    issue_type_verified = StringField()
    resolution = StringField()
    fix_versions = ListField(StringField())
    assignee_id = ObjectIdField()
    issue_links = ListField(DictField())
    parent_issue_id = ObjectIdField()
    original_time_estimate = IntField()
    environment = StringField()
    platform = StringField()
    is_pull_request = BooleanField(default=False)

### Extract external_id from message body

In [3]:
def extract_external_id(msg):
    """
    In message body, we usually have the format below.
    The important part is 'Key'. Using regular expression('re' lib) we will find
    the 'external_id' related to an issue.

    Regex for extracting external_id from message:
        (?<=(Key: ))([A-Z]+-[0-9]+)
    --------------------------

                 Key: ZOOKEEPER-850
                 URL: https://issues.apache.org/jira/browse/ZOOKEEPER-850
             Project: Zookeeper
          Issue Type: Improvement
          Components: java client
            Reporter: Olaf Krische


    :param msg
    :return: String -> external_id
    """

    # Create the pattern for matching with "Key: project-number"
    pattern = re.compile(f'(?<=(Key: ))([A-Z]+-[0-9]+)', re.M)
    external_id_body = pattern.search(msg.body)

    if external_id_body is not None:
        return external_id_body.group(0)

    else:
        pattern = re.compile(f'((?<=\()([A-Z]+-[0-9]+)(?=\)))', re.M)
        external_id_subject = pattern.search(msg.subject)

        if external_id_subject is not None:
            return external_id_subject.group(0)

        else:
            return None

### Add external_id attribute to the message collection

In [4]:
def add_external_id_to_messages():
    """
    Adding a new attribute "external_id" to the message collection
    """

    for count, msg in enumerate(Message.objects()):
        if msg.subject is not None and msg.body is not None:
            # If we have body and subject in message
            ex_id = extract_external_id(msg)
            if ex_id is not None:
                item = {"external_id": ex_id}
                # Update the document
                db.get_collection('message').update_one({"_id": ObjectId(msg.id)},
                                                         {'$set': item},
                                                         )
        if count % 250000 == 0:
            print("messages done: ", count)

### Extract {linked, fixed, szz} message external ids

In [5]:
def extract_related_external_ids_to_commit(commit):
    """
    With linked_issue_ids, fixed_issue_ids and szz_issue_ids we get
    external ids of them in Issue collection.
    :param commit:
    :return: Dictionary
    """

    linked_message_external_ids = []
    fixed_message_external_id = []
    szz_message_external_id = []
    for linked_issue_id in commit.linked_issue_ids:
        # Get issues of a commit (linked)
        for issue in Issue.objects(id=linked_issue_id):
            linked_message_external_ids.append(issue.external_id)

    for fixed_issue_id in commit.fixed_issue_ids:
        for issue in Issue.objects(id=fixed_issue_id):
            fixed_message_external_id.append(issue.external_id)

    for szz_issue_id in commit.szz_issue_ids:
        for issue in Issue.objects(id=szz_issue_id):
            szz_message_external_id.append(issue.external_id)

    return {"linked_message_external_ids": linked_message_external_ids,
            "fixed_message_external_id": fixed_message_external_id,
            "szz_message_external_id": szz_message_external_id}

### Extract {linked, fixed, szz} messages ids

In [6]:
def find_related_messages_to_commit(external_ids: dict)-> dict:
    """
    external_ids is in this format:

        {"linked_message_external_ids": linked_message_external_ids,
        "fixed_message_external_id": fixed_message_external_id,
        "szz_message_external_id": szz_message_external_id}

    By lists in external_ids, we achieve related messages by external_ids
    :param external_ids: A dict with linked, fixed and szz message external_id
    :return: Dictionary of related messages ids
    """
    linked_message_ids = []
    fixed_message_ids = []
    szz_message_ids = []

    for external_id in external_ids['linked_message_external_ids']:
        messages = db.get_collection('message').find({'external_id': external_id})
        if messages is not None:
            linked_message_ids.extend([ObjectId(message.get('_id')) for message in messages])

    for external_id in external_ids['fixed_message_external_id']:
        messages = db.get_collection('message').find({'external_id': external_id})
        if messages is not None:
            fixed_message_ids.extend([ObjectId(message.get('_id')) for message in messages])

    for external_id in external_ids['szz_message_external_id']:
        messages = db.get_collection('message').find({'external_id': external_id})
        if messages is not None:
            szz_message_ids.extend([ObjectId(message.get('_id')) for message in messages])

    return {'linked_message_ids': linked_message_ids,
            'fixed_message_ids': fixed_message_ids,
            'szz_message_ids': szz_message_ids}

### Add {linked, fixed, szz} message ids attribute to the commit collection

In [7]:
def link_commit_and_message():
    """
    With project we get vcs_system, by that we achieve commits, by commits
    we have linked, fixed and szz issue ids, so by their external ids we find
    messages in Message collection.
    :param project:
    :return:
    """
    """
    With project we get vcs_system, by that we achieve commits, by commits
    we have linked, fixed and szz issue ids, so by their external ids we find
    messages in Message collection.
    :param project:
    :return:
    """
    print('----------------------------------Commits----------------------------------')
    for count, commit in enumerate(Commit.objects().only('id', 'linked_issue_ids', 'fixed_issue_ids', 'szz_issue_ids')):
        external_ids = extract_related_external_ids_to_commit(commit)
        message_ids = find_related_messages_to_commit(external_ids)
        db.get_collection('commit').update_many({'_id': ObjectId(commit.id)},
                                                {'$set': message_ids},
                                                )
        
        if count % 25000 == 0:
            print("commits done: ", count)

### Main procedure

In [9]:
print('starting extracting external_id from message ...')
add_external_id_to_messages()
print('extracting external_id from message finished!')
print('\nATTENTION ==> do NOT forget to INDEX external_id')
print('starting linking commit to message ...')
link_commit_and_message()
print('linking commit to message finished!')

starting linking commit to message ...
----------------------------------Commits----------------------------------
commits done:  0
commits done:  25000
commits done:  50000
commits done:  75000
commits done:  100000
commits done:  125000
commits done:  150000
commits done:  175000
commits done:  200000
commits done:  225000
commits done:  250000
commits done:  275000
commits done:  300000
commits done:  325000
commits done:  350000
linking commit to message finished!
