# Statistics Related to Our Result

### Import and Making Database Connections

In [None]:
import re
from bson.objectid import ObjectId
from pymongo import MongoClient
from mongoengine import connect
from pycoshark.utils import create_mongodb_uri_string

# You may have to update this dict to match your DB credentials
credentials = {'db_user': '',
               'db_password': '',
               'db_hostname': 'localhost',
               'db_port': 27017,
               'db_authentication_database': '',
               'db_ssl_enabled': False}

uri = create_mongodb_uri_string(**credentials)
connect('smartshark_2_1', host='127.0.0.1', port=27017)

client = MongoClient('localhost', 27017)
# Accessing database
db = client['smartshark_2_1']

### Models

In [None]:
from mongoengine import Document, StringField, ListField, DateTimeField, IntField, BooleanField, ObjectIdField, \
    DictField, DynamicField, LongField, EmbeddedDocument, EmbeddedDocumentField, FileField, FloatField
import hashlib


class Project(Document):
    """
    Project class.
    Inherits from :class:`mongoengine.Document`

    Index: #name

    ShardKey: name

    :property name: (:class:`~mongoengine.fields.StringField`) name of the project
    """
    meta = {
        'indexes': [
            '#name'
        ],
        'shard_key': ('name', ),
    }

    # PK: name
    # Shard Key: hashed name
    name = StringField(max_length=200, required=True, unique=True)

class Message(Document):
    """
    Message class.
    Inherits from :class:`mongoengine.Document`

    Index: message_id

    ShardKey: message_id, mailing_list_id

    :property message_id: (:class:`~mongoengine.fields.StringField`) id of the message (worldwide unique)
    :property mailing_list_id: (:class:`~mongoengine.fields.ObjectIdField`) id of the :class:`~pycoshark.mongomodels.MailingList` to which the message belongs
    :property reference_ids: ((:class:`~mongoengine.fields.ListField` of (:class:`~mongoengine.fields.ObjectIdField`)) id to messages that are referenced by this message
    :property in_reply_to_id: (:class:`~mongoengine.fields.ObjectIdField`) id of a message to which this message is a reply
    :property from_id: (:class:`~mongoengine.fields.ObjectIdField`) id of a person :class:`~pycoshark.mongomodels.People` from which this message is
    :property to_ids: ((:class:`~mongoengine.fields.ListField` of (:class:`~mongoengine.fields.ObjectIdField`)) ids of persons :class:`~pycoshark.mongomodels.People` to which this message was sent
    :property cc_ids: ((:class:`~mongoengine.fields.ListField` of (:class:`~mongoengine.fields.ObjectIdField`)) ids of persons :class:`~pycoshark.mongomodels.People` to which this message was sent (cc)
    :property subject: (:class:`~mongoengine.fields.StringField`) subject of the message
    :property body: (:class:`~mongoengine.fields.StringField`) message text
    :property date: (:class:`~mongoengine.fields.DateTimeField`)  date when the message was sent
    :property patches: ((:class:`~mongoengine.fields.ListField` of (:class:`~mongoengine.fields.StringField`))  if patches were applied to the message
    """

    meta = {
        'indexes': [
            'message_id'
        ],
        'shard_key': ('message_id', 'mailing_list_id'),
    }

    # PK: message_id
    # Shard Key: message_id, mailing_list_id

    message_id = StringField(required=True, unique_with=['mailing_list_id'])
    mailing_list_id = ObjectIdField(required=True)
    reference_ids = ListField(ObjectIdField())
    in_reply_to_id = ObjectIdField()
    from_id = ObjectIdField()
    to_ids = ListField(ObjectIdField())
    cc_ids = ListField(ObjectIdField())
    subject = StringField()
    body = StringField()
    date = DateTimeField()
    patches = ListField(StringField())
    external_id = StringField()
    
    
class MailingList(Document):
    """
        MailingList class.
        Inherits from :class:`mongoengine.Document`

        Index: #name

        ShardKey: name

        :property project_id: (:class:`~mongoengine.fields.ObjectIdField`) :class:`~pycoshark.mongomodels.Project` id id to which the mailing list belongs
        :property name: (:class:`~mongoengine.fields.StringField`) name of the mailing list
        :property last_updated: (:class:`~mongoengine.fields.DateTimeField`) date when the data of the mailing list was last updated in the database
    """
    meta = {
        'indexes': [
            '#name'
        ],
        'shard_key': ('name', ),
    }

    # PK: name
    # Shard Key: hashed name

    project_id = ObjectIdField(required=True)
    name = StringField(required=True)
    last_updated = DateTimeField()


class Commit(Document):
    """
    Commit class.

    Inherits from :class:`mongoengine.Document`.

    Index: vcs_system_id

    ShardKey: revision_hash, vcs_system_id

    :property vcs_system_id: (:class:`~mongoengine.fields.ObjectIdField`) :class:`~pycoshark.mongomodels.VCSSystem` id to which this commit belongs
    :property revision_hash: (:class:`~mongoengine.fields.StringField`) revision hash for this commit
    :property branches: ((:class:`~mongoengine.fields.ListField` of (:class:`~mongoengine.fields.StringField`))  list of branches to which this commit belongs
    :property parents: ((:class:`~mongoengine.fields.ListField` of (:class:`~mongoengine.fields.StringField`))  list of parents (revision hashes) of this commit
    :property author_id: (:class:`~mongoengine.fields.ObjectIdField`) :class:`~pycoshark.mongomodels.People` id of the person that authored this commit
    :property author_date: (:class:`~mongoengine.fields.DateTimeField`)  date of the authored commit
    :property author_date_offset: (:class:`~mongoengine.fields.IntField`)  offset for the author date
    :property committer_id: (:class:`~mongoengine.fields.ObjectIdField`) :class:`~pycoshark.mongomodels.People` id of the person that comitted this commit
    :property committer_date: (:class:`~mongoengine.fields.DateTimeField`)  date of the committed commit
    :property committer_date_offset: (:class:`~mongoengine.fields.IntField`)  offset for the committer date
    :property message: (:class:`~mongoengine.fields.StringField`) message of the commit
    :property linked_issue_ids: ((:class:`~mongoengine.fields.ListField` of (:class:`~mongoengine.fields.ObjectIdField`))  :class:`~pycoshark.mongomodels.Issue` ids linked to this commit
    :property code_entity_states: ((:class:`~mongoengine.fields.ListField` of (:class:`~mongoengine.fields.ObjectIdField`))  :class:`~pycoshark.mongomodels.CodeEntityState` code entity states for this commit
    :property labels: (:class:`~mongoengine.fields.DictField`) dictionary of different labels for this commit, is_bugfix etc.
    :property validations: ((:class:`~mongoengine.fields.ListField` of (:class:`~mongoengine.fields.StringField`))  list of different validations on this commit
    :property fixed_issue_ids: ((:class:`~mongoengine.fields.ListField` of (:class:`~mongoengine.fields.ObjectIdField`)) verified :class:`~pycoshark.mongomodels.Issue` ids linked to this commit
    :property szz_issue_ids: ((:class:`~mongoengine.fields.ListField` of (:class:`~mongoengine.fields.ObjectIdField`)) verified :class:`~pycoshark.mongomodels.Issue` issues linked by the SZZ algorithm
    """

    meta = {
        'indexes': [
            'vcs_system_id',
        ],
        'shard_key': ('revision_hash', 'vcs_system_id'),
    }

    # PK: revision_hash, vcs_system_id
    # Shard Key: revision_hash, vcs_system_id

    vcs_system_id = ObjectIdField(required=True)
    revision_hash = StringField(max_length=50, required=True, unique_with=['vcs_system_id'])
    branches = ListField(StringField(max_length=500), null=True)
    parents = ListField(StringField(max_length=50))
    author_id = ObjectIdField()
    author_date = DateTimeField()
    author_date_offset = IntField()
    committer_id = ObjectIdField()
    committer_date = DateTimeField()
    committer_date_offset = IntField()
    message = StringField()
    linked_issue_ids = ListField(ObjectIdField())
    code_entity_states = ListField(ObjectIdField())
    labels = DictField()
    validations = ListField(StringField(max_length=50))
    fixed_issue_ids = ListField(ObjectIdField())
    szz_issue_ids = ListField(ObjectIdField())
    linked_message_ids = ListField(ObjectIdField())
    fixed_message_ids = ListField(ObjectIdField())
    szz_message_ids = ListField(ObjectIdField())


class Issue(Document):
    """
    Issue class.
    Inherits from :class:`mongoengine.Document`

    Index: external_id, issue_system_id

    ShardKey: external_id, issue_system_id

    :property external_id: (:class:`~mongoengine.fields.StringField`) id that was assigned from the issue system to this issue
    :property issue_system_id: (:class:`~mongoengine.fields.ObjectIdField`) id of the :class:`~pycoshark.mongomodels.IssueSystem` to which this issue belongs
    :property title: (:class:`~mongoengine.fields.StringField`) title of the issue
    :property desc: (:class:`~mongoengine.fields.StringField`) description of the issue
    :property created_at: (:class:`~mongoengine.fields.DateTimeField`)  date, when this issue was created
    :property updated_at: (:class:`~mongoengine.fields.DateTimeField`)  date, when this issue was last updated
    :property creator_id: (:class:`~mongoengine.fields.ObjectIdField`) id of the :class:`~pycoshark.mongomodels.People` document which created this issue
    :property reporter_id: (:class:`~mongoengine.fields.ObjectIdField`) id of the :class:`~pycoshark.mongomodels.People` document which reported this issue
    :property issue_type: (:class:`~mongoengine.fields.StringField`) type of the issue
    :property priority: (:class:`~mongoengine.fields.StringField`) priority of the issue
    :property status: (:class:`~mongoengine.fields.StringField`) status of the issue
    :property affects_versions: ((:class:`~mongoengine.fields.ListField` of (:class:`~mongoengine.fields.StringField`)) list of affected versions by this issue
    :property components: ((:class:`~mongoengine.fields.ListField` of (:class:`~mongoengine.fields.StringField`))  list, which componenets are affected
    :property labels: ((:class:`~mongoengine.fields.ListField` of (:class:`~mongoengine.fields.StringField`))  list of labels for this issue
    :property issue_type_manual: (:class:`~mongoengine.fields.DictField`) for manual issue types for this issue, contains information about the issue_type and the author, the author is the key and the issue_type is the value
    :property issue_type_verified: (:class:`~mongoengine.fields.StringField`) verified issue_type of the issue; source is manual issue types
    :property resolution: (:class:`~mongoengine.fields.StringField`) resolution for this issue
    :property fix_versions: ((:class:`~mongoengine.fields.ListField` of (:class:`~mongoengine.fields.StringField`))  list of versions on which this issue is fixed
    :property assignee_id: (:class:`~mongoengine.fields.ObjectIdField`) id of the :class:`~pycoshark.mongomodels.People` document to which this issue was assigned
    :property issue_links: ((:class:`~mongoengine.fields.ListField` of (:class:`~mongoengine.fields.DictField`)) to which this issue is linked
    :property parent_issue_id: (:class:`~mongoengine.fields.ObjectIdField`) id of the :class:`~pycoshark.mongomodels.Issue` document that is the parent of this issue
    :property original_time_estimate: (:class:`~mongoengine.fields.IntField`)  estimated time to solve this issue
    :property environment: (:class:`~mongoengine.fields.StringField`) environment that is affected by this issue
    :property platform: (:class:`~mongoengine.fields.StringField`) platform that is affected by this issue
    :property is_pull_request: (:class:`~mongoengine.fields.BoleanField`) true if this issue is a pull request, Github issues can be pull requests
    """
    meta = {
        'indexes': [
            'issue_system_id'
        ],
        'shard_key': ('external_id', 'issue_system_id'),
    }

    # PK: external_id, issue_system_id
    # Shard Key: external_id, issue_system_id

    external_id = StringField(unique_with=['issue_system_id'])
    issue_system_id = ObjectIdField(required=True)
    title = StringField()
    desc = StringField()
    created_at = DateTimeField()
    updated_at = DateTimeField()
    creator_id = ObjectIdField()
    reporter_id = ObjectIdField()

    issue_type = StringField()
    priority = StringField()
    status = StringField()
    affects_versions = ListField(StringField())
    components = ListField(StringField())
    labels = ListField(StringField())
    issue_type_manual = DictField()
    issue_type_verified = StringField()
    resolution = StringField()
    fix_versions = ListField(StringField())
    assignee_id = ObjectIdField()
    issue_links = ListField(DictField())
    parent_issue_id = ObjectIdField()
    original_time_estimate = IntField()
    environment = StringField()
    platform = StringField()
    is_pull_request = BooleanField(default=False)

### Total Number of linked messages and Issues

In [None]:
totalLinkedMessages = set()
totalLinkedIssues = set()
for count, commit in enumerate(Commit.objects().only('id', 'linked_message_ids', 'fixed_message_ids', 'szz_message_ids', 
                                                     'linked_issue_ids', 'fixed_issue_ids', 'szz_issue_ids')):   
    totalLinkedMessages.update(commit.linked_message_ids)
    totalLinkedMessages.update(commit.fixed_message_ids)
    totalLinkedMessages.update(commit.szz_message_ids)
    
    totalLinkedIssues.update(commit.linked_issue_ids)
    totalLinkedIssues.update(commit.fixed_issue_ids)
    totalLinkedIssues.update(commit.szz_issue_ids)
    
    if count%25000 == 0:
        print("Iteration => ", count)

print("Total Issues => ", len(totalLinkedIssues))
print("Total Messages => ", len(totalLinkedMessages))

In [None]:
listMessageId = list(totalLinkedMessages)
listMessage = list()

for count, msgId in enumerate(listMessageId):
    msg = db.get_collection('message').find({'_id': msgId})
    db.get_collection('UniqueMessages').insert(msg)    
    
    if count%25000 == 0:
        print("Iteration => ", count)

### Accuracy of new links

In [None]:
for count, project in enumerate(Project.objects()):
    uniqueCount = 0
    totalCount = 0
    percentage = 0
    for mailingList in MailingList.objects(project_id = project.id):
        uniqueCount += db.get_collection('UniqueMessages').find({'mailing_list_id': ObjectId(mailingList.id)}).count()
        totalCount += db.get_collection('message').find({'mailing_list_id': ObjectId(mailingList.id)}).count()
    
    try: 
        percentage = uniqueCount/totalCount*100
    except:
        percentage = 0
        
    projectDetails = {"project_id": ObjectId(project.id), 
                     "name": project.name,
                     "number_of_unique_links_to_messages": uniqueCount,
                     "number_of_total_links_to_messages": totalCount,
                     "percentage": percentage}
    
    db.get_collection('ProjectLinkDetails').insert(projectDetails)

In [None]:
avg = 0
for count, project in enumerate(Project.objects()):
    uniqueCount = 0
    totalCount = 0
    for mailingList in MailingList.objects(project_id = project.id):
        uniqueCount += db.get_collection('UniqueMessages').find({'mailing_list_id': ObjectId(mailingList.id)}).count()
        totalCount += db.get_collection('message').find({'mailing_list_id': ObjectId(mailingList.id)}).count()
    
    try: 
        projectDetails = {"name": project.name,
                     "percentage": uniqueCount/totalCount*100}
        avg += uniqueCount/totalCount*100
    except:
        projectDetails = {"name": project.name,
                         "percentage": 0}
        
        
    print(projectDetails)
print("Average: ", avg)
print(avg/77)

### Determine Overlapes between 3 links for issue and messages in Commit

In [None]:
import pandas as pd
from pycoshark.mongomodels import VCSSystem

def determine_overlapes_in_message_links(project):

    vcs_system = VCSSystem.objects(project_id=project.id).get()

    commits = Commit.objects(vcs_system_id=vcs_system.id)
    
    for commit in commits:
        
        linked_message_ids = set(commit.linked_message_ids)
        fixed_message_ids = set(commit.fixed_message_ids)
        szz_message_ids = set(commit.szz_message_ids)
        
        linked_fixed_szz_count = 0
        linked_fixed_notSzz_count = 0
        linked_notFixed_szz_count = 0
        linked_notFixed_notSzz_count = 0
        notLinked_fixed_szz_count = 0
        notLinked_fixed_notSzz_count = 0
        notLinked_notFixed_szz_count = 0
        
        for _id in linked_message_ids | fixed_message_ids | szz_message_ids:
            if _id in linked_message_ids and _id in fixed_message_ids and _id in szz_message_ids:
                linked_fixed_szz_count+=1
                
            elif _id in linked_message_ids and _id in fixed_message_ids and not _id in szz_message_ids:
                linked_fixed_notSzz_count+=1
                
            elif _id in linked_message_ids and not _id in fixed_message_ids and _id in szz_message_ids:
                linked_notFixed_szz_count+=1
                
            elif _id in linked_message_ids and not _id in fixed_message_ids and not _id in szz_message_ids:
                linked_notFixed_notSzz_count+=1
            
            elif not _id in linked_message_ids and _id in fixed_message_ids and _id in szz_message_ids:
                notLinked_fixed_szz_count+=1
                
            elif not _id in linked_message_ids and _id in fixed_message_ids and not _id in szz_message_ids:
                notLinked_fixed_notSzz_count+=1
            
            elif not _id in linked_message_ids and not _id in fixed_message_ids and _id in szz_message_ids:
                notLinked_notFixed_szz_count+=1
        
        data = {'project_name': project.name
               ,'project_id': ObjectId(project.id)
               ,'commit_id': ObjectId(commit.id)
               ,'linked_fixed_szz': linked_fixed_szz_count
               ,'linked_fixed_notSzz': linked_fixed_notSzz_count
               ,'linked_notFixed_szz': linked_notFixed_szz_count
               ,'linked_notFixed_notSzz': linked_notFixed_notSzz_count
               ,'notLinked_fixed_szz': notLinked_fixed_szz_count
               ,'notLinked_fixed_notSzz': notLinked_fixed_notSzz_count
               ,'notLinked_notFixed_szz': notLinked_notFixed_szz_count}
        
        db.get_collection('ProjectMessagesOverlaps').insert_one(data)        

In [None]:
for count, project in enumerate(Project.objects()):
    print('Project Number ', count, ' => ', project.name)
    determine_overlapes_in_message_links(project)

### Count the above, uniquely

In [None]:
import pandas as pd
from pycoshark.mongomodels import VCSSystem

def determine_overlapes_in_message_links_uniquely(project):

    vcs_system = VCSSystem.objects(project_id=project.id).get()

    commits = Commit.objects(vcs_system_id=vcs_system.id)
    
    linked_fixed_szz_count = set()
    linked_fixed_notSzz_count = set()
    linked_notFixed_szz_count = set()
    linked_notFixed_notSzz_count = set()
    notLinked_fixed_szz_count = set()
    notLinked_fixed_notSzz_count = set()
    notLinked_notFixed_szz_count = set()
    
    for commit in commits:
        
        linked_message_ids = set(commit.linked_message_ids)
        fixed_message_ids = set(commit.fixed_message_ids)
        szz_message_ids = set(commit.szz_message_ids)
        
        for _id in linked_message_ids | fixed_message_ids | szz_message_ids:
            if _id in linked_message_ids and _id in fixed_message_ids and _id in szz_message_ids:
                linked_fixed_szz_count.add(_id)
                
            elif _id in linked_message_ids and _id in fixed_message_ids and not _id in szz_message_ids:
                linked_fixed_notSzz_count.add(_id)
                
            elif _id in linked_message_ids and not _id in fixed_message_ids and _id in szz_message_ids:
                linked_notFixed_szz_count.add(_id)
                
            elif _id in linked_message_ids and not _id in fixed_message_ids and not _id in szz_message_ids:
                linked_notFixed_notSzz_count.add(_id)
            
            elif not _id in linked_message_ids and _id in fixed_message_ids and _id in szz_message_ids:
                notLinked_fixed_szz_count.add(_id)
                
            elif not _id in linked_message_ids and _id in fixed_message_ids and not _id in szz_message_ids:
                notLinked_fixed_notSzz_count.add(_id)
            
            elif not _id in linked_message_ids and not _id in fixed_message_ids and _id in szz_message_ids:
                notLinked_notFixed_szz_count.add(_id)
        
    data = {'project_name': project.name
           ,'project_id': ObjectId(project.id)
           ,'commit_id': ObjectId(commit.id)
           ,'linked_fixed_szz': len(linked_fixed_szz_count)
           ,'linked_fixed_notSzz': len(linked_fixed_notSzz_count)
           ,'linked_notFixed_szz': len(linked_notFixed_szz_count)
           ,'linked_notFixed_notSzz': len(linked_notFixed_notSzz_count)
           ,'notLinked_fixed_szz': len(notLinked_fixed_szz_count)
           ,'notLinked_fixed_notSzz': len(notLinked_fixed_notSzz_count)
           ,'notLinked_notFixed_szz': len(notLinked_notFixed_szz_count)}
        
    db.get_collection('ProjectMessagesOverlapsUniquely').insert_one(data)        

In [None]:
for count, project in enumerate(Project.objects()):
    print('Project Number ', count, ' => ', project.name)
    determine_overlapes_in_message_links_uniquely(project)

### Count message links uniquely for all commits

In [None]:
import pandas as pd


def determine_overlapes_in_message_links_uniquely_all_commits():

    commits = Commit.objects()
    
    linked_fixed_szz_count = set()
    linked_fixed_notSzz_count = set()
    linked_notFixed_szz_count = set()
    linked_notFixed_notSzz_count = set()
    notLinked_fixed_szz_count = set()
    notLinked_fixed_notSzz_count = set()
    notLinked_notFixed_szz_count = set()
    
    linked_message_ids = set()
    fixed_message_ids = set()
    szz_message_ids = set()
    
    all_message_ids = set()

    for commit in commits:
        
        linked_message_ids.update(commit.linked_message_ids)
        fixed_message_ids.update(commit.fixed_message_ids)
        szz_message_ids.update(commit.szz_message_ids)
        
        all_message_ids.update(commit.linked_message_ids)
        all_message_ids.update(commit.fixed_message_ids)
        all_message_ids.update(commit.szz_message_ids)
    
        
    for _id in all_message_ids:
        if _id in linked_message_ids and _id in fixed_message_ids and _id in szz_message_ids:
            linked_fixed_szz_count.add(_id)

        elif _id in linked_message_ids and _id in fixed_message_ids and not _id in szz_message_ids:
            linked_fixed_notSzz_count.add(_id)

        elif _id in linked_message_ids and not _id in fixed_message_ids and _id in szz_message_ids:
            linked_notFixed_szz_count.add(_id)

        elif _id in linked_message_ids and not _id in fixed_message_ids and not _id in szz_message_ids:
            linked_notFixed_notSzz_count.add(_id)

        elif not _id in linked_message_ids and _id in fixed_message_ids and _id in szz_message_ids:
            notLinked_fixed_szz_count.add(_id)

        elif not _id in linked_message_ids and _id in fixed_message_ids and not _id in szz_message_ids:
            notLinked_fixed_notSzz_count.add(_id)

        elif not _id in linked_message_ids and not _id in fixed_message_ids and _id in szz_message_ids:
            notLinked_notFixed_szz_count.add(_id)
        
    data = {'linked_fixed_szz': len(linked_fixed_szz_count)
           ,'linked_fixed_notSzz': len(linked_fixed_notSzz_count)
           ,'linked_notFixed_szz': len(linked_notFixed_szz_count)
           ,'linked_notFixed_notSzz': len(linked_notFixed_notSzz_count)
           ,'notLinked_fixed_szz': len(notLinked_fixed_szz_count)
           ,'notLinked_fixed_notSzz': len(notLinked_fixed_notSzz_count)
           ,'notLinked_notFixed_szz': len(notLinked_notFixed_szz_count)}
    
    return data

In [None]:
determine_overlapes_in_message_links_uniquely_all_commits()

### Determine overlapes in commits for issue links

In [None]:
import pandas as pd
from pycoshark.mongomodels import VCSSystem

def determine_overlapes_in_issue_links(project):

    vcs_system = VCSSystem.objects(project_id=project.id).get()

    commits = Commit.objects(vcs_system_id=vcs_system.id)

    for commit in commits:
        
        linked_issue_ids = set(commit.linked_issue_ids)
        fixed_issue_ids = set(commit.fixed_issue_ids)
        szz_issue_ids = set(commit.szz_issue_ids)
        
        linked_fixed_szz_count = 0
        linked_fixed_notSzz_count = 0
        linked_notFixed_szz_count = 0
        linked_notFixed_notSzz_count = 0
        notLinked_fixed_szz_count = 0
        notLinked_fixed_notSzz_count = 0
        notLinked_notFixed_szz_count = 0
        
        for _id in linked_issue_ids | fixed_issue_ids | szz_issue_ids:
            if _id in linked_issue_ids and _id in fixed_issue_ids and _id in szz_issue_ids:
                linked_fixed_szz_count+=1
                
            elif _id in linked_issue_ids and _id in fixed_issue_ids and not _id in szz_issue_ids:
                linked_fixed_notSzz_count+=1
                
            elif _id in linked_issue_ids and not _id in fixed_issue_ids and _id in szz_issue_ids:
                linked_notFixed_szz_count+=1
                
            elif _id in linked_issue_ids and not _id in fixed_issue_ids and not _id in szz_issue_ids:
                linked_notFixed_notSzz_count+=1
            
            elif not _id in linked_issue_ids and _id in fixed_issue_ids and _id in szz_issue_ids:
                notLinked_fixed_szz_count+=1
                
            elif not _id in linked_issue_ids and _id in fixed_issue_ids and not _id in szz_issue_ids:
                notLinked_fixed_notSzz_count+=1
            
            elif not _id in linked_issue_ids and not _id in fixed_issue_ids and _id in szz_issue_ids:
                notLinked_notFixed_szz_count+=1
        
        data = {'project_name': project.name
               ,'project_id': ObjectId(project.id)
               ,'commit_id': ObjectId(commit.id)
               ,'linked_fixed_szz': linked_fixed_szz_count
               ,'linked_fixed_notSzz': linked_fixed_notSzz_count
               ,'linked_notFixed_szz': linked_notFixed_szz_count
               ,'linked_notFixed_notSzz': linked_notFixed_notSzz_count
               ,'notLinked_fixed_szz': notLinked_fixed_szz_count
               ,'notLinked_fixed_notSzz': notLinked_fixed_notSzz_count
               ,'notLinked_notFixed_szz': notLinked_notFixed_szz_count}
        
        db.get_collection('ProjectIssuesOverlaps').insert_one(data)

In [None]:
for count, project in enumerate(Project.objects()):
    print('Project Number ', count, ' => ', project.name)
    determine_overlapes_in_issue_links(project)

### count the above, uniquely

In [None]:
def determine_overlapes_in_issue_links_uniquely(project):

    vcs_system = VCSSystem.objects(project_id=project.id).get()

    commits = Commit.objects(vcs_system_id=vcs_system.id)
    
    linked_fixed_szz_count = set()
    linked_fixed_notSzz_count = set()
    linked_notFixed_szz_count = set()
    linked_notFixed_notSzz_count = set()
    notLinked_fixed_szz_count = set()
    notLinked_fixed_notSzz_count = set()
    notLinked_notFixed_szz_count = set()

    for commit in commits:
        
        linked_issue_ids = set(commit.linked_issue_ids)
        fixed_issue_ids = set(commit.fixed_issue_ids)
        szz_issue_ids = set(commit.szz_issue_ids)
        
        for _id in linked_issue_ids | fixed_issue_ids | szz_issue_ids:
            if _id in linked_issue_ids and _id in fixed_issue_ids and _id in szz_issue_ids:
                linked_fixed_szz_count.add(_id)
                
            elif _id in linked_issue_ids and _id in fixed_issue_ids and not _id in szz_issue_ids:
                linked_fixed_notSzz_count.add(_id)
                
            elif _id in linked_issue_ids and not _id in fixed_issue_ids and _id in szz_issue_ids:
                linked_notFixed_szz_count.add(_id)
                
            elif _id in linked_issue_ids and not _id in fixed_issue_ids and not _id in szz_issue_ids:
                linked_notFixed_notSzz_count.add(_id)
            
            elif not _id in linked_issue_ids and _id in fixed_issue_ids and _id in szz_issue_ids:
                notLinked_fixed_szz_count.add(_id)
                
            elif not _id in linked_issue_ids and _id in fixed_issue_ids and not _id in szz_issue_ids:
                notLinked_fixed_notSzz_count.add(_id)
            
            elif not _id in linked_issue_ids and not _id in fixed_issue_ids and _id in szz_issue_ids:
                notLinked_notFixed_szz_count.add(_id)
        
    data = {'project_name': project.name
           ,'project_id': ObjectId(project.id)
           ,'commit_id': ObjectId(commit.id)
           ,'linked_fixed_szz': len(linked_fixed_szz_count)
           ,'linked_fixed_notSzz': len(linked_fixed_notSzz_count)
           ,'linked_notFixed_szz': len(linked_notFixed_szz_count)
           ,'linked_notFixed_notSzz': len(linked_notFixed_notSzz_count)
           ,'notLinked_fixed_szz': len(notLinked_fixed_szz_count)
           ,'notLinked_fixed_notSzz': len(notLinked_fixed_notSzz_count)
           ,'notLinked_notFixed_szz': len(notLinked_notFixed_szz_count)}
        
    db.get_collection('ProjectIssuesOverlapsUniquely').insert_one(data)

In [None]:
for count, project in enumerate(Project.objects()):
    print('Project Number ', count, ' => ', project.name)
    determine_overlapes_in_issue_links_uniquely(project)

### Count issue links for all commits

In [None]:
def determine_overlapes_in_issue_links_uniquely_all_commits():

    commits = Commit.objects()
    
    linked_fixed_szz_count = set()
    linked_fixed_notSzz_count = set()
    linked_notFixed_szz_count = set()
    linked_notFixed_notSzz_count = set()
    notLinked_fixed_szz_count = set()
    notLinked_fixed_notSzz_count = set()
    notLinked_notFixed_szz_count = set()
    
    linked_issue_ids = set()
    fixed_issue_ids = set()
    szz_issue_ids = set()
    
    all_issue_ids = set()

    for commit in commits:
        
        linked_issue_ids.update(commit.linked_issue_ids)
        fixed_issue_ids.update(commit.fixed_issue_ids)
        szz_issue_ids.update(commit.szz_issue_ids)
        
        all_issue_ids.update(commit.linked_issue_ids)
        all_issue_ids.update(commit.fixed_issue_ids)
        all_issue_ids.update(commit.szz_issue_ids)
        
    
    for _id in all_issue_ids:
        if _id in linked_issue_ids and _id in fixed_issue_ids and _id in szz_issue_ids:
            linked_fixed_szz_count.add(_id)

        elif _id in linked_issue_ids and _id in fixed_issue_ids and not _id in szz_issue_ids:
            linked_fixed_notSzz_count.add(_id)

        elif _id in linked_issue_ids and not _id in fixed_issue_ids and _id in szz_issue_ids:
            linked_notFixed_szz_count.add(_id)

        elif _id in linked_issue_ids and not _id in fixed_issue_ids and not _id in szz_issue_ids:
            linked_notFixed_notSzz_count.add(_id)

        elif not _id in linked_issue_ids and _id in fixed_issue_ids and _id in szz_issue_ids:
            notLinked_fixed_szz_count.add(_id)

        elif not _id in linked_issue_ids and _id in fixed_issue_ids and not _id in szz_issue_ids:
            notLinked_fixed_notSzz_count.add(_id)

        elif not _id in linked_issue_ids and not _id in fixed_issue_ids and _id in szz_issue_ids:
            notLinked_notFixed_szz_count.add(_id)
        
    data = {'linked_fixed_szz': len(linked_fixed_szz_count)
           ,'linked_fixed_notSzz': len(linked_fixed_notSzz_count)
           ,'linked_notFixed_szz': len(linked_notFixed_szz_count)
           ,'linked_notFixed_notSzz': len(linked_notFixed_notSzz_count)
           ,'notLinked_fixed_szz': len(notLinked_fixed_szz_count)
           ,'notLinked_fixed_notSzz': len(notLinked_fixed_notSzz_count)
           ,'notLinked_notFixed_szz': len(notLinked_notFixed_szz_count)}
    
    return data


In [None]:
determine_overlapes_in_issue_links_uniquely_all_commits()

### Find issues & Messages that are not in commit links

In [None]:
from pycoshark.mongomodels import IssueSystem


def issues_not_linked(project):
    
    vcs_system = VCSSystem.objects(project_id=project.id).get()

    commits = Commit.objects(vcs_system_id=vcs_system.id)
    
    links_exists_in_commits = set()

    for commit in commits:
        links_exists_in_commits.update(set(commit.linked_issue_ids))
        links_exists_in_commits.update(set(commit.fixed_issue_ids))
        links_exists_in_commits.update(set(commit.szz_issue_ids))
        
    
    all_issue_links = set()
    
    for issue_system in IssueSystem.objects(project_id = project.id):
        all_issue_links.update(set(Issue.objects(issue_system_id = issue_system.id).only('id')))
    
    number_of_all_issue_links = len(all_issue_links)
    
    all_issue_links = set([issue_obj.id for issue_obj in all_issue_links])
    
    all_issue_links.difference_update(links_exists_in_commits)
    
    data = db.get_collection("ProjectSum").aggregate([
    {
        "$match": {
            "project_id": project.id
        }
    },
    {
        "$addFields": {
            "Issues.sum_notLinked_notFixed_notSzz": len(all_issue_links)
        }
    }
])
    
    db.get_collection("ProjectSum").update_one({"project_id": project.id}, {"$set": list(data)[0]})



In [None]:
from pycoshark.mongomodels import MailingList

def messages_not_linked(project):
    
    vcs_system = VCSSystem.objects(project_id=project.id).get()

    commits = Commit.objects(vcs_system_id=vcs_system.id)
    
    links_exists_in_commits = set()

    for commit in commits:
        links_exists_in_commits.update(set(commit.linked_message_ids))
        links_exists_in_commits.update(set(commit.fixed_message_ids))
        links_exists_in_commits.update(set(commit.szz_message_ids))
        
    
    all_message_links = set()
    
    for mailing_list in MailingList.objects(project_id = project.id):
        all_message_links.update(set(Message.objects(mailing_list_id = mailing_list.id).only('id')))
    
    number_of_all_message_links = len(all_message_links)
    
    all_message_links = set([message_obj.id for message_obj in all_message_links])
    
    all_message_links.difference_update(links_exists_in_commits)

    data = db.get_collection("ProjectSum").aggregate([
    {
        "$match": {
            "project_id": project.id
        }
    },
    {
        "$addFields": {
            "Messages.sum_notLinked_notFixed_notSzz": len(all_message_links)
        }
    }
])
    
    db.get_collection("ProjectSum").update_one({"project_id": project.id}, {"$set": list(data)[0]})



In [None]:
for count, project in enumerate(Project.objects()):
    print('Project Number ', count, ' => ', project.name)
    issues_not_linked(project)
    messages_not_linked(project)

### Commits with no issue links

In [None]:
def commits_with_no_issue_links(project):
    
    vcs_system = VCSSystem.objects(project_id=project.id).get()

    commits = Commit.objects(vcs_system_id=vcs_system.id)

    for commit in commits:
        if len(commit.linked_issue_ids) == 0 and len(commit.fixed_issue_ids) == 0 and len(commit.szz_issue_ids) == 0:
            db.get_collection('CommitsWithNoIssueLinks').insert_one({'project_id': project.id
                                                                     ,'project_name': project.name
                                                                     ,'commit_id': commit.id})


In [None]:
for count, project in enumerate(Project.objects()):
    print('Project Number ', count, ' => ', project.name)
    commits_with_no_issue_links(project)

### Commits with no link to messages

In [None]:
def commits_with_no_message_links(project):
    
    vcs_system = VCSSystem.objects(project_id=project.id).get()

    commits = Commit.objects(vcs_system_id=vcs_system.id)

    for commit in commits:
        if len(commit.linked_message_ids) == 0 and len(commit.fixed_message_ids) == 0 and len(commit.szz_message_ids) == 0:
            db.get_collection('CommitsWithNoMessageLinks').insert_one({'project_id': project.id
                                                                     ,'project_name': project.name
                                                                     ,'commit_id': commit.id})

In [None]:
for count, project in enumerate(Project.objects()):
    print('Project Number ', count, ' => ', project.name)
    commits_with_no_message_links(project)

### Quartile of projects accuracy percentages

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

projects_percentage = db.get_collection('ProjectsPercentageAccuracy').find({})
projects_percentage_values =  [project['percentage'] for project in projects_percentage]
# sns.set_theme(style="whitegrid")
ax = sns.boxplot(x=projects_percentage_values, palette="Pastel1")
ax = sns.swarmplot(x=projects_percentage_values, color=".30")

plt.savefig('projects_accuracy_percentages.png', dpi=500)

### Find all unique issue links

In [None]:
commits = Commit.objects().only('linked_issue_ids', 'fixed_issue_ids', 'szz_issue_ids')

issue_ids = set()

for commit in commits:
    issue_ids.update(commit.linked_issue_ids)
    issue_ids.update(commit.fixed_issue_ids)
    issue_ids.update(commit.szz_issue_ids)

print(len(issue_ids))

### Find all unique message links

In [None]:
commits = Commit.objects().only('linked_message_ids', 'fixed_message_ids', 'szz_message_ids')

message_ids = set()

for commit in commits:
    message_ids.update(commit.linked_message_ids)
    message_ids.update(commit.fixed_message_ids)
    message_ids.update(commit.szz_message_ids)

print(len(issue_ids))

### Standard deviation

In [None]:
def standard_deviation(project):
    
    commits_issue_details = db.get_collection('ProjectIssuesOverlaps').find({'project_id': project.id})
    commits_message_details = db.get_collection('ProjectMessagesOverlaps').find({'project_id': project.id})
    
    commits_issue_details = pd.DataFrame(commits_issue_details)
    commits_message_details = pd.DataFrame(commits_message_details)
    
    issue_links_std = commits_issue_details.std()
    message_links_std = commits_message_details.std()
    
    db.get_collection('ProjectOverlapsSTD').insert({'project_name': project.name
                                                   ,'project_id': project.id
                                                   ,'issue_links': dict(issue_links_std)
                                                   ,'message_links': dict(message_links_std)})
    
    

In [None]:
for count, project in enumerate(Project.objects()):
    print('Project Number ', count, ' => ', project.name)
    standard_deviation(project)


#### Standard Deviation for all Commits together

In [None]:
def standard_deviation_all_commits():
    
    commits_issue_details = db.get_collection('ProjectIssuesOverlaps').find()
    commits_message_details = db.get_collection('ProjectMessagesOverlaps').find()
    
    commits_issue_details = pd.DataFrame(commits_issue_details)
    commits_message_details = pd.DataFrame(commits_message_details)
    
    issue_links_std = commits_issue_details.std()
    message_links_std = commits_message_details.std()
    
    print(issue_links_std)
    print(message_links_std)

standard_deviation_all_commits()

#### Find standard deviation of project percentages

In [None]:
x = db.get_collection('ProjectLinkDetails').aggregate([
    {
        '$group': {
            '_id': None,
            'STD': {
                '$stdDevSamp': '$percentage'
            }
        }
    }
])

print(list(x))
data = pd.DataFrame(db.get_collection('ProjectLinkDetails').find({}))
data.describe()

### Count total number of issues and messages that are in commits uniquely

In [None]:
linked_issue_ids = set()
fixed_issue_ids = set()
szz_issue_ids = set()

linked_message_ids = set()
fixed_message_ids = set()
szz_messsage_ids = set()

for count, commit in enumerate(Commit.objects().only('id', 'linked_message_ids', 'fixed_message_ids', 'szz_message_ids', 
                                                     'linked_issue_ids', 'fixed_issue_ids', 'szz_issue_ids')):   
    
    linked_issue_ids.update(commit.linked_issue_ids)
    fixed_issue_ids.update(commit.fixed_issue_ids)
    szz_issue_ids.update(commit.szz_issue_ids)

    linked_message_ids.update(commit.linked_message_ids)
    fixed_message_ids.update(commit.fixed_message_ids)
    szz_messsage_ids.update(commit.szz_message_ids)
    
    if count%25000 == 0:
        print("Iteration => ", count)
    
    
print('linked_issue_ids: ', len(linked_issue_ids))
print('fixed_issue_ids: ', len(fixed_issue_ids))
print('szz_issue_ids: ', len(szz_issue_ids))

print('linked_message_ids: ', len(linked_message_ids))
print('fixed_message_ids: ', len(fixed_message_ids))
print('szz_messsage_ids: ', len(szz_messsage_ids))
    

### Count number of commits that have at least one related issue or one related message

In [None]:
commitIssueCount = 0
commitMessageCount = 0
for count, commit in enumerate(Commit.objects().only('linked_message_ids', 'fixed_message_ids', 'szz_message_ids', 
                                                     'linked_issue_ids', 'fixed_issue_ids', 'szz_issue_ids')):  
    

    if len(commit.linked_issue_ids) != 0 or len(commit.fixed_issue_ids) != 0 or len(commit.szz_issue_ids) != 0:
        commitIssueCount += 1
    if len(commit.linked_message_ids) != 0 or len(commit.fixed_message_ids) != 0 or len(commit.szz_message_ids) != 0:
        commitMessageCount += 1
    
    if count%25000 == 0:
        print("Iteration => ", count)

print('commitIssueCount: ', commitIssueCount)
print('commitMessageCount: ',commitMessageCount)