In [None]:
%matplotlib inline

import glob
import copy
import pprint
import logging

import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import pandas as pd
import networkx as nx
from scipy import stats
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import f_regression

from collections import deque

from datetime import datetime
from bson.objectid import ObjectId
from mongoengine import connect
from pycoshark.mongomodels import Project, VCSSystem, Commit, Tag, File, CodeEntityState, FileAction, People, IssueSystem, Issue, Message, MailingList, Event, MynbouData, Identity, Hunk, Branch, Refactoring
from pycoshark.utils import jira_is_resolved_and_fixed
loc = {'host': '127.0.0.1',
       'port': 27017,
       'db': 'smartshark',
       'username': 'root',
       'password': '',
       'authentication_source': 'smartshark',
       'connect': False}
connect(**loc)

In [None]:
def calculate_defect_density(dfall, project_name):
    dd = []
    pr = Project.objects.get(name=project_name)
    vcs = VCSSystem.objects.get(project_id=pr.id)
    its = IssueSystem.objects.get(project_id=pr.id)
    
    first_use = False
    first_year = True
    for year in dfall[dfall['project'] == project_name]['year'].unique():
        start = datetime(year=year, day=1, month=1, hour=0, minute=0, second=1)
        end = datetime(year=year, day=31, month=12, hour=23, minute=59, second=59)

        # take average of lloc for defect density
        lloc = dfall[(dfall['project'] == project_name) & (dfall['year'] == year)]['code_klloc'].mean()
        elloc = dfall[(dfall['project'] == project_name) & (dfall['year'] == year)]['effective_code_klloc'].mean()

        issues_fixed = set()
        issues_created = set()

        for i in Issue.objects.filter(issue_system_id=its.id, created_at__gte=start, created_at__lt=end):
            if str(i.issue_type).lower() == 'bug':
                issues_created.add(i)

        # this only works with linked_issue_ids (we could use fixed_issue_ids if we use validated data)
        for revision_hash in dfall[(dfall['project'] == project_name) & (dfall['year'] == year)]['revision'].unique():
            c = Commit.objects.get(vcs_system_id=vcs.id, revision_hash=revision_hash)
            for fi in c.linked_issue_ids:
                i = Issue.objects.get(id=fi)
                if str(i.issue_type).lower() == 'bug' and jira_is_resolved_and_fixed(i):
                    issues_fixed.add(i)
        
        # if we are on the first year in our data and we already have fixed issues in our linked issues (linked to commits) we assume the issue tracker is used consistenly
        if first_year and len(issues_fixed) > 0:
            first_use = True

        # we skip the first year because the ITS may not have been used completely iff we have empty linked issues in the first year
        if first_use:
            dd.append({'project': project_name, 'year': year, 'code_klloc': lloc, 'effective_klloc': elloc, 'issues_created': len(issues_created), 'issues_fixed': len(issues_fixed)})
            print('{} in {} defect density: {}, ({} / {}) issues fixed: {}, fix density: {}'.format(project_name, year, len(issues_created) / lloc, len(issues_created), lloc, len(issues_fixed), len(issues_fixed) / lloc))
        else:
            if len(issues_fixed) > 0:
                first_use = True
        
        first_year = False
    return dd

In [None]:
dfall = pd.read_pickle('../data/aggregated_full.pickle')
len(dfall)

In [None]:
dfs = []
for d in dd:
    dfs.append(pd.DataFrame(d))
df = pd.concat(dfs)
df.to_csv('../data/defect_density.csv', index=False)

In [None]:
ddall = pd.read_csv('../data/defect_density.csv')
print(ddall['project'].nunique(), 'projects')
print(ddall['project'].unique())