In [None]:
%matplotlib inline

import glob
import copy
import pprint
import os
import itertools
import datetime
import math
import statistics

import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib.dates as mdates
import pandas as pd
import numpy as np

from datetime import datetime, timedelta
from matplotlib.lines import Line2D
from scipy import stats
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import f_regression
from sklearn.metrics import mean_squared_error, r2_score

from bokeh.io import show, output_notebook
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource
from bokeh.palettes import Category10, Pastel1, Reds, Dark2
from bokeh.models import HoverTool, PanTool, ResetTool, WheelZoomTool
from bokeh.models import LinearAxis
from bokeh.models.ranges import Range1d

PROJECTS = ['calcite',
 'cayenne',
 'commons-bcel',
 'commons-beanutils',
 'commons-codec',
 'commons-collections',
 'commons-compress',
 'commons-configuration',
 'commons-dbcp',
 'commons-digester',
 'commons-imaging',
 'commons-io',
 'commons-jcs',
 'commons-jexl',
 'commons-lang',
 'commons-math',
 'commons-net',
 'commons-rdf',
 'commons-scxml',
 'commons-validator',
 'commons-vfs',
 'eagle',
 'falcon',
 'flume',
 'giraph',
 'gora',
 'jspwiki',
 'knox',
 'kylin',
 'lens',
 'mahout',
 'manifoldcf',
 'opennlp',
 'parquet-mr',
 'pdfbox',
 'phoenix',
 'ranger',
 'santuario-java',
 'storm',
 'struts',
 'systemml',
 'tez',
 'tika',
 'wss4j',
 'zeppelin',
 'helix',
  'httpcomponents-client', 'archiva', 'httpcomponents-core', 'jena', 'streams', 'mina-sshd', 'roller', 'nifi']

# PMD Rules from Sourcemeter homepage https://www.sourcemeter.com/resources/java/ 2018-07-24
PMD_RULES = [{'type': 'Basic Rules', 'rule': 'Avoid Branching Statement As Last In Loop', 'abbrev': 'PMD_ABSALIL', 'severity': 'Major'}, {'type': 'Basic Rules', 'rule': 'Avoid Decimal Literals In Big Decimal Constructor', 'abbrev': 'PMD_ADLIBDC', 'severity': 'Critical'}, {'type': 'Basic Rules', 'rule': 'Avoid Multiple Unary Operators', 'abbrev': 'PMD_AMUO', 'severity': 'Major'}, {'type': 'Basic Rules', 'rule': 'Avoid Thread Group', 'abbrev': 'PMD_ATG', 'severity': 'Critical'}, {'type': 'Basic Rules', 'rule': 'Avoid Using Hard Coded IP', 'abbrev': 'PMD_AUHCIP', 'severity': 'Major'}, {'type': 'Basic Rules', 'rule': 'Avoid Using Octal Values', 'abbrev': 'PMD_AUOV', 'severity': 'Critical'}, {'type': 'Basic Rules', 'rule': 'Big Integer Instantiation', 'abbrev': 'PMD_BII', 'severity': 'Minor'}, {'type': 'Basic Rules', 'rule': 'Boolean Instantiation', 'abbrev': 'PMD_BI', 'severity': 'Minor'}, {'type': 'Basic Rules', 'rule': 'Broken Null Check', 'abbrev': 'PMD_BNC', 'severity': 'Critical'}, {'type': 'Basic Rules', 'rule': 'Check Result Set', 'abbrev': 'PMD_CRS', 'severity': 'Critical'}, {'type': 'Basic Rules', 'rule': 'Check Skip Result', 'abbrev': 'PMD_CSR', 'severity': 'Critical'}, {'type': 'Basic Rules', 'rule': 'Class Cast Exception With To Array', 'abbrev': 'PMD_CCEWTA', 'severity': 'Critical'}, {'type': 'Basic Rules', 'rule': 'Collapsible If Statements', 'abbrev': 'PMD_CIS', 'severity': 'Minor'}, {'type': 'Basic Rules', 'rule': 'Dont Call Thread Run', 'abbrev': 'PMD_DCTR', 'severity': 'Critical'}, {'type': 'Basic Rules', 'rule': 'Dont Use Float Type For Loop Indices', 'abbrev': 'PMD_DUFTFLI', 'severity': 'Critical'}, {'type': 'Basic Rules', 'rule': 'Double Checked Locking', 'abbrev': 'PMD_DCL', 'severity': 'Critical'}, {'type': 'Basic Rules', 'rule': 'Empty Catch Block', 'abbrev': 'PMD_ECB', 'severity': 'Critical'}, {'type': 'Basic Rules', 'rule': 'Empty Finally Block', 'abbrev': 'PMD_EFB', 'severity': 'Minor'}, {'type': 'Basic Rules', 'rule': 'Empty If Stmt', 'abbrev': 'PMD_EIS', 'severity': 'Major'}, {'type': 'Basic Rules', 'rule': 'Empty Statement Block', 'abbrev': 'PMD_EmSB', 'severity': 'Minor'}, {'type': 'Basic Rules', 'rule': 'Empty Statement Not In Loop', 'abbrev': 'PMD_ESNIL', 'severity': 'Minor'}, {'type': 'Basic Rules', 'rule': 'Empty Static Initializer', 'abbrev': 'PMD_ESI', 'severity': 'Minor'}, {'type': 'Basic Rules', 'rule': 'Empty Switch Statements', 'abbrev': 'PMD_ESS', 'severity': 'Major'}, {'type': 'Basic Rules', 'rule': 'Empty Synchronized Block', 'abbrev': 'PMD_ESB', 'severity': 'Major'}, {'type': 'Basic Rules', 'rule': 'Empty Try Block', 'abbrev': 'PMD_ETB', 'severity': 'Major'}, {'type': 'Basic Rules', 'rule': 'Empty While Stmt', 'abbrev': 'PMD_EWS', 'severity': 'Critical'}, {'type': 'Basic Rules', 'rule': 'Extends Object', 'abbrev': 'PMD_EO', 'severity': 'Minor'}, {'type': 'Basic Rules', 'rule': 'For Loop Should Be While Loop', 'abbrev': 'PMD_FLSBWL', 'severity': 'Minor'}, {'type': 'Basic Rules', 'rule': 'Jumbled Incrementer', 'abbrev': 'PMD_JI', 'severity': 'Critical'}, {'type': 'Basic Rules', 'rule': 'Misplaced Null Check', 'abbrev': 'PMD_MNC', 'severity': 'Critical'}, {'type': 'Basic Rules', 'rule': 'Override Both Equals And Hashcode', 'abbrev': 'PMD_OBEAH', 'severity': 'Critical'}, {'type': 'Basic Rules', 'rule': 'Return From Finally Block', 'abbrev': 'PMD_RFFB', 'severity': 'Critical'}, {'type': 'Basic Rules', 'rule': 'Unconditional If Statement', 'abbrev': 'PMD_UIS', 'severity': 'Major'}, {'type': 'Basic Rules', 'rule': 'Unnecessary Conversion Temporary', 'abbrev': 'PMD_UCT', 'severity': 'Minor'}, {'type': 'Basic Rules', 'rule': 'Unused Null Check In Equals', 'abbrev': 'PMD_UNCIE', 'severity': 'Critical'}, {'type': 'Basic Rules', 'rule': 'Useless Operation On Immutable', 'abbrev': 'PMD_UOOI', 'severity': 'Critical'}, {'type': 'Basic Rules', 'rule': 'Useless Overriding Method', 'abbrev': 'PMD_UOM', 'severity': 'Minor'}, {'type': 'Brace Rules', 'rule': 'For Loops Must Use Braces', 'abbrev': 'PMD_FLMUB', 'severity': 'Minor'}, {'type': 'Brace Rules', 'rule': 'If Else Stmts Must Use Braces', 'abbrev': 'PMD_IESMUB', 'severity': 'Minor'}, {'type': 'Brace Rules', 'rule': 'If Stmts Must Use Braces', 'abbrev': 'PMD_ISMUB', 'severity': 'Minor'}, {'type': 'Brace Rules', 'rule': 'While Loops Must Use Braces', 'abbrev': 'PMD_WLMUB', 'severity': 'Minor'}, {'type': 'Clone Implementation Rules', 'rule': 'Clone Throws Clone Not Supported Exception', 'abbrev': 'PMD_CTCNSE', 'severity': 'Major'}, {'type': 'Clone Implementation Rules', 'rule': 'Proper Clone Implementation', 'abbrev': 'PMD_PCI', 'severity': 'Critical'}, {'type': 'Controversial Rules', 'rule': 'Assignment In Operand', 'abbrev': 'PMD_AIO', 'severity': 'Minor'}, {'type': 'Controversial Rules', 'rule': 'Avoid Accessibility Alteration', 'abbrev': 'PMD_AAA', 'severity': 'Major'}, {'type': 'Controversial Rules', 'rule': 'Avoid Prefixing Method Parameters', 'abbrev': 'PMD_APMP', 'severity': 'Minor'}, {'type': 'Controversial Rules', 'rule': 'Avoid Using Native Code', 'abbrev': 'PMD_AUNC', 'severity': 'Major'}, {'type': 'Controversial Rules', 'rule': 'Default Package', 'abbrev': 'PMD_DP', 'severity': 'Minor'}, {'type': 'Controversial Rules', 'rule': 'Do Not Call Garbage Collection Explicitly', 'abbrev': 'PMD_DNCGCE', 'severity': 'Major'}, {'type': 'Controversial Rules', 'rule': 'Dont Import Sun', 'abbrev': 'PMD_DIS', 'severity': 'Major'}, {'type': 'Controversial Rules', 'rule': 'One Declaration Per Line', 'abbrev': 'PMD_ODPL', 'severity': 'Minor'}, {'type': 'Controversial Rules', 'rule': 'Suspicious Octal Escape', 'abbrev': 'PMD_SOE', 'severity': 'Major'}, {'type': 'Controversial Rules', 'rule': 'Unnecessary Constructor', 'abbrev': 'PMD_UC', 'severity': 'Minor'}, {'type': 'Design Rules', 'rule': 'Abstract Class Without Abstract Method', 'abbrev': 'PMD_ACWAM', 'severity': 'Minor'}, {'type': 'Design Rules', 'rule': 'Abstract Class Without Any Method', 'abbrev': 'PMD_AbCWAM', 'severity': 'Minor'}, {'type': 'Design Rules', 'rule': 'Assignment To Non Final Static', 'abbrev': 'PMD_ATNFS', 'severity': 'Critical'}, {'type': 'Design Rules', 'rule': 'Avoid Constants Interface', 'abbrev': 'PMD_ACI', 'severity': 'Minor'}, {'type': 'Design Rules', 'rule': 'Avoid Instanceof Checks In Catch Clause', 'abbrev': 'PMD_AICICC', 'severity': 'Major'}, {'type': 'Design Rules', 'rule': 'Avoid Protected Field In Final Class', 'abbrev': 'PMD_APFIFC', 'severity': 'Minor'}, {'type': 'Design Rules', 'rule': 'Avoid Protected Method In Final Class Not Extending', 'abbrev': 'PMD_APMIFCNE', 'severity': 'Minor'}, {'type': 'Design Rules', 'rule': 'Avoid Reassigning Parameters', 'abbrev': 'PMD_ARP', 'severity': 'Minor'}, {'type': 'Design Rules', 'rule': 'Avoid Synchronized At Method Level', 'abbrev': 'PMD_ASAML', 'severity': 'Minor'}, {'type': 'Design Rules', 'rule': 'Bad Comparison', 'abbrev': 'PMD_BC', 'severity': 'Critical'}, {'type': 'Design Rules', 'rule': 'Class With Only Private Constructors Should Be Final', 'abbrev': 'PMD_CWOPCSBF', 'severity': 'Minor'}, {'type': 'Design Rules', 'rule': 'Close Resource', 'abbrev': 'PMD_ClR', 'severity': 'Critical'}, {'type': 'Design Rules', 'rule': 'Constructor Calls Overridable Method', 'abbrev': 'PMD_CCOM', 'severity': 'Critical'}, {'type': 'Design Rules', 'rule': 'Default Label Not Last In Switch Stmt', 'abbrev': 'PMD_DLNLISS', 'severity': 'Minor'}, {'type': 'Design Rules', 'rule': 'Empty Method In Abstract Class Should Be Abstract', 'abbrev': 'PMD_EMIACSBA', 'severity': 'Major'}, {'type': 'Design Rules', 'rule': 'Equals Null', 'abbrev': 'PMD_EN', 'severity': 'Critical'}, {'type': 'Design Rules', 'rule': 'Field Declarations Should Be At Start Of Class', 'abbrev': 'PMD_FDSBASOC', 'severity': 'Minor'}, {'type': 'Design Rules', 'rule': 'Final Field Could Be Static', 'abbrev': 'PMD_FFCBS', 'severity': 'Minor'}, {'type': 'Design Rules', 'rule': 'Idempotent Operations', 'abbrev': 'PMD_IO', 'severity': 'Major'}, {'type': 'Design Rules', 'rule': 'Immutable Field', 'abbrev': 'PMD_IF', 'severity': 'Minor'}, {'type': 'Design Rules', 'rule': 'Instantiation To Get Class', 'abbrev': 'PMD_ITGC', 'severity': 'Major'}, {'type': 'Design Rules', 'rule': 'Logic Inversion', 'abbrev': 'PMD_LI', 'severity': 'Minor'}, {'type': 'Design Rules', 'rule': 'Missing Break In Switch', 'abbrev': 'PMD_MBIS', 'severity': 'Critical'}, {'type': 'Design Rules', 'rule': 'Missing Static Method In Non Instantiatable Class', 'abbrev': 'PMD_MSMINIC', 'severity': 'Minor'}, {'type': 'Design Rules', 'rule': 'Non Case Label In Switch Statement', 'abbrev': 'PMD_NCLISS', 'severity': 'Critical'}, {'type': 'Design Rules', 'rule': 'Non Static Initializer', 'abbrev': 'PMD_NSI', 'severity': 'Critical'}, {'type': 'Design Rules', 'rule': 'Non Thread Safe Singleton', 'abbrev': 'PMD_NTSS', 'severity': 'Critical'}, {'type': 'Design Rules', 'rule': 'Optimizable To Array Call', 'abbrev': 'PMD_OTAC', 'severity': 'Major'}, {'type': 'Design Rules', 'rule': 'Position Literals First In Case Insensitive Comparisons', 'abbrev': 'PMD_PLFICIC', 'severity': 'Critical'}, {'type': 'Design Rules', 'rule': 'Position Literals First In Comparisons', 'abbrev': 'PMD_PLFIC', 'severity': 'Critical'}, {'type': 'Design Rules', 'rule': 'Preserve Stack Trace', 'abbrev': 'PMD_PST', 'severity': 'Major'}, {'type': 'Design Rules', 'rule': 'Return Empty Array Rather Than Null', 'abbrev': 'PMD_REARTN', 'severity': 'Major'}, {'type': 'Design Rules', 'rule': 'Simple Date Format Needs Locale', 'abbrev': 'PMD_SDFNL', 'severity': 'Minor'}, {'type': 'Design Rules', 'rule': 'Simplify Boolean Expressions', 'abbrev': 'PMD_SBE', 'severity': 'Minor'}, {'type': 'Design Rules', 'rule': 'Simplify Boolean Returns', 'abbrev': 'PMD_SBR', 'severity': 'Minor'}, {'type': 'Design Rules', 'rule': 'Simplify Conditional', 'abbrev': 'PMD_SC', 'severity': 'Minor'}, {'type': 'Design Rules', 'rule': 'Singular Field', 'abbrev': 'PMD_SF', 'severity': 'Major'}, {'type': 'Design Rules', 'rule': 'Switch Stmts Should Have Default', 'abbrev': 'PMD_SSSHD', 'severity': 'Major'}, {'type': 'Design Rules', 'rule': 'Too Few Branches For ASwitch Statement', 'abbrev': 'PMD_TFBFASS', 'severity': 'Minor'}, {'type': 'Design Rules', 'rule': 'Uncommented Empty Constructor', 'abbrev': 'PMD_UEC', 'severity': 'Minor'}, {'type': 'Design Rules', 'rule': 'Uncommented Empty Method', 'abbrev': 'PMD_UEM', 'severity': 'Minor'}, {'type': 'Design Rules', 'rule': 'Unnecessary Local Before Return', 'abbrev': 'PMD_ULBR', 'severity': 'Minor'}, {'type': 'Design Rules', 'rule': 'Unsynchronized Static Date Formatter', 'abbrev': 'PMD_USDF', 'severity': 'Critical'}, {'type': 'Design Rules', 'rule': 'Use Collection Is Empty', 'abbrev': 'PMD_UCIE', 'severity': 'Major'}, {'type': 'Design Rules', 'rule': 'Use Locale With Case Conversions', 'abbrev': 'PMD_ULWCC', 'severity': 'Critical'}, {'type': 'Design Rules', 'rule': 'Use Notify All Instead Of Notify', 'abbrev': 'PMD_UNAION', 'severity': 'Critical'}, {'type': 'Design Rules', 'rule': 'Use Varargs', 'abbrev': 'PMD_UV', 'severity': 'Minor'}, {'type': 'Finalizer Rules', 'rule': 'Avoid Calling Finalize', 'abbrev': 'PMD_ACF', 'severity': 'Major'}, {'type': 'Finalizer Rules', 'rule': 'Empty Finalizer', 'abbrev': 'PMD_EF', 'severity': 'Minor'}, {'type': 'Finalizer Rules', 'rule': 'Finalize Does Not Call Super Finalize', 'abbrev': 'PMD_FDNCSF', 'severity': 'Critical'}, {'type': 'Finalizer Rules', 'rule': 'Finalize Only Calls Super Finalize', 'abbrev': 'PMD_FOCSF', 'severity': 'Minor'}, {'type': 'Finalizer Rules', 'rule': 'Finalize Overloaded', 'abbrev': 'PMD_FO', 'severity': 'Critical'}, {'type': 'Finalizer Rules', 'rule': 'Finalize Should Be Protected', 'abbrev': 'PMD_FSBP', 'severity': 'Critical'}, {'type': 'Import Statement Rules', 'rule': 'Dont Import Java Lang', 'abbrev': 'PMD_DIJL', 'severity': 'Minor'}, {'type': 'Import Statement Rules', 'rule': 'Duplicate Imports', 'abbrev': 'PMD_DI', 'severity': 'Minor'}, {'type': 'Import Statement Rules', 'rule': 'Import From Same Package', 'abbrev': 'PMD_IFSP', 'severity': 'Minor'}, {'type': 'Import Statement Rules', 'rule': 'Too Many Static Imports', 'abbrev': 'PMD_TMSI', 'severity': 'Major'}, {'type': 'Import Statement Rules', 'rule': 'Unnecessary Fully Qualified Name', 'abbrev': 'PMD_UFQN', 'severity': 'Minor'}, {'type': 'J2EE Rules', 'rule': 'Do Not Call System Exit', 'abbrev': 'PMD_DNCSE', 'severity': 'Critical'}, {'type': 'J2EE Rules', 'rule': 'Local Home Naming Convention', 'abbrev': 'PMD_LHNC', 'severity': 'Major'}, {'type': 'J2EE Rules', 'rule': 'Local Interface Session Naming Convention', 'abbrev': 'PMD_LISNC', 'severity': 'Major'}, {'type': 'J2EE Rules', 'rule': 'MDBAnd Session Bean Naming Convention', 'abbrev': 'PMD_MDBASBNC', 'severity': 'Major'}, {'type': 'J2EE Rules', 'rule': 'Remote Interface Naming Convention', 'abbrev': 'PMD_RINC', 'severity': 'Major'}, {'type': 'J2EE Rules', 'rule': 'Remote Session Interface Naming Convention', 'abbrev': 'PMD_RSINC', 'severity': 'Major'}, {'type': 'J2EE Rules', 'rule': 'Static EJBField Should Be Final', 'abbrev': 'PMD_SEJBFSBF', 'severity': 'Critical'}, {'type': 'JUnit Rules', 'rule': 'JUnit Assertions Should Include Message', 'abbrev': 'PMD_JUASIM', 'severity': 'Minor'}, {'type': 'JUnit Rules', 'rule': 'JUnit Spelling', 'abbrev': 'PMD_JUS', 'severity': 'Critical'}, {'type': 'JUnit Rules', 'rule': 'JUnit Static Suite', 'abbrev': 'PMD_JUSS', 'severity': 'Critical'}, {'type': 'JUnit Rules', 'rule': 'JUnit Test Contains Too Many Asserts', 'abbrev': 'PMD_JUTCTMA', 'severity': 'Minor'}, {'type': 'JUnit Rules', 'rule': 'JUnit Tests Should Include Assert', 'abbrev': 'PMD_JUTSIA', 'severity': 'Major'}, {'type': 'JUnit Rules', 'rule': 'Simplify Boolean Assertion', 'abbrev': 'PMD_SBA', 'severity': 'Minor'}, {'type': 'JUnit Rules', 'rule': 'Test Class Without Test Cases', 'abbrev': 'PMD_TCWTC', 'severity': 'Minor'}, {'type': 'JUnit Rules', 'rule': 'Unnecessary Boolean Assertion', 'abbrev': 'PMD_UBA', 'severity': 'Minor'}, {'type': 'JUnit Rules', 'rule': 'Use Assert Equals Instead Of Assert True', 'abbrev': 'PMD_UAEIOAT', 'severity': 'Major'}, {'type': 'JUnit Rules', 'rule': 'Use Assert Null Instead Of Assert True', 'abbrev': 'PMD_UANIOAT', 'severity': 'Minor'}, {'type': 'JUnit Rules', 'rule': 'Use Assert Same Instead Of Assert True', 'abbrev': 'PMD_UASIOAT', 'severity': 'Minor'}, {'type': 'JUnit Rules', 'rule': 'Use Assert True Instead Of Assert Equals', 'abbrev': 'PMD_UATIOAE', 'severity': 'Minor'}, {'type': 'Jakarta Commons Logging Rules', 'rule': 'Guard Debug Logging', 'abbrev': 'PMD_GDL', 'severity': 'Major'}, {'type': 'Jakarta Commons Logging Rules', 'rule': 'Guard Log Statement', 'abbrev': 'PMD_GLS', 'severity': 'Minor'}, {'type': 'Jakarta Commons Logging Rules', 'rule': 'Proper Logger', 'abbrev': 'PMD_PL', 'severity': 'Minor'}, {'type': 'Jakarta Commons Logging Rules', 'rule': 'Use Correct Exception Logging', 'abbrev': 'PMD_UCEL', 'severity': 'Major'}, {'type': 'Java Logging Rules', 'rule': 'Avoid Print Stack Trace', 'abbrev': 'PMD_APST', 'severity': 'Major'}, {'type': 'Java Logging Rules', 'rule': 'Guard Log Statement Java Util', 'abbrev': 'PMD_GLSJU', 'severity': 'Minor'}, {'type': 'Java Logging Rules', 'rule': 'Logger Is Not Static Final', 'abbrev': 'PMD_LINSF', 'severity': 'Minor'}, {'type': 'Java Logging Rules', 'rule': 'More Than One Logger', 'abbrev': 'PMD_MTOL', 'severity': 'Major'}, {'type': 'Java Logging Rules', 'rule': 'System Println', 'abbrev': 'PMD_SP', 'severity': 'Major'}, {'type': 'JavaBean Rules', 'rule': 'Missing Serial Version UID', 'abbrev': 'PMD_MSVUID', 'severity': 'Major'}, {'type': 'Naming Rules', 'rule': 'Avoid Dollar Signs', 'abbrev': 'PMD_ADS', 'severity': 'Minor'}, {'type': 'Naming Rules', 'rule': 'Avoid Field Name Matching Method Name', 'abbrev': 'PMD_AFNMMN', 'severity': 'Minor'}, {'type': 'Naming Rules', 'rule': 'Avoid Field Name Matching Type Name', 'abbrev': 'PMD_AFNMTN', 'severity': 'Minor'}, {'type': 'Naming Rules', 'rule': 'Boolean Get Method Name', 'abbrev': 'PMD_BGMN', 'severity': 'Minor'}, {'type': 'Naming Rules', 'rule': 'Class Naming Conventions', 'abbrev': 'PMD_CNC', 'severity': 'Minor'}, {'type': 'Naming Rules', 'rule': 'Generics Naming', 'abbrev': 'PMD_GN', 'severity': 'Minor'}, {'type': 'Naming Rules', 'rule': 'Method Naming Conventions', 'abbrev': 'PMD_MeNC', 'severity': 'Minor'}, {'type': 'Naming Rules', 'rule': 'Method With Same Name As Enclosing Class', 'abbrev': 'PMD_MWSNAEC', 'severity': 'Minor'}, {'type': 'Naming Rules', 'rule': 'No Package', 'abbrev': 'PMD_NP', 'severity': 'Minor'}, {'type': 'Naming Rules', 'rule': 'Package Case', 'abbrev': 'PMD_PC', 'severity': 'Minor'}, {'type': 'Naming Rules', 'rule': 'Short Class Name', 'abbrev': 'PMD_SCN', 'severity': 'Minor'}, {'type': 'Naming Rules', 'rule': 'Short Method Name', 'abbrev': 'PMD_SMN', 'severity': 'Minor'}, {'type': 'Naming Rules', 'rule': 'Suspicious Constant Field Name', 'abbrev': 'PMD_SCFN', 'severity': 'Minor'}, {'type': 'Naming Rules', 'rule': 'Suspicious Equals Method Name', 'abbrev': 'PMD_SEMN', 'severity': 'Critical'}, {'type': 'Naming Rules', 'rule': 'Suspicious Hashcode Method Name', 'abbrev': 'PMD_SHMN', 'severity': 'Critical'}, {'type': 'Naming Rules', 'rule': 'Variable Naming Conventions', 'abbrev': 'PMD_VNC', 'severity': 'Minor'}, {'type': 'Optimization Rules', 'rule': 'Add Empty String', 'abbrev': 'PMD_AES', 'severity': 'Minor'}, {'type': 'Optimization Rules', 'rule': 'Avoid Array Loops', 'abbrev': 'PMD_AAL', 'severity': 'Major'}, {'type': 'Optimization Rules', 'rule': 'Redundant Field Initializer', 'abbrev': 'PMD_RFI', 'severity': 'Minor'}, {'type': 'Optimization Rules', 'rule': 'Unnecessary Wrapper Object Creation', 'abbrev': 'PMD_UWOC', 'severity': 'Major'}, {'type': 'Optimization Rules', 'rule': 'Use Array List Instead Of Vector', 'abbrev': 'PMD_UALIOV', 'severity': 'Minor'}, {'type': 'Optimization Rules', 'rule': 'Use Arrays As List', 'abbrev': 'PMD_UAAL', 'severity': 'Major'}, {'type': 'Optimization Rules', 'rule': 'Use String Buffer For String Appends', 'abbrev': 'PMD_USBFSA', 'severity': 'Major'}, {'type': 'Security Code Guideline Rules', 'rule': 'Array Is Stored Directly', 'abbrev': 'PMD_AISD', 'severity': 'Major'}, {'type': 'Security Code Guideline Rules', 'rule': 'Method Returns Internal Array', 'abbrev': 'PMD_MRIA', 'severity': 'Major'}, {'type': 'Strict Exception Rules', 'rule': 'Avoid Catching Generic Exception', 'abbrev': 'PMD_ACGE', 'severity': 'Major'}, {'type': 'Strict Exception Rules', 'rule': 'Avoid Catching NPE', 'abbrev': 'PMD_ACNPE', 'severity': 'Critical'}, {'type': 'Strict Exception Rules', 'rule': 'Avoid Catching Throwable', 'abbrev': 'PMD_ACT', 'severity': 'Major'}, {'type': 'Strict Exception Rules', 'rule': 'Avoid Losing Exception Information', 'abbrev': 'PMD_ALEI', 'severity': 'Major'}, {'type': 'Strict Exception Rules', 'rule': 'Avoid Rethrowing Exception', 'abbrev': 'PMD_ARE', 'severity': 'Minor'}, {'type': 'Strict Exception Rules', 'rule': 'Avoid Throwing New Instance Of Same Exception', 'abbrev': 'PMD_ATNIOSE', 'severity': 'Minor'}, {'type': 'Strict Exception Rules', 'rule': 'Avoid Throwing Null Pointer Exception', 'abbrev': 'PMD_ATNPE', 'severity': 'Critical'}, {'type': 'Strict Exception Rules', 'rule': 'Avoid Throwing Raw Exception Types', 'abbrev': 'PMD_ATRET', 'severity': 'Major'}, {'type': 'Strict Exception Rules', 'rule': 'Do Not Extend Java Lang Error', 'abbrev': 'PMD_DNEJLE', 'severity': 'Critical'}, {'type': 'Strict Exception Rules', 'rule': 'Do Not Throw Exception In Finally', 'abbrev': 'PMD_DNTEIF', 'severity': 'Critical'}, {'type': 'Strict Exception Rules', 'rule': 'Exception As Flow Control', 'abbrev': 'PMD_EAFC', 'severity': 'Major'}, {'type': 'String and StringBuffer Rules', 'rule': 'Avoid Duplicate Literals', 'abbrev': 'PMD_ADL', 'severity': 'Major'}, {'type': 'String and StringBuffer Rules', 'rule': 'Avoid String Buffer Field', 'abbrev': 'PMD_ASBF', 'severity': 'Minor'}, {'type': 'String and StringBuffer Rules', 'rule': 'Consecutive Appends Should Reuse', 'abbrev': 'PMD_CASR', 'severity': 'Minor'}, {'type': 'String and StringBuffer Rules', 'rule': 'Consecutive Literal Appends', 'abbrev': 'PMD_CLA', 'severity': 'Minor'}, {'type': 'String and StringBuffer Rules', 'rule': 'Inefficient String Buffering', 'abbrev': 'PMD_ISB', 'severity': 'Minor'}, {'type': 'String and StringBuffer Rules', 'rule': 'String Buffer Instantiation With Char', 'abbrev': 'PMD_SBIWC', 'severity': 'Critical'}, {'type': 'String and StringBuffer Rules', 'rule': 'String Instantiation', 'abbrev': 'PMD_StI', 'severity': 'Minor'}, {'type': 'String and StringBuffer Rules', 'rule': 'String To String', 'abbrev': 'PMD_STS', 'severity': 'Minor'}, {'type': 'String and StringBuffer Rules', 'rule': 'Unnecessary Case Change', 'abbrev': 'PMD_UCC', 'severity': 'Minor'}, {'type': 'String and StringBuffer Rules', 'rule': 'Use Equals To Compare Strings', 'abbrev': 'PMD_UETCS', 'severity': 'Critical'}, {'type': 'Type Resolution Rules', 'rule': 'Clone Method Must Implement Cloneable', 'abbrev': 'PMD_ClMMIC', 'severity': 'Major'}, {'type': 'Type Resolution Rules', 'rule': 'Loose Coupling', 'abbrev': 'PMD_LoC', 'severity': 'Major'}, {'type': 'Type Resolution Rules', 'rule': 'Signature Declare Throws Exception', 'abbrev': 'PMD_SiDTE', 'severity': 'Major'}, {'type': 'Type Resolution Rules', 'rule': 'Unused Imports', 'abbrev': 'PMD_UnI', 'severity': 'Minor'}, {'type': 'Unnecessary and Unused Code Rules', 'rule': 'Unused Local Variable', 'abbrev': 'PMD_ULV', 'severity': 'Major'}, {'type': 'Unnecessary and Unused Code Rules', 'rule': 'Unused Private Field', 'abbrev': 'PMD_UPF', 'severity': 'Major'}, {'type': 'Unnecessary and Unused Code Rules', 'rule': 'Unused Private Method', 'abbrev': 'PMD_UPM', 'severity': 'Major'}]
PMD_SEVERITIES = list(set([d['severity'] for d in PMD_RULES]))
PMD_SEVERITY_MATCH = {d['abbrev']: d['severity'] for d in PMD_RULES}
PMD_GROUP_MATCH = {d['abbrev']: d['type'] for d in PMD_RULES}
PMD_SINCE = {'PMD_ABSALIL': '2012-05-01', 'PMD_ADLIBDC': '2005-11-30', 'PMD_AMUO': '2008-03-25', 'PMD_ATG': '2006-03-29', 'PMD_AUHCIP': '2007-11-17', 'PMD_AUOV': '2006-12-19', 'PMD_BII': '2006-12-19', 'PMD_BI': '2003-07-30', 'PMD_BNC': '2006-10-04', 'PMD_CRS': '2007-11-17', 'PMD_CSR': '2012-05-01', 'PMD_CCEWTA': '2005-11-30', 'PMD_CIS': '2005-05-10', 'PMD_DCTR': '2011-11-04', 'PMD_DUFTFLI': '2011-11-04', 'PMD_DCL': '2003-03-21', 'PMD_ECB': '2002-06-25', 'PMD_EFB': '2002-07-10', 'PMD_EIS': '2002-06-25', 'PMD_EmSB': '2012-05-01', 'PMD_ESNIL': '2004-02-02', 'PMD_ESI': '2004-02-02', 'PMD_ESS': '2002-11-04', 'PMD_ESB': '2003-10-23', 'PMD_ETB': '2002-07-10', 'PMD_EWS': '2002-06-27', 'PMD_EO': '2012-05-01', 'PMD_FLSBWL': '2003-01-22', 'PMD_JI': '2002-11-04', 'PMD_MNC': '2006-01-25', 'PMD_OBEAH': '2002-07-10', 'PMD_RFFB': '2003-04-17', 'PMD_UIS': '2004-02-02', 'PMD_UCT': '2002-06-25', 'PMD_UNCIE': '2006-01-25', 'PMD_UOOI': '2006-01-25', 'PMD_UOM': '2005-09-15', 'PMD_FLMUB': '2002-07-25', 'PMD_IESMUB': '2002-06-27', 'PMD_ISMUB': '2002-11-04', 'PMD_WLMUB': '2002-07-25', 'PMD_CTCNSE': '2004-07-14', 'PMD_PCI': '2004-01-07', 'PMD_AIO': '2003-02-11', 'PMD_AAA': '2007-11-17', 'PMD_APMP': '2012-05-01', 'PMD_AUNC': '2007-11-17', 'PMD_DP': '2005-11-30', 'PMD_DNCGCE': '2008-03-25', 'PMD_DIS': '2004-02-02', 'PMD_ODPL': '2012-05-01', 'PMD_SOE': '2004-02-02', 'PMD_UC': '2002-11-04', 'PMD_ACWAM': '2005-03-23', 'PMD_AbCWAM': '2008-03-25', 'PMD_ATNFS': '2005-01-31', 'PMD_ACI': '2006-01-25', 'PMD_AICICC': '2005-03-23', 'PMD_APFIFC': '2004-12-15', 'PMD_APMIFCNE': '2014-02-11', 'PMD_ARP': '2002-11-04', 'PMD_ASAML': '2005-03-23', 'PMD_BC': '2004-05-19', 'PMD_CWOPCSBF': '2007-11-17', 'PMD_ClR': '2003-10-06', 'PMD_CCOM': '2003-03-21', 'PMD_DLNLISS': '2004-02-02', 'PMD_EMIACSBA': '2007-11-17', 'PMD_EN': '2004-07-14', 'PMD_FDSBASOC': '2012-05-01', 'PMD_FFCBS': '2003-06-19', 'PMD_IO': '2004-10-19', 'PMD_IF': '2004-10-19', 'PMD_ITGC': '2004-10-19', 'PMD_LI': '2012-05-01', 'PMD_MBIS': '2005-03-23', 'PMD_MSMINIC': '2005-03-23', 'PMD_NCLISS': '2004-02-02', 'PMD_NSI': '2004-02-02', 'PMD_NTSS': '2005-11-30', 'PMD_OTAC': '2004-05-19', 'PMD_PLFICIC': '2014-02-11', 'PMD_PLFIC': '2005-09-15', 'PMD_PST': '2006-06-01', 'PMD_REARTN': '2008-03-25', 'PMD_SDFNL': '2004-10-19', 'PMD_SBE': '2003-04-17', 'PMD_SBR': '2002-08-16', 'PMD_SC': '2005-05-10', 'PMD_SF': '2005-05-10', 'PMD_SSSHD': '2002-11-04', 'PMD_TFBFASS': '2008-03-25', 'PMD_UEC': '2005-11-30', 'PMD_UEM': '2005-11-30', 'PMD_ULBR': '2005-09-15', 'PMD_USDF': '2006-03-29', 'PMD_UCIE': '2006-12-19', 'PMD_ULWCC': '2004-10-19', 'PMD_UNAION': '2005-03-23', 'PMD_UV': '2012-05-01', 'PMD_ACF': '2005-03-23', 'PMD_EF': '2004-02-02', 'PMD_FDNCSF': '2004-02-02', 'PMD_FOCSF': '2004-02-02', 'PMD_FO': '2004-02-02', 'PMD_FSBP': '2003-06-19', 'PMD_DIJL': '2002-07-15', 'PMD_DI': '2002-07-15', 'PMD_IFSP': '2003-01-22', 'PMD_TMSI': '2007-11-17', 'PMD_UFQN': '2012-05-01', 'PMD_DNCSE': '2007-11-17', 'PMD_LHNC': '2007-07-20', 'PMD_LISNC': '2007-07-20', 'PMD_MDBASBNC': '2007-07-20', 'PMD_RINC': '2007-07-20', 'PMD_RSINC': '2007-07-20', 'PMD_SEJBFSBF': '2007-11-17', 'PMD_JUASIM': '2003-03-21', 'PMD_JUS': '2002-11-04', 'PMD_JUSS': '2002-11-04', 'PMD_JUTCTMA': '2012-05-01', 'PMD_JUTSIA': '2004-10-19', 'PMD_SBA': '2006-03-29', 'PMD_TCWTC': '2005-03-23', 'PMD_UBA': '2005-03-23', 'PMD_UAEIOAT': '2005-05-10', 'PMD_UANIOAT': '2006-01-25', 'PMD_UASIOAT': '2005-05-10', 'PMD_UATIOAE': '2012-05-01', 'PMD_GDL': '2011-11-04', 'PMD_GLS': '2014-02-11', 'PMD_PL': '2005-09-15', 'PMD_UCEL': '2005-06-21', 'PMD_APST': '2005-06-21', 'PMD_GLSJU': '2014-02-11', 'PMD_LINSF': '2004-10-19', 'PMD_MTOL': '2004-10-19', 'PMD_SP': '2004-12-15', 'PMD_MSVUID': '2005-03-23', 'PMD_ADS': '2004-02-02', 'PMD_AFNMMN': '2005-03-23', 'PMD_AFNMTN': '2005-03-23', 'PMD_BGMN': '2007-07-20', 'PMD_CNC': '2003-07-30', 'PMD_GN': '2011-09-14', 'PMD_MeNC': '2003-07-30', 'PMD_MWSNAEC': '2004-02-02', 'PMD_NP': '2005-09-15', 'PMD_PC': '2005-09-15', 'PMD_SCN': '2012-05-01', 'PMD_SMN': '2002-07-03', 'PMD_SCFN': '2004-10-19', 'PMD_SEMN': '2004-10-19', 'PMD_SHMN': '2004-02-02', 'PMD_VNC': '2003-07-30', 'PMD_AES': '2007-07-20', 'PMD_AAL': '2006-01-25', 'PMD_RFI': '2012-05-01', 'PMD_UWOC': '2006-10-04', 'PMD_UALIOV': '2005-03-23', 'PMD_UAAL': '2006-01-25', 'PMD_USBFSA': '2005-05-10', 'PMD_AISD': '2005-01-31', 'PMD_MRIA': '2005-01-31', 'PMD_ACGE': '2011-09-14', 'PMD_ACNPE': '2004-05-19', 'PMD_ACT': '2003-07-30', 'PMD_ALEI': '2011-09-14', 'PMD_ARE': '2006-10-04', 'PMD_ATNIOSE': '2009-02-08', 'PMD_ATNPE': '2004-05-19', 'PMD_ATRET': '2004-05-19', 'PMD_DNEJLE': '2007-07-20', 'PMD_DNTEIF': '2008-03-25', 'PMD_EAFC': '2004-05-19', 'PMD_ADL': '2002-11-04', 'PMD_ASBF': '2008-03-25', 'PMD_CASR': '2014-02-11', 'PMD_CLA': '2006-01-25', 'PMD_ISB': '2005-11-30', 'PMD_SBIWC': '2006-12-19', 'PMD_StI': '2002-11-04', 'PMD_STS': '2002-11-04', 'PMD_UCC': '2005-09-15', 'PMD_UETCS': '2007-11-17', 'PMD_ClMMIC': '2006-12-19', 'PMD_LoC': '2006-12-19', 'PMD_SiDTE': '2007-07-20', 'PMD_UnI': '2007-07-20', 'PMD_ULV': '2002-06-25', 'PMD_UPF': '2002-06-25', 'PMD_UPM': '2002-07-25'}

DEFAULT_RULES_MAVEN_CLEANED = ['PMD_AUHCIP', 'PMD_CRS', 'PMD_UnI', 'PMD_ULV', 'PMD_UPF', 'PMD_UPM', 'PMD_DIJL', 'PMD_DI', 'PMD_EO', 'PMD_FLSBWL', 'PMD_TMSI', 'PMD_UFQN', 'PMD_CIS', 'PMD_UOM', 'PMD_ABSALIL', 'PMD_ADLIBDC', 'PMD_AMUO', 'PMD_AUOV', 'PMD_BNC', 'PMD_CSR', 'PMD_CCEWTA', 'PMD_DUFTFLI', 'PMD_ECB', 'PMD_EFB', 'PMD_EIS', 'PMD_EmSB', 'PMD_ESNIL', 'PMD_ESS', 'PMD_ESB', 'PMD_ETB', 'PMD_EWS', 'PMD_IFSP', 'PMD_JI', 'PMD_MNC', 'PMD_OBEAH', 'PMD_RFFB', 'PMD_UIS', 'PMD_UCT', 'PMD_UNCIE', 'PMD_UOOI', 'PMD_ATG', 'PMD_DCTR', 'PMD_DCL', 'PMD_BII', 'PMD_BI']

OVERLAP_CHECKSTYLE = ['PMD_BI', 'PMD_UnI', 'PMD_CNC', 'PMD_FDNCSF', 'PMD_PCI', 'PMD_UETCS', 'PMD_VNC', 'PMD_SBE', 'PMD_SBR', 'PMD_DIJL', 'PMD_DI', 'PMD_IFSP', 'PMD_ECB', 'PMD_DLNLISS', 'PMD_PLFIC', 'PMD_OBEAH', 'PMD_CWOPCSBF', 'PMD_ULV',
                      'PMD_ARP', 'PMD_NP', 'PMD_FLMUB', 'PMD_IESMUB', 'PMD_ISMUB', 'PMD_WLMUB', 'PMD_SSSHD', 'PMD_GN', 'PMD_MeNC', 'PMD_MWSNAEC', 'PMD_ACI', 'PMD_AIO', 'PMD_LoC', 'PMD_ATRET', 'PMD_DIS', 'PMD_ACGE', 'PMD_ACT']
OVERLAP_FINDBUGS = ['PMD_PCI', 'PMD_ClMMIC', 'PMD_UETCS', 'PMD_EF', 'PMD_ACF', 'PMD_FDNCSF', 'PMD_OBEAH', 'PMD_CNC', 'PMD_MeNC', 'PMD_VNC', 'PMD_ClR', 'PMD_MSVUID', 'PMD_ADLIBDC', 'PMD_EN', 'PMD_JUSS', 'PMD_TCWTC', 'PMD_SEMN',
                    'PMD_SHMN', 'PMD_MWSNAEC', 'PMD_FSBP', 'PMD_UNAION', 'PMD_DCTR', 'PMD_USDF', 'PMD_BI', 'PMD_ADL', 'PMD_UPM', 'PMD_UPF', 'PMD_ULV', 'PMD_APMIFCNE', 'PMD_REARTN', 'PMD_ACGE', 'PMD_MBIS', 'PMD_SSSHD']

PROJECTS = sorted(PROJECTS)

TABLES_PATH = '../tables/'
FIGURES_PATH = '../figures/'
DATA_PATH = '../asatlib/data/'

In [None]:
# concatenate pmd states to extract introduction of pmd, checkstyle, findbugs, etc
dfs = []
for file in glob.glob(DATA_PATH + '*_pmd_states6.csv'):
    dfs.append(pd.read_csv(file))
df = pd.concat(dfs, sort=False, ignore_index=True)

In [None]:
# this is used to find the commit on which each ASAT was introduced or removed from the build configuration
final_checkstyle = {}
final_findbugs = {}
final_spotbugs = {}
final_pmd = {}

for project_name in PROJECTS:
    checkstyle = {'introduced': [], 'removed': [], 'state': False}
    findbugs = {'introduced': [], 'removed': [], 'state': False}
    spotbugs = {'introduced': [], 'removed': [], 'state': False}
    pmd = {'introduced': [], 'removed': [], 'state': False, 'pom_introduced': None}

    contains_project = False
    for var, name in [(checkstyle, 'use_checkstyle'), (findbugs, 'use_findbugs'), (spotbugs, 'use_spotbugs'), (pmd, 'use_pmd')]:
        for idx, row in df[(df['project'] == project_name)].iterrows():
            contains_project = True

            if name == 'use_pmd' and not pmd['pom_introduced']:
                pmd['pom_introduced'] = row['revision']

            if row[name] != var['state']:
                if row[name]:
                    var['introduced'].append(row['revision'])
                else:
                    var['removed'].append(row['revision'])
                var['state'] = row[name]


    if contains_project:
        del checkstyle['state']
        del findbugs['state']
        del spotbugs['state']
        del pmd['state']

        final_checkstyle[project_name] = checkstyle
        final_findbugs[project_name] = findbugs
        final_spotbugs[project_name] = spotbugs
        final_pmd[project_name] = pmd
    else:
        print('missing', project_name)

In [None]:
pmd = final_pmd
CHECKSTYLE = final_checkstyle
FINDBUGS = final_findbugs
SPOTBUGS = final_spotbugs

In [None]:
# helper functions for apply to aggregate data from multiple columns to one

def overlap_other_asats(row, rule):
    ret = False
    if row['use_checkstyle'] is True and rule in OVERLAP_CHECKSTYLE:
        ret = True
    if row['use_findbugs'] is True and rule in OVERLAP_FINDBUGS:
        ret = True
    return ret

def overlap_other_asats_remove(row, rule):
    ret = False
    if rule in OVERLAP_CHECKSTYLE or rule in OVERLAP_FINDBUGS:
        ret = True
    return ret

def aggregate_type(row, rule_type='brace rules', time_correction=False, lloc_type='effective_', filter_overlap2=False):
    agg = 0
    for rule, since in PMD_SINCE.items():
        if filter_overlap2 and overlap_other_asats_remove(row, rule):
            continue
        if row[lloc_type + 'code_' + rule] > 0 and PMD_GROUP_MATCH[rule].lower() == rule_type.lower():
            if time_correction and datetime.strptime(row['date'], '%Y-%m-%d %H:%M:%S') > datetime.strptime(since, '%Y-%m-%d'):
                agg += row[lloc_type + 'code_' + rule]
            elif not time_correction:
                agg += row[lloc_type + 'code_' + rule]
    return agg

def aggregate_severity(row, severity='minor', time_correction=False, lloc_type='effective_', filter_overlap2=False):
    agg = 0
    for rule, since in PMD_SINCE.items():
        if filter_overlap2 and overlap_other_asats_remove(row, rule):
            continue
        if row[lloc_type + 'code_' + rule] > 0 and PMD_SEVERITY_MATCH[rule].lower() == severity.lower():
            if time_correction and datetime.strptime(row['date'], '%Y-%m-%d %H:%M:%S') > datetime.strptime(since, '%Y-%m-%d'):
                agg += row[lloc_type + 'code_' + rule]
            elif not time_correction:
                agg += row[lloc_type + 'code_' + rule]
    return agg

def aggregate_effective(row, time_correction=False, filtered=False, filter_overlap=False, filter_overlap2=False):
    agg = 0
    for rule, since in PMD_SINCE.items():
        if filtered and PMD_GROUP_MATCH[rule].lower() in ['brace rules', 'naming rules']:
            continue
        if filter_overlap and overlap_other_asats(row, rule):
            continue
        if filter_overlap2 and overlap_other_asats_remove(row, rule):
            continue
        if row['effective_code_' + rule] > 0 and rule in row['effective_rules'].split(','):
            if time_correction and datetime.strptime(row['date'], '%Y-%m-%d %H:%M:%S') > datetime.strptime(since, '%Y-%m-%d'):
                agg += row['effective_code_' + rule]
            elif not time_correction:
                agg += row['effective_code_' + rule]
    return agg

def aggregate_defaults(row, time_correction=False, filtered=False, filter_overlap=False, filter_overlap2=False):
    agg = 0
    for rule, since in PMD_SINCE.items():
        if rule not in DEFAULT_RULES_MAVEN_CLEANED:
            continue
        if filtered and PMD_GROUP_MATCH[rule].lower() in ['brace rules', 'naming rules']:
            continue
        if filter_overlap and overlap_other_asats(row, rule):
            continue
        if filter_overlap2 and overlap_other_asats_remove(row, rule):
            continue
        if time_correction and datetime.strptime(row['date'], '%Y-%m-%d %H:%M:%S') > datetime.strptime(since, '%Y-%m-%d'):
            agg += row['effective_code_' + rule]
        elif not time_correction:
            agg += row['effective_code_' + rule]
    return agg

def aggregate_all(row, time_correction=False, filtered=False, filter_overlap=False, filter_overlap2=False):
    agg = 0
    for rule, since in PMD_SINCE.items():
        if filtered and PMD_GROUP_MATCH[rule].lower() in ['brace rules', 'naming rules']:
            continue
        if filter_overlap and overlap_other_asats(row, rule):
            continue
        if filter_overlap2 and overlap_other_asats_remove(row, rule):
            continue
        if time_correction and datetime.strptime(row['date'], '%Y-%m-%d %H:%M:%S') > datetime.strptime(since, '%Y-%m-%d'):
            agg += row['effective_code_' + rule]
        elif not time_correction:
            agg += row['effective_code_' + rule]
    return agg

def count_all_time_corrected(row, filter_overlap=False, filter_overlap2=False):
    count = 0
    for rule, since in PMD_SINCE.items():
        if filter_overlap and overlap_other_asats(row, rule):
            continue
        if filter_overlap2 and overlap_other_asats_remove(row, rule):
            continue
        if datetime.strptime(row['date'], '%Y-%m-%d %H:%M:%S') > datetime.strptime(since, '%Y-%m-%d'):
            count += 1
    return count

def count_defaults_time_corrected(row, filter_overlap=False, filter_overlap2=False):
    count = 0
    for rule, since in PMD_SINCE.items():
        if rule not in DEFAULT_RULES_MAVEN_CLEANED:
            continue
        if filter_overlap and overlap_other_asats(row, rule):
            continue
        if filter_overlap2 and overlap_other_asats_remove(row, rule):
            continue
        if datetime.strptime(row['date'], '%Y-%m-%d %H:%M:%S') > datetime.strptime(since, '%Y-%m-%d'):
            count += 1
    return count

def count_effective_time_corrected(row, filter_overlap=False, filter_overlap2=False):
    count = 0
    for rule, since in PMD_SINCE.items():
        if filter_overlap and overlap_other_asats(row, rule):
            continue
        if filter_overlap2 and overlap_other_asats_remove(row, rule):
            continue
        if rule in row['effective_rules'].split(',') and datetime.strptime(row['date'], '%Y-%m-%d %H:%M:%S') > datetime.strptime(since, '%Y-%m-%d'):
            count += 1
    return count

def aggregate_project(project_name, year=None):
    df = pd.read_csv(DATA_PATH + '{}_coarse6.csv'.format(project_name))
    df.drop(axis=1, columns=['test_loc', 'test_lloc', 'test_mccc'], inplace=True)

    df['datetime'] = pd.to_datetime(df['date'])

    # we now have the effective rules in the coarse file
    df.loc[df['effective_rules'].isnull(), 'effective_rules'] = ''  # set to '' if NaN is there (happens when we save as CSV '' gets not written as empty string "" but as empty value)
    
    # determine pmd usage via commits introduced, removed
    df['use_pmd'] = 0
    df['use_checkstyle'] = 0
    df['use_findbugs'] = 0
    
    # include introduced, removed markers into the coarse data
    for var, name in [(pmd, 'use_pmd'), (CHECKSTYLE, 'use_checkstyle'), (FINDBUGS, 'use_findbugs')]:
        # removed is always shorter or equal to introduced
        removed = var[project_name]['removed']
        if len(removed) < len(var[project_name]['introduced']):
            removed.append(None)

        # usage of pmd tracking
        for start, end in zip(var[project_name]['introduced'], removed):
            if not end:
                # from start to end of df use_pmd = 1
                start_idx = df[df['revision'] == start].index.values[0]

                df.loc[start_idx:, name] = 1
            else:
                # from start to end use_pmd = 1
                start_idx = df[df['revision'] == start].index.values[0]
                end_idx = df[df['revision'] == end].index.values[0]

                df.loc[start_idx:end_idx, name] = 1
            
    # code_lloc is everything except /test/
    df['code_klloc'] = df['code_lloc'] / 1000
    
    # effective_code_lloc is everything in source directories as defined in pom.xml
    df['effective_code_klloc'] = df['effective_code_lloc'] / 1000
    
    # all full not effective for analysis before Maven (no pom.xml to extract source directories)
    df['all_full'] = df[['code_' + r['abbrev'] for r in PMD_RULES]].sum(axis=1)
    df['all_full_ratio'] = df['all_full'] / df['code_klloc']
    
    # severities
    df['minor_full'] = df.apply(aggregate_severity, axis=1, severity='minor', time_correction=False, lloc_type='')
    df['minor_full_ratio'] = df['minor_full'] / df['code_klloc']
    df['major_full'] = df.apply(aggregate_severity, axis=1, severity='major', time_correction=False, lloc_type='')
    df['major_full_ratio'] = df['major_full'] / df['code_klloc']
    df['critical_full'] = df.apply(aggregate_severity, axis=1, severity='critical', time_correction=False, lloc_type='')
    df['critical_full_ratio'] = df['critical_full'] / df['code_klloc']
    
    # types
    for rule_type in [r.lower() for r in PMD_GROUP_MATCH.values()]:
        df[rule_type + '_full'] = df.apply(aggregate_type, axis=1, rule_type=rule_type, time_correction=False, lloc_type='')
        df[rule_type + '_full_ratio'] = df[rule_type + '_full'] / df['code_klloc']

    # severities without overlap
    df['minor_full_overlap2'] = df.apply(aggregate_severity, axis=1, severity='minor', time_correction=False, lloc_type='', filter_overlap2=True)
    df['minor_full_overlap2_ratio'] = df['minor_full_overlap2'] / df['code_klloc']
    df['major_full_overlap2'] = df.apply(aggregate_severity, axis=1, severity='major', time_correction=False, lloc_type='', filter_overlap2=True)
    df['major_full_overlap2_ratio'] = df['major_full_overlap2'] / df['code_klloc']
    df['critical_full_overlap2'] = df.apply(aggregate_severity, axis=1, severity='critical', time_correction=False, lloc_type='', filter_overlap2=True)
    df['critical_full_overlap2_ratio'] = df['critical_full_overlap2'] / df['code_klloc']
        
    # types without overlap
    for rule_type in [r.lower() for r in PMD_GROUP_MATCH.values()]:
        df[rule_type + '_full_overlap2'] = df.apply(aggregate_type, axis=1, rule_type=rule_type, time_correction=False, lloc_type='', filter_overlap2=True)
        df[rule_type + '_full_overlap2_ratio'] = df[rule_type + '_full_overlap2'] / df['code_klloc']
    
    # new aggregates    
    df['all'] = df[['effective_code_' + r['abbrev'] for r in PMD_RULES]].sum(axis=1)
    df['all_ratio'] = df['all'] / df['effective_code_klloc']
    df['all_time'] = df.apply(aggregate_all, axis=1, time_correction=True)
    df['all_time_ratio'] = df['all_time'] / df['effective_code_klloc']
    
    df['default'] = df.apply(aggregate_defaults, axis=1, time_correction=False)
    df['default_ratio'] = df['default'] / df['effective_code_klloc']
    df['default_time'] = df.apply(aggregate_defaults, axis=1, time_correction=True)
    df['default_time_ratio'] = df['default_time'] / df['effective_code_klloc']
    
    df['effective'] = df.apply(aggregate_effective, axis=1, time_correction=False)
    df['effective_ratio'] = df['effective'] / df['effective_code_klloc']
    df['effective_time'] = df.apply(aggregate_effective, axis=1, time_correction=True)
    df['effective_time_ratio'] = df['effective_time'] / df['effective_code_klloc']

    # removed overlap rules depending on current use (if checkstyle is in use remove overlapping rules with checkstyle)
    df['all_overlap'] = df.apply(aggregate_all, axis=1, time_correction=False, filtered=False, filter_overlap=True)
    df['all_overlap_ratio'] = df['all_overlap'] / df['effective_code_klloc']
    df['all_time_overlap'] = df.apply(aggregate_all, axis=1, time_correction=True, filtered=False, filter_overlap=True)
    df['all_time_overlap_ratio'] = df['all_time_overlap'] / df['effective_code_klloc']
    df['default_time_overlap'] = df.apply(aggregate_defaults, axis=1, time_correction=True, filtered=False, filter_overlap=True)
    df['default_time_overlap_ratio'] = df['default_time_overlap'] / df['effective_code_klloc']
    df['effective_time_overlap'] = df.apply(aggregate_effective, axis=1, time_correction=True, filtered=False, filter_overlap=True)
    df['effective_time_overlap_ratio'] = df['effective_time_overlap'] / df['effective_code_klloc']

    # removed overlap rules regardless of actual use
    df['all_overlap2'] = df.apply(aggregate_all, axis=1, time_correction=False, filtered=False, filter_overlap=False, filter_overlap2=True)
    df['all_overlap2_ratio'] = df['all_overlap2'] / df['effective_code_klloc']
    df['all_time_overlap2'] = df.apply(aggregate_all, axis=1, time_correction=True, filtered=False, filter_overlap=False, filter_overlap2=True)
    df['all_time_overlap2_ratio'] = df['all_time_overlap2'] / df['effective_code_klloc']
    df['default_time_overlap2'] = df.apply(aggregate_defaults, axis=1, time_correction=True, filtered=False, filter_overlap=False, filter_overlap2=True)
    df['default_time_overlap2_ratio'] = df['default_time_overlap2'] / df['effective_code_klloc']
    df['effective_time_overlap2'] = df.apply(aggregate_effective, axis=1, time_correction=True, filtered=False, filter_overlap=False, filter_overlap2=True)
    df['effective_time_overlap2_ratio'] = df['effective_time_overlap2'] / df['effective_code_klloc']

    # counts
    df['count_all_time'] = df.apply(count_all_time_corrected, axis=1)
    df['count_default_time'] = df.apply(count_defaults_time_corrected, axis=1)
    df['count_effective_time'] = df.apply(count_effective_time_corrected, axis=1)

    df['year'] = df['datetime'].dt.year
    
    # -- cleaning --
    
    # drop everything after 2017 because we startet collecting the data in 2018
    df.drop(df[df['datetime'] > datetime(2017, 12, 31, 23, 59, 59)].index, inplace=True)

    df['use_maven'] = 1
    
    # get index in which pom.xml was introduced
    end_idx = df[df['revision'] == pmd[project_name]['pom_introduced']].index.values[0] - 1  # loc is end inclusive so we need to subract one
    pom_intro_year = df[df['revision'] == pmd[project_name]['pom_introduced']]['year'].values[0]
    
    # set use_maven to 0 for every entry before pom.xml
    df.loc[:end_idx, 'use_maven'] = 0
    
    # drop first year because it will be useless for complete year analysis
    first_year = df['year'].unique()[0]
    df.drop(df[df['year'] == first_year].index, inplace=True)
    
    # drop revisions without code
    no_code = df[(df['effective_code_lloc'].isnull()) & (df['code_lloc'].isnull())]
    print('dropping {} commits without code'.format(len(no_code)))
    df.drop(no_code.index, inplace=True)
    
    return df

In [None]:
import timeit

dfs = []
estimate = []
for project_name in PROJECTS:
    start = timeit.default_timer()
    if os.path.isfile('../data/{}_aggregated2.pickle'.format(project_name)):
        print(project_name, 'already exists, skipping')
        continue
    tmp = aggregate_project(project_name)
    tmp.to_pickle('../data/{}_aggregated2.pickle'.format(project_name))
    dfs.append(tmp)
    end = timeit.default_timer() - start
    estimate.append(end)
    est = sum(estimate) / len(estimate) / 60 / 60
    print(project_name, end, 'estimated per project:', est, 'hours')

In [None]:
# concat and save concatenated
dfs = []
for project_name in PROJECTS:
    tmp = pd.read_pickle('../data/{}_aggregated2.pickle'.format(project_name))
    dfs.append(tmp)
dfall = pd.concat(dfs, ignore_index=True)

dfall.to_pickle('../data/aggregated2_full.pickle')

print('commits', len(dfall))
print('projects', dfall['project'].nunique())