In [None]:
%matplotlib inline

import os
import itertools
import pprint
import copy
import datetime
import math
import statistics

import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib.dates as mdates
import pandas as pd
import numpy as np

from datetime import datetime, timedelta
from matplotlib.lines import Line2D
from scipy import stats
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import f_regression
from sklearn.metrics import mean_squared_error, r2_score

# PMD Rules from Sourcemeter homepage https://www.sourcemeter.com/resources/java/ 2018-07-24
PMD_RULES = [{'type': 'Basic Rules', 'rule': 'Avoid Branching Statement As Last In Loop', 'abbrev': 'PMD_ABSALIL', 'severity': 'Major'}, {'type': 'Basic Rules', 'rule': 'Avoid Decimal Literals In Big Decimal Constructor', 'abbrev': 'PMD_ADLIBDC', 'severity': 'Critical'}, {'type': 'Basic Rules', 'rule': 'Avoid Multiple Unary Operators', 'abbrev': 'PMD_AMUO', 'severity': 'Major'}, {'type': 'Basic Rules', 'rule': 'Avoid Thread Group', 'abbrev': 'PMD_ATG', 'severity': 'Critical'}, {'type': 'Basic Rules', 'rule': 'Avoid Using Hard Coded IP', 'abbrev': 'PMD_AUHCIP', 'severity': 'Major'}, {'type': 'Basic Rules', 'rule': 'Avoid Using Octal Values', 'abbrev': 'PMD_AUOV', 'severity': 'Critical'}, {'type': 'Basic Rules', 'rule': 'Big Integer Instantiation', 'abbrev': 'PMD_BII', 'severity': 'Minor'}, {'type': 'Basic Rules', 'rule': 'Boolean Instantiation', 'abbrev': 'PMD_BI', 'severity': 'Minor'}, {'type': 'Basic Rules', 'rule': 'Broken Null Check', 'abbrev': 'PMD_BNC', 'severity': 'Critical'}, {'type': 'Basic Rules', 'rule': 'Check Result Set', 'abbrev': 'PMD_CRS', 'severity': 'Critical'}, {'type': 'Basic Rules', 'rule': 'Check Skip Result', 'abbrev': 'PMD_CSR', 'severity': 'Critical'}, {'type': 'Basic Rules', 'rule': 'Class Cast Exception With To Array', 'abbrev': 'PMD_CCEWTA', 'severity': 'Critical'}, {'type': 'Basic Rules', 'rule': 'Collapsible If Statements', 'abbrev': 'PMD_CIS', 'severity': 'Minor'}, {'type': 'Basic Rules', 'rule': 'Dont Call Thread Run', 'abbrev': 'PMD_DCTR', 'severity': 'Critical'}, {'type': 'Basic Rules', 'rule': 'Dont Use Float Type For Loop Indices', 'abbrev': 'PMD_DUFTFLI', 'severity': 'Critical'}, {'type': 'Basic Rules', 'rule': 'Double Checked Locking', 'abbrev': 'PMD_DCL', 'severity': 'Critical'}, {'type': 'Basic Rules', 'rule': 'Empty Catch Block', 'abbrev': 'PMD_ECB', 'severity': 'Critical'}, {'type': 'Basic Rules', 'rule': 'Empty Finally Block', 'abbrev': 'PMD_EFB', 'severity': 'Minor'}, {'type': 'Basic Rules', 'rule': 'Empty If Stmt', 'abbrev': 'PMD_EIS', 'severity': 'Major'}, {'type': 'Basic Rules', 'rule': 'Empty Statement Block', 'abbrev': 'PMD_EmSB', 'severity': 'Minor'}, {'type': 'Basic Rules', 'rule': 'Empty Statement Not In Loop', 'abbrev': 'PMD_ESNIL', 'severity': 'Minor'}, {'type': 'Basic Rules', 'rule': 'Empty Static Initializer', 'abbrev': 'PMD_ESI', 'severity': 'Minor'}, {'type': 'Basic Rules', 'rule': 'Empty Switch Statements', 'abbrev': 'PMD_ESS', 'severity': 'Major'}, {'type': 'Basic Rules', 'rule': 'Empty Synchronized Block', 'abbrev': 'PMD_ESB', 'severity': 'Major'}, {'type': 'Basic Rules', 'rule': 'Empty Try Block', 'abbrev': 'PMD_ETB', 'severity': 'Major'}, {'type': 'Basic Rules', 'rule': 'Empty While Stmt', 'abbrev': 'PMD_EWS', 'severity': 'Critical'}, {'type': 'Basic Rules', 'rule': 'Extends Object', 'abbrev': 'PMD_EO', 'severity': 'Minor'}, {'type': 'Basic Rules', 'rule': 'For Loop Should Be While Loop', 'abbrev': 'PMD_FLSBWL', 'severity': 'Minor'}, {'type': 'Basic Rules', 'rule': 'Jumbled Incrementer', 'abbrev': 'PMD_JI', 'severity': 'Critical'}, {'type': 'Basic Rules', 'rule': 'Misplaced Null Check', 'abbrev': 'PMD_MNC', 'severity': 'Critical'}, {'type': 'Basic Rules', 'rule': 'Override Both Equals And Hashcode', 'abbrev': 'PMD_OBEAH', 'severity': 'Critical'}, {'type': 'Basic Rules', 'rule': 'Return From Finally Block', 'abbrev': 'PMD_RFFB', 'severity': 'Critical'}, {'type': 'Basic Rules', 'rule': 'Unconditional If Statement', 'abbrev': 'PMD_UIS', 'severity': 'Major'}, {'type': 'Basic Rules', 'rule': 'Unnecessary Conversion Temporary', 'abbrev': 'PMD_UCT', 'severity': 'Minor'}, {'type': 'Basic Rules', 'rule': 'Unused Null Check In Equals', 'abbrev': 'PMD_UNCIE', 'severity': 'Critical'}, {'type': 'Basic Rules', 'rule': 'Useless Operation On Immutable', 'abbrev': 'PMD_UOOI', 'severity': 'Critical'}, {'type': 'Basic Rules', 'rule': 'Useless Overriding Method', 'abbrev': 'PMD_UOM', 'severity': 'Minor'}, {'type': 'Brace Rules', 'rule': 'For Loops Must Use Braces', 'abbrev': 'PMD_FLMUB', 'severity': 'Minor'}, {'type': 'Brace Rules', 'rule': 'If Else Stmts Must Use Braces', 'abbrev': 'PMD_IESMUB', 'severity': 'Minor'}, {'type': 'Brace Rules', 'rule': 'If Stmts Must Use Braces', 'abbrev': 'PMD_ISMUB', 'severity': 'Minor'}, {'type': 'Brace Rules', 'rule': 'While Loops Must Use Braces', 'abbrev': 'PMD_WLMUB', 'severity': 'Minor'}, {'type': 'Clone Implementation Rules', 'rule': 'Clone Throws Clone Not Supported Exception', 'abbrev': 'PMD_CTCNSE', 'severity': 'Major'}, {'type': 'Clone Implementation Rules', 'rule': 'Proper Clone Implementation', 'abbrev': 'PMD_PCI', 'severity': 'Critical'}, {'type': 'Controversial Rules', 'rule': 'Assignment In Operand', 'abbrev': 'PMD_AIO', 'severity': 'Minor'}, {'type': 'Controversial Rules', 'rule': 'Avoid Accessibility Alteration', 'abbrev': 'PMD_AAA', 'severity': 'Major'}, {'type': 'Controversial Rules', 'rule': 'Avoid Prefixing Method Parameters', 'abbrev': 'PMD_APMP', 'severity': 'Minor'}, {'type': 'Controversial Rules', 'rule': 'Avoid Using Native Code', 'abbrev': 'PMD_AUNC', 'severity': 'Major'}, {'type': 'Controversial Rules', 'rule': 'Default Package', 'abbrev': 'PMD_DP', 'severity': 'Minor'}, {'type': 'Controversial Rules', 'rule': 'Do Not Call Garbage Collection Explicitly', 'abbrev': 'PMD_DNCGCE', 'severity': 'Major'}, {'type': 'Controversial Rules', 'rule': 'Dont Import Sun', 'abbrev': 'PMD_DIS', 'severity': 'Major'}, {'type': 'Controversial Rules', 'rule': 'One Declaration Per Line', 'abbrev': 'PMD_ODPL', 'severity': 'Minor'}, {'type': 'Controversial Rules', 'rule': 'Suspicious Octal Escape', 'abbrev': 'PMD_SOE', 'severity': 'Major'}, {'type': 'Controversial Rules', 'rule': 'Unnecessary Constructor', 'abbrev': 'PMD_UC', 'severity': 'Minor'}, {'type': 'Design Rules', 'rule': 'Abstract Class Without Abstract Method', 'abbrev': 'PMD_ACWAM', 'severity': 'Minor'}, {'type': 'Design Rules', 'rule': 'Abstract Class Without Any Method', 'abbrev': 'PMD_AbCWAM', 'severity': 'Minor'}, {'type': 'Design Rules', 'rule': 'Assignment To Non Final Static', 'abbrev': 'PMD_ATNFS', 'severity': 'Critical'}, {'type': 'Design Rules', 'rule': 'Avoid Constants Interface', 'abbrev': 'PMD_ACI', 'severity': 'Minor'}, {'type': 'Design Rules', 'rule': 'Avoid Instanceof Checks In Catch Clause', 'abbrev': 'PMD_AICICC', 'severity': 'Major'}, {'type': 'Design Rules', 'rule': 'Avoid Protected Field In Final Class', 'abbrev': 'PMD_APFIFC', 'severity': 'Minor'}, {'type': 'Design Rules', 'rule': 'Avoid Protected Method In Final Class Not Extending', 'abbrev': 'PMD_APMIFCNE', 'severity': 'Minor'}, {'type': 'Design Rules', 'rule': 'Avoid Reassigning Parameters', 'abbrev': 'PMD_ARP', 'severity': 'Minor'}, {'type': 'Design Rules', 'rule': 'Avoid Synchronized At Method Level', 'abbrev': 'PMD_ASAML', 'severity': 'Minor'}, {'type': 'Design Rules', 'rule': 'Bad Comparison', 'abbrev': 'PMD_BC', 'severity': 'Critical'}, {'type': 'Design Rules', 'rule': 'Class With Only Private Constructors Should Be Final', 'abbrev': 'PMD_CWOPCSBF', 'severity': 'Minor'}, {'type': 'Design Rules', 'rule': 'Close Resource', 'abbrev': 'PMD_ClR', 'severity': 'Critical'}, {'type': 'Design Rules', 'rule': 'Constructor Calls Overridable Method', 'abbrev': 'PMD_CCOM', 'severity': 'Critical'}, {'type': 'Design Rules', 'rule': 'Default Label Not Last In Switch Stmt', 'abbrev': 'PMD_DLNLISS', 'severity': 'Minor'}, {'type': 'Design Rules', 'rule': 'Empty Method In Abstract Class Should Be Abstract', 'abbrev': 'PMD_EMIACSBA', 'severity': 'Major'}, {'type': 'Design Rules', 'rule': 'Equals Null', 'abbrev': 'PMD_EN', 'severity': 'Critical'}, {'type': 'Design Rules', 'rule': 'Field Declarations Should Be At Start Of Class', 'abbrev': 'PMD_FDSBASOC', 'severity': 'Minor'}, {'type': 'Design Rules', 'rule': 'Final Field Could Be Static', 'abbrev': 'PMD_FFCBS', 'severity': 'Minor'}, {'type': 'Design Rules', 'rule': 'Idempotent Operations', 'abbrev': 'PMD_IO', 'severity': 'Major'}, {'type': 'Design Rules', 'rule': 'Immutable Field', 'abbrev': 'PMD_IF', 'severity': 'Minor'}, {'type': 'Design Rules', 'rule': 'Instantiation To Get Class', 'abbrev': 'PMD_ITGC', 'severity': 'Major'}, {'type': 'Design Rules', 'rule': 'Logic Inversion', 'abbrev': 'PMD_LI', 'severity': 'Minor'}, {'type': 'Design Rules', 'rule': 'Missing Break In Switch', 'abbrev': 'PMD_MBIS', 'severity': 'Critical'}, {'type': 'Design Rules', 'rule': 'Missing Static Method In Non Instantiatable Class', 'abbrev': 'PMD_MSMINIC', 'severity': 'Minor'}, {'type': 'Design Rules', 'rule': 'Non Case Label In Switch Statement', 'abbrev': 'PMD_NCLISS', 'severity': 'Critical'}, {'type': 'Design Rules', 'rule': 'Non Static Initializer', 'abbrev': 'PMD_NSI', 'severity': 'Critical'}, {'type': 'Design Rules', 'rule': 'Non Thread Safe Singleton', 'abbrev': 'PMD_NTSS', 'severity': 'Critical'}, {'type': 'Design Rules', 'rule': 'Optimizable To Array Call', 'abbrev': 'PMD_OTAC', 'severity': 'Major'}, {'type': 'Design Rules', 'rule': 'Position Literals First In Case Insensitive Comparisons', 'abbrev': 'PMD_PLFICIC', 'severity': 'Critical'}, {'type': 'Design Rules', 'rule': 'Position Literals First In Comparisons', 'abbrev': 'PMD_PLFIC', 'severity': 'Critical'}, {'type': 'Design Rules', 'rule': 'Preserve Stack Trace', 'abbrev': 'PMD_PST', 'severity': 'Major'}, {'type': 'Design Rules', 'rule': 'Return Empty Array Rather Than Null', 'abbrev': 'PMD_REARTN', 'severity': 'Major'}, {'type': 'Design Rules', 'rule': 'Simple Date Format Needs Locale', 'abbrev': 'PMD_SDFNL', 'severity': 'Minor'}, {'type': 'Design Rules', 'rule': 'Simplify Boolean Expressions', 'abbrev': 'PMD_SBE', 'severity': 'Minor'}, {'type': 'Design Rules', 'rule': 'Simplify Boolean Returns', 'abbrev': 'PMD_SBR', 'severity': 'Minor'}, {'type': 'Design Rules', 'rule': 'Simplify Conditional', 'abbrev': 'PMD_SC', 'severity': 'Minor'}, {'type': 'Design Rules', 'rule': 'Singular Field', 'abbrev': 'PMD_SF', 'severity': 'Major'}, {'type': 'Design Rules', 'rule': 'Switch Stmts Should Have Default', 'abbrev': 'PMD_SSSHD', 'severity': 'Major'}, {'type': 'Design Rules', 'rule': 'Too Few Branches For ASwitch Statement', 'abbrev': 'PMD_TFBFASS', 'severity': 'Minor'}, {'type': 'Design Rules', 'rule': 'Uncommented Empty Constructor', 'abbrev': 'PMD_UEC', 'severity': 'Minor'}, {'type': 'Design Rules', 'rule': 'Uncommented Empty Method', 'abbrev': 'PMD_UEM', 'severity': 'Minor'}, {'type': 'Design Rules', 'rule': 'Unnecessary Local Before Return', 'abbrev': 'PMD_ULBR', 'severity': 'Minor'}, {'type': 'Design Rules', 'rule': 'Unsynchronized Static Date Formatter', 'abbrev': 'PMD_USDF', 'severity': 'Critical'}, {'type': 'Design Rules', 'rule': 'Use Collection Is Empty', 'abbrev': 'PMD_UCIE', 'severity': 'Major'}, {'type': 'Design Rules', 'rule': 'Use Locale With Case Conversions', 'abbrev': 'PMD_ULWCC', 'severity': 'Critical'}, {'type': 'Design Rules', 'rule': 'Use Notify All Instead Of Notify', 'abbrev': 'PMD_UNAION', 'severity': 'Critical'}, {'type': 'Design Rules', 'rule': 'Use Varargs', 'abbrev': 'PMD_UV', 'severity': 'Minor'}, {'type': 'Finalizer Rules', 'rule': 'Avoid Calling Finalize', 'abbrev': 'PMD_ACF', 'severity': 'Major'}, {'type': 'Finalizer Rules', 'rule': 'Empty Finalizer', 'abbrev': 'PMD_EF', 'severity': 'Minor'}, {'type': 'Finalizer Rules', 'rule': 'Finalize Does Not Call Super Finalize', 'abbrev': 'PMD_FDNCSF', 'severity': 'Critical'}, {'type': 'Finalizer Rules', 'rule': 'Finalize Only Calls Super Finalize', 'abbrev': 'PMD_FOCSF', 'severity': 'Minor'}, {'type': 'Finalizer Rules', 'rule': 'Finalize Overloaded', 'abbrev': 'PMD_FO', 'severity': 'Critical'}, {'type': 'Finalizer Rules', 'rule': 'Finalize Should Be Protected', 'abbrev': 'PMD_FSBP', 'severity': 'Critical'}, {'type': 'Import Statement Rules', 'rule': 'Dont Import Java Lang', 'abbrev': 'PMD_DIJL', 'severity': 'Minor'}, {'type': 'Import Statement Rules', 'rule': 'Duplicate Imports', 'abbrev': 'PMD_DI', 'severity': 'Minor'}, {'type': 'Import Statement Rules', 'rule': 'Import From Same Package', 'abbrev': 'PMD_IFSP', 'severity': 'Minor'}, {'type': 'Import Statement Rules', 'rule': 'Too Many Static Imports', 'abbrev': 'PMD_TMSI', 'severity': 'Major'}, {'type': 'Import Statement Rules', 'rule': 'Unnecessary Fully Qualified Name', 'abbrev': 'PMD_UFQN', 'severity': 'Minor'}, {'type': 'J2EE Rules', 'rule': 'Do Not Call System Exit', 'abbrev': 'PMD_DNCSE', 'severity': 'Critical'}, {'type': 'J2EE Rules', 'rule': 'Local Home Naming Convention', 'abbrev': 'PMD_LHNC', 'severity': 'Major'}, {'type': 'J2EE Rules', 'rule': 'Local Interface Session Naming Convention', 'abbrev': 'PMD_LISNC', 'severity': 'Major'}, {'type': 'J2EE Rules', 'rule': 'MDBAnd Session Bean Naming Convention', 'abbrev': 'PMD_MDBASBNC', 'severity': 'Major'}, {'type': 'J2EE Rules', 'rule': 'Remote Interface Naming Convention', 'abbrev': 'PMD_RINC', 'severity': 'Major'}, {'type': 'J2EE Rules', 'rule': 'Remote Session Interface Naming Convention', 'abbrev': 'PMD_RSINC', 'severity': 'Major'}, {'type': 'J2EE Rules', 'rule': 'Static EJBField Should Be Final', 'abbrev': 'PMD_SEJBFSBF', 'severity': 'Critical'}, {'type': 'JUnit Rules', 'rule': 'JUnit Assertions Should Include Message', 'abbrev': 'PMD_JUASIM', 'severity': 'Minor'}, {'type': 'JUnit Rules', 'rule': 'JUnit Spelling', 'abbrev': 'PMD_JUS', 'severity': 'Critical'}, {'type': 'JUnit Rules', 'rule': 'JUnit Static Suite', 'abbrev': 'PMD_JUSS', 'severity': 'Critical'}, {'type': 'JUnit Rules', 'rule': 'JUnit Test Contains Too Many Asserts', 'abbrev': 'PMD_JUTCTMA', 'severity': 'Minor'}, {'type': 'JUnit Rules', 'rule': 'JUnit Tests Should Include Assert', 'abbrev': 'PMD_JUTSIA', 'severity': 'Major'}, {'type': 'JUnit Rules', 'rule': 'Simplify Boolean Assertion', 'abbrev': 'PMD_SBA', 'severity': 'Minor'}, {'type': 'JUnit Rules', 'rule': 'Test Class Without Test Cases', 'abbrev': 'PMD_TCWTC', 'severity': 'Minor'}, {'type': 'JUnit Rules', 'rule': 'Unnecessary Boolean Assertion', 'abbrev': 'PMD_UBA', 'severity': 'Minor'}, {'type': 'JUnit Rules', 'rule': 'Use Assert Equals Instead Of Assert True', 'abbrev': 'PMD_UAEIOAT', 'severity': 'Major'}, {'type': 'JUnit Rules', 'rule': 'Use Assert Null Instead Of Assert True', 'abbrev': 'PMD_UANIOAT', 'severity': 'Minor'}, {'type': 'JUnit Rules', 'rule': 'Use Assert Same Instead Of Assert True', 'abbrev': 'PMD_UASIOAT', 'severity': 'Minor'}, {'type': 'JUnit Rules', 'rule': 'Use Assert True Instead Of Assert Equals', 'abbrev': 'PMD_UATIOAE', 'severity': 'Minor'}, {'type': 'Jakarta Commons Logging Rules', 'rule': 'Guard Debug Logging', 'abbrev': 'PMD_GDL', 'severity': 'Major'}, {'type': 'Jakarta Commons Logging Rules', 'rule': 'Guard Log Statement', 'abbrev': 'PMD_GLS', 'severity': 'Minor'}, {'type': 'Jakarta Commons Logging Rules', 'rule': 'Proper Logger', 'abbrev': 'PMD_PL', 'severity': 'Minor'}, {'type': 'Jakarta Commons Logging Rules', 'rule': 'Use Correct Exception Logging', 'abbrev': 'PMD_UCEL', 'severity': 'Major'}, {'type': 'Java Logging Rules', 'rule': 'Avoid Print Stack Trace', 'abbrev': 'PMD_APST', 'severity': 'Major'}, {'type': 'Java Logging Rules', 'rule': 'Guard Log Statement Java Util', 'abbrev': 'PMD_GLSJU', 'severity': 'Minor'}, {'type': 'Java Logging Rules', 'rule': 'Logger Is Not Static Final', 'abbrev': 'PMD_LINSF', 'severity': 'Minor'}, {'type': 'Java Logging Rules', 'rule': 'More Than One Logger', 'abbrev': 'PMD_MTOL', 'severity': 'Major'}, {'type': 'Java Logging Rules', 'rule': 'System Println', 'abbrev': 'PMD_SP', 'severity': 'Major'}, {'type': 'JavaBean Rules', 'rule': 'Missing Serial Version UID', 'abbrev': 'PMD_MSVUID', 'severity': 'Major'}, {'type': 'Naming Rules', 'rule': 'Avoid Dollar Signs', 'abbrev': 'PMD_ADS', 'severity': 'Minor'}, {'type': 'Naming Rules', 'rule': 'Avoid Field Name Matching Method Name', 'abbrev': 'PMD_AFNMMN', 'severity': 'Minor'}, {'type': 'Naming Rules', 'rule': 'Avoid Field Name Matching Type Name', 'abbrev': 'PMD_AFNMTN', 'severity': 'Minor'}, {'type': 'Naming Rules', 'rule': 'Boolean Get Method Name', 'abbrev': 'PMD_BGMN', 'severity': 'Minor'}, {'type': 'Naming Rules', 'rule': 'Class Naming Conventions', 'abbrev': 'PMD_CNC', 'severity': 'Minor'}, {'type': 'Naming Rules', 'rule': 'Generics Naming', 'abbrev': 'PMD_GN', 'severity': 'Minor'}, {'type': 'Naming Rules', 'rule': 'Method Naming Conventions', 'abbrev': 'PMD_MeNC', 'severity': 'Minor'}, {'type': 'Naming Rules', 'rule': 'Method With Same Name As Enclosing Class', 'abbrev': 'PMD_MWSNAEC', 'severity': 'Minor'}, {'type': 'Naming Rules', 'rule': 'No Package', 'abbrev': 'PMD_NP', 'severity': 'Minor'}, {'type': 'Naming Rules', 'rule': 'Package Case', 'abbrev': 'PMD_PC', 'severity': 'Minor'}, {'type': 'Naming Rules', 'rule': 'Short Class Name', 'abbrev': 'PMD_SCN', 'severity': 'Minor'}, {'type': 'Naming Rules', 'rule': 'Short Method Name', 'abbrev': 'PMD_SMN', 'severity': 'Minor'}, {'type': 'Naming Rules', 'rule': 'Suspicious Constant Field Name', 'abbrev': 'PMD_SCFN', 'severity': 'Minor'}, {'type': 'Naming Rules', 'rule': 'Suspicious Equals Method Name', 'abbrev': 'PMD_SEMN', 'severity': 'Critical'}, {'type': 'Naming Rules', 'rule': 'Suspicious Hashcode Method Name', 'abbrev': 'PMD_SHMN', 'severity': 'Critical'}, {'type': 'Naming Rules', 'rule': 'Variable Naming Conventions', 'abbrev': 'PMD_VNC', 'severity': 'Minor'}, {'type': 'Optimization Rules', 'rule': 'Add Empty String', 'abbrev': 'PMD_AES', 'severity': 'Minor'}, {'type': 'Optimization Rules', 'rule': 'Avoid Array Loops', 'abbrev': 'PMD_AAL', 'severity': 'Major'}, {'type': 'Optimization Rules', 'rule': 'Redundant Field Initializer', 'abbrev': 'PMD_RFI', 'severity': 'Minor'}, {'type': 'Optimization Rules', 'rule': 'Unnecessary Wrapper Object Creation', 'abbrev': 'PMD_UWOC', 'severity': 'Major'}, {'type': 'Optimization Rules', 'rule': 'Use Array List Instead Of Vector', 'abbrev': 'PMD_UALIOV', 'severity': 'Minor'}, {'type': 'Optimization Rules', 'rule': 'Use Arrays As List', 'abbrev': 'PMD_UAAL', 'severity': 'Major'}, {'type': 'Optimization Rules', 'rule': 'Use String Buffer For String Appends', 'abbrev': 'PMD_USBFSA', 'severity': 'Major'}, {'type': 'Security Code Guideline Rules', 'rule': 'Array Is Stored Directly', 'abbrev': 'PMD_AISD', 'severity': 'Major'}, {'type': 'Security Code Guideline Rules', 'rule': 'Method Returns Internal Array', 'abbrev': 'PMD_MRIA', 'severity': 'Major'}, {'type': 'Strict Exception Rules', 'rule': 'Avoid Catching Generic Exception', 'abbrev': 'PMD_ACGE', 'severity': 'Major'}, {'type': 'Strict Exception Rules', 'rule': 'Avoid Catching NPE', 'abbrev': 'PMD_ACNPE', 'severity': 'Critical'}, {'type': 'Strict Exception Rules', 'rule': 'Avoid Catching Throwable', 'abbrev': 'PMD_ACT', 'severity': 'Major'}, {'type': 'Strict Exception Rules', 'rule': 'Avoid Losing Exception Information', 'abbrev': 'PMD_ALEI', 'severity': 'Major'}, {'type': 'Strict Exception Rules', 'rule': 'Avoid Rethrowing Exception', 'abbrev': 'PMD_ARE', 'severity': 'Minor'}, {'type': 'Strict Exception Rules', 'rule': 'Avoid Throwing New Instance Of Same Exception', 'abbrev': 'PMD_ATNIOSE', 'severity': 'Minor'}, {'type': 'Strict Exception Rules', 'rule': 'Avoid Throwing Null Pointer Exception', 'abbrev': 'PMD_ATNPE', 'severity': 'Critical'}, {'type': 'Strict Exception Rules', 'rule': 'Avoid Throwing Raw Exception Types', 'abbrev': 'PMD_ATRET', 'severity': 'Major'}, {'type': 'Strict Exception Rules', 'rule': 'Do Not Extend Java Lang Error', 'abbrev': 'PMD_DNEJLE', 'severity': 'Critical'}, {'type': 'Strict Exception Rules', 'rule': 'Do Not Throw Exception In Finally', 'abbrev': 'PMD_DNTEIF', 'severity': 'Critical'}, {'type': 'Strict Exception Rules', 'rule': 'Exception As Flow Control', 'abbrev': 'PMD_EAFC', 'severity': 'Major'}, {'type': 'String and StringBuffer Rules', 'rule': 'Avoid Duplicate Literals', 'abbrev': 'PMD_ADL', 'severity': 'Major'}, {'type': 'String and StringBuffer Rules', 'rule': 'Avoid String Buffer Field', 'abbrev': 'PMD_ASBF', 'severity': 'Minor'}, {'type': 'String and StringBuffer Rules', 'rule': 'Consecutive Appends Should Reuse', 'abbrev': 'PMD_CASR', 'severity': 'Minor'}, {'type': 'String and StringBuffer Rules', 'rule': 'Consecutive Literal Appends', 'abbrev': 'PMD_CLA', 'severity': 'Minor'}, {'type': 'String and StringBuffer Rules', 'rule': 'Inefficient String Buffering', 'abbrev': 'PMD_ISB', 'severity': 'Minor'}, {'type': 'String and StringBuffer Rules', 'rule': 'String Buffer Instantiation With Char', 'abbrev': 'PMD_SBIWC', 'severity': 'Critical'}, {'type': 'String and StringBuffer Rules', 'rule': 'String Instantiation', 'abbrev': 'PMD_StI', 'severity': 'Minor'}, {'type': 'String and StringBuffer Rules', 'rule': 'String To String', 'abbrev': 'PMD_STS', 'severity': 'Minor'}, {'type': 'String and StringBuffer Rules', 'rule': 'Unnecessary Case Change', 'abbrev': 'PMD_UCC', 'severity': 'Minor'}, {'type': 'String and StringBuffer Rules', 'rule': 'Use Equals To Compare Strings', 'abbrev': 'PMD_UETCS', 'severity': 'Critical'}, {'type': 'Type Resolution Rules', 'rule': 'Clone Method Must Implement Cloneable', 'abbrev': 'PMD_ClMMIC', 'severity': 'Major'}, {'type': 'Type Resolution Rules', 'rule': 'Loose Coupling', 'abbrev': 'PMD_LoC', 'severity': 'Major'}, {'type': 'Type Resolution Rules', 'rule': 'Signature Declare Throws Exception', 'abbrev': 'PMD_SiDTE', 'severity': 'Major'}, {'type': 'Type Resolution Rules', 'rule': 'Unused Imports', 'abbrev': 'PMD_UnI', 'severity': 'Minor'}, {'type': 'Unnecessary and Unused Code Rules', 'rule': 'Unused Local Variable', 'abbrev': 'PMD_ULV', 'severity': 'Major'}, {'type': 'Unnecessary and Unused Code Rules', 'rule': 'Unused Private Field', 'abbrev': 'PMD_UPF', 'severity': 'Major'}, {'type': 'Unnecessary and Unused Code Rules', 'rule': 'Unused Private Method', 'abbrev': 'PMD_UPM', 'severity': 'Major'}]
PMD_SEVERITIES = list(set([d['severity'] for d in PMD_RULES]))
PMD_SEVERITY_MATCH = {d['abbrev']: d['severity'] for d in PMD_RULES}
PMD_GROUP_MATCH = {d['abbrev']: d['type'] for d in PMD_RULES}
PMD_SINCE = {'PMD_ABSALIL': '2012-05-01', 'PMD_ADLIBDC': '2005-11-30', 'PMD_AMUO': '2008-03-25', 'PMD_ATG': '2006-03-29', 'PMD_AUHCIP': '2007-11-17', 'PMD_AUOV': '2006-12-19', 'PMD_BII': '2006-12-19', 'PMD_BI': '2003-07-30', 'PMD_BNC': '2006-10-04', 'PMD_CRS': '2007-11-17', 'PMD_CSR': '2012-05-01', 'PMD_CCEWTA': '2005-11-30', 'PMD_CIS': '2005-05-10', 'PMD_DCTR': '2011-11-04', 'PMD_DUFTFLI': '2011-11-04', 'PMD_DCL': '2003-03-21', 'PMD_ECB': '2002-06-25', 'PMD_EFB': '2002-07-10', 'PMD_EIS': '2002-06-25', 'PMD_EmSB': '2012-05-01', 'PMD_ESNIL': '2004-02-02', 'PMD_ESI': '2004-02-02', 'PMD_ESS': '2002-11-04', 'PMD_ESB': '2003-10-23', 'PMD_ETB': '2002-07-10', 'PMD_EWS': '2002-06-27', 'PMD_EO': '2012-05-01', 'PMD_FLSBWL': '2003-01-22', 'PMD_JI': '2002-11-04', 'PMD_MNC': '2006-01-25', 'PMD_OBEAH': '2002-07-10', 'PMD_RFFB': '2003-04-17', 'PMD_UIS': '2004-02-02', 'PMD_UCT': '2002-06-25', 'PMD_UNCIE': '2006-01-25', 'PMD_UOOI': '2006-01-25', 'PMD_UOM': '2005-09-15', 'PMD_FLMUB': '2002-07-25', 'PMD_IESMUB': '2002-06-27', 'PMD_ISMUB': '2002-11-04', 'PMD_WLMUB': '2002-07-25', 'PMD_CTCNSE': '2004-07-14', 'PMD_PCI': '2004-01-07', 'PMD_AIO': '2003-02-11', 'PMD_AAA': '2007-11-17', 'PMD_APMP': '2012-05-01', 'PMD_AUNC': '2007-11-17', 'PMD_DP': '2005-11-30', 'PMD_DNCGCE': '2008-03-25', 'PMD_DIS': '2004-02-02', 'PMD_ODPL': '2012-05-01', 'PMD_SOE': '2004-02-02', 'PMD_UC': '2002-11-04', 'PMD_ACWAM': '2005-03-23', 'PMD_AbCWAM': '2008-03-25', 'PMD_ATNFS': '2005-01-31', 'PMD_ACI': '2006-01-25', 'PMD_AICICC': '2005-03-23', 'PMD_APFIFC': '2004-12-15', 'PMD_APMIFCNE': '2014-02-11', 'PMD_ARP': '2002-11-04', 'PMD_ASAML': '2005-03-23', 'PMD_BC': '2004-05-19', 'PMD_CWOPCSBF': '2007-11-17', 'PMD_ClR': '2003-10-06', 'PMD_CCOM': '2003-03-21', 'PMD_DLNLISS': '2004-02-02', 'PMD_EMIACSBA': '2007-11-17', 'PMD_EN': '2004-07-14', 'PMD_FDSBASOC': '2012-05-01', 'PMD_FFCBS': '2003-06-19', 'PMD_IO': '2004-10-19', 'PMD_IF': '2004-10-19', 'PMD_ITGC': '2004-10-19', 'PMD_LI': '2012-05-01', 'PMD_MBIS': '2005-03-23', 'PMD_MSMINIC': '2005-03-23', 'PMD_NCLISS': '2004-02-02', 'PMD_NSI': '2004-02-02', 'PMD_NTSS': '2005-11-30', 'PMD_OTAC': '2004-05-19', 'PMD_PLFICIC': '2014-02-11', 'PMD_PLFIC': '2005-09-15', 'PMD_PST': '2006-06-01', 'PMD_REARTN': '2008-03-25', 'PMD_SDFNL': '2004-10-19', 'PMD_SBE': '2003-04-17', 'PMD_SBR': '2002-08-16', 'PMD_SC': '2005-05-10', 'PMD_SF': '2005-05-10', 'PMD_SSSHD': '2002-11-04', 'PMD_TFBFASS': '2008-03-25', 'PMD_UEC': '2005-11-30', 'PMD_UEM': '2005-11-30', 'PMD_ULBR': '2005-09-15', 'PMD_USDF': '2006-03-29', 'PMD_UCIE': '2006-12-19', 'PMD_ULWCC': '2004-10-19', 'PMD_UNAION': '2005-03-23', 'PMD_UV': '2012-05-01', 'PMD_ACF': '2005-03-23', 'PMD_EF': '2004-02-02', 'PMD_FDNCSF': '2004-02-02', 'PMD_FOCSF': '2004-02-02', 'PMD_FO': '2004-02-02', 'PMD_FSBP': '2003-06-19', 'PMD_DIJL': '2002-07-15', 'PMD_DI': '2002-07-15', 'PMD_IFSP': '2003-01-22', 'PMD_TMSI': '2007-11-17', 'PMD_UFQN': '2012-05-01', 'PMD_DNCSE': '2007-11-17', 'PMD_LHNC': '2007-07-20', 'PMD_LISNC': '2007-07-20', 'PMD_MDBASBNC': '2007-07-20', 'PMD_RINC': '2007-07-20', 'PMD_RSINC': '2007-07-20', 'PMD_SEJBFSBF': '2007-11-17', 'PMD_JUASIM': '2003-03-21', 'PMD_JUS': '2002-11-04', 'PMD_JUSS': '2002-11-04', 'PMD_JUTCTMA': '2012-05-01', 'PMD_JUTSIA': '2004-10-19', 'PMD_SBA': '2006-03-29', 'PMD_TCWTC': '2005-03-23', 'PMD_UBA': '2005-03-23', 'PMD_UAEIOAT': '2005-05-10', 'PMD_UANIOAT': '2006-01-25', 'PMD_UASIOAT': '2005-05-10', 'PMD_UATIOAE': '2012-05-01', 'PMD_GDL': '2011-11-04', 'PMD_GLS': '2014-02-11', 'PMD_PL': '2005-09-15', 'PMD_UCEL': '2005-06-21', 'PMD_APST': '2005-06-21', 'PMD_GLSJU': '2014-02-11', 'PMD_LINSF': '2004-10-19', 'PMD_MTOL': '2004-10-19', 'PMD_SP': '2004-12-15', 'PMD_MSVUID': '2005-03-23', 'PMD_ADS': '2004-02-02', 'PMD_AFNMMN': '2005-03-23', 'PMD_AFNMTN': '2005-03-23', 'PMD_BGMN': '2007-07-20', 'PMD_CNC': '2003-07-30', 'PMD_GN': '2011-09-14', 'PMD_MeNC': '2003-07-30', 'PMD_MWSNAEC': '2004-02-02', 'PMD_NP': '2005-09-15', 'PMD_PC': '2005-09-15', 'PMD_SCN': '2012-05-01', 'PMD_SMN': '2002-07-03', 'PMD_SCFN': '2004-10-19', 'PMD_SEMN': '2004-10-19', 'PMD_SHMN': '2004-02-02', 'PMD_VNC': '2003-07-30', 'PMD_AES': '2007-07-20', 'PMD_AAL': '2006-01-25', 'PMD_RFI': '2012-05-01', 'PMD_UWOC': '2006-10-04', 'PMD_UALIOV': '2005-03-23', 'PMD_UAAL': '2006-01-25', 'PMD_USBFSA': '2005-05-10', 'PMD_AISD': '2005-01-31', 'PMD_MRIA': '2005-01-31', 'PMD_ACGE': '2011-09-14', 'PMD_ACNPE': '2004-05-19', 'PMD_ACT': '2003-07-30', 'PMD_ALEI': '2011-09-14', 'PMD_ARE': '2006-10-04', 'PMD_ATNIOSE': '2009-02-08', 'PMD_ATNPE': '2004-05-19', 'PMD_ATRET': '2004-05-19', 'PMD_DNEJLE': '2007-07-20', 'PMD_DNTEIF': '2008-03-25', 'PMD_EAFC': '2004-05-19', 'PMD_ADL': '2002-11-04', 'PMD_ASBF': '2008-03-25', 'PMD_CASR': '2014-02-11', 'PMD_CLA': '2006-01-25', 'PMD_ISB': '2005-11-30', 'PMD_SBIWC': '2006-12-19', 'PMD_StI': '2002-11-04', 'PMD_STS': '2002-11-04', 'PMD_UCC': '2005-09-15', 'PMD_UETCS': '2007-11-17', 'PMD_ClMMIC': '2006-12-19', 'PMD_LoC': '2006-12-19', 'PMD_SiDTE': '2007-07-20', 'PMD_UnI': '2007-07-20', 'PMD_ULV': '2002-06-25', 'PMD_UPF': '2002-06-25', 'PMD_UPM': '2002-07-25'}

DEFAULT_RULES_MAVEN_CLEANED = ['PMD_AUHCIP', 'PMD_CRS', 'PMD_UnI', 'PMD_ULV', 'PMD_UPF', 'PMD_UPM', 'PMD_DIJL', 'PMD_DI', 'PMD_EO', 'PMD_FLSBWL', 'PMD_TMSI', 'PMD_UFQN', 'PMD_CIS', 'PMD_UOM', 'PMD_ABSALIL', 'PMD_ADLIBDC', 'PMD_AMUO', 'PMD_AUOV', 'PMD_BNC', 'PMD_CSR', 'PMD_CCEWTA', 'PMD_DUFTFLI', 'PMD_ECB', 'PMD_EFB', 'PMD_EIS', 'PMD_EmSB', 'PMD_ESNIL', 'PMD_ESS', 'PMD_ESB', 'PMD_ETB', 'PMD_EWS', 'PMD_IFSP', 'PMD_JI', 'PMD_MNC', 'PMD_OBEAH', 'PMD_RFFB', 'PMD_UIS', 'PMD_UCT', 'PMD_UNCIE', 'PMD_UOOI', 'PMD_ATG', 'PMD_DCTR', 'PMD_DCL', 'PMD_BII', 'PMD_BI']

OVERLAP_CHECKSTYLE = ['PMD_BI', 'PMD_UnI', 'PMD_CNC', 'PMD_FDNCSF', 'PMD_PCI', 'PMD_UETCS', 'PMD_VNC', 'PMD_SBE', 'PMD_SBR', 'PMD_DIJL', 'PMD_DI', 'PMD_IFSP', 'PMD_ECB', 'PMD_DLNLISS', 'PMD_PLFIC', 'PMD_OBEAH', 'PMD_CWOPCSBF', 'PMD_ULV',
                      'PMD_ARP', 'PMD_NP', 'PMD_FLMUB', 'PMD_IESMUB', 'PMD_ISMUB', 'PMD_WLMUB', 'PMD_SSSHD', 'PMD_GN', 'PMD_MeNC', 'PMD_MWSNAEC', 'PMD_ACI', 'PMD_AIO', 'PMD_LoC', 'PMD_ATRET', 'PMD_DIS', 'PMD_ACGE', 'PMD_ACT']
OVERLAP_FINDBUGS = ['PMD_PCI', 'PMD_ClMMIC', 'PMD_UETCS', 'PMD_EF', 'PMD_ACF', 'PMD_FDNCSF', 'PMD_OBEAH', 'PMD_CNC', 'PMD_MeNC', 'PMD_VNC', 'PMD_ClR', 'PMD_MSVUID', 'PMD_ADLIBDC', 'PMD_EN', 'PMD_JUSS', 'PMD_TCWTC', 'PMD_SEMN',
                    'PMD_SHMN', 'PMD_MWSNAEC', 'PMD_FSBP', 'PMD_UNAION', 'PMD_DCTR', 'PMD_USDF', 'PMD_BI', 'PMD_ADL', 'PMD_UPM', 'PMD_UPF', 'PMD_ULV', 'PMD_APMIFCNE', 'PMD_REARTN', 'PMD_ACGE', 'PMD_MBIS', 'PMD_SSSHD']

PROJECTS = ['calcite',
 'cayenne',
 'commons-bcel',
 'commons-beanutils',
 'commons-codec',
 'commons-collections',
 'commons-compress',
 'commons-configuration',
 'commons-dbcp',
 'commons-digester',
 'commons-imaging',
 'commons-io',
 'commons-jcs',
 'commons-jexl',
 'commons-lang',
 'commons-math',
 'commons-net',
 'commons-rdf',
 'commons-scxml',
 'commons-validator',
 'commons-vfs',
 'eagle',
 'falcon',
 'flume',
 'giraph',
 'gora',
 'jspwiki',
 'knox',
 'kylin',
 'lens',
 'mahout',
 'manifoldcf',
 'opennlp',
 'parquet-mr',
 'pdfbox',
 'phoenix',
 'ranger',
 'santuario-java',
 'storm',
 'struts',
 'systemml',
 'tez',
 'tika',
 'wss4j',
 'zeppelin',
 'helix',
  'httpcomponents-client', 'archiva', 'httpcomponents-core', 'jena', 'streams', 'mina-sshd', 'roller', 'nifi']

TABLES_PATH = '../tables/'
FIGURES_PATH = '../figures/'
DATA_PATH = '../data/'
ASAT_DATA_PATH = '../asatlib/data/'

In [None]:
def get_slope(df, kind):
    slope = [np.NaN]
    f_val = [np.NaN]
    p_val = [np.NaN]
    coef = [np.NaN]
    
    # only non NaNs
    df = df[~df[kind].isnull()]
    
    if len(df) == 0:
        return slope[0], f_val[0], p_val[0], coef[0]
    
    start_index = None
    if 'effective' in kind:
        try:
            start_index = df.index[df['use_pmd'] == 1][0]
        except IndexError:
            pass

    # for effective_time_ratio we need to reset the values to the first one where it is not 0, because otherwise we have skewed data
    if start_index:
        tmpv = df[kind].loc[start_index:]
        x = mdates.date2num(df['datetime'].loc[start_index:].values)
    else:
        tmpv = df[kind]
        x = mdates.date2num(df['datetime'].values)

    reg = LinearRegression()
    reg.fit(x.reshape(-1,1), tmpv.values.reshape(-1,1))
    y_pred = reg.predict(x.reshape(-1,1))
    coef = reg.coef_

    if not np.all(tmpv.values == tmpv.values[0], axis = 0):
        f_val, p_val = f_regression(x.reshape(-1,1), tmpv.values)

    if len(x) > 1:
        slope = (y_pred[-1] - y_pred[0]) / (x[-1] - x[0])
    
    return slope[0], f_val[0], p_val[0], coef[0][0]


def yearly_slopes(df):
    years = []
    complete = []
    
    for project_name in df['project'].unique():
        tmp2 = df[df['project'] == project_name]
        
        pr = {'project': project_name}
        for kind in ['all', 'all_ratio', 'all_time_ratio', 'default_time_ratio', 'default_ratio', 'effective_time_ratio', 'effective_ratio',
                     'all_overlap2', 'all_overlap2_ratio', 'all_time_overlap2_ratio', 'default_time_overlap2_ratio', 'effective_time_overlap2_ratio', 'all_full_ratio']:
            pr[kind + '_slope'], pr[kind + '_f_val'], pr[kind + '_p_val'], pr[kind + '_coef'] = get_slope(tmp2, kind)            
        complete.append(pr)
        for ye in df['year'].unique():
            tmp = df[(df['year'] == ye) & (df['project'] == project_name)]

            # we may not have this year for this project
            if len(tmp) == 0:
                continue
            
            yes_pmd = len(tmp[tmp['use_pmd'] == 1])
            no_pmd = len(tmp[tmp['use_pmd'] == 0])
            
            yes_checkstyle = len(tmp[tmp['use_checkstyle'] == 1])
            no_checkstyle = len(tmp[tmp['use_checkstyle'] == 0])

            yes_findbugs = len(tmp[tmp['use_findbugs'] == 1])
            no_findbugs = len(tmp[tmp['use_findbugs'] == 0])
            
            yes_maven = len(tmp[tmp['use_maven'] == 1])
            no_maven = len(tmp[tmp['use_maven'] == 0])

            pry = {'project': project_name, 
                   'year': ye, 
                   'complete_pmd': (yes_pmd > 0 and no_pmd == 0) or (yes_pmd == 0 and no_pmd > 1),
                   'complete_checkstyle': (yes_checkstyle > 0 and no_checkstyle == 0) or (yes_checkstyle == 0 and no_checkstyle > 1),
                   'complete_findbugs': (yes_findbugs > 0 and no_findbugs == 0) or (yes_findbugs == 0 and no_findbugs > 1),
                   'yes_pmd': yes_pmd, 
                   'no_pmd': no_pmd,
                   'yes_checkstyle': yes_checkstyle, 
                   'no_checkstyle': no_checkstyle,
                   'yes_findbugs': yes_findbugs, 
                   'no_findbugs': no_findbugs,
                   'yes_maven': yes_maven,
                   'no_maven': no_maven,
                   'use_maven': yes_maven > 0 and no_maven == 0}
            for kind in ['all_ratio', 'all_time_ratio', 'default_time_ratio', 'default_ratio', 'effective_time_ratio', 'effective_ratio',
                         'all_time_overlap_ratio', 'default_time_overlap_ratio', 'effective_time_overlap_ratio', 'all_overlap_ratio',
                         'all_overlap2_ratio', 'all_time_overlap2_ratio', 'default_time_overlap2_ratio', 'effective_time_overlap2_ratio', 'all_overlap2_ratio', 'effective_time', 'default_time', 'all_time', 'default', 'all', 'effective', 'all_full', 'all_full_ratio',
                         'all_overlap']:
                pry[kind + '_slope'], pry[kind + '_f_val'], pry[kind + '_p_val'], pry[kind + '_coef'] = get_slope(tmp, kind)
            years.append(pry)
    return pd.DataFrame(years), pd.DataFrame(complete)

def save_full_trends_after_maven_table(df):
    """
    """
    # df = df[df['use_maven'] == 1]  # is already restricted to maven due to df being full from yearly_slopes
    
    table = """
    """

    for project_name in sorted(df['project'].unique()):
        row = [project_name]

        for ttype in ['all_overlap2', 'all_overlap2_ratio', 'all_time_overlap2_ratio', 'default_time_overlap2_ratio', 'effective_time_overlap2_ratio']:
        # for ttype in ['all', 'all_ratio', 'all_time_ratio', 'default_time_ratio', 'effective_time_ratio']:
            
            slope_name = ttype + '_slope'
            pval_name = ttype + '_p_val'
            fval_name = ttype + '_f_val'

            tmp = df[df['project'] == project_name]
            if len(tmp) == 0:
                row.append(' ')
            elif len(tmp) == 1:
                if tmp[slope_name].values[0] < 0 and tmp[pval_name].values[0] < 0.05 and tmp[fval_name].values[0] > 1:
                    arrow = '$\\searrow$'
                elif tmp[slope_name].values[0] > 0 and tmp[pval_name].values[0] < 0.05 and tmp[fval_name].values[0] > 1:
                    arrow = '$\\nearrow$'
                else:
                    arrow = '$\\rightarrow$'

                row.append(arrow)
            else:
                print('THIS IS NOT SUPPOSED TO HAPPEN NJIAAANAAAA!')
        table += ' & '.join(row) + '\\\\' + '\n'
    
    table += """
    """
    
    with open(TABLES_PATH + 'full_trends_table.tex', 'w') as f:
        f.write(table)

def save_trends_table(df, ttype='all_time_ratio'):
    """Restricted by meaningful trends, we need this to not display useless data because we do not include numerical data in this visualization."""

    df = df[df['use_maven'] == 1]
    
    years = [y for y in range(df['year'].min(), df['year'].max() + 1)]

    table = """\\begin{tabular}{l|""" + '|'.join('c' for y in years) + '}\n' 

    shortyears = []
    for year in years:
        shortyears.append(str(year)[2:])

    table += 'Project & ' + ' & '.join(shortyears) + '\\\\' + '\n' + '\\hline' + '\n'

    for project_name in sorted(df['project'].unique()):
        row = [project_name]
        for year in years:
            tmp = df[(df['year'] == year) & (df['project'] == project_name)]
            if len(tmp) == 0:
                row.append(' ')
            elif len(tmp) == 1:
                slope_name = ttype + '_slope'
                pval_name = ttype + '_p_val'
                fval_name = ttype + '_f_val'
                # if we use overlap we have to check if we can use it, i.e., the other overlapping asat was used the complete year
                # if not we have to resort to overlap2
                if ttype == 'all_overlap_ratio':
                    if len(tmp[tmp['complete_findbugs'] == True]) != len(tmp) or len(tmp[tmp['complete_checkstyle'] == True]) != len(tmp):
                        slope_name = ttype.replace('overlap', 'overlap2') + '_slope'
                        pval_name = ttype.replace('overlap', 'overlap2') + '_p_val'
                        fval_name = ttype.replace('overlap', 'overlap2') + '_f_val'

                if tmp['yes_pmd'].values[0] == 0 and tmp['no_pmd'].values[0] > 0:
                    color = ', color=red!80'
                elif tmp['yes_pmd'].values[0] > 0 and tmp['no_pmd'].values[0] == 0:
                    color = ', color=green!80'
                else:
                    color = ''

                if tmp[slope_name].values[0] < 0 and tmp[pval_name].values[0] < 0.05 and tmp[fval_name].values[0] > 1:
                    arrow = r'\tikz{\draw[-stealth,rotate=-45,semithick' + color + r'] (0,0)--(0.3,0);}'
                elif tmp[slope_name].values[0] > 0 and tmp[pval_name].values[0] < 0.05 and tmp[fval_name].values[0] > 1:
                    arrow = '$\\nearrow$'
                    arrow = r'\tikz{\draw[-stealth,rotate=+45,semithick' + color + r'] (0,0)--(0.3,0);}'
                else:
                    arrow = '$\\rightarrow$'
                    arrow = r'\tikz{\draw[-stealth,semithick' + color + r'] (0,0)--(0.3,0);}'

                # row.append('\\textcolor{'+color+'}{'+arrow+'}')
                row.append(arrow)
            else:
                print('THIS IS NOT SUPPOSED TO HAPPEN NJIAAANAAAA!')
                print(tmp)
        table += ' & '.join(row) + '\\\\' + '\n'

    table += """\\end{tabular}"""
    
    with open(TABLES_PATH + 'full_table_years_{}.tex'.format(ttype), 'w') as f:
        f.write(table)

def print_project_full_year(df):

    df = df[~df['all_ratio'].isnull()]

    x = mdates.date2num(df['datetime'].values)
    
    reg = LinearRegression()
    reg.fit(x.reshape(-1,1), df['all'].values.reshape(-1,1))
    y_pred0 = reg.predict(x.reshape(-1,1))
    print(reg.coef_)
    
    reg = LinearRegression()
    reg.fit(x.reshape(-1,1), df['all_ratio'].values.reshape(-1,1))
    y_pred1 = reg.predict(x.reshape(-1,1))

    reg = LinearRegression()
    reg.fit(x.reshape(-1,1), df['all_time_ratio'].values.reshape(-1,1))
    y_pred2 = reg.predict(x.reshape(-1,1))

    reg = LinearRegression()
    reg.fit(x.reshape(-1,1), df['default_time_ratio'].values.reshape(-1,1))
    y_pred3 = reg.predict(x.reshape(-1,1))

    # we need to discard every 0 value before the first(!) non 0 value
    try:
        start_index = df.index[df['use_pmd'] == 1][0]
    except IndexError:
        start_index = None
    
    if start_index:
        tmpv = df['effective_time_ratio'].loc[start_index:]
        x2 = mdates.date2num(df['datetime'].loc[start_index:].values)
        reg = LinearRegression()
        reg.fit(x2.reshape(-1,1), tmpv.values.reshape(-1,1))
        y_pred4 = reg.predict(x2.reshape(-1,1))

    f, axarr = plt.subplots(1, 5, figsize=(14,4))
    # f.suptitle(df['project'].values[0])

    fmt = mdates.DateFormatter('%m/%d/%Y')
    loc = mdates.DayLocator()
    # loc.MAXTICKS = 5000
    
    axarr[0].set_title('Sum (S)')
    axarr[0].xaxis.set_major_formatter(fmt)
    axarr[0].xaxis.set_major_locator(loc)
    axarr[0].plot(df[['datetime']].values, df[['all']].values)
    axarr[0].plot(df[['datetime']].values, y_pred0)
    axarr[0].set_yticks([])
    
    df['count_all'] = len(PMD_SINCE.keys())
    ax1 = axarr[0].twinx()
    # ax1.set_ylim(top=len(PMD_SINCE.keys()) + 0.5)
    ax1.plot(df[['datetime']].values, df[['count_all']].values, color='r', alpha=0.7)
    ax1.set_yticks([])

    axarr[1].set_title('Warning density (R)')
    axarr[1].xaxis.set_major_formatter(fmt)
    axarr[1].xaxis.set_major_locator(loc)
    axarr[1].plot(df[['datetime']].values, df[['all_ratio']].values)
    axarr[1].plot(df[['datetime']].values, y_pred1)
    axarr[1].set_yticks([])
    
    ax3 = axarr[1].twinx()
    ax3.plot(df[['datetime']].values, df[['count_all']].values, color='r', alpha=0.7)
    ax3.set_yticks([])

    axarr[2].set_title('R+t')
    axarr[2].xaxis.set_major_formatter(fmt)
    axarr[2].xaxis.set_major_locator(loc)
    axarr[2].plot(df[['datetime']].values, df[['all_time_ratio']].values)
    axarr[2].plot(df[['datetime']].values, y_pred2)
    axarr[2].set_yticks([])

    ax2 = axarr[2].twinx()
    ax2.plot(df[['datetime']].values, df[['count_default_time']].values, color='r', alpha=0.7)
    ax2.set_yticks([])
    
    axarr[3].set_title('R+d+t')
    axarr[3].xaxis.set_major_formatter(fmt)
    axarr[3].xaxis.set_major_locator(loc)
    axarr[3].plot(df[['datetime']].values, df[['default_time_ratio']].values)
    axarr[3].plot(df[['datetime']].values, y_pred3)
    axarr[3].set_yticks([])

    ax4 = axarr[3].twinx()
    ax4.plot(df[['datetime']].values, df['count_default_time'].values, color='r', alpha=0.7)
    ax4.set_yticks([])
    
    axarr[4].set_title('R+e+t')
    axarr[4].xaxis.set_major_formatter(fmt)
    axarr[4].xaxis.set_major_locator(loc)
    axarr[4].plot(df[['datetime']].values, df[['effective_time_ratio']].values)
    axarr[4].set_yticks([])

    if start_index:
        axarr[4].plot(df['datetime'].loc[start_index:].values, y_pred4)
    # axarr[5].plot(df[['effective_time']].values)
    # axarr[5].plot(y_pred4)

    ax5 = axarr[4].twinx()
    ax5.plot(df[['datetime']].values, df[['count_effective_time']].values, color='r', alpha=0.7)
    ax5.set_yticks([])
    
    plt.setp(axarr[0].get_xticklabels(), rotation=90, horizontalalignment='center')
    plt.setp(axarr[1].get_xticklabels(), rotation=90, horizontalalignment='center')
    plt.setp(axarr[2].get_xticklabels(), rotation=90, horizontalalignment='center')
    plt.setp(axarr[3].get_xticklabels(), rotation=90, horizontalalignment='center')
    plt.setp(axarr[4].get_xticklabels(), rotation=90, horizontalalignment='center')
    
    # plt.show()
    plt.tight_layout()
    plt.savefig(FIGURES_PATH + 'all_years_example_{}.pdf'.format(df['project'].values[0]))


def save_reporting_table(reporting):

    table = []
    for k, (yes_pmd_len, yes_pmd_median, no_pmd_len, no_pmd_median) in reporting.items():
        if k == 'all':
            name = 'Sum (\\emph{S})'
        elif k == 'all_time_ratio':
            name = r'\emph{R}+\emph{t}'
        elif k == 'default_time_ratio':
            name = r'\emph{R}+\emph{d}+\emph{t}'
        elif k == 'effective_time_ratio':
            name = r'\emph{R}+\emph{e}+\emph{t}'
        elif k == 'effective_time_overlap_ratio':
            name = r'\emph{R}+\emph{e}+\emph{t}+\emph{o}'

        table.append(r'\multirow{2}{*}{' + name + r'}' + '& No PMD & {} & {:.5f}'.format(no_pmd_len, no_pmd_median) + r'\\')
        table.append(r'& PMD & {} & {:.5f}'.format(yes_pmd_len, yes_pmd_median) + r'\\')
        table.append(r'\hline')
    
    with open(TABLES_PATH + 'mwu_reporting.tex', 'w') as f:
        f.write('\n'.join(table[0:-1]))

def save_stats_table3(stats, ttype):

    table = ''
    table += 'Shapiro-Wilk & No PMD & {:.5} & {:.5}\\\\\n'.format(*stats['shapiro_wilk_yes_pmd'])
    table += 'Shapiro-Wilk & PMD & {:.5} & {:.5}\\\\\n'.format(*stats['shapiro_wilk_no_pmd'])
    table += '\\hline\n'

    table += 'Levene & Both & {:.5} & {:.5}\\\\\n'.format(*stats['levene'])
    table += '\\hline\n'

    table += 'Mann-Whitney-U & Both & {:.5} & {:.5}\\\\\n'.format(*stats['mwu']) 
    if stats['mwu'][1] < 0.05:
        table += 'Effect size & Both & {:.5} & -\\\\\n'.format(stats['effect_size'])
    with open(TABLES_PATH + '{}_results.tex'.format(ttype), 'w') as f:
        f.write(table)

def get_stats(df, ttype, use_f=False, use_coef=True):
    """Statistical Test between yes_pmd and no_pmd including all prerequesites."""

    df = df[df['use_maven'] == 1]

    no_ttype = ttype
    if 'effective' in ttype:
        no_ttype = ttype.replace('effective', 'default')

    # if we have overlap rules active we need to discard partial uses because otherwise the slope would be biased if, e.g., at the beginning overlap was active and at the end of the year not
    if 'overlap' in ttype:
        df = df[df['complete_checkstyle']]
        df = df[df['complete_findbugs']]
    #df = df.dropna()
        
    if use_f:
        tmp1 = df[(df['no_pmd'] == 0) & (df['yes_pmd'] != 0) & (df[ttype + '_p_val'] < 0.05) & (df[ttype + '_f_val'] > 1)]
        tmp2 = df[(df['no_pmd'] != 0) & (df['yes_pmd'] == 0) & (df[no_ttype + '_p_val'] < 0.05) & (df[no_ttype + '_f_val'] > 1)]
    else:
        tmp1 = df[(df['no_pmd'] == 0) & (df['yes_pmd'] != 0)]
        tmp2 = df[(df['no_pmd'] != 0) & (df['yes_pmd'] == 0)]
        
    desc = '_slope'
    if use_coef:
        desc = '_coef'
    
    # we need to drop NaN if we dont use f
    if not use_f:
        tmp1 = tmp1.dropna(subset=[ttype + desc])
        tmp2 = tmp2.dropna(subset=[no_ttype + desc])

    # default maven rules against custom rules
    yes_pmd = tmp1[ttype + desc].values
    no_pmd = tmp2[no_ttype + desc].values
    
    stat, p_val = stats.mannwhitneyu(yes_pmd, no_pmd, alternative='two-sided')
    eff = effsize(stat, len(yes_pmd), len(no_pmd))
    conf = confidence(yes_pmd, no_pmd)

    return ({'shapiro_wilk_yes_pmd': stats.shapiro(yes_pmd), 'shapiro_wilk_no_pmd': stats.shapiro(no_pmd), 'levene': stats.levene(yes_pmd, no_pmd), 'mwu': stats.mannwhitneyu(yes_pmd, no_pmd, alternative='less'), 'effect_size': eff, 'confidence_interval_95': conf}), (len(yes_pmd), np.median(yes_pmd), len(no_pmd), np.median(no_pmd))

def save_config_changes_table():
    """Restricted to maven usage view pmd_states"""
    cchanges = []
    for project_name in sorted(PROJECTS):
        
        pdf = pd.read_csv(ASAT_DATA_PATH + '{}_pmd_states6.csv'.format(project_name))
        rule_changes = set(pdf['effective_rules'].unique())
        all_config_changes = len(pdf['revision'].unique())  # count per revision otherwise all poms are included everytime
        
        tmp = {'project': project_name}
        for rt in PMD_GROUP_MATCH.values():
            tmp[rt] = 0

        current_rules = set()  # state of current rules
        changed_rules = []
        for rule_change in pdf['effective_rules'].unique():
            if not rule_change or type(rule_change) != str:  # NaN or empty string to empty set
                rule_change = set()
            else:
                rule_change = set(rule_change.split(','))  # otherwise set is consisting of rule elements

            if rule_change != current_rules:
                changed_rules.append(len(current_rules ^ rule_change))  # symmetric difference, only elements that are in one set but not in the other
                for rule in (current_rules ^ rule_change):
                    tmp[PMD_GROUP_MATCH[rule]] += 1
                current_rules = rule_change
        
        tmp['rule_changes'] = sum(changed_rules)
        tmp['all_changes'] = all_config_changes
        cchanges.append(tmp)

    table = ''
    for p in cchanges:
        table += p['project'] + ' & ' + str(p['rule_changes']) + ' & ' + str(p['all_changes']) + '\\\\' + '\n'

    with open(TABLES_PATH + 'rule_changes.tex', 'w') as f:
        f.write(table)

    return cchanges


def save_correlation_lloc_table(dfall):
    """Correlation between warnings (all) and code klloc.
    Not restricted by effective code via maven usage
    """
    print(stats.kendalltau(dfall['code_klloc'].values, dfall['all'].values))
    print(stats.pearsonr(dfall['code_klloc'].values, dfall['all'].values))
    print(stats.spearmanr(dfall['code_klloc'].values, dfall['all'].values))

    k = stats.kendalltau(dfall['code_klloc'].values, dfall['all'].values)
    s = stats.spearmanr(dfall['code_klloc'].values, dfall['all'].values)

    table = """\\begin{tabular}{l|c|c}
    Method & Value & P-Value\\\\
    \\hline
    """

    table += 'Kendall\'s $\\tau$ & {:.5} & {:.5}\\\\\n'.format(k[0], k[1])
    table += 'Spearman\'s $\\rho$ & {:.5} & {:.5}\\\\\n'.format(s[0], s[1])

    table += """\\end{tabular}"""

    with open(TABLES_PATH + 'correlation_lloc_warnings.tex', 'w') as f:
        f.write(table)

def save_slope_sums_table(years):
    """
    As we remove incomplete years of checkstyle and findbugs we can use overlap instead of overlap2.
    """
    years = years[years['use_maven'] == 1]
    years = years[years['complete_checkstyle']]
    years = years[years['complete_findbugs']]

    table = ''
    sums = []
    for project in sorted(years['project'].unique()):
        t = years[(years['project'] == project) & (years['yes_pmd'] > 0) & (years['no_pmd'] == 0) & (years['all_overlap_ratio_p_val'] < 0.05) & (years['all_overlap_ratio_f_val'] > 1)]['all_overlap_ratio_coef'].values
        #t = t[~np.isnan(t)]
        # sums.append({'project': project, 'sum_all_ratio': t})
        
        # projects that do not have full PMD years
        if len(t) == 0:
            continue
        
        # skip nan
        if np.isnan(np.mean(t)):
            continue
    
        if np.mean(t) != 0:
            table += '{} & {:.5}\\\\\n'.format(project, np.mean([t2 * 365 for t2 in t]))
            sums.append(np.mean([t2 * 365 for t2 in t]))
    
    table += '\\hline\n'
    table += 'Mean & {:.5}\\\\\n'.format(np.mean(sums))
    
    with open(TABLES_PATH + 'slope_sums.tex', 'w') as f:
        f.write(table)

def save_rule_changes_correlation_table(dfall, years):    
    """The correlation between the number of rule changes per year and the warning density slope per year."""
    cchanges = []
    for project_name in sorted(dfall['project'].unique()):
        
        pdf = pd.read_csv(ASAT_DATA_PATH + '{}_pmd_states6.csv'.format(project_name))
        
        current_rules = set()  # current rules per project
        for year in sorted(dfall[dfall['project'] == project_name]['year'].unique()):
            
            rule_changes_year = []
            for revision in dfall[(dfall['project'] == project_name) & (dfall['year'] == year)]['revision'].values:

                effective_rules = pdf[pdf['revision'] == revision]['effective_rules'].values
                # we may select empty for that revision (most cases)
                if len(effective_rules) == 0:
                    continue
                
                # we iterate over all rules by all maven modules contained and merge them together
                full_rules = set()
                for er in effective_rules:
                    if type(er) == str:
                        er = set(er.split(','))
                        full_rules.update(er)
                # print(project_name, year, revision)
                if full_rules != current_rules:  # rules have changed
                    # print(project_name, year, 'rules changed', current_rules, full_rules, len(current_rules ^ full_rules))
                    rule_changes_year.append(len(current_rules ^ full_rules))  # symmetric difference, only elements that are in one set but not in the other
                    current_rules = full_rules

            cchanges.append({'project': project_name, 'year': year,  'rule_changes': sum(rule_changes_year)})
    changes = pd.DataFrame(cchanges)
    
    y2 = years.merge(changes, on=['project', 'year'])

    # all non pmd using years raus
    y2.drop(y2[y2['no_pmd'] > 0].index, inplace=True)

    # we may have NaN in the slope values
    y2 = y2.dropna()
    
    # correlate config changes to effective rules lope?
    k = stats.kendalltau(y2['all_ratio_slope'].values, y2['rule_changes'].values)
    s = stats.spearmanr(y2['all_ratio_slope'].values, y2['rule_changes'].values)
    
    table = """\\begin{tabular}{l|c|c}
    Method & Value & P-Value\\\\
    \\hline
    """

    table += 'Kendall\'s $\\tau$ & {:.5} & {:.5}\\\\\n'.format(k[0], k[1])
    table += 'Spearman\'s $\\rho$ & {:.5} & {:.5}\\\\\n'.format(s[0], s[1])

    table += """\\end{tabular}"""

    with open(TABLES_PATH + 'correlation_rule_changes_warnings.tex', 'w') as f:
        f.write(table)
    
    return y2

# effect size calculation for Mann-Whitney-U
def effsize(u, n1, n2):
    z = (u - ((n1*n2)/2)) / math.sqrt((n1*n2*(n1+n2+1))/12)
    r = z / math.sqrt(n1+n2)
    return abs(r)

def save_full_trends_table(dfall, yearfull):
    """Gives an overall trend of asat warnings incorporating the complete source code without the build system.
    
    For each year and project calculate the slope and f_regression.
    """

    all_slopes = []
    full_slope = {}  # coefficient / slope over all time per project
    full_sum = {}  # full trend over project lifetime with sum only
    full = {}  # full trend over project lifetime
    projects = {}  # per project per year
    years = {}  # mean per year over all projects active that year
    for project in sorted(PROJECTS):
        if project not in projects.keys():
            projects[project] = {}
            
            # 1. coefficient / slope = mean average change per year
            mean_change_year = yearfull[(yearfull['project'] == project)]['all_full_ratio_coef'].values[0] * 365
            f_val = yearfull[(yearfull['project'] == project)]['all_full_ratio_f_val'].values[0]
            p_val = yearfull[(yearfull['project'] == project)]['all_full_ratio_p_val'].values[0]

            if f_val < 1 or p_val > 0.05:
                mean_change_year = 0
            full_slope[project] = mean_change_year

            if mean_change_year != 0:
                all_slopes.append(mean_change_year)
            
            tmp2 = dfall[(dfall['project'] == project)]
            tmp2 = tmp2[~tmp2['all_full_ratio'].isnull()].copy()
            tmp3 = tmp2[~tmp2['all_full_ratio'].isnull()].copy()
            
            x = mdates.date2num(tmp2['datetime'].values)
            reg = LinearRegression()
            reg.fit(x.reshape(-1,1), tmp2['all_full_ratio'].values.reshape(-1,1))
            y_pred = reg.predict(x.reshape(-1,1))
            
            full[project] = (np.NaN, np.NaN, np.NaN, np.NaN)
        
            if len(x) > 1 and not np.all(tmp2['all_full_ratio'].values == tmp2['all_full_ratio'].values[0], axis = 0):
                slope = (y_pred[-1] - y_pred[0]) / (x[-1] - x[0])
                f_val, p_val = f_regression(x.reshape(-1,1), tmp2['all_full_ratio'].values)
                full[project] = (slope[0], f_val[0], p_val[0], y_pred[0] - y_pred[-1])
    
            x = mdates.date2num(tmp3['datetime'].values)
            reg = LinearRegression()
            reg.fit(x.reshape(-1,1), tmp3['all_full'].values.reshape(-1,1))
            y_pred = reg.predict(x.reshape(-1,1))
            
            full_sum[project] = (np.NaN, np.NaN, np.NaN, np.NaN)
            
            if len(x) > 1 and not np.all(tmp2['all_full'].values == tmp2['all_full'].values[0], axis = 0):
                slope = (y_pred[-1] - y_pred[0]) / (x[-1] - x[0])
                f_val, p_val = f_regression(x.reshape(-1,1), tmp2['all_full'].values)
                full_sum[project] = (slope[0], f_val[0], p_val[0], y_pred[0] - y_pred[-1])
            
        for year in dfall[(dfall['project'] == project)]['year'].unique():
            if str(year) not in years.keys():
                years[str(year)] = []

            tmp = dfall[(dfall['project'] == project) & (dfall['year'] == year)]
            tmp = tmp[~tmp['all_full_ratio'].isnull()] # this isnull if we do not have any source code

            x = mdates.date2num(tmp['datetime'].values)
            reg = LinearRegression()
            reg.fit(x.reshape(-1,1), tmp['all_full_ratio'].values.reshape(-1,1))
            y_pred = reg.predict(x.reshape(-1,1))
            
            projects[project][str(year)] = (np.NaN, np.NaN, np.NaN, np.NaN)
            years[str(year)].append((np.NaN, np.NaN, np.NaN, np.NaN))
            
            if len(x) > 1 and not np.all(tmp['all_full_ratio'].values == tmp['all_full_ratio'].values[0], axis = 0):
                slope = (y_pred[-1] - y_pred[0]) / (x[-1] - x[0])
                f_val, p_val = f_regression(x.reshape(-1,1), tmp['all_full_ratio'].values)
                projects[project][str(year)] = (slope[0], f_val[0], p_val[0], y_pred[0] - y_pred[-1])
                years[str(year)].append((slope[0], f_val[0], p_val[0], y_pred[0] - y_pred[-1]))

    table = r'\begin{tabular}{l' + '|c' * len(years.keys()) + '|c|c|r}\n'
    
    # second row
    year_titles = sorted([y[2:] for y in years.keys()])
    year_vals = sorted(years.keys())
    table += 'Project & ' + " & ".join(year_vals) + ' & \emph{S} & \emph{R} & \emph{R} p.a.'  + r'\\' + '\n'
    table += r'\hline' + '\n'
    
    # per project
    for project in sorted(PROJECTS):
        row = [project]
        for year in year_vals:
            if year in projects[project].keys():
                slope, f_val, p_val, delta = projects[project][year]
                if slope < 0 and p_val < 0.05 and f_val > 1:
                    row.append('$\\searrow$')
                elif slope > 0 and p_val < 0.05 and f_val > 1:
                    row.append('$\\nearrow$')
                else:
                    row.append('$\\rightarrow$')
            else:
                row.append(' ')
        # full project lifetime sum
        slope, f_val, p_val, delta = full_sum[project]
        if slope < 0 and p_val < 0.05 and f_val > 1:
            row.append('$\\searrow$')
        elif slope > 0 and p_val < 0.05 and f_val > 1:
            row.append('$\\nearrow$')
        else:
            row.append('$\\rightarrow$')

        # full project lifetime
        slope, f_val, p_val, delta = full[project]
        if slope < 0 and p_val < 0.05 and f_val > 1:
            row.append('$\\searrow$')
        elif slope > 0 and p_val < 0.05 and f_val > 1:
            row.append('$\\nearrow$')
        else:
            row.append('$\\rightarrow$')

        # slope wd numeric
        if full_slope[project] != 0:
            row.append('{:.4f}'.format(full_slope[project]))
        else:
            row.append('-'.format(full_slope[project]))
        
        table += ' & '.join(row) + r'\\' + '\n'

    table += r'\hline' + '\n'

    # mean over all projects
    row = ['mean']
    for year in year_vals:
        slope_mean = statistics.mean([val[0] for val in years[year] if not val[0] is np.NaN])  # ignore these for mean
        if slope_mean < 0:
            row.append('$\\searrow$')
        elif slope_mean > 0:
            row.append('$\\nearrow$')
        else:
            row.append('$\\rightarrow$')
    table += ' & '.join(row) + ' & - & - & {:.4f}'.format(np.mean(all_slopes)) + r'\\' + '\n'
    table += r'\end{tabular}' + '\n'

    with open(TABLES_PATH + 'full_year_trends.tex', 'w') as f:
        f.write(table)

def save_group_table(dfall):
    """Create the differences per warning group and severity over the complete lifetime of the project.
    
    The mean per Project are taken from the sum of all warnings, especially for the slope this is different than the sum of each slope for each warning type.
    """
    
    project_sum = []
    project_remaining = []
    project_slope = []
    slope_sum = {}
    delta_sum = {}
    remaining = {}
    for project in PROJECTS:
        
        # this yields the trend per project over all of its years
        tmp2 = dfall[dfall['project'] == project]
        tmp2 = tmp2[~tmp2['all_full_ratio'].isnull()]

        # sum all warning density
        project_sum.append(tmp2['all_full_ratio'].values[0] - tmp2['all_full_ratio'].values[-1])
        project_remaining.append(tmp2['all_full_ratio'].values[-1])

        # project slope
        x = mdates.date2num(tmp2['datetime'].values)
        reg = LinearRegression()
        reg.fit(x.reshape(-1,1), tmp2['all_full_ratio'].values.reshape(-1,1))
        y_pred = reg.predict(x.reshape(-1,1))
        
        f_val = 0
        p_val = 1
        if not np.all(tmp2['all_full_ratio'].values == tmp2['all_full_ratio'].values[0], axis=0):
            f_val, p_val = f_regression(x.reshape(-1,1), tmp2['all_full_ratio'].values)

        if f_val > 1 and p_val < 0.05:
            project_slope.append(reg.coef_[0][0] * 365)
            # print(project, 'slope', reg.coef_[0][0] * 365)

        # sum warning density per warning group and severity
        for group in ['minor', 'major', 'critical'] + [r.lower() for r in sorted(set(PMD_GROUP_MATCH.values()))]:
            if group not in slope_sum.keys():
                slope_sum[group] = []
            if group not in delta_sum.keys():
                delta_sum[group] = []
            if group not in remaining.keys():
                remaining[group] = []

            name = group + '_full_ratio'
            
            tmp2 = tmp2[~tmp2[name].isnull()]
            if len(tmp2) == 0:
                continue

            # get slope for project/year/severity
            x = mdates.date2num(tmp2['datetime'].values)
            reg = LinearRegression()
            reg.fit(x.reshape(-1,1), tmp2[name].values.reshape(-1,1))
            y_pred = reg.predict(x.reshape(-1,1))

            f_val = 0
            p_val = 1
            if not np.all(tmp2[name].values == tmp2[name].values[0], axis=0):
                f_val, p_val = f_regression(x.reshape(-1,1), tmp2[name].values)

            if f_val > 1 and p_val < 0.05:
                slope_sum[group].append(reg.coef_[0][0] * 365)
            
            delta_sum[group].append(tmp2[name].values[0] - tmp2[name].values[-1])
            remaining[group].append(tmp2[name].values[-1])
            
    table3 = r'\begin{tabular}{l|r|r|r}' + '\n'    
    table3 += r'ASAT group / severity & Mean change per year & Delta & Remaining\\' + '\n'
    table3 += r'\hline' + '\n'
    
    # rows
    for severity in ['minor', 'major', 'critical']:
        table3 += '{} & {:.4f} & {:.4} & {:.4}'.format(severity, np.mean(slope_sum[severity]), np.mean(delta_sum[severity]), np.mean(remaining[severity])) + r'\\' + '\n'

    table3 += r'\hline' + '\n'

    sortable = {}
    for k, v in slope_sum.items():
        sortable[k] = np.mean(v)

    mean1 = []
    mean2 = []
    for key, value in sorted(sortable.items(), key=lambda kv: kv[1]):
        if key not in ['minor', 'major', 'critical']:
            table3 += '{} & {:.4f} & {:.4} & {:.4}'.format(key, value, np.mean(delta_sum[key]), np.mean(remaining[key])) + r'\\' + '\n'

    table3 += r'\hline' + '\n'
    table3 += 'Sum & {:.4f} & {:.4} & {:.4}'.format(np.mean(project_slope), np.mean(project_sum), np.mean(project_remaining)) + r'\\' + '\n'
    table3 += r'\end{tabular}'
    
    file_table3 = TABLES_PATH + 'full_trend_deltas_sum_only.tex'
    with open(file_table3, 'w') as f:
        f.write(table3)


def save_boxplot(years, use_f=True, use_coef=False):
    for kind in ['all_full', 'all_full_ratio']: #, 'default', 'effective', 'all_time', 'default_time', 'effective_time', 'all_time_ratio', 'default_time_ratio', 'effective_time_ratio']:

        yes_pmd = kind
        no_pmd = kind

        if 'effective' in kind:
            no_pmd = yes_pmd.replace('effective', 'default')

        # tmp = years[(years['year'] >= min_year)].copy()
        # tmp = years[years['use_maven'] == 1].copy()
        tmp = years.copy()
        #if 'ratio' in kind:
        if use_f:
            tmp = tmp[(tmp[yes_pmd + '_f_val'] > 1) & (tmp[yes_pmd + '_p_val'] < 0.05 ) & (tmp[no_pmd + '_f_val'] > 1) & (tmp[no_pmd + '_p_val'] < 0.05 )].copy()    
            
        var = '_slope'
        if use_coef:
            var = '_coef'
            
        group1 = tmp[(tmp['complete_pmd'] == True) & (tmp['yes_pmd'] > 0)].copy()
        group2 = tmp[(tmp['complete_pmd'] == True) & (tmp['no_pmd'] > 0)].copy()

        tmp.loc[tmp[(tmp['complete_pmd'] == True) & (tmp['yes_pmd'] > 0)].index, 'pmd_used'] = tmp[(tmp['complete_pmd'] == True) & (tmp['yes_pmd'] > 0)][yes_pmd + var]
        tmp.loc[tmp[(tmp['complete_pmd'] == True) & (tmp['no_pmd'] > 0)].index, 'pmd_not_used'] = tmp[(tmp['complete_pmd'] == True) & (tmp['no_pmd'] > 0)][no_pmd + var]

        fig = plt.figure()
        #plt.title(kind)
        ax = tmp.boxplot(column=['pmd_used', 'pmd_not_used'])
        # plt.show()
        ax.set_xticklabels(['PMD', 'No PMD'])
        plt.tight_layout()
        plt.savefig(FIGURES_PATH + 'boxplot_{}.pdf'.format(kind))
        plt.savefig(FIGURES_PATH + 'boxplot_{}.png'.format(kind))

        yes_pmd = group1[yes_pmd + var].values
        no_pmd = group2[no_pmd + var].values

        yes_pmd = yes_pmd[~np.isnan(yes_pmd)]
        no_pmd = no_pmd[~np.isnan(no_pmd)]

        stat, p_val = stats.mannwhitneyu(yes_pmd, no_pmd, alternative='less')
        print('n1: {} median1: {}, n2: {} median2 {}'.format(len(yes_pmd), np.median(yes_pmd), len(no_pmd), np.median(no_pmd)))
        print('mwu: {}, {}'.format(stat, p_val))
        print('effsize: {}'.format(effsize(stat, len(yes_pmd), len(no_pmd))))
        #ax = group1[['all_time_ratio_slope']].boxplot()
        #group2[['all_time_ratio_slope']].boxplot(ax=ax)

def save_dd_boxplot(dddf, years):
    tmp = years.copy()
    
    
    # tmp = tmp[(tmp[yes_pmd + '_f_val'] > 1) & (tmp[yes_pmd + '_p_val'] < 0.05 ) & (tmp[no_pmd + '_f_val'] > 1) & (tmp[no_pmd + '_p_val'] < 0.05 )].copy()
    
    tmp['pmd_used'] = np.NaN
    tmp['pmd_not_used'] = np.NaN
    tmp['dd'] = np.NaN

    for project_name in years['project'].unique():
        for year in years[years['project'] == project_name]['year'].unique():
            dd = dddf[(dddf['project'] == project_name) & (dddf['year'] == year)]['issues_created'] / dddf[(dddf['project'] == project_name) & (dddf['year'] == year)]['code_klloc']
            fd = dddf[(dddf['project'] == project_name) & (dddf['year'] == year)]['issues_fixed'] / dddf[(dddf['project'] == project_name) & (dddf['year'] == year)]['effective_klloc']
            
            # this may not select anything
            if len(dd.values) > 0:
                tmp.loc[tmp[(tmp['project'] == project_name) & (tmp['year'] == year)].index, 'dd'] = dd.values[0]
                tmp.loc[tmp[(tmp['complete_pmd'] == True) & (tmp['yes_pmd'] > 0) & (tmp['project'] == project_name) & (tmp['year'] == year)].index, 'pmd_used'] = dd.values[0]
                tmp.loc[tmp[(tmp['complete_pmd'] == True) & (tmp['no_pmd'] > 0) & (tmp['project'] == project_name) & (tmp['year'] == year)].index, 'pmd_not_used'] = dd.values[0]
            
            #print(project_name, year, dd)
            
    fig = plt.figure()
    #plt.title(kind)
    ax = tmp.boxplot(column=['pmd_used', 'pmd_not_used'])
    # plt.show()
    ax.set_xticklabels(['PMD', 'No PMD'])
    plt.tight_layout()
    plt.savefig(FIGURES_PATH + 'boxplot_dd.pdf')
    plt.savefig(FIGURES_PATH + 'boxplot_dd.png')

    yes_pmd = tmp[(tmp['complete_pmd'] == True) & (tmp['yes_pmd'] > 0)]['pmd_used'].values
    no_pmd = tmp[(tmp['complete_pmd'] == True) & (tmp['no_pmd'] > 0)]['pmd_not_used'].values

    yes_pmd = yes_pmd[~np.isnan(yes_pmd)]
    no_pmd = no_pmd[~np.isnan(no_pmd)]

    stat, p_val = stats.mannwhitneyu(yes_pmd, no_pmd, alternative='less')
    print('n1: {} median1: {}, n2: {} median2 {}'.format(len(yes_pmd), np.median(yes_pmd), len(no_pmd), np.median(no_pmd)))
    print('mwu: {}, {}'.format(stat, p_val))

    table = ''
    table += 'Shapiro-Wilk & No PMD & {:.5} & {:.5}\\\\\n'.format(*stats.shapiro(no_pmd))
    table += 'Shapiro-Wilk & PMD & {:.5} & {:.5}\\\\\n'.format(*stats.shapiro(yes_pmd))
    table += '\\hline\n'

    table += 'Levene & Both & {:.5} & {:.5}\\\\\n'.format(*stats.levene(yes_pmd, no_pmd))
    table += '\\hline\n'

    #table += 'Welch T-Test & Both & {:.5} & {:.5}\\\\\n'.format(*stats.ttest_ind(yes_pmd, no_pmd, equal_var=False)) 
    
    stat, p_val = stats.mannwhitneyu(yes_pmd, no_pmd, alternative='less')
    eff = effsize(stat, len(yes_pmd), len(no_pmd))
        
    table += 'Mann-Whitney-U & Both & {:.5} & {:.5}\\\\\n'.format(stat, p_val) 
    table += 'Effect size & Both & {:.5} & -\\\\\n'.format(eff) 

    print('confidence', confidence(yes_pmd, no_pmd))
    
    with open(TABLES_PATH + 'dd_results.tex', 'w') as f:
        f.write(table)
    
    return tmp
        
def confidence(yes_pmd, no_pmd):
    
    n1 = len(yes_pmd)
    n2 = len(no_pmd)

    deltas = sorted([i-j for i in yes_pmd for j in no_pmd])
    K = (n1*n2) / 2 - (1.96 * math.sqrt((n1*n2*(n1+n2+1)) / 12))
    
    k = int(round(K))
    
    CI = (deltas[k], deltas[len(deltas)-k])
    # print('confidence interval that the real median difference lies between {} and {} is 95%'.format(CI[0], CI[1])) 
    return CI

In [None]:
dfall = pd.read_pickle('../data/aggregated_full.pickle')
ddf = pd.read_csv('../data/defect_density.csv')
print('projects', dfall['project'].nunique())
print('commits', len(dfall))

In [None]:
years, full = yearly_slopes(dfall)

In [None]:
# save MWU stats tables and reporting tbale
table = {}
reporting = {}
for kind in ['all', 'all_time_ratio', 'default_time_ratio', 'effective_time_ratio', 'effective_time_overlap_ratio']:
    table[kind], reporting[kind] = get_stats(years, kind)
    
    save_stats_table3(table[kind], kind)

save_reporting_table(reporting)

In [None]:
# correlation between lloc and warnings
save_correlation_lloc_table(dfall)

save_trends_table(years, 'all_overlap_ratio')

# yearly trends without maven
save_full_trends_table(dfall, full)

# changes of warnings per severity and group
save_group_table(dfall)

# number of changes for build configuration and custom rules
save_config_changes_table()

# trends for the number of asat warnigns over all development years of the project
# without overlapping rules and for warning sum, warning densitiy, time correction, defaults, effective
save_full_trends_after_maven_table(full)

# sum of slopes after asat intro
save_slope_sums_table(years)

# correlation between slope per year (mean warning change per year) and rule changes per year
save_rule_changes_correlation_table(dfall, years)

# boxplots for sum of warnings for pmd/no pmd samples
save_boxplot(years)

# and dd for pmd/no pmd samples
save_dd_boxplot(ddf, years)

# example of commons-math
print_project_full_year(dfall[dfall['project'] == 'commons-lang'])