In [1]:
import pandas as pd
import numpy as np
import re
import os
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import grangercausalitytests
from statsmodels.tsa.vector_ar.vecm import coint_johansen

In [2]:
loc_df = pd.read_csv('./metrics/loc.csv', index_col=0)
loc_df.index = [item + '_loc' for item in loc_df.index]

noc_df = pd.read_csv('./metrics/noc.csv', index_col=0)
noc_df.index = [item + '_noc' for item in noc_df.index]

log_df = pd.read_csv('./metrics/logStatementsQty.csv', index_col=0)
log_df.index = [item + '_log' for item in log_df.index]

try_df = pd.read_csv('./metrics/tryCatchQty.csv', index_col=0)
try_df.index = [item + '_try' for item in try_df.index] 


fanin_df = pd.read_csv('./metrics/fanin.csv', index_col=0)
fanin_df.index = [item + '_fanin' for item in fanin_df.index]

fanout_df = pd.read_csv('./metrics/fanout.csv', index_col=0)
fanout_df.index = [item + '_fanout' for item in fanout_df.index]

dit_df = pd.read_csv('./metrics/dit.csv', index_col=0)
dit_df.index = [item + '_dit' for item in dit_df.index]

lcc_df = pd.read_csv('./metrics/lcc.csv', index_col=0)
lcc_df.index = [item + '_lcc' for item in lcc_df.index]

In [3]:
## normalize all dataframes to 0-1
def normalize_df(df):
    return (df - df.min()) / (df.max() - df.min())

log_df = normalize_df(log_df)
try_df = normalize_df(try_df)  
loc_df = normalize_df(loc_df)
noc_df = normalize_df(noc_df)
fanin_df = normalize_df(fanin_df)
fanout_df = normalize_df(fanout_df)
dit_df = normalize_df(dit_df)

In [4]:
full_df = pd.concat([loc_df, noc_df, fanin_df, fanout_df, dit_df], axis=0)

In [5]:
full_df.dropna(thresh=300, inplace=True)
full_df.dropna(axis=1, inplace=True)

In [6]:
projects_dict = {}

projects = ['neo4j', 'tomcat', 'jitsi',
'eclipse_jdt_core', 'spring-security', 'Arduino',
'eclipse_pde_ui', 'hibernate-orm', 'jabref', 'jenkins',
'pmd', 'rt_equinox_framework']

metrics = ['loc', 'noc', 'fanin', 'fanout', 'dit']

for project in projects:
    project_dfs = []
    for metric in metrics:
        project_dfs.append(full_df.T[[project + '_' + metric]])
    
    projects_dict[project] = pd.concat(project_dfs, axis=1)
    projects_dict[project].columns = metrics

In [7]:
try_df.dropna(thresh=300, inplace=True)
try_df.dropna(axis=1, inplace=True)

try_df.drop(['pentaho-kettle_try'], axis=0, inplace=True)

In [8]:
log_df.dropna(thresh=300, inplace=True)
log_df.dropna(axis=1, inplace=True)

log_df.drop(['pentaho-kettle_log'], axis=0, inplace=True)

In [9]:
def grangers_causation_matrix(data, variables, maxlag, test='ssr_ftest'):    
    """Check Granger Causality of all possible combinations of the Time series.
    The rows are the response variable, columns are predictors. The values in the table 
    are the P-Values. P-Values lesser than the significance level (0.05), implies 
    the Null Hypothesis that the coefficients of the corresponding past values is 
    zero, that is, the X does not cause Y can be rejected.

    data      : pandas dataframe containing the time series variables
    variables : list containing names of the time series variables.
    """
    df = pd.DataFrame(np.zeros((len(variables), len(variables))), columns=variables, index=variables)
    for c in df.columns:
        for r in df.index:
            test_result = grangercausalitytests(data[[r, c]], maxlag=maxlag, verbose=False)
            p_values = [round(test_result[i+1][0][test][1],4) for i in range(maxlag)]
            # if verbose: print(f'Y = {r}, X = {c}, P Values = {p_values}')
            min_p_value = np.min(p_values)
            df.loc[r, c] = min_p_value
    df.columns = [var + '_x' for var in variables]
    df.index = [var + '_y' for var in variables]
    return df

In [18]:
corr_matrices = {}

for proj in projects:

    corr_df = pd.concat([projects_dict[proj], try_df.loc[proj + '_try'], log_df.loc[proj + '_log']], axis=1)

    ## drop all constant columns and keep track of them
    corr_df = corr_df.loc[:, (corr_df != corr_df.iloc[0]).any()]

    corr_matrix = grangers_causation_matrix(corr_df, variables = corr_df.columns, maxlag=3)


    corr_matrix = corr_matrix[[proj + '_try_x',proj + '_log_x']].drop([proj + '_try_y',proj + '_log_y'])
    
    corr_matrix.columns = ['try', 'log']
    corr_matrix.index = ['loc', 'noc', 'fanin', 'fanout', 'dit']
    
    corr_matrices[proj] = corr_matrix
    
    




In [39]:
## plot the correlation matrices for each project, colouring the cell blue if the p-value is less than 0.05, and red otherwise. Leave the text black.
for proj in projects:
    fig, ax = plt.subplots(figsize=(10,10))
    ax.matshow(corr_matrices[proj], cmap=plt.cm.Blues)
    ax.set_xticklabels([''] + list(corr_matrices[proj].columns))
    ax.set_yticklabels([''] + list(corr_matrices[proj].index))
    ax.set_title(proj)
    for (i, j), z in np.ndenumerate(corr_matrices[proj]):
        ax.text(j, i, '{:0.2f}'.format(z), ha='center', va='center', color='black')
    plt.savefig('./plots/' + proj + '_corr.png')
    plt.close()

  ax.set_xticklabels([''] + list(corr_matrices[proj].columns))
  ax.set_yticklabels([''] + list(corr_matrices[proj].index))
  ax.set_xticklabels([''] + list(corr_matrices[proj].columns))
  ax.set_yticklabels([''] + list(corr_matrices[proj].index))
  ax.set_xticklabels([''] + list(corr_matrices[proj].columns))
  ax.set_yticklabels([''] + list(corr_matrices[proj].index))
  ax.set_xticklabels([''] + list(corr_matrices[proj].columns))
  ax.set_yticklabels([''] + list(corr_matrices[proj].index))
  ax.set_xticklabels([''] + list(corr_matrices[proj].columns))
  ax.set_yticklabels([''] + list(corr_matrices[proj].index))
  ax.set_xticklabels([''] + list(corr_matrices[proj].columns))
  ax.set_yticklabels([''] + list(corr_matrices[proj].index))
  ax.set_xticklabels([''] + list(corr_matrices[proj].columns))
  ax.set_yticklabels([''] + list(corr_matrices[proj].index))
  ax.set_xticklabels([''] + list(corr_matrices[proj].columns))
  ax.set_yticklabels([''] + list(corr_matrices[proj].index))
  ax.set

In [66]:
neo4j_log_corr

Unnamed: 0,complexity_x,logqty_x
complexity_y,1.0,0.0204
logqty_y,0.0002,1.0


In [67]:
neo4j_try_corr

Unnamed: 0,complexity_x,tryqty_x
complexity_y,1.0,0.0005
tryqty_y,0.0,1.0


In [21]:
def sumdf(df, project):
    
    df = df[df != -1].sum()

    df = pd.DataFrame(df).T

    df.index = [project]
    df.columns = list(np.arange(1, len(df.columns)+1))
    
    return df

def iqrdf(df, project):
    q1 = df[df!=-1].quantile(0.25)
    q3 = df[df!=-1].quantile(0.75)
    iqr = q3 - q1

    df = pd.DataFrame(iqr).T

    df.index = [project]
    df.columns = list(np.arange(1, len(df.columns)+1))
    
    return df

def meandf(df, project):
    
    df = df[df != -1].mean()

    df = pd.DataFrame(df).T

    df.index = [project]
    df.columns = list(np.arange(1, len(df.columns)+1))
    
    return df

def mediandf(df, project):
    
    df = df[df != -1].median()

    df = pd.DataFrame(df).T

    df.index = [project]
    df.columns = list(np.arange(1, len(df.columns)+1))
    
    return df

def maxdf(df, project):
    
    df = df[df != -1].max()

    df = pd.DataFrame(df).T

    df.index = [project]
    df.columns = list(np.arange(1, len(df.columns)+1))
    
    return df

In [8]:
for metric in ['loc', 'noc', 'tryCatchQty', 'logStatementsQty']:

    all = []
    for project in os.listdir('./data/'):

        try:
            df = pd.read_csv(f'./data/{project}/{metric}.csv', header=None)
            df.set_index(0, inplace=True)
        except:
            continue

        df = df.query('~index.str.lower().str.contains("test")')

        df = sumdf(df, project)
        
        all.append(df)
        
    new = pd.concat(all, axis=0)
    new.to_csv(f'./metrics/{metric}.csv', index=True)

In [112]:
for metric in ['dit', 'fanin', 'fanout', 'lcc']:
 
    all = []
    for project in os.listdir('./data/'): 
        
        try:
            df = pd.read_csv(f'./data/{project}/{metric}.csv', header=None)
        except:
            continue   
        df.set_index(0, inplace=True)

        df = df.query('~index.str.lower().str.contains("test")')
        df = iqrdf(df, project)
        
        all.append(df)
        
    new = pd.concat(all, axis=0)
    new.to_csv(f'./metrics/{metric}.csv', index=True)

In [22]:
# for metric in ['dit', 'fanin', 'fanout', 'lcc']:
for metric in ['dit']:
 
    all = []
    for project in os.listdir('./data/'): 
        
        try:
            df = pd.read_csv(f'./data/{project}/{metric}.csv', header=None)
        except:
            continue   
        df.set_index(0, inplace=True)

        df = df.query('~index.str.lower().str.contains("test")')
        df = meandf(df, project)
        
        all.append(df)
        
    new = pd.concat(all, axis=0)
    new.to_csv(f'./metrics/{metric}_mean.csv', index=True)

In [67]:
for metric in ['dit', 'fanin', 'fanout']:
 
    all = []
    for project in os.listdir('./data/'): 
        
        try:
            df = pd.read_csv(f'./data/{project}/{metric}.csv', header=None)
        except:
            continue   
        df.set_index(0, inplace=True)

        df = df.query('~index.str.lower().str.contains("test")')
        df = mediandf(df, project)
        
        all.append(df)
        
    new = pd.concat(all, axis=0)
    new.to_csv(f'./metrics/{metric}_median.csv', index=True)

In [72]:
for metric in ['dit', 'fanin', 'fanout']:
 
    all = []
    for project in os.listdir('./data/'): 
        
        try:
            df = pd.read_csv(f'./data/{project}/{metric}.csv', header=None)
        except:
            continue   
        df.set_index(0, inplace=True)

        df = df.query('~index.str.lower().str.contains("test")')
        df = maxdf(df, project)
        
        all.append(df)
        
    new = pd.concat(all, axis=0)
    new.to_csv(f'./metrics/{metric}_max.csv', index=True)