In [2]:
import pandas as pd
import numpy as np

relations = pd.read_csv('input/relation_overview.csv', sep='\t')
variables = pd.read_csv('input/variables_overview.csv', sep='\t')

variable_column = 'Variable'
relations['Variable'] = relations['Variable A'] + relations['Variable B']

In [3]:
relations.head()

Unnamed: 0,A1,A2,B1,B2,Variable A,Variable B,Group,Group (base),Method,Combination,ID A,ID B,Variable
0,B,1.0,C,3.0,B1 - Build tools,C3 - # Files changed,3.1 (pd),3,boxplots,BC,B1,C3,B1 - Build toolsC3 - # Files changed
1,B,1.0,C,4.0,B1 - Build tools,C4 - Commit file bugginess,32,32,boxplots,BC,B1,C4,B1 - Build toolsC4 - Commit file bugginess
2,B,1.0,C,5.0,B1 - Build tools,C5 - Commit total line bugginess,32,32,boxplots,BC,B1,C5,B1 - Build toolsC5 - Commit total line bugginess
3,B,1.0,C,6.0,B1 - Build tools,C6 - Commit file line bugginess,32,32,boxplots,BC,B1,C6,B1 - Build toolsC6 - Commit file line bugginess
4,B,1.0,F,4.0,B1 - Build tools,F4 - # Bug discussants,32,32,boxplots,BF,B1,F4,B1 - Build toolsF4 - # Bug discussants


In [4]:
project_exclusive = variables[variables['Project exclusive'] == True]['ID']
incomparable = variables[variables['Incomparable'] == True]['ID']
project_dependent = variables[variables['Project dependent'] == True]['ID']

sparsity = set(relations[relations['Group (base)'].isin(['31', '32', 'E'])][variable_column])
print(f'Data sparsity: {len(sparsity)}')

project_exclusive = set(relations[(relations['ID A'].isin(project_exclusive)) | (relations['ID B'].isin(project_exclusive))][variable_column])
print(f'Project exclusive: {len(project_exclusive)}')

incomparable = set(relations[(relations['ID A'].isin(incomparable)) | (relations['ID B'].isin(incomparable))][variable_column])
print(f'Incomparable: {len(incomparable)}')

project_dependent = set(relations[(relations['ID A'].isin(project_dependent)) | (relations['ID B'].isin(project_dependent))][variable_column])
print(f'Project dependent: {len(project_dependent)}')

Data sparsity: 2490
Project exclusive: 1608
Incomparable: 1676
Project dependent: 410


In [5]:
len(sparsity & project_exclusive)

1126

In [6]:
overlap_sparsity_exclusive = sparsity & project_exclusive
overlat_parcity_incomparable = sparsity & incomparable
overlap_sparsity_dependent = sparsity & project_dependent

overlap_exclusive_incomparable = project_exclusive & incomparable
overlap_exclusive_dependent = project_exclusive & project_dependent
overlap_incomparable_dependent = incomparable & project_dependent

overlap_all = sparsity & project_exclusive & incomparable & project_dependent

print(f"sparsity exclusive: {len(overlap_sparsity_exclusive)}")
print(f"sparsity incomparable: {len(overlat_parcity_incomparable)}")
print(f"sparsity dependent: {len(overlap_sparsity_dependent)}")

print(f"exclusive incomparable: {len(overlap_exclusive_incomparable)}")
print(f"exclusive dependent: {len(overlap_exclusive_dependent)}")
print(f"incomparable dependent: {len(overlap_incomparable_dependent)}")


sparsity exclusive: 1126
sparsity incomparable: 1676
sparsity dependent: 236
exclusive incomparable: 546
exclusive dependent: 110
incomparable dependent: 172


In [7]:
sparsity_lable = 'Data Sparsity'
incomparable_label = 'Incomparability'
project_exclusive_label = 'Project exclusivity'
project_dependent_label = 'Project dependence'


tbl = {
    sparsity_lable: [len(sparsity), len(sparsity & incomparable), len(sparsity & project_exclusive), len(sparsity & project_dependent)],
    incomparable_label: ['~', len(incomparable), len(incomparable & project_exclusive), len(incomparable & project_dependent)],
    project_exclusive_label: ['~', '~', len(project_exclusive), len(project_exclusive & project_dependent)],
    project_dependent_label: ['~', '~', '~', len(project_dependent)]
}

overlap_df = pd.DataFrame(tbl, index=[sparsity_lable, incomparable_label, project_exclusive_label, project_dependent_label])

latex_table = overlap_df.to_latex(column_format='r|' + 'r' * len(overlap_df.columns), label='tab:data-limitations', caption='Number of pairs for which limitations were observed, and the overlap between these limitations.', escape=False)

print(latex_table)

\begin{table}
\caption{Number of pairs for which limitations were observed, and the overlap between these limitations.}
\label{tab:data-limitations}
\begin{tabular}{r|rrrr}
\toprule
 & Data Sparsity & Incomparability & Project exclusivity & Project dependence \\
\midrule
Data Sparsity & 2490 & ~ & ~ & ~ \\
Incomparability & 1676 & 1676 & ~ & ~ \\
Project exclusivity & 1126 & 546 & 1608 & ~ \\
Project dependence & 236 & 172 & 110 & 410 \\
\bottomrule
\end{tabular}
\end{table}



