# Setup

In [33]:
# imports
import pandas as pd

In [34]:
# constants
RELEASED_VERSIONS = ["2.0.0", "2.0.1",
                     "2.1.0", "2.1.1", "2.2.0",
                     "2.3.0", "2.3.1", "2.3.2", "2.3.3", "2.3.4", "2.3.5", "2.3.6", "2.3.7", "2.3.8", "2.3.9",
                     "3.0.0", "3.1.0", "3.1.1", "3.1.2", "3.1.3"]

INPUT_DIR = "../../data/input/"
OUTPUT_DIR = "../../data/output/"

# Load data

We take a look at the raw metrics data to know how to process it 

In [35]:
df = pd.read_csv(INPUT_DIR + "hive-metrics-2.2.0.csv")
df.head()

Unnamed: 0,Kind,Name,AltAvgLineBlank,AltAvgLineCode,AltAvgLineComment,AltCountLineBlank,AltCountLineCode,AltCountLineComment,AvgCyclomatic,AvgCyclomaticModified,...,MaxInheritanceTree,MaxNesting,MinEssentialKnots,PercentLackOfCohesion,PercentLackOfCohesionModified,RatioCommentToCode,SumCyclomatic,SumCyclomaticModified,SumCyclomaticStrict,SumEssential
0,File,ACLConfigurationParser.java,,,,,,,2.0,2.0,...,,3.0,,,,0.16,29.0,29.0,32.0,22.0
1,File,AMReporter.java,,,,,,,1.0,1.0,...,,3.0,,,,0.13,65.0,65.0,68.0,36.0
2,File,ASTBuilder.java,,,,,,,5.0,1.0,...,,2.0,,,,0.09,90.0,32.0,90.0,22.0
3,File,ASTConverter.java,,,,,,,3.0,3.0,...,,5.0,,,,0.15,144.0,144.0,157.0,44.0
4,File,ASTErrorNode.java,,,,,,,1.0,1.0,...,,0.0,,,,0.95,5.0,5.0,5.0,5.0


We can see that we have metrics for not only files but also for packages, classes, and functions. Also, there are a lot more metrics than we want.
Therefore, we need to select precisely what we want from the dataset.

Below we can observe that classes' metrics don't have any value for files.

In [16]:
classes_metrics_cols = ["Name", "CountClassBase", "CountClassCoupled", "CountClassDerived", "MaxInheritanceTree", "PercentLackOfCohesion"]
df.loc[df["Kind"] == "File"][classes_metrics_cols].describe().loc["count"]

CountClassBase           0.0
CountClassCoupled        0.0
CountClassDerived        0.0
MaxInheritanceTree       0.0
PercentLackOfCohesion    0.0
Name: count, dtype: float64

Same for methods' metrics

In [18]:
methods_metrics_cols = ["Name", "CountInput", "CountClassCoupled", "CountClassDerived", "MaxInheritanceTree", "PercentLackOfCohesion"]
df.loc[df["Kind"] == "File"][methods_metrics_cols].describe().loc["count"]

CountInput               0.0
CountClassCoupled        0.0
CountClassDerived        0.0
MaxInheritanceTree       0.0
PercentLackOfCohesion    0.0
Name: count, dtype: float64

### Metrics selection

We therefore choose to only collect files' metrics as we are trying to predict bugs at files level.

In [36]:
metrics_cols = ["Name", "AvgCyclomatic", "AvgCyclomaticModified", "AvgCyclomaticStrict", "AvgEssential", "AvgLine", "AvgLineBlank", "AvgLineCode", 
    "AvgLineComment", "CountDeclClass", "CountDeclClassMethod", "CountDeclClassVariable", "CountDeclFunction", "CountDeclInstanceMethod",
    "CountDeclInstanceVariable", "CountDeclMethod", "CountDeclMethodDefault", "CountDeclMethodPrivate", "CountDeclMethodProtected",
    "CountDeclMethodPublic", "CountLine", "CountLineBlank", "CountLineCode", "CountLineCodeDecl", "CountLineCodeExe", "CountLineComment", 
    "CountSemicolon", "CountStmt", "CountStmtDecl", "CountStmtExe", "MaxCyclomatic", "MaxCyclomaticModified", "MaxCyclomaticStrict", 
    "RatioCommentToCode", "SumCyclomatic", "SumCyclomaticModified", "SumCyclomaticStrict", "SumEssential"]

data = df.loc[df["Kind"] == "File"][metrics_cols]
data.describe()

Unnamed: 0,AvgCyclomatic,AvgCyclomaticModified,AvgCyclomaticStrict,AvgEssential,AvgLine,AvgLineBlank,AvgLineCode,AvgLineComment,CountDeclClass,CountDeclClassMethod,...,CountStmtDecl,CountStmtExe,MaxCyclomatic,MaxCyclomaticModified,MaxCyclomaticStrict,RatioCommentToCode,SumCyclomatic,SumCyclomaticModified,SumCyclomaticStrict,SumEssential
count,4992.0,4992.0,4992.0,4992.0,4992.0,4992.0,4992.0,4992.0,4992.0,4953.0,...,4992.0,4992.0,4992.0,4992.0,4992.0,4992.0,4992.0,4992.0,4992.0,4992.0
mean,1.755008,1.63141,1.896835,1.028646,12.165264,0.654647,10.082933,1.047877,2.701723,1.792449,...,65.978566,112.604768,5.886819,5.330128,7.014824,0.877326,43.287861,40.936498,46.558093,25.011218
std,1.878251,1.484832,2.07734,0.728791,13.920479,2.024822,11.074217,2.309787,35.769781,18.347462,...,485.99738,1252.434005,8.452574,7.227726,10.92831,2.68148,539.98628,510.093969,585.422186,324.414761
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,1.0,1.0,1.0,4.0,0.0,4.0,0.0,1.0,0.0,...,12.0,7.0,1.0,1.0,1.0,0.21,4.0,4.0,4.0,3.0
50%,1.0,1.0,1.0,1.0,8.0,0.0,7.0,0.0,1.0,0.0,...,26.0,25.0,3.0,3.0,3.0,0.41,10.0,10.0,11.0,7.0
75%,2.0,2.0,2.0,1.0,15.0,1.0,13.0,1.0,2.0,1.0,...,60.0,78.0,7.0,7.0,9.0,0.88,28.0,27.0,30.0,16.0
max,35.0,17.0,35.0,13.0,354.0,66.0,260.0,38.0,2409.0,902.0,...,29959.0,76450.0,145.0,96.0,152.0,144.0,35520.0,33530.0,38848.0,21642.0


In [37]:
data.insert(1, "Version", "2.2.0")
data.head()

Unnamed: 0,Name,Version,AvgCyclomatic,AvgCyclomaticModified,AvgCyclomaticStrict,AvgEssential,AvgLine,AvgLineBlank,AvgLineCode,AvgLineComment,...,CountStmtDecl,CountStmtExe,MaxCyclomatic,MaxCyclomaticModified,MaxCyclomaticStrict,RatioCommentToCode,SumCyclomatic,SumCyclomaticModified,SumCyclomaticStrict,SumEssential
0,ACLConfigurationParser.java,2.2.0,2.0,2.0,3.0,2.0,10.0,0.0,10.0,0.0,...,37.0,58.0,11.0,11.0,13.0,0.16,29.0,29.0,32.0,22.0
1,AMReporter.java,2.2.0,1.0,1.0,2.0,1.0,11.0,0.0,10.0,0.0,...,135.0,155.0,7.0,7.0,9.0,0.13,65.0,65.0,68.0,36.0
2,ASTBuilder.java,2.2.0,5.0,1.0,5.0,1.0,16.0,0.0,14.0,0.0,...,60.0,120.0,64.0,9.0,64.0,0.09,90.0,32.0,90.0,22.0
3,ASTConverter.java,2.2.0,3.0,3.0,4.0,1.0,18.0,1.0,15.0,1.0,...,249.0,345.0,21.0,21.0,23.0,0.15,144.0,144.0,157.0,44.0
4,ASTErrorNode.java,2.2.0,1.0,1.0,1.0,1.0,2.0,0.0,2.0,0.0,...,13.0,5.0,1.0,1.0,1.0,0.95,5.0,5.0,5.0,5.0


We have a proper dataset to work on. 

Now, let us replicate this process on all versions of files.

In [None]:
for version in RELEASED_VERSIONS:
    df = pd.read_csv(INPUT_DIR + f"hive-metrics-{version}.csv")
    df = df.loc[df["Kind"] == "File"][metrics_cols]
    df.insert(1, "Version", version)
    df.to_csv(OUTPUT_DIR + f"hive-files-metrics-{version}.csv", index=False)
    

We can now concatenate all the generated files above, to have the unlabeled dataset.

In [None]:
versions_data = []

for version in RELEASED_VERSIONS:
    df = pd.read_csv(OUTPUT_DIR + f"hive-files-metrics-{version}.csv")
    versions_data.append(df)

unlabeled_data = pd.concat(versions_data)
unlabeled_data.to_csv(OUTPUT_DIR + f"hive-unlabeled-data.csv", index=False)