# Setup

In [1]:
# imports
import pandas as pd
import seaborn as sns

In [2]:
# constants
RELEASED_VERSIONS = ["2.0.0", "2.0.1",
                     "2.1.0", "2.1.1", "2.2.0",
                     "2.3.0", "2.3.1", "2.3.2", "2.3.3", "2.3.4", "2.3.5", "2.3.6", "2.3.7", "2.3.8", "2.3.9",
                     "3.0.0", "3.1.0", "3.1.1", "3.1.2", "3.1.3"]

INPUT_DIR = "../../data/input/"
OUTPUT_DIR = "../../data/output/"

# Load data

We take a look at the raw metrics data to know how to process it 

In [3]:
df = pd.read_csv(INPUT_DIR + "hive-metrics-v2/2.0.0.csv")
df.head()

Unnamed: 0,Kind,Name,File,AltAvgLineBlank,AltAvgLineCode,AltAvgLineComment,AltCountLineBlank,AltCountLineCode,AltCountLineComment,AvgCyclomatic,...,MaxInheritanceTree,MaxNesting,MinEssentialKnots,PercentLackOfCohesion,PercentLackOfCohesionModified,RatioCommentToCode,SumCyclomatic,SumCyclomaticModified,SumCyclomaticStrict,SumEssential
0,Class,Apache::Hadoop::Hive::AbortTxnRequest,hive\metastore\src\gen\thrift\gen-cpp\hive_met...,0.0,7.0,0.0,16.0,88.0,0.0,1.0,...,0.0,3.0,,27.0,,0.0,16.0,16.0,16.0,12.0
1,Public Function,Apache::Hadoop::Hive::AbortTxnRequest::AbortTx...,hive\metastore\src\gen\thrift\gen-cpp\hive_met...,,,,0.0,3.0,0.0,,...,,0.0,0.0,,,0.0,,,,
2,Public Function,Apache::Hadoop::Hive::AbortTxnRequest::AbortTx...,hive\metastore\src\gen\thrift\gen-cpp\hive_met...,,,,0.0,2.0,0.0,,...,,0.0,0.0,,,0.0,,,,
3,Public Function,Apache::Hadoop::Hive::AbortTxnRequest::__set_t...,hive\metastore\src\gen\thrift\gen-cpp\hive_met...,,,,0.0,3.0,0.0,,...,,0.0,0.0,,,0.0,,,,
4,Public Const Function,Apache::Hadoop::Hive::AbortTxnRequest::operator!=,hive\metastore\src\gen\thrift\gen-cpp\hive_met...,,,,0.0,3.0,0.0,,...,,0.0,0.0,,,0.0,,,,


We can see that we have metrics for not only files but also for packages, classes, and functions. Also, there are a lot more metrics than we want.
Therefore, we need to select precisely what we want from the dataset.

Below we can observe that classes' metrics don't have any value for files.

In [4]:
classes_metrics_cols = ["Name", "CountClassBase", "CountClassCoupled", "CountClassDerived", "MaxInheritanceTree", "PercentLackOfCohesion"]
df.loc[df["Kind"] == "File"][classes_metrics_cols].describe().loc["count"]

CountClassBase           0.0
CountClassCoupled        0.0
CountClassDerived        0.0
MaxInheritanceTree       0.0
PercentLackOfCohesion    0.0
Name: count, dtype: float64

Same for methods' metrics

In [5]:
methods_metrics_cols = ["Name", "CountInput", "CountClassCoupled", "CountClassDerived", "MaxInheritanceTree", "PercentLackOfCohesion"]
df.loc[df["Kind"] == "File"][methods_metrics_cols].describe().loc["count"]

CountInput               0.0
CountClassCoupled        0.0
CountClassDerived        0.0
MaxInheritanceTree       0.0
PercentLackOfCohesion    0.0
Name: count, dtype: float64

### Metrics selection

We therefore choose to only collect files' metrics as we are trying to predict bugs at files level.

In [6]:
metrics_cols = ["Name", "AvgCyclomatic", "AvgCyclomaticModified", "AvgCyclomaticStrict", "AvgEssential", "AvgLine", "AvgLineBlank", "AvgLineCode", 
    "AvgLineComment", "CountDeclClass", "CountDeclClassMethod", "CountDeclClassVariable", "CountDeclFunction", "CountDeclInstanceMethod",
    "CountDeclInstanceVariable", "CountDeclMethod", "CountDeclMethodDefault", "CountDeclMethodPrivate", "CountDeclMethodProtected",
    "CountDeclMethodPublic", "CountLine", "CountLineBlank", "CountLineCode", "CountLineCodeDecl", "CountLineCodeExe", "CountLineComment", 
    "CountSemicolon", "CountStmt", "CountStmtDecl", "CountStmtExe", "MaxCyclomatic", "MaxCyclomaticModified", "MaxCyclomaticStrict", 
    "RatioCommentToCode", "SumCyclomatic", "SumCyclomaticModified", "SumCyclomaticStrict", "SumEssential"]

data = df.loc[df["Kind"] == "File"][metrics_cols]
data.describe()

Unnamed: 0,AvgCyclomatic,AvgCyclomaticModified,AvgCyclomaticStrict,AvgEssential,AvgLine,AvgLineBlank,AvgLineCode,AvgLineComment,CountDeclClass,CountDeclClassMethod,...,CountStmtDecl,CountStmtExe,MaxCyclomatic,MaxCyclomaticModified,MaxCyclomaticStrict,RatioCommentToCode,SumCyclomatic,SumCyclomaticModified,SumCyclomaticStrict,SumEssential
count,4992.0,4992.0,4992.0,4992.0,4992.0,4992.0,4992.0,4992.0,4992.0,4953.0,...,4992.0,4992.0,4992.0,4992.0,4992.0,4992.0,4992.0,4992.0,4992.0,4992.0
mean,1.755008,1.63141,1.896835,1.028646,12.165264,0.654647,10.082933,1.047877,2.701723,1.792449,...,65.978566,112.604768,5.886819,5.330128,7.014824,0.877326,43.287861,40.936498,46.558093,25.011218
std,1.878251,1.484832,2.07734,0.728791,13.920479,2.024822,11.074217,2.309787,35.769781,18.347462,...,485.99738,1252.434005,8.452574,7.227726,10.92831,2.68148,539.98628,510.093969,585.422186,324.414761
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,1.0,1.0,1.0,4.0,0.0,4.0,0.0,1.0,0.0,...,12.0,7.0,1.0,1.0,1.0,0.21,4.0,4.0,4.0,3.0
50%,1.0,1.0,1.0,1.0,8.0,0.0,7.0,0.0,1.0,0.0,...,26.0,25.0,3.0,3.0,3.0,0.41,10.0,10.0,11.0,7.0
75%,2.0,2.0,2.0,1.0,15.0,1.0,13.0,1.0,2.0,1.0,...,60.0,78.0,7.0,7.0,9.0,0.88,28.0,27.0,30.0,16.0
max,35.0,17.0,35.0,13.0,354.0,66.0,260.0,38.0,2409.0,902.0,...,29959.0,76450.0,145.0,96.0,152.0,144.0,35520.0,33530.0,38848.0,21642.0


In [7]:
data.insert(1, "Version", "2.0.0")
data.head()

Unnamed: 0,Name,Version,AvgCyclomatic,AvgCyclomaticModified,AvgCyclomaticStrict,AvgEssential,AvgLine,AvgLineBlank,AvgLineCode,AvgLineComment,...,CountStmtDecl,CountStmtExe,MaxCyclomatic,MaxCyclomaticModified,MaxCyclomaticStrict,RatioCommentToCode,SumCyclomatic,SumCyclomaticModified,SumCyclomaticStrict,SumEssential
7639,hive\accumulo-handler\src\java\org\apache\hado...,2.0.0,1.0,1.0,1.0,1.0,9.0,0.0,6.0,1.0,...,56.0,61.0,4.0,4.0,4.0,0.35,29.0,29.0,30.0,17.0
7640,hive\accumulo-handler\src\java\org\apache\hado...,2.0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,10.0,5.0,0.0,0.0,0.0,1.5,0.0,0.0,0.0,0.0
7641,hive\accumulo-handler\src\java\org\apache\hado...,2.0.0,2.0,2.0,2.0,1.0,8.0,0.0,7.0,0.0,...,62.0,93.0,10.0,10.0,10.0,0.14,44.0,44.0,46.0,38.0
7642,hive\accumulo-handler\src\java\org\apache\hado...,2.0.0,2.0,2.0,2.0,1.0,14.0,1.0,11.0,1.0,...,100.0,121.0,9.0,9.0,9.0,0.16,57.0,57.0,58.0,26.0
7643,hive\accumulo-handler\src\java\org\apache\hado...,2.0.0,2.0,2.0,2.0,1.0,29.0,1.0,13.0,14.0,...,68.0,60.0,5.0,5.0,6.0,0.92,20.0,20.0,21.0,10.0


We have a proper dataset to work on. 

Now, let us replicate this process on all versions of files.

In [8]:
for version in RELEASED_VERSIONS:
    df = pd.read_csv(INPUT_DIR + f"hive-metrics-{version}.csv")
    df = df.loc[df["Kind"] == "File"][metrics_cols]
    df.insert(1, "Version", version)
    df.to_csv(OUTPUT_DIR + f"hive-files-metrics-{version}.csv", index=False)
    

FileNotFoundError: [Errno 2] No such file or directory: '../../data/input/hive-metrics-2.0.0.csv'

We can now concatenate all the generated files above, to have the unlabeled dataset.

In [None]:
versions_data = []

for version in RELEASED_VERSIONS:
    df = pd.read_csv(OUTPUT_DIR + f"hive-files-metrics-{version}.csv")
    versions_data.append(df)

unlabeled_data = pd.concat(versions_data, ignore_index=True)
unlabeled_data = unlabeled_data.rename(columns={"Name": "File"})
unlabeled_data.to_csv(OUTPUT_DIR + f"hive-unlabeled-data.csv", index=False)
unlabeled_data

### Adding Filetypes 

Let:
- all files with the *.h* extention be labeled as **headers** files,
- all files with the *.cpp* extention be labeled as **cpp** files, and
- the remaining files are considered **java** files but won't be assigned a dummy variable to avoid a colinear relationships between the 3 variables.

In [None]:
unlabeled_data["IsHeader"] = unlabeled_data["File"].str.endswith(".h")
unlabeled_data["IsCpp"]  = unlabeled_data["File"].str.endswith(".cpp")

### Data labelling

We load labels

In [None]:
labels = pd.read_csv(OUTPUT_DIR + "hive-bugs-files.csv")
labels.head()

In [None]:
labels.shape

Let us add `Bug` column and set its value to 1 for all file/version in labels dataset

In [None]:
labels["Bug"] = 1

Merge unlabeled data to labels

In [None]:
dataset = pd.merge(left=unlabeled_data, right=labels, on=["File", "Version"], how="left")
dataset

Remove duplicated data

In [None]:
dataset.drop_duplicates(keep="first", inplace=True)
dataset.shape

Remove unnecessary columns

In [None]:
dataset.drop(columns=["IssueId", "CommitId", "File", "Version"], inplace=True)
dataset["Bug"] = dataset["Bug"].fillna(0).astype(int)


In [None]:
dataset.to_csv(OUTPUT_DIR + "dataset.csv", index=False)
dataset

### Reload data

In [None]:
dataset = pd.read_csv(OUTPUT_DIR + "dataset.csv")
dataset

In [None]:
dataset.describe()

In [None]:
sns.countplot(dataset, x="Bug")

In [None]:
dataset.to_csv(OUTPUT_DIR + "dataset.csv", index=False)

# Preprocess data

### Correlation Matrix of Raw Data

In [None]:
sns.heatmap(dataset.corr())

The heatmap above shows a highs correlations between most of the metrics. We will need to generate uncorrelated features highlighting attributes of interest from these metrics.

### Feature Generation

#### Generating Relative Metrics

In [None]:
dataset['AvgCyclomaticModifiedRatio'] = dataset['AvgCyclomaticModified'] / dataset['AvgCyclomatic']
dataset['AvgCyclomaticStrictRatio'] = dataset['AvgCyclomaticStrict'] / dataset['AvgCyclomatic']

dataset['AvgLineBlankRatio'] = dataset['AvgLineBlank'] / dataset['AvgLine']
dataset['AvgLineCodeRatio'] = dataset['AvgLineCode'] / dataset['AvgLine']
dataset['AvgLineCommentRatio'] = dataset['AvgLineComment'] / dataset['AvgLine']

dataset['CountDeclClassMethodRatio'] = dataset['CountDeclClassMethod'] / dataset['CountDeclClass']
dataset['CountDeclClassVariableRatio'] = dataset['CountDeclClassVariable'] / dataset['CountDeclClass']

dataset['CountDeclMethodDefaultRatio'] = dataset['CountDeclMethodDefault'] / dataset['CountDeclMethod']
dataset['CountDeclMethodPrivateRatio'] = dataset['CountDeclMethodPrivate'] / dataset['CountDeclMethod']
dataset['CountDeclMethodProtectedRatio'] = dataset['CountDeclMethodProtected'] / dataset['CountDeclMethod']
dataset['CountDeclMethodPublicRatio'] = dataset['CountDeclMethodPublic'] / dataset['CountDeclMethod']

dataset['CountLineBlankRatio'] = dataset['CountLineBlank'] / dataset['CountLine']
dataset['CountLineCodeRatio'] = dataset['CountLineCode'] / dataset['CountLine']
dataset['CountLineCodeDeclRatio'] = dataset['CountLineCodeDecl'] / dataset['CountLine']
dataset['CountLineCodeExeRatio'] = dataset['CountLineCodeExe'] / dataset['CountLine']
dataset['CountLineCommentRatio'] = dataset['CountLineComment'] / dataset['CountLine']
dataset['CountSemicolonRatio'] = dataset['CountSemicolon'] / dataset['CountLine']

dataset['CountStmtDeclRatio'] = dataset['CountStmtDecl'] / dataset['CountStmt']
dataset['CountStmtExeRatio'] = dataset['CountStmtExe'] / dataset['CountStmt']

dataset['MaxCyclomaticModifiedRatio'] = dataset['MaxCyclomaticModified'] / dataset['MaxCyclomatic']
dataset['MaxCyclomaticStrictRatio'] = dataset['MaxCyclomaticStrict'] / dataset['MaxCyclomatic']

dataset['SumCyclomaticModifiedRatio'] = dataset['SumCyclomaticModified'] / dataset['SumCyclomatic']
dataset['SumCyclomaticStrictRatio'] = dataset['SumCyclomaticStrict'] / dataset['SumCyclomatic']

preprocessed_data = dataset[

    ['AvgCyclomatic', 'AvgCyclomaticModifiedRatio', 'AvgCyclomaticStrictRatio', 'AvgEssential', 
     'AvgLine', 'AvgLineBlankRatio', 'AvgLineCodeRatio', 'AvgLineCommentRatio', 'CountDeclClass', 
     'CountDeclClassMethodRatio', 'CountDeclClassVariable', 'CountDeclFunction', 'CountDeclInstanceMethod', 
     'CountDeclInstanceVariable','CountDeclMethod', 'CountDeclMethodDefaultRatio', 'CountDeclMethodPrivateRatio', 
     'CountDeclMethodProtectedRatio', 'CountDeclMethodPublicRatio', 'CountLine', 'CountLineBlankRatio',
     'CountLineCodeRatio', 'CountLineCodeDeclRatio', 'CountLineCodeExeRatio', 'CountLineCommentRatio',
     'CountSemicolonRatio', 'CountStmt', 'CountStmtDeclRatio', 'CountStmtExeRatio', 'MaxCyclomatic',
     'MaxCyclomaticModifiedRatio', 'MaxCyclomaticStrictRatio', 'RatioCommentToCode', 'SumCyclomatic',
     'SumCyclomaticModifiedRatio', 'SumCyclomaticStrictRatio', 'SumEssential', 'IsHeader', 'IsCpp', 'Bug']]

### Correlation Matrix

In [None]:
sns.heatmap(preprocessed_data.corr())

### Retained Uncorrelated Dataset

In [None]:
preprocessed_data = dataset[
    ['AvgCyclomatic', 'AvgCyclomaticModifiedRatio', 'AvgCyclomaticStrictRatio', 'AvgEssential', 
     'AvgLineBlankRatio', 'AvgLineCommentRatio', 'CountDeclClass', 'CountDeclMethodDefaultRatio', 
     'CountDeclMethodPrivateRatio', 'CountDeclMethodProtectedRatio', 'CountDeclMethodPublicRatio', 
     'CountLineBlankRatio', 'MaxCyclomaticStrictRatio', 'RatioCommentToCode', 'IsHeader', 'IsCpp', 'Bug']]

In [None]:
preprocessed_data.corr().to_csv("../../../corr.csv")