# Matching Hive bugs with files

Author: Henri A.

## Setup

In [1]:
# imports
import pandas as pd

## Data load

In [2]:
bugs_commits = pd.read_csv("../../data/input/hive-bugs.csv")
bugs_commits.head()

Unnamed: 0,Summary,Issue key,Issue id,Issue Type,Status,Project key,Project name,Project type,Project lead,Project description,...,Comment.80,Comment.81,Comment.82,Comment.83,Comment.84,Comment.85,Comment.86,Comment.87,Comment.88,Comment.89
0,Parse Exception : character '@' not supported ...,HIVE-4413,12644338,Bug,Resolved,HIVE,Hive,software,ashutoshc,Hive is a data warehouse infrastructure built ...,...,,,,,,,,,,
1,Hive on tez: memory manager for grace hash join,HIVE-10233,12818760,Bug,Resolved,HIVE,Hive,software,ashutoshc,Hive is a data warehouse infrastructure built ...,...,,,,,,,,,,
2,ORC concatenation of old files can fail while ...,HIVE-11031,12838341,Bug,Resolved,HIVE,Hive,software,ashutoshc,Hive is a data warehouse infrastructure built ...,...,,,,,,,,,,
3,Avoid recursive listing to check if a director...,HIVE-22054,13247273,Bug,Resolved,HIVE,Hive,software,ashutoshc,Hive is a data warehouse infrastructure built ...,...,,,,,,,,,,
4,Unbalanced calls to openTransaction/commitTran...,HIVE-16839,13077858,Bug,Resolved,HIVE,Hive,software,ashutoshc,Hive is a data warehouse infrastructure built ...,...,,,,,,,,,,


In [6]:
bugs_commits_ = bugs_commits[["Issue key", "Affects Version/s", "Affects Version/s.1", "Affects Version/s.2", "Affects Version/s.3", "Affects Version/s.4", "Affects Version/s.5"]]
bugs_commits_ = bugs_commits_.rename(columns={"Issue key": "IssueId"})
bugs_commits_

Unnamed: 0,IssueId,Affects Version/s,Affects Version/s.1,Affects Version/s.2,Affects Version/s.3,Affects Version/s.4,Affects Version/s.5
0,HIVE-4413,0.10.0,0.14.0,1.2.0,1.2.1,2.0.0,
1,HIVE-10233,2.0.0,llap,,,,
2,HIVE-11031,0.13.0,0.14.0,1.0.0,1.1.0,1.2.0,2.0.0
3,HIVE-22054,0.13.0,1.2.0,2.1.0,2.3.5,3.1.1,
4,HIVE-16839,0.13.1,1.1.0,2.3.4,3.0.0,,
...,...,...,...,...,...,...,...
712,HIVE-26396,3.1.3,,,,,
713,HIVE-26447,3.1.3,4.0.0,,,,
714,HIVE-26374,3.1.3,4.0.0-alpha-1,,,,
715,HIVE-26197,3.1.3,4.0.0,4.0.0-alpha-1,,,


In [4]:
print(f"There are {sum(bugs_commits['Issue key'].duplicated())} duplicated issue keys.")

There are 0 duplicated issue keys.


In [5]:
issues_files = pd.read_csv("../../data/output/hive-issues-files.csv")
issues_files.head()

Unnamed: 0,CommitId,IssueId,Filename
0,1789489c67e5131f3a444cf7b9ac039f276ab6c4,HIVE-26471,standalone-metastore/metastore-server/src/main...
1,1789489c67e5131f3a444cf7b9ac039f276ab6c4,HIVE-26471,standalone-metastore/metastore-server/src/main...
2,1789489c67e5131f3a444cf7b9ac039f276ab6c4,HIVE-26471,standalone-metastore/metastore-server/src/main...
3,8741b620ba020dfbe472b3822674d74c779f1215,HIVE-26583,iceberg/iceberg-catalog/src/test/java/org/apac...
4,d12db58d5351b297519bf431087d48b01b44b73d,HIVE-26596,iceberg/iceberg-handler/src/main/java/org/apac...


Let us join bugs ids to associated files

In [11]:
bugs_files = pd.merge(left=bugs_commits_, right=issues_files, on="IssueId", how="left")
bugs_files

Unnamed: 0,IssueId,Affects Version/s,Affects Version/s.1,Affects Version/s.2,Affects Version/s.3,Affects Version/s.4,Affects Version/s.5,CommitId,Filename
0,HIVE-4413,0.10.0,0.14.0,1.2.0,1.2.1,2.0.0,,,
1,HIVE-10233,2.0.0,llap,,,,,749b286867f314813de501efe9dca9d3c92fd219,common/src/java/org/apache/hadoop/hive/conf/Hi...
2,HIVE-10233,2.0.0,llap,,,,,749b286867f314813de501efe9dca9d3c92fd219,ql/src/java/org/apache/hadoop/hive/ql/exec/tez...
3,HIVE-10233,2.0.0,llap,,,,,749b286867f314813de501efe9dca9d3c92fd219,ql/src/java/org/apache/hadoop/hive/ql/optimize...
4,HIVE-10233,2.0.0,llap,,,,,749b286867f314813de501efe9dca9d3c92fd219,ql/src/java/org/apache/hadoop/hive/ql/parse/Te...
...,...,...,...,...,...,...,...,...,...
2023,HIVE-26374,3.1.3,4.0.0-alpha-1,,,,,e3751ab545370f9b252d0b4a07bc315037541a95,itests/hive-unit/src/test/java/org/apache/hado...
2024,HIVE-26374,3.1.3,4.0.0-alpha-1,,,,,e3751ab545370f9b252d0b4a07bc315037541a95,ql/src/java/org/apache/hadoop/hive/ql/exec/DDL...
2025,HIVE-26374,3.1.3,4.0.0-alpha-1,,,,,e3751ab545370f9b252d0b4a07bc315037541a95,ql/src/java/org/apache/hadoop/hive/ql/txn/comp...
2026,HIVE-26197,3.1.3,4.0.0,4.0.0-alpha-1,,,,,


We check if bugs files contains the number of bugs identified

In [18]:
print(f"{len(bugs_commits_)} expected, {len(bugs_files['IssueId'].unique())} bugs got.")

717 expected, 717 bugs got.


Now we drop those lines or instances where no filename is associated

In [21]:
bugs_files_ = bugs_files.dropna(subset=["Filename"])
bugs_files_

Unnamed: 0,IssueId,Affects Version/s,Affects Version/s.1,Affects Version/s.2,Affects Version/s.3,Affects Version/s.4,Affects Version/s.5,CommitId,Filename
1,HIVE-10233,2.0.0,llap,,,,,749b286867f314813de501efe9dca9d3c92fd219,common/src/java/org/apache/hadoop/hive/conf/Hi...
2,HIVE-10233,2.0.0,llap,,,,,749b286867f314813de501efe9dca9d3c92fd219,ql/src/java/org/apache/hadoop/hive/ql/exec/tez...
3,HIVE-10233,2.0.0,llap,,,,,749b286867f314813de501efe9dca9d3c92fd219,ql/src/java/org/apache/hadoop/hive/ql/optimize...
4,HIVE-10233,2.0.0,llap,,,,,749b286867f314813de501efe9dca9d3c92fd219,ql/src/java/org/apache/hadoop/hive/ql/parse/Te...
5,HIVE-10233,2.0.0,llap,,,,,749b286867f314813de501efe9dca9d3c92fd219,ql/src/java/org/apache/hadoop/hive/ql/plan/Abs...
...,...,...,...,...,...,...,...,...,...
2021,HIVE-26396,3.1.3,,,,,,3ae189e6a47559ab98f19ecf845138be0e6b8f5d,ql/src/java/org/apache/hadoop/hive/ql/udf/gene...
2022,HIVE-26447,3.1.3,4.0.0,,,,,4b1f01f5b97b0028047b63d1922db335dbaf5d8d,ql/src/java/org/apache/hadoop/hive/ql/exec/vec...
2023,HIVE-26374,3.1.3,4.0.0-alpha-1,,,,,e3751ab545370f9b252d0b4a07bc315037541a95,itests/hive-unit/src/test/java/org/apache/hado...
2024,HIVE-26374,3.1.3,4.0.0-alpha-1,,,,,e3751ab545370f9b252d0b4a07bc315037541a95,ql/src/java/org/apache/hadoop/hive/ql/exec/DDL...


In [25]:
print(f"Only {len(bugs_files_['IssueId'].unique())} bugs have files associated.")

Only 354 bugs have files associated.


Here is the expected result at step 2.a

In [26]:
bugs_files_[["IssueId", "Filename"]]

Unnamed: 0,IssueId,Filename
1,HIVE-10233,common/src/java/org/apache/hadoop/hive/conf/Hi...
2,HIVE-10233,ql/src/java/org/apache/hadoop/hive/ql/exec/tez...
3,HIVE-10233,ql/src/java/org/apache/hadoop/hive/ql/optimize...
4,HIVE-10233,ql/src/java/org/apache/hadoop/hive/ql/parse/Te...
5,HIVE-10233,ql/src/java/org/apache/hadoop/hive/ql/plan/Abs...
...,...,...
2021,HIVE-26396,ql/src/java/org/apache/hadoop/hive/ql/udf/gene...
2022,HIVE-26447,ql/src/java/org/apache/hadoop/hive/ql/exec/vec...
2023,HIVE-26374,itests/hive-unit/src/test/java/org/apache/hado...
2024,HIVE-26374,ql/src/java/org/apache/hadoop/hive/ql/exec/DDL...


We export it into the output folder

In [27]:
bugs_files_[["IssueId", "Filename"]].to_csv("../../data/output/hive-bugs-files.csv")