# Matching Hive bugs with files

Author: Groupe 4, MGL869

## Setup

In [1]:
# imports
import pandas as pd

In [2]:
# constants
RELEASED_VERSIONS = ["2.0.0", "2.0.1",
                     "2.1.0", "2.1.1", "2.2.0",
                     "2.3.0", "2.3.1", "2.3.2", "2.3.3", "2.3.4", "2.3.5", "2.3.6", "2.3.7", "2.3.8", "2.3.9",
                     "3.0.0", "3.1.0", "3.1.1", "3.1.2", "3.1.3"]

## Data load

In [3]:
bugs_commits = pd.read_csv("../../data/input/hive-bugs.csv")
bugs_commits.head()

Unnamed: 0,Summary,Issue key,Issue id,Issue Type,Status,Project key,Project name,Project type,Project lead,Project description,...,Comment.80,Comment.81,Comment.82,Comment.83,Comment.84,Comment.85,Comment.86,Comment.87,Comment.88,Comment.89
0,Parse Exception : character '@' not supported ...,HIVE-4413,12644338,Bug,Resolved,HIVE,Hive,software,ashutoshc,Hive is a data warehouse infrastructure built ...,...,,,,,,,,,,
1,Hive on tez: memory manager for grace hash join,HIVE-10233,12818760,Bug,Resolved,HIVE,Hive,software,ashutoshc,Hive is a data warehouse infrastructure built ...,...,,,,,,,,,,
2,ORC concatenation of old files can fail while ...,HIVE-11031,12838341,Bug,Resolved,HIVE,Hive,software,ashutoshc,Hive is a data warehouse infrastructure built ...,...,,,,,,,,,,
3,Avoid recursive listing to check if a director...,HIVE-22054,13247273,Bug,Resolved,HIVE,Hive,software,ashutoshc,Hive is a data warehouse infrastructure built ...,...,,,,,,,,,,
4,Unbalanced calls to openTransaction/commitTran...,HIVE-16839,13077858,Bug,Resolved,HIVE,Hive,software,ashutoshc,Hive is a data warehouse infrastructure built ...,...,,,,,,,,,,


In [4]:
bugs_commits_ = bugs_commits[["Issue key", "Affects Version/s", "Affects Version/s.1", "Affects Version/s.2", "Affects Version/s.3", "Affects Version/s.4", "Affects Version/s.5"]]
bugs_commits_ = bugs_commits_.rename(columns={"Issue key": "IssueId"})
bugs_commits_

Unnamed: 0,IssueId,Affects Version/s,Affects Version/s.1,Affects Version/s.2,Affects Version/s.3,Affects Version/s.4,Affects Version/s.5
0,HIVE-4413,0.10.0,0.14.0,1.2.0,1.2.1,2.0.0,
1,HIVE-10233,2.0.0,llap,,,,
2,HIVE-11031,0.13.0,0.14.0,1.0.0,1.1.0,1.2.0,2.0.0
3,HIVE-22054,0.13.0,1.2.0,2.1.0,2.3.5,3.1.1,
4,HIVE-16839,0.13.1,1.1.0,2.3.4,3.0.0,,
...,...,...,...,...,...,...,...
712,HIVE-26396,3.1.3,,,,,
713,HIVE-26447,3.1.3,4.0.0,,,,
714,HIVE-26374,3.1.3,4.0.0-alpha-1,,,,
715,HIVE-26197,3.1.3,4.0.0,4.0.0-alpha-1,,,


In [5]:
print(f"There are {sum(bugs_commits['Issue key'].duplicated())} duplicated issue keys.")

There are 0 duplicated issue keys.


### Impacted files by commits

In [6]:
issues_files = pd.read_csv("../../data/output/hive-issues-files.csv")
issues_files.head()

Unnamed: 0,CommitId,IssueId,Filename
0,1789489c67e5131f3a444cf7b9ac039f276ab6c4,HIVE-26471,standalone-metastore/metastore-server/src/main...
1,1789489c67e5131f3a444cf7b9ac039f276ab6c4,HIVE-26471,standalone-metastore/metastore-server/src/main...
2,1789489c67e5131f3a444cf7b9ac039f276ab6c4,HIVE-26471,standalone-metastore/metastore-server/src/main...
3,8741b620ba020dfbe472b3822674d74c779f1215,HIVE-26583,iceberg/iceberg-catalog/src/test/java/org/apac...
4,d12db58d5351b297519bf431087d48b01b44b73d,HIVE-26596,iceberg/iceberg-handler/src/main/java/org/apac...


Let us remove all files located in test paths

In [7]:
issues_files = issues_files[issues_files["Filename"].str.contains("test", case=False) == False]
issues_files

Unnamed: 0,CommitId,IssueId,Filename
0,1789489c67e5131f3a444cf7b9ac039f276ab6c4,HIVE-26471,standalone-metastore/metastore-server/src/main...
1,1789489c67e5131f3a444cf7b9ac039f276ab6c4,HIVE-26471,standalone-metastore/metastore-server/src/main...
2,1789489c67e5131f3a444cf7b9ac039f276ab6c4,HIVE-26471,standalone-metastore/metastore-server/src/main...
4,d12db58d5351b297519bf431087d48b01b44b73d,HIVE-26596,iceberg/iceberg-handler/src/main/java/org/apac...
5,d12db58d5351b297519bf431087d48b01b44b73d,HIVE-26596,iceberg/iceberg-handler/src/main/java/org/apac...
...,...,...,...
110183,245075d5b91864e169feefef37f73b22904ccf59,HIVE-54,serde/src/java/org/apache/hadoop/hive/serde/si...
110184,245075d5b91864e169feefef37f73b22904ccf59,HIVE-54,serde/src/java/org/apache/hadoop/hive/serde/th...
110186,245075d5b91864e169feefef37f73b22904ccf59,HIVE-54,serde/src/java/org/apache/hadoop/hive/serde/th...
110187,245075d5b91864e169feefef37f73b22904ccf59,HIVE-54,serde/src/java/org/apache/hadoop/hive/serde/th...


Let us refactor filepath to get filenames

In [8]:
import os

issues_files["File"] = issues_files["Filename"].apply(os.path.basename)
issues_files.drop(columns=["Filename"], inplace=True)
issues_files

Unnamed: 0,CommitId,IssueId,File
0,1789489c67e5131f3a444cf7b9ac039f276ab6c4,HIVE-26471,AcidMetricService.java
1,1789489c67e5131f3a444cf7b9ac039f276ab6c4,HIVE-26471,CompactionMetricData.java
2,1789489c67e5131f3a444cf7b9ac039f276ab6c4,HIVE-26471,MetricsConstants.java
4,d12db58d5351b297519bf431087d48b01b44b73d,HIVE-26596,HiveIcebergMetaHook.java
5,d12db58d5351b297519bf431087d48b01b44b73d,HIVE-26596,HiveIcebergStorageHandler.java
...,...,...,...
110183,245075d5b91864e169feefef37f73b22904ccf59,HIVE-54,MetadataTypedSerDeField.java
110184,245075d5b91864e169feefef37f73b22904ccf59,HIVE-54,TCTLSeparatedProtocol.java
110186,245075d5b91864e169feefef37f73b22904ccf59,HIVE-54,ThriftSerDe.java
110187,245075d5b91864e169feefef37f73b22904ccf59,HIVE-54,ThriftSerDeField.java


## Match

Let us join bugs ids to associated files

In [9]:
bugs_files = pd.merge(left=bugs_commits_, right=issues_files, on="IssueId", how="left")
bugs_files

Unnamed: 0,IssueId,Affects Version/s,Affects Version/s.1,Affects Version/s.2,Affects Version/s.3,Affects Version/s.4,Affects Version/s.5,CommitId,File
0,HIVE-4413,0.10.0,0.14.0,1.2.0,1.2.1,2.0.0,,,
1,HIVE-10233,2.0.0,llap,,,,,749b286867f314813de501efe9dca9d3c92fd219,HiveConf.java
2,HIVE-10233,2.0.0,llap,,,,,749b286867f314813de501efe9dca9d3c92fd219,HashTableLoader.java
3,HIVE-10233,2.0.0,llap,,,,,749b286867f314813de501efe9dca9d3c92fd219,MemoryDecider.java
4,HIVE-10233,2.0.0,llap,,,,,749b286867f314813de501efe9dca9d3c92fd219,TezCompiler.java
...,...,...,...,...,...,...,...,...,...
1690,HIVE-26447,3.1.3,4.0.0,,,,,4b1f01f5b97b0028047b63d1922db335dbaf5d8d,VectorUDFMapIndexStringScalar.java
1691,HIVE-26374,3.1.3,4.0.0-alpha-1,,,,,e3751ab545370f9b252d0b4a07bc315037541a95,DDLPlanUtils.java
1692,HIVE-26374,3.1.3,4.0.0-alpha-1,,,,,e3751ab545370f9b252d0b4a07bc315037541a95,CompactionQueryBuilder.java
1693,HIVE-26197,3.1.3,4.0.0,4.0.0-alpha-1,,,,,


We check if bugs files contains the number of bugs identified

In [10]:
print(f"{len(bugs_commits_)} expected, {len(bugs_files['IssueId'].unique())} bugs got.")

717 expected, 717 bugs got.


Now we drop those lines or instances where no filename is associated

In [11]:
bugs_files_ = bugs_files.dropna(subset=["File"])
bugs_files_

Unnamed: 0,IssueId,Affects Version/s,Affects Version/s.1,Affects Version/s.2,Affects Version/s.3,Affects Version/s.4,Affects Version/s.5,CommitId,File
1,HIVE-10233,2.0.0,llap,,,,,749b286867f314813de501efe9dca9d3c92fd219,HiveConf.java
2,HIVE-10233,2.0.0,llap,,,,,749b286867f314813de501efe9dca9d3c92fd219,HashTableLoader.java
3,HIVE-10233,2.0.0,llap,,,,,749b286867f314813de501efe9dca9d3c92fd219,MemoryDecider.java
4,HIVE-10233,2.0.0,llap,,,,,749b286867f314813de501efe9dca9d3c92fd219,TezCompiler.java
5,HIVE-10233,2.0.0,llap,,,,,749b286867f314813de501efe9dca9d3c92fd219,AbstractOperatorDesc.java
...,...,...,...,...,...,...,...,...,...
1688,HIVE-26036,3.1.2,4.0.0,,,,,aa09f429edb632ad8bfcce3515458f17449b11cc,ObjectStore.java
1689,HIVE-26396,3.1.3,,,,,,3ae189e6a47559ab98f19ecf845138be0e6b8f5d,GenericUDFTrunc.java
1690,HIVE-26447,3.1.3,4.0.0,,,,,4b1f01f5b97b0028047b63d1922db335dbaf5d8d,VectorUDFMapIndexStringScalar.java
1691,HIVE-26374,3.1.3,4.0.0-alpha-1,,,,,e3751ab545370f9b252d0b4a07bc315037541a95,DDLPlanUtils.java


In [12]:
bugs_files_.sort_values(by="IssueId", inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bugs_files_.sort_values(by="IssueId", inplace=True)


Let us check for duplicated (CommitId, Filename)

In [13]:
sum(bugs_files_[["CommitId", "File"]].duplicated())

48

We remove them

In [14]:

bugs_files_ = bugs_files_.drop_duplicates(subset=["CommitId", "File"])
bugs_files_

Unnamed: 0,IssueId,Affects Version/s,Affects Version/s.1,Affects Version/s.2,Affects Version/s.3,Affects Version/s.4,Affects Version/s.5,CommitId,File
1,HIVE-10233,2.0.0,llap,,,,,749b286867f314813de501efe9dca9d3c92fd219,HiveConf.java
2,HIVE-10233,2.0.0,llap,,,,,749b286867f314813de501efe9dca9d3c92fd219,HashTableLoader.java
3,HIVE-10233,2.0.0,llap,,,,,749b286867f314813de501efe9dca9d3c92fd219,MemoryDecider.java
4,HIVE-10233,2.0.0,llap,,,,,749b286867f314813de501efe9dca9d3c92fd219,TezCompiler.java
5,HIVE-10233,2.0.0,llap,,,,,749b286867f314813de501efe9dca9d3c92fd219,AbstractOperatorDesc.java
...,...,...,...,...,...,...,...,...,...
1256,HIVE-26184,2.3.8,3.1.3,,,,,c4f6eb2f91478152c89070a7455df9a1b8980c75,GenericUDAFMkCollectionEvaluator.java
1691,HIVE-26374,3.1.3,4.0.0-alpha-1,,,,,e3751ab545370f9b252d0b4a07bc315037541a95,DDLPlanUtils.java
1692,HIVE-26374,3.1.3,4.0.0-alpha-1,,,,,e3751ab545370f9b252d0b4a07bc315037541a95,CompactionQueryBuilder.java
1689,HIVE-26396,3.1.3,,,,,,3ae189e6a47559ab98f19ecf845138be0e6b8f5d,GenericUDFTrunc.java


In [15]:
print(f"Only {len(bugs_files_['IssueId'].unique())} bugs have files associated.")

Only 341 bugs have files associated.


From here, we choose to create a dataframe based on the one above, that presents on each line one File and one Version affected by a bug

In [16]:
labels = bugs_files_.melt(id_vars=["IssueId", "CommitId", "File"], var_name="Affected", value_name="Version")
labels

Unnamed: 0,IssueId,CommitId,File,Affected,Version
0,HIVE-10233,749b286867f314813de501efe9dca9d3c92fd219,HiveConf.java,Affects Version/s,2.0.0
1,HIVE-10233,749b286867f314813de501efe9dca9d3c92fd219,HashTableLoader.java,Affects Version/s,2.0.0
2,HIVE-10233,749b286867f314813de501efe9dca9d3c92fd219,MemoryDecider.java,Affects Version/s,2.0.0
3,HIVE-10233,749b286867f314813de501efe9dca9d3c92fd219,TezCompiler.java,Affects Version/s,2.0.0
4,HIVE-10233,749b286867f314813de501efe9dca9d3c92fd219,AbstractOperatorDesc.java,Affects Version/s,2.0.0
...,...,...,...,...,...
7621,HIVE-26184,c4f6eb2f91478152c89070a7455df9a1b8980c75,GenericUDAFMkCollectionEvaluator.java,Affects Version/s.5,
7622,HIVE-26374,e3751ab545370f9b252d0b4a07bc315037541a95,DDLPlanUtils.java,Affects Version/s.5,
7623,HIVE-26374,e3751ab545370f9b252d0b4a07bc315037541a95,CompactionQueryBuilder.java,Affects Version/s.5,
7624,HIVE-26396,3ae189e6a47559ab98f19ecf845138be0e6b8f5d,GenericUDFTrunc.java,Affects Version/s.5,


We remove the 'Affected' column and all version other than the released ones

In [17]:
labels.drop(columns=["Affected"], inplace=True)
labels.dropna(subset=["Version"], inplace=True)
labels = labels[labels["Version"].str.contains("|".join(RELEASED_VERSIONS))]
labels

Unnamed: 0,IssueId,CommitId,File,Version
0,HIVE-10233,749b286867f314813de501efe9dca9d3c92fd219,HiveConf.java,2.0.0
1,HIVE-10233,749b286867f314813de501efe9dca9d3c92fd219,HashTableLoader.java,2.0.0
2,HIVE-10233,749b286867f314813de501efe9dca9d3c92fd219,MemoryDecider.java,2.0.0
3,HIVE-10233,749b286867f314813de501efe9dca9d3c92fd219,TezCompiler.java,2.0.0
4,HIVE-10233,749b286867f314813de501efe9dca9d3c92fd219,AbstractOperatorDesc.java,2.0.0
...,...,...,...,...
6364,HIVE-11031,3f8b0ef87dfb374038c7170dc8f94c52974872ca,OrcFileKeyWrapper.java,2.0.0
6365,HIVE-11031,3f8b0ef87dfb374038c7170dc8f94c52974872ca,OrcFileStripeMergeRecordReader.java,2.0.0
6366,HIVE-11035,9a511eb97270a3a0d8bf3504ff3e884d5871628f,ColumnProjectionUtils.java,2.0.0
7169,HIVE-21009,0e4d16b462bf9abd7ec58e60936e24ee4302736c,HiveConf.java,2.3.2


A little check for bug [HIVE-21009 on Jira](https://issues.apache.org/jira/browse/HIVE-21009?jql=project%20%3D%20HIVE%20AND%20issuetype%20%3D%20Bug%20AND%20status%20%3D%20Resolved%20AND%20affectedVersion%20in%20(2.0.0%2C%202.0.1%2C%202.1.0%2C%202.1.1%2C%202.2.0%2C%202.3.0%2C%202.3.1%2C%202.3.2%2C%202.3.3%2C%202.3.4%2C%202.3.5%2C%202.3.6%2C%202.3.7%2C%202.3.8%2C%202.4.0%2C%203.0.0%2C%203.1.0%2C%203.1.1%2C%203.1.2%2C%203.1.3)%20AND%20id%20%3D%20HIVE-21009%20ORDER%20BY%20affectedVersion%20ASC%2C%20priority%20DESC%2C%20updated%20DESC)

Impacted files are checked with this command:
```bash
git log 0e4d16b462bf9abd7ec58e60936e24ee4302736c --name-only -1
```

In [18]:
labels[labels["IssueId"] == "HIVE-21009"]

Unnamed: 0,IssueId,CommitId,File,Version
814,HIVE-21009,0e4d16b462bf9abd7ec58e60936e24ee4302736c,HiveConf.java,2.1.0
815,HIVE-21009,0e4d16b462bf9abd7ec58e60936e24ee4302736c,LdapAuthenticationProviderImpl.java,2.1.0
2085,HIVE-21009,0e4d16b462bf9abd7ec58e60936e24ee4302736c,HiveConf.java,2.1.1
2086,HIVE-21009,0e4d16b462bf9abd7ec58e60936e24ee4302736c,LdapAuthenticationProviderImpl.java,2.1.1
3356,HIVE-21009,0e4d16b462bf9abd7ec58e60936e24ee4302736c,HiveConf.java,2.2.0
3357,HIVE-21009,0e4d16b462bf9abd7ec58e60936e24ee4302736c,LdapAuthenticationProviderImpl.java,2.2.0
4627,HIVE-21009,0e4d16b462bf9abd7ec58e60936e24ee4302736c,HiveConf.java,2.3.0
4628,HIVE-21009,0e4d16b462bf9abd7ec58e60936e24ee4302736c,LdapAuthenticationProviderImpl.java,2.3.0
5898,HIVE-21009,0e4d16b462bf9abd7ec58e60936e24ee4302736c,HiveConf.java,2.3.1
5899,HIVE-21009,0e4d16b462bf9abd7ec58e60936e24ee4302736c,LdapAuthenticationProviderImpl.java,2.3.1


Here is the expected result at step 2.a. We export it into the output folder

In [19]:
labels.to_csv("../../data/output/hive-bugs-files.csv", index=False)