Browse files

IMPALA-4705, IMPALA-4779, IMPALA-4780: Fix some Expr bugs with codegen

This change fixes to work with codegen as it's
originally intended. Fixing it uncovers a couple of bugs fixed
in this patch:

IMPALA-4705: When an IR function is materialized, its
function body is parsed to find all its callee functions
to be materialized too. However, the old code doesn't
detect callee fnctions referenced indirectly (e.g. a
callee function passed as argument to another function).

This change fixes the problem above inspecting the use
lists of llvm::Function objects. When parsing the bitcode
module into memory, LLVM already establishes a use list
for each llvm::Value object which llvm::Function is a
subclass of. A use list contains all the locations in
the module in which the Value is referenced. For a
llvm::Function object, that would be its call sites and
constant expressions referencing the functions. By using
the use lists of llvm::Function in the module, a global
map is established at Impala initialization time to map
functions to their corresponding callee functions. This
map is then used when materializing a function to ensure
all its callee functions are also materialized recursively.

IMPALA-4779: conditional function isfalse(), istrue(),
isnotfalse(), isnotrue() aren't cross-compiled so they
will lead to unexpected query failure when codegen is enabled.
This change will cross-compile these functions.

IMPALA-4780: next_day() always returns NULL when codegen
is enabled. The bound checks for next_day() use some class
static variables initialized in the global constructors
(@llvm.global_ctors). However, we never execute the global
constructors before calling the JIT compiled functions.
This causes these variables to remain as zero, causing all
executions of next_day() to fail the bound checks. The reason
why these class static variables aren't compiled as global
constants in LLVM IR is that TimestampFunctions::MIN_YEAR is
not a compile time constant. This change fixes the problem
above by setting TimestampFunctions::MIN_YEAR to a known constant
value. A DCHECK is added to verify that it matches the value
defined in the boost library.

Change-Id: I40fdb035a565ae2f9c9fbf4db48a548653ef7608
Reviewed-by: Michael Ho <>
Tested-by: Impala Public Jenkins
  • Loading branch information...
michaelhkw authored and Impala Public Jenkins committed Jan 16, 2017
1 parent 933f2ce commit 1d933919ee964d8766ba028623d66ec20cd123ac
@@ -55,7 +55,8 @@ class LlvmCodeGenTest : public testing:: Test {
// Wrapper to call private test-only methods on LlvmCodeGen object
static Status CreateFromFile(
ObjectPool* pool, const string& filename, scoped_ptr<LlvmCodeGen>* codegen) {
return LlvmCodeGen::CreateFromFile(pool, NULL, filename, "test", codegen);
RETURN_IF_ERROR(LlvmCodeGen::CreateFromFile(pool, NULL, filename, "test", codegen));
return (*codegen)->MaterializeModule();
static LlvmCodeGen* CreateCodegen(ObjectPool* pool) {
@@ -101,7 +101,8 @@ bool LlvmCodeGen::llvm_initialized_ = false;
string LlvmCodeGen::cpu_name_;
vector<string> LlvmCodeGen::cpu_attrs_;
unordered_set<string> LlvmCodeGen::gv_ref_ir_fns_;
unordered_set<string> LlvmCodeGen::fns_to_always_materialize_;
FnRefsMap LlvmCodeGen::fn_refs_map_;
[[noreturn]] static void LlvmCodegenHandleError(
void* user_data, const std::string& reason, bool gen_crash_diag) {
@@ -115,62 +116,29 @@ bool LlvmCodeGen::IsDefinedInImpalad(const string& fn_name) {
return status.ok();
void LlvmCodeGen::ParseGlobalConstant(Value* val, unordered_set<string>* ref_fns) {
// Parse constants to find any referenced functions.
vector<string> fn_names;
if (isa<Function>(val)) {
} else if (isa<BlockAddress>(val)) {
const BlockAddress *ba = cast<BlockAddress>(val);
} else if (isa<GlobalAlias>(val)) {
GlobalAlias* alias = cast<GlobalAlias>(val);
ParseGlobalConstant(alias->getAliasee(), ref_fns);
} else if (isa<ConstantExpr>(val)) {
const ConstantExpr* ce = cast<ConstantExpr>(val);
if (ce->isCast()) {
for (User::const_op_iterator oi=ce->op_begin(); oi != ce->op_end(); ++oi) {
Function* fn = dyn_cast<Function>(*oi);
if (fn != NULL) fn_names.push_back(fn->getName().str());
void LlvmCodeGen::FindGlobalUsers(User* val, vector<GlobalObject*>* users) {
for (Use& u: val->uses()) {
User* user = u.getUser();
if (isa<Instruction>(user)) {
Instruction* inst = dyn_cast<Instruction>(u.getUser());
} else if (isa<GlobalVariable>(user)) {
GlobalVariable* gv = cast<GlobalVariable>(user);
string val_name = gv->getName();
// We strip global ctors and dtors out of the modules as they are not run.
if (val_name.find("llvm.global_ctors") == string::npos &&
val_name.find("llvm.global_dtors") == string::npos) {
} else if (isa<ConstantStruct>(val) || isa<ConstantArray>(val) ||
isa<ConstantDataArray>(val)) {
const Constant* val_constant = cast<Constant>(val);
for (int i = 0; i < val_constant->getNumOperands(); ++i) {
ParseGlobalConstant(val_constant->getOperand(i), ref_fns);
} else if (isa<ConstantVector>(val) || isa<ConstantDataVector>(val)) {
const Constant* val_const = cast<Constant>(val);
for (int i = 0; i < val->getType()->getVectorNumElements(); ++i) {
ParseGlobalConstant(val_const->getAggregateElement(i), ref_fns);
} else {
// Ignore constants which cannot contain function pointers. Ignore other global
// variables referenced by this global variable as InitializeLlvm() will parse
// all global variables.
DCHECK(isa<UndefValue>(val) || isa<ConstantFP>(val) || isa<ConstantInt>(val) ||
isa<GlobalVariable>(val) || isa<ConstantTokenNone>(val) ||
isa<ConstantPointerNull>(val) || isa<ConstantAggregateZero>(val) ||
// Adds all functions not defined in Impalad native binary.
for (const string& fn_name: fn_names) {
if (!IsDefinedInImpalad(fn_name)) ref_fns->insert(fn_name);
void LlvmCodeGen::ParseGVForFunctions(Module* module, unordered_set<string>* ref_fns) {
for (GlobalVariable& gv: module->globals()) {
if (gv.hasInitializer() && gv.isConstant()) {
Constant* val = gv.getInitializer();
if (val->getNumOperands() > 0) ParseGlobalConstant(val, ref_fns);
} else if (isa<Constant>(user)) {
FindGlobalUsers(user, users);
} else {
DCHECK(false) << "Unknown user's types for " << val->getName().str();
void LlvmCodeGen::InitializeLlvm(bool load_backend) {
Status LlvmCodeGen::InitializeLlvm(bool load_backend) {
@@ -201,17 +169,43 @@ void LlvmCodeGen::InitializeLlvm(bool load_backend) {
ObjectPool init_pool;
scoped_ptr<LlvmCodeGen> init_codegen;
Status status = LlvmCodeGen::CreateFromMemory(&init_pool, NULL, "init", &init_codegen);
ParseGVForFunctions(init_codegen->module_, &gv_ref_ir_fns_);
RETURN_IF_ERROR(LlvmCodeGen::CreateFromMemory(&init_pool, NULL, "init", &init_codegen));
// LLVM will construct "use" lists only when the entire module is materialized.
// Validate the module by verifying that functions for all IRFunction::Type
// can be found.
for (int i = IRFunction::FN_START; i < IRFunction::FN_END; ++i) {
const string& fn_name = FN_MAPPINGS[i].fn_name;
DCHECK(init_codegen->module_->getFunction(fn_name) != NULL)
<< "Failed to find function " << fn_name;
if (init_codegen->module_->getFunction(fn_name) == nullptr) {
return Status(Substitute("Failed to find function $0", fn_name));
// Create a mapping of functions to their referenced functions.
for (Function& fn: init_codegen->module_->functions()) {
if (fn.isIntrinsic() || fn.isDeclaration()) continue;
string fn_name = fn.getName();
vector<GlobalObject*> users;
FindGlobalUsers(&fn, &users);
for (GlobalValue* val: users) {
string key = val->getName();
DCHECK(isa<GlobalVariable>(val) || isa<Function>(val));
// 'fn_refs_map_' contains functions which need to be materialized when a certain
// IR Function is materialized. We choose to include functions referenced by
// another IR function in the map even if it's defined in Impalad binary so it
// can be inlined for further optimization. This is not applicable for functions
// referenced by global variables only.
if (isa<GlobalVariable>(val)) {
if (IsDefinedInImpalad(fn_name)) continue;
} else {
return Status::OK();
@@ -329,19 +323,17 @@ Status LlvmCodeGen::LinkModule(const string& file) {
// The module data layout must match the one selected by the execution engine.
// Record all IR functions in 'new_module' referenced by the module's global variables
// if they are not defined in the Impalad native code. They must be materialized to
// avoid linking error.
unordered_set<string> ref_fns;
ParseGVForFunctions(new_module.get(), &ref_fns);
// Record all the materializable functions in the new module before linking.
// Linking the new module to the main module (i.e. 'module_') may materialize
// functions in the new module. These materialized functions need to be parsed
// to materialize any functions they call in 'module_'.
unordered_set<string> materializable_fns;
// Parse all functions' names from the new module and find those which also exist in
// the main module. They are declarations in the new module or duplicated definitions
// of the same function in both modules. For the latter case, it's unclear which one
// the linker will choose. Materialize these functions in the main module in case they
// are chosen by the linker or referenced by functions in the new module. Note that
// linkModules() will materialize functions defined only in the new module.
for (Function& fn: new_module->functions()) {
if (fn.isMaterializable()) materializable_fns.insert(fn.getName().str());
if (fn_refs_map_.find(fn.getName()) != fn_refs_map_.end()) {
Function* local_fn = module_->getFunction(fn.getName());
bool error = Linker::linkModules(*module_, std::move(new_module));
@@ -352,23 +344,6 @@ Status LlvmCodeGen::LinkModule(const string& file) {
for (const string& fn_name: ref_fns) {
Function* fn = module_->getFunction(fn_name);
// The global variable from source module which references 'fn' can have private
// linkage and it may not be linked into 'module_'.
if (fn != NULL && fn->isMaterializable()) {
// Parse functions in the source module materialized during linking and materialize
// their callees. Do it after linking so LLVM has "merged" functions defined in both
// modules. LLVM may not link in functions (and their callees) from source module if
// they're defined in destination module already.
for (const string& fn_name: materializable_fns) {
Function* fn = module_->getFunction(fn_name);
if (fn != NULL && !fn->isMaterializable()) RETURN_IF_ERROR(MaterializeCallees(fn));
return Status::OK();
@@ -403,11 +378,11 @@ Status LlvmCodeGen::CreateImpalaCodegen(ObjectPool* pool, MemTracker* parent_mem
return Status("Could not create llvm struct type for StringVal");
// Materialize functions implicitly referenced by the global variables.
for (const string& fn_name : gv_ref_ir_fns_) {
// Materialize functions referenced by the global variables.
for (const string& fn_name : fns_to_always_materialize_) {
Function* fn = codegen->module_->getFunction(fn_name);
DCHECK(fn != nullptr);
return Status::OK();
@@ -642,22 +617,6 @@ void LlvmCodeGen::CreateIfElseBlocks(Function* fn, const string& if_name,
*else_block = BasicBlock::Create(context(), else_name, fn, insert_before);
Status LlvmCodeGen::MaterializeCallees(Function* fn) {
for (inst_iterator iter = inst_begin(fn); iter != inst_end(fn); ++iter) {
Instruction* instr = &*iter;
Function* called_fn = NULL;
if (isa<CallInst>(instr)) {
CallInst* call_instr = reinterpret_cast<CallInst*>(instr);
called_fn = call_instr->getCalledFunction();
} else if (isa<InvokeInst>(instr)) {
InvokeInst* invoke_instr = reinterpret_cast<InvokeInst*>(instr);
called_fn = invoke_instr->getCalledFunction();
if (called_fn != NULL) RETURN_IF_ERROR(MaterializeFunctionHelper(called_fn));
return Status::OK();
Status LlvmCodeGen::MaterializeFunctionHelper(Function *fn) {
if (fn->isIntrinsic() || !fn->isMaterializable()) return Status::OK();
@@ -670,7 +629,12 @@ Status LlvmCodeGen::MaterializeFunctionHelper(Function *fn) {
// Materialized functions are marked as not materializable by LLVM.
const unordered_set<string>& callees = fn_refs_map_[fn->getName().str()];
for (const string& callee: callees) {
Function* callee_fn = module_->getFunction(callee);
DCHECK(callee_fn != nullptr);
return Status::OK();
@@ -886,13 +850,11 @@ Function* LlvmCodeGen::FinalizeFunction(Function* function) {
return function;
Status LlvmCodeGen::MaterializeModule(Module* module) {
std::error_code err = module->materializeAll();
Status LlvmCodeGen::MaterializeModule() {
std::error_code err = module_->materializeAll();
if (UNLIKELY(err)) {
stringstream err_msg;
err_msg << "Failed to complete materialization of module " << module->getName().str()
<< ": " << err.message();
return Status(err_msg.str());
return Status(Substitute("Failed to materialize module $0: $1",
module_->getName().str(), err.message()));
return Status::OK();
@@ -919,7 +881,7 @@ Status LlvmCodeGen::FinalizeLazyMaterialization() {
// All unused functions are now not materializable so it should be quick to call
// materializeAll(). We need to call this function in order to destroy the
// materializer so that DCE will not assert fail.
return MaterializeModule(module_);
return MaterializeModule();
Status LlvmCodeGen::FinalizeModule() {
Oops, something went wrong.

0 comments on commit 1d93391

Please sign in to comment.