Skip to content

Commit

Permalink
Fixes for merging native frames
Browse files Browse the repository at this point in the history
There were two main error cases remaining when I test on linux:

1) Failed to get a native stack

I've seen cases where we can only get 1-2 native frames, even though there is a
valid python stack being returned. This seems to be in libmkl_avx512.so on anaconda
3.7.2 - and I failed to unwind with libunwind/gdb and the gimli based unwinder.

GDB in this case returned a message complaining about a 'corrupt stack trace'.

Rather than error out, just insert the native frames w/ the python stack.

2) 1 more native python frame

The other error seems to be where we have exactly 1 more python PyEval_Frame* function
in the native stack than we have frames for in the python stack. This seems to happen
when the python function is finished, and the frame structure has been updated in
python but the native function hasn't exitted yet.  Just allow this for now.

Also add an example program to stress test native unwinding.
  • Loading branch information
benfred committed Jul 8, 2019
1 parent 842d35b commit 1303122
Show file tree
Hide file tree
Showing 4 changed files with 96 additions and 7 deletions.
61 changes: 61 additions & 0 deletions examples/native_stress_test.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
// This example loops over native stack traces until it fails to get one for any reason
extern crate remoteprocess;
extern crate env_logger;
#[macro_use]
extern crate log;
extern crate failure;
extern crate py_spy;

fn native_stress_test(pid: remoteprocess::Pid) -> Result<(), failure::Error> {

let config = py_spy::Config{native: true, ..Default::default() };
let mut spy = py_spy::PythonSpy::retry_new(pid, &config, 3)?;


let mut success = 0;
let mut failed = 0;
loop {
match spy.get_stack_traces() {
Ok(_) => {
success += 1;
if success % 1000 == 0 {
info!("Success {} fail {}", success, failed)
}
},
Err(e) => {
error!("Failed to get stack traces: {:#?}", e);
for (_i, suberror) in e.iter_chain().enumerate() {
eprintln!("Reason: {:?}", suberror);
}

failed += 1;
info!("Success {} fail {}", success, failed);
}
}
}
Ok(())
}


#[cfg(unwind)]
fn main() {
env_logger::init();

let args: Vec<String> = std::env::args().collect();

let pid = if args.len() > 1 {
args[1].parse::<remoteprocess::Pid>().expect("invalid pid")
} else {
error!("must specify a pid!");
return;
};

if let Err(e) = native_stress_test(pid) {
println!("Failed to get backtrace {:?}", e);
}
}

#[cfg(not(unwind))]
fn main() {
panic!("unwind not supported!");
}
2 changes: 1 addition & 1 deletion remoteprocess/examples/validate_libunwind.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ fn libunwind_compare(pid: remoteprocess::Pid) -> Result<(), remoteprocess::Error

let _lock = process.lock()?;

let thread = remoteprocess::Thread::new(pid);
let thread = remoteprocess::Thread::new(pid)?;

let mut gimli_cursor = unwinder.cursor(&thread)?;
let mut libunwind_cursor = libunwinder.cursor(pid)?;
Expand Down
11 changes: 9 additions & 2 deletions remoteprocess/src/linux/gimli_unwinder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ impl Unwinder {
}

pub fn reload(&mut self) -> Result<(), Error> {
info!("reloading process binaries");

// Get shared libraries from virtual memory mapped files
let maps = &proc_maps::get_process_maps(self.pid)?;
let shared_maps = maps.iter().filter(|m| m.is_exec() && !m.is_write() && m.is_read());
Expand Down Expand Up @@ -77,8 +79,13 @@ impl Unwinder {
vdso_data = self.process.copy(m.start(), m.size())?;
&vdso_data
} else {
// vsyscall region, can be ignored
debug!("skipping {} region", filename);
// vsyscall region, can be ignored, but lets not keep on trying to do this
info!("skipping {} region", filename);

// insert a stub for [vsyscall] so that we don't continually try to load it etc
self.binaries.insert(address_key,
BinaryInfo{unwind_info: None, offset: 0, address: m.start() as u64, size: m.size() as u64,
filename: filename.to_string(), symbols: RefCell::new(None)});
continue;
};

Expand Down
29 changes: 25 additions & 4 deletions src/native_stack_trace.rs
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,7 @@ impl NativeStack {
}
}).unwrap_or_else(|e| {
if let remoteprocess::Error::NoBinaryForAddress(_) = e {
debug!("don't have a binary for symbols at 0x{:x} - reloading", addr);
self.should_reload = true;
}
// if we can't symbolicate, just insert a stub here.
Expand All @@ -165,8 +166,27 @@ impl NativeStack {
}

if python_frame_index != frames.len() {
return Err(format_err!("Failed to merge native and python frames (Have {} native and {} python",
python_frame_index, frames.len()));
if python_frame_index == 0 {
// I've seen a problem come up a bunch where we only get 1-2 native stack traces and then it fails
// (with a valid python stack trace on top of that). both the gimli and libunwind unwinder don't
// return the full stack, and connecting up to the process with GDB brings a corrupt stack error:
// from /home/ben/anaconda3/lib/python3.7/site-packages/numpy/core/../../../../libmkl_avx512.so
// Backtrace stopped: previous frame inner to this frame (corrupt stack?)
//
// rather than fail here, lets just insert the python frames after the native frames
for frame in frames {
merged.push(frame.clone());
}
} else if python_frame_index == frames.len() + 1 {
// if we have seen exactly one more python frame in the native stack than the python stack - let it go.
// (can happen when the python stack has been unwound, but haven't exitted the PyEvalFrame function
// yet)
info!("Have {} native and {} python threads in stack - allowing for now",
python_frame_index, frames.len());
} else {
return Err(format_err!("Failed to merge native and python frames (Have {} native and {} python)",
python_frame_index, frames.len()));
}
}

for frame in merged.iter_mut() {
Expand All @@ -182,8 +202,9 @@ impl NativeStack {
let mut cursor = self.unwinder.cursor(thread)?;

while let Some(ip) = cursor.next() {
if let Err(remoteprocess::Error::NoBinaryForAddress(_)) = ip {
// self.should_reload = true;
if let Err(remoteprocess::Error::NoBinaryForAddress(addr)) = ip {
debug!("don't have a binary for 0x{:x} - reloading", addr);
self.should_reload = true;
}
stack.push(ip?);
}
Expand Down

0 comments on commit 1303122

Please sign in to comment.