Race condition when using wait/cont on target in combination with hooks #54

redfast00 · 2020-07-13T09:41:27Z

Circumstances

I am using the GDBTarget target to connect to QEMU. I can't use the built-in QEMU target because QEMU gets spawned from another process. There seems to be an issue with synchronous hooks being async. I have the following testcase to reproduce the issue:

import avatar2
import os
import logging
import sys
import time

from nose.tools import *


def test_race():

    def hook_callback(avatar, *args, **kwargs):
        gdb = avatar.targets['gdbtest']
        pc = gdb.read_register("pc")
        assert pc is not None, f"ILLEGAL STATE {gdb.get_status()}"
        print(pc)

        if pc == 0x1000:
            sys.exit()

    avatar = avatar2.Avatar(arch=avatar2.ARM)
    gdb = avatar.add_target(avatar2.GDBTarget,
                              name='gdbtest',
                              gdb_port=1234,
                              gdb_verbose_mi=True,
                              gdb_executable='/usr/bin/gdb-multiarch'
                            )

    # add breakpoint callback
    avatar.watchmen.add('BreakpointHit', when='after', callback=hook_callback, is_async=False)


    print("Init avatar targets...")
    avatar.init_targets()

    gdb.set_breakpoint(0x4)

    gdb.write_register('pc', 0)
    # Start running
    gdb.cont()

    # wait until we hit a breakpoint, once we hit the breakpoint, continue this python script
    print("waiting until we hit a breakpoint")
    gdb.wait()
    # add two breakpoints
    gdb.set_breakpoint(0x8)
    gdb.set_breakpoint(0xc)

    gdb.set_breakpoint(0x1000)

    # Continue executing from main
    gdb.cont()
    time.sleep(1)
    while True:
        # Wait until we hit a breakpoint
        gdb.wait()
        gdb.cont()


    avatar.shutdown()

if __name__ == '__main__':
    test_race()

This results in this crash:

redacted@5d755f437052:~/avatar_bug$ python3 target_wait.py 
Init avatar targets...
waiting until we hit a breakpoint
4
8
Exception in thread Thread-1:
Traceback (most recent call last):
  File "/usr/lib/python3.8/threading.py", line 932, in _bootstrap_inner
    self.run()
  File "/usr/local/lib/python3.8/dist-packages/avatar2/avatar2.py", line 400, in run
    handler(message)
  File "/usr/local/lib/python3.8/dist-packages/avatar2/watchmen.py", line 80, in watchtrigger
    cb_ret = avatar.watchmen.t(watched_type, AFTER, *args, **cb_kwargs)
  File "/usr/local/lib/python3.8/dist-packages/avatar2/watchmen.py", line 185, in trigger
    watchman.react(self._avatar, *args, **kwargs)
  File "/usr/local/lib/python3.8/dist-packages/avatar2/watchmen.py", line 126, in react
    ret = self._callback(avatar, *args, **kwargs)
  File "target_wait.py", line 18, in hook_callback
    assert pc is not None, f"ILLEGAL STATE {gdb.get_status()}"
AssertionError: ILLEGAL STATE {'state': <TargetStates.RUNNING: 8>}

See also the following trace with gdb_verbose_mi enabled (around message 15-16):

writing: 13-data-list-register-values x 15
{'message': 'done',
 'payload': {'register-values': [{'number': '15', 'value': '0x8'}]},
 'stream': 'stdout',
 'token': 13,
 'type': 'result'}
8
writing: 14-exec-continue
{'message': 'running',
 'payload': None,
 'stream': 'stdout',
 'token': 14,
 'type': 'result'}
{'message': 'running',
 'payload': {'thread-id': 'all'},
 'stream': 'stdout',
 'token': None,
 'type': 'notify'}
{'message': 'breakpoint-modified',
 'payload': {'bkpt': {'addr': '0x0000000c',
                      'disp': 'keep',
                      'enabled': 'y',
                      'number': '3',
                      'original-location': '*0xc',
                      'thread-groups': ['i1'],
                      'times': '1',
                      'type': 'breakpoint'}},
 'stream': 'stdout',
 'token': None,
 'type': 'notify'}
{'message': None, 'payload': '\\n', 'stream': 'stdout', 'type': 'console'}
{'message': None,
 'payload': 'Breakpoint 3, 0x0000000c in ?? ()\\n',
 'stream': 'stdout',
 'type': 'console'}
{'message': 'stopped',
 'payload': {'bkptno': '3',
             'disp': 'keep',
             'frame': {'addr': '0x0000000c',
                       'arch': 'arm',
                       'args': [],
                       'func': '??'},
             'reason': 'breakpoint-hit',
             'stopped-threads': 'all',
             'thread-id': '1'},
 'stream': 'stdout',
 'token': None,
 'type': 'notify'}
writing: 15-exec-continue
writing: 16-data-list-register-values x 15
{'message': 'running',
 'payload': None,
 'stream': 'stdout',
 'token': 15,
 'type': 'result'}
{'message': 'running',
 'payload': {'thread-id': 'all'},
 'stream': 'stdout',
 'token': None,
 'type': 'notify'}
{'message': 'error',
 'payload': {'msg': 'Selected thread is running.'},
 'stream': 'stdout',
 'token': 16,
 'type': 'result'}
Exception in thread Thread-1:
Traceback (most recent call last):
  File "/usr/lib/python3.8/threading.py", line 932, in _bootstrap_inner
{'message': 'breakpoint-modified',
 'payload': {'bkpt': {'addr': '0x00000008',
                      'disp': 'keep',
                      'enabled': 'y',
                      'number': '2',
                      'original-location': '*0x8',
                      'thread-groups': ['i1'],
                      'times': '2',
                      'type': 'breakpoint'}},
 'stream': 'stdout',
 'token': None,
 'type': 'notify'}
    self.run()
  File "/usr/local/lib/python3.8/dist-packages/avatar2/avatar2.py", line 400, in run
{'message': None, 'payload': '\\n', 'stream': 'stdout', 'type': 'console'}
{'message': None,
 'payload': 'Breakpoint 2, 0x00000008 in ?? ()\\n',
 'stream': 'stdout',
 'type': 'console'}
{'message': 'stopped',
 'payload': {'bkptno': '2',
             'disp': 'keep',
             'frame': {'addr': '0x00000008',
                       'arch': 'arm',
                       'args': [],
                       'func': '??'},
             'reason': 'breakpoint-hit',
             'stopped-threads': 'all',
             'thread-id': '1'},
 'stream': 'stdout',
 'token': None,
 'type': 'notify'}
    handler(message)
  File "/usr/local/lib/python3.8/dist-packages/avatar2/watchmen.py", line 80, in watchtrigger
    cb_ret = avatar.watchmen.t(watched_type, AFTER, *args, **cb_kwargs)
  File "/usr/local/lib/python3.8/dist-packages/avatar2/watchmen.py", line 185, in trigger
    watchman.react(self._avatar, *args, **kwargs)
  File "/usr/local/lib/python3.8/dist-packages/avatar2/watchmen.py", line 126, in react
    ret = self._callback(avatar, *args, **kwargs)
  File "target_wait.py", line 15, in hook_callback
    assert pc is not None, f"ILLEGAL STATE {gdb.get_status()}"
AssertionError: ILLEGAL STATE {'state': <TargetStates.RUNNING: 8>}

Steps to reproduce this

Run qemu-system-arm -machine virt -gdb tcp::1234 -S -nographic -bios qemu_arm_test in a separate terminal. The qemu_arm_test binary is in avatar2/tests/binaries of this repository.
Run the Python script, for me it crashes every time (I have 4 CPU cores)

Expected behaviour

Machine state is STOPPED in synchronous hooks

Actual behaviour

Synchronous hooks act like async hooks?

The text was updated successfully, but these errors were encountered:

redfast00 · 2020-07-14T07:50:31Z

@mariusmue I see you self-assigned this issue. I have a deadline coming up where I need this code to be working. Of course, I can't and don't have any expectations of you (both fixing this in a certain time period or even fixing this at all): this is an opensource project and is delivered as is. However, I still need this, and thus would be willing to fix this myself. Could you please give me some pointers so I can look into fixing this myself? Where do you think the issue is? Could you describe in a high-level way how to fix this?

mariusmue · 2020-07-14T08:24:07Z

Hi @redfast00.
I already spent a couple of hours yesterday debugging this issue, and will throw in some more today.
Thank you in any case for this detailed issue, with testcase reproducing the failure! This really made it easier for me, just the bug seems to be a bit more subtil in the synchronization locking between state transitions (from what I can tell so far).

In any case, if you need a hotfix to have the code working for your assignement, here is what you can do:
set the handler to async and call continue from directly inside the handler, rather than in the main-loop. The reason we had async-handler in the first place, is because they could change the execution state from directly inside the handler.

redfast00 · 2020-07-14T08:39:39Z

Will try that, thanks for the quick reply!

mariusmue · 2020-07-14T14:47:54Z

How did it go? I just realized, a workaround which would require even less changes in your program is the insertion of a small-sleep after the cont(), or the wait(). (time.sleep(.01) seemed to be enough in my case.)

While this is certainly no solution to the underlying problem, it may help in your specific case. Meanwhile, I'm trying to figure out how to fix this behaviour.

@redfast00

* guard waiting for watchmen via events In rare occasions, a race can occure where watchmen callbacks are running while the main-thread already consumes the update-state message. This commit guards those callback by an additional event on which target wait() is waiting for. However, there may still be a small TOCTOU-window on that event, as the waiting is using a timeout (for state-updates which are not generating watchmen callbacks). Hence, in the long run, the full code dealing with state updates could benefit from refactoring. * added smoketest for #54 (thanks @redfast00) * Remove manual keystone-library copy from CI It seems that keystone version 0.9.2 (as in pypi) has fixed the issues regarding the installation path of keystone. * Various fixes found by CI * Add processing_callbacks to dictify-ignore list Event()-Objects cannot be serialized, so it should either be private or part of the ignore list. As the watchman are independently accessing this Event, it's added to the ignore list * Fix assert statement in smoketest to be py2-compliant * Set environment variables for the smoketest

mariusmue · 2020-07-14T19:36:39Z

I rolled out a fix with #55. Let me know if you still encounter any problems. For now, I close this issue.

redfast00 · 2020-07-15T10:58:55Z

@mariusmue The issue is still present in my codebase, unfortunately I can't share the binary and code needed to reproduce it (the binary is proprietary, and the code contains parts of the proprietary binary). However, this line c03fc2b#diff-ac6fa11e7e5c294778098e85088305f3R456 looks suspicious: the return value of the wait is not checked, so even if we are still waiting, it will progress past it anyway after the timeout.

redfast00 · 2020-07-15T11:05:58Z

How did it go? I just realized, a workaround which would require even less changes in your program is the insertion of a small-sleep after the cont(), or the wait(). (time.sleep(.01) seemed to be enough in my case.)

While this is certainly no solution to the underlying problem, it may help in your specific case. Meanwhile, I'm trying to figure out how to fix this behaviour.

This seems to work, but I'm scared of it breaking unexpectedly: with 0.01 after cont as suggested, it crashes; with 0.1, it works but is slow.

Edit: s/it works but is slow/it works sometimes but is slow/

redfast00 · 2020-07-15T13:07:20Z

I modified the code to be:

    @watch('TargetWait')
    def wait(self, state=TargetStates.STOPPED|TargetStates.EXITED):
        while True:
            self.processing_callbacks.wait(.1)
            if (self.state & state != 0) and self.processing_callbacks.is_set():
                break
            elif (self.state & state != 0) and not self.processing_callbacks.is_set():
                print("not set, but we would continue")

It then gets stuck in an infinite loop, where it keeps printing:

not set, but we would continue
not set, but we would continue
not set, but we would continue
not set, but we would continue
not set, but we would continue
not set, but we would continue
not set, but we would continue
not set, but we would continue
not set, but we would continue
not set, but we would continue
...

This shows that this Event.wait() does not wait on an event at all, but is just a glorified sleep statement, causing the software to fail sometimes. The problem with these race conditions is that they don't always occur, sometimes making you think you fixed the problem when in fact the code you wrote didn't have any effect on the issue at all.

redfast00 · 2020-07-15T13:32:52Z

Can confirm that with my original testcase I published and the modified code of the post before, it also keeps printing.

mariusmue · 2020-07-15T13:33:01Z

It's not a glorified sleep issue. If you check the commit message, I am well aware that there is still a risk for a race.

The current problem is that not all events causing a state transition are handled via the watchmen state transition guard. Hence, not in all cases the event would be set/cleared, which is of course suboptimal - hence I went to the method of a timeout - which, as we see, in your case does not fix towards the intended behavior.

redfast00 · 2020-07-15T13:35:09Z

Oh, my bad, you did mention this in your commit message, my excuses for not reading it.

redfast00 · 2020-07-15T13:36:56Z

I am willing to do this refactor, is there a high-level documentation of the architecture of the avatar2 codebase somewhere? Would the original paper be a good read for this?

mariusmue · 2020-07-15T13:37:53Z

Still, as the problem persists, I reopened the issue for now and will look for further solutions.
It's just difficult without the actual test-case - I was going for my machine and the CI, in both cases it worked (for now) - probably, also due to the "hidden sleep".

mariusmue · 2020-07-15T13:42:28Z

Oh, if you really want to do this refactor, I guess the fastest way to get an overview is scheduling a (screen-shared) call with me, or join the avatar2-slack for further synchronization. In any case, this would need some more thought into how to get a clean synchronization model - and may not feasible within the limited time until your deadline.

I also have an independent commit which would solve your issue for callbacks after breakpoints (by making breakpoint an event with two state-transitions, whereas in the first ones the callbacks are run).
However, this does not fix the actual callback race for any event.

//edit: The avatar2-paper unfortunately does not describe the implementation details in-depth.
//edit2: of course, you can also reach me on irc, in-case you (understandably) don't want to join any slack workspaces.

redfast00 · 2020-07-15T13:54:22Z

Yes, that's the main issue for me; beating the deadline. I'm currently blocked on this issue (all other parts are done) and a large part of the codebase was written in Avatar2 (making switching to another approach hard), so it does make sense for me to do whatever it takes to get this issue resolved, either using your independent commit or refactoring Avatar2.

I only need to have breakpoint hooks working: memory reads and writes would be nice if I want to polish my work later on, but for now the BreakpointHit event will suffice.

mariusmue · 2020-07-15T14:05:53Z

I will spent some more hours today on this. If nothing succeeds, expect an extra branch with a 2-tier-breakpoint based approach by tomorrow morning.

mariusmue · 2020-07-15T14:12:25Z

In the mean time, for testing, feel free to just increase the timeout on the wait. In my tests, it's pretty easy to see that in many cases the event-mechanism seems to succeed:

    @watch('TargetWait')
    def wait(self, state=TargetStates.STOPPED|TargetStates.EXITED):
        while True:
            self.processing_callbacks.wait(1)
            if (self.state & state != 0) and self.processing_callbacks.is_set():
                break
            elif (self.state & state != 0) and not self.processing_callbacks.is_set():
                self.log.warn("We timed out, there may be a race")
                break

mariusmue · 2020-07-15T15:58:59Z

I pushed you this branch: https://github.com/avatartwo/avatar2/tree/54/two-tier-bp
Can you check if it solves the problem for you? This really solves the problem only for breakpoint callbacks, and not for any arbitrary state transitions, by making breakpoints two-tiered state transition and having wait() ignoring the first one.

The more I think about the issue, the more it becomes an evident that we need a code refactor and a better state-update model.

As of now, the update-state on a target is triggered via the avatar2-fastqueue, as state-updates can come in at any time, asynchronously.
However, the thread processing the callbacks is the normal avatar2-event queue, whereas wait() fetches the status inside the main-thread, which may have been updated via the fastqueue while the callbacks are not processed yet, leading to this inconsistencies.

Edit: Updated linked branch to the correct one.

redfast00 · 2020-07-16T07:58:43Z

The code on the two-tier-bp branch solves the problem for me, thank you!

@redfast00

* guard waiting for watchmen via events In rare occasions, a race can occure where watchmen callbacks are running while the main-thread already consumes the update-state message. This commit guards those callback by an additional event on which target wait() is waiting for. However, there may still be a small TOCTOU-window on that event, as the waiting is using a timeout (for state-updates which are not generating watchmen callbacks). Hence, in the long run, the full code dealing with state updates could benefit from refactoring. * added smoketest for #54 (thanks @redfast00) * Remove manual keystone-library copy from CI It seems that keystone version 0.9.2 (as in pypi) has fixed the issues regarding the installation path of keystone. * Various fixes found by CI * Add processing_callbacks to dictify-ignore list Event()-Objects cannot be serialized, so it should either be private or part of the ignore list. As the watchman are independently accessing this Event, it's added to the ignore list * Fix assert statement in smoketest to be py2-compliant * Set environment variables for the smoketest

mariusmue · 2021-02-09T18:29:10Z

This fix made it into main by now, hence, I'm closing this issue.

mariusmue self-assigned this Jul 13, 2020

mariusmue added a commit that referenced this issue Jul 14, 2020

added smoketest for #54 (thanks @redfast00)

e74c879

mariusmue mentioned this issue Jul 14, 2020

Fix/issue54 #55

Merged

mariusmue closed this as completed Jul 14, 2020

mariusmue reopened this Jul 15, 2020

redfast00 changed the title ~~Race condition when using GDBTarget~~ Race condition when using wait/continue Jul 15, 2020

redfast00 changed the title ~~Race condition when using wait/continue~~ Race condition when using wait/cont on target in combination with hooks Jul 15, 2020

mariusmue closed this as completed Feb 9, 2021

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Race condition when using wait/cont on target in combination with hooks #54

Race condition when using wait/cont on target in combination with hooks #54

redfast00 commented Jul 13, 2020

redfast00 commented Jul 14, 2020

mariusmue commented Jul 14, 2020

redfast00 commented Jul 14, 2020

mariusmue commented Jul 14, 2020

mariusmue commented Jul 14, 2020

redfast00 commented Jul 15, 2020

redfast00 commented Jul 15, 2020 •

edited

Loading

redfast00 commented Jul 15, 2020

redfast00 commented Jul 15, 2020

mariusmue commented Jul 15, 2020 •

edited

Loading

redfast00 commented Jul 15, 2020

redfast00 commented Jul 15, 2020

mariusmue commented Jul 15, 2020 •

edited

Loading

mariusmue commented Jul 15, 2020 •

edited

Loading

redfast00 commented Jul 15, 2020

mariusmue commented Jul 15, 2020

mariusmue commented Jul 15, 2020

mariusmue commented Jul 15, 2020 •

edited

Loading

redfast00 commented Jul 16, 2020 •

edited

Loading

mariusmue commented Feb 9, 2021

Race condition when using wait/cont on target in combination with hooks #54

Race condition when using wait/cont on target in combination with hooks #54

Comments

redfast00 commented Jul 13, 2020

Circumstances

Steps to reproduce this

Expected behaviour

Actual behaviour

redfast00 commented Jul 14, 2020

mariusmue commented Jul 14, 2020

redfast00 commented Jul 14, 2020

mariusmue commented Jul 14, 2020

mariusmue commented Jul 14, 2020

redfast00 commented Jul 15, 2020

redfast00 commented Jul 15, 2020 • edited Loading

redfast00 commented Jul 15, 2020

redfast00 commented Jul 15, 2020

mariusmue commented Jul 15, 2020 • edited Loading

redfast00 commented Jul 15, 2020

redfast00 commented Jul 15, 2020

mariusmue commented Jul 15, 2020 • edited Loading

mariusmue commented Jul 15, 2020 • edited Loading

redfast00 commented Jul 15, 2020

mariusmue commented Jul 15, 2020

mariusmue commented Jul 15, 2020

mariusmue commented Jul 15, 2020 • edited Loading

redfast00 commented Jul 16, 2020 • edited Loading

mariusmue commented Feb 9, 2021

redfast00 commented Jul 15, 2020 •

edited

Loading

mariusmue commented Jul 15, 2020 •

edited

Loading

mariusmue commented Jul 15, 2020 •

edited

Loading

mariusmue commented Jul 15, 2020 •

edited

Loading

mariusmue commented Jul 15, 2020 •

edited

Loading

redfast00 commented Jul 16, 2020 •

edited

Loading