From 4a82a0ec61f66570ed59cf2e03dcd00d14a1fa75 Mon Sep 17 00:00:00 2001
From: mephi42 <mephi42@gmail.com>
Date: Fri, 15 Mar 2019 17:58:59 +0100
Subject: [PATCH 1/9] Pick common code changes from 94d63e38

94d63e38: Add a mechanism for hinting to the core disassembler loop
---
 priv/guest_amd64_toIR.c       |  9 ++++++++-
 priv/guest_arm64_toIR.c       |  1 +
 priv/guest_arm_toIR.c         |  2 ++
 priv/guest_generic_bb_to_IR.c | 28 ++++++++++++++++++++++++++--
 priv/guest_generic_bb_to_IR.h | 11 ++++++++---
 priv/guest_mips_toIR.c        |  1 +
 priv/guest_ppc_toIR.c         |  2 ++
 priv/guest_tilegx_toIR.c      |  1 +
 priv/guest_x86_toIR.c         |  1 +
 9 files changed, 50 insertions(+), 6 deletions(-)

diff --git a/priv/guest_amd64_toIR.c b/priv/guest_amd64_toIR.c
index 9aa82fc34..b466b1ba9 100644
--- a/priv/guest_amd64_toIR.c
+++ b/priv/guest_amd64_toIR.c
@@ -29737,6 +29737,7 @@ Long dis_ESC_0F38__VEX (
       if (have66noF2noF3(pfx)) {
          delta = dis_FMA( vbi, pfx, delta, opc );
          *uses_vvvv = True;
+         dres->hint = Dis_HintVerbose;
          goto decode_success;
       }
       break;
@@ -31862,15 +31863,20 @@ Long dis_ESC_0F3A__VEX (
          /* else fall though; dis_PCMPxSTRx failed to decode it */
       }
       break;
+
    case 0x5c: case 0x5d: case 0x5e: case 0x5f:
    case 0x68: case 0x69: case 0x6a: case 0x6b:
    case 0x6c: case 0x6d: case 0x6e: case 0x6f:
    case 0x78: case 0x79: case 0x7a: case 0x7b:
    case 0x7c: case 0x7d: case 0x7e: case 0x7f:
+      /* FIXME: list the instructions decoded here */
       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
          Long delta0 = delta;
          delta = dis_FMA4( pfx, delta, opc, uses_vvvv, vbi );
-         if (delta > delta0) goto decode_success;
+         if (delta > delta0) {
+            dres->hint = Dis_HintVerbose;
+            goto decode_success;
+         }
          /* else fall though; dis_FMA4 failed to decode it */
       }
       break;
@@ -31983,6 +31989,7 @@ DisResult disInstr_AMD64_WRK (
    dres.len         = 0;
    dres.continueAt  = 0;
    dres.jk_StopHere = Ijk_INVALID;
+   dres.hint        = Dis_HintNone;
    *expect_CAS = False;
 
    vassert(guest_RIP_next_assumed == 0);
diff --git a/priv/guest_arm64_toIR.c b/priv/guest_arm64_toIR.c
index fa9676350..0483f9558 100644
--- a/priv/guest_arm64_toIR.c
+++ b/priv/guest_arm64_toIR.c
@@ -14315,6 +14315,7 @@ Bool disInstr_ARM64_WRK (
    dres->len         = 4;
    dres->continueAt  = 0;
    dres->jk_StopHere = Ijk_INVALID;
+   dres->hint        = Dis_HintNone;
 
    /* At least this is simple on ARM64: insns are all 4 bytes long, and
       4-aligned.  So just fish the whole thing out of memory right now
diff --git a/priv/guest_arm_toIR.c b/priv/guest_arm_toIR.c
index 735bb110c..84dc428cc 100644
--- a/priv/guest_arm_toIR.c
+++ b/priv/guest_arm_toIR.c
@@ -16173,6 +16173,7 @@ DisResult disInstr_ARM_WRK (
    dres.len         = 4;
    dres.continueAt  = 0;
    dres.jk_StopHere = Ijk_INVALID;
+   dres.hint        = Dis_HintNone;
 
    /* Set default actions for post-insn handling of writes to r15, if
       required. */
@@ -19069,6 +19070,7 @@ DisResult disInstr_THUMB_WRK (
    dres.len         = 2;
    dres.continueAt  = 0;
    dres.jk_StopHere = Ijk_INVALID;
+   dres.hint        = Dis_HintNone;
 
    /* Set default actions for post-insn handling of writes to r15, if
       required. */
diff --git a/priv/guest_generic_bb_to_IR.c b/priv/guest_generic_bb_to_IR.c
index b3f45c218..9e51d26ff 100644
--- a/priv/guest_generic_bb_to_IR.c
+++ b/priv/guest_generic_bb_to_IR.c
@@ -240,6 +240,13 @@ IRSB* bb_to_IR (
       vassert((offB_GUEST_IP % 8) == 0);
    }
 
+   /* Although we will try to disassemble up to vex_control.guest_max_insns
+      insns into the block, the individual insn assemblers may hint to us that a
+      disassembled instruction is verbose.  In that case we will lower the limit
+      so as to ensure that the JIT doesn't run out of space.  See bug 375839 for
+      the motivating example. */
+   Int guest_max_insns_really = vex_control.guest_max_insns;
+
    /* Start a new, empty extent. */
    vge->n_used  = 1;
    vge->base[0] = guest_IP_bbstart;
@@ -287,7 +294,7 @@ IRSB* bb_to_IR (
 
    /* Process instructions. */
    while (True) {
-      vassert(n_instrs < vex_control.guest_max_insns);
+      vassert(n_instrs < guest_max_insns_really);
 
       /* Regardless of what chase_into_ok says, is chasing permissible
          at all right now?  Set resteerOKfn accordingly. */
@@ -386,6 +393,23 @@ IRSB* bb_to_IR (
       if (n_cond_resteers_allowed == 0)
          vassert(dres.whatNext != Dis_ResteerC);
 
+      /* If the disassembly function passed us a hint, take note of it. */
+      if (LIKELY(dres.hint == Dis_HintNone)) {
+         /* Do nothing */
+      } else {
+         vassert(dres.hint == Dis_HintVerbose);
+         /* The current insn is known to be verbose.  Lower the max insns limit
+            if necessary so as to avoid running the JIT out of space in the
+            event that we've encountered the start of a long sequence of them.
+            This is expected to be a very rare event.  In any case the remaining
+            limit (30 insns) is still so high that most blocks will terminate
+            anyway before then.  So this is very unlikely to give a perf hit in
+            practice.  See bug 375839 for the motivating example. */
+         if (guest_max_insns_really > 30) {
+            guest_max_insns_really = 30;
+         }
+      }
+
       /* Fill in the insn-mark length field. */
       vassert(first_stmt_idx >= 0 && first_stmt_idx < irsb->stmts_used);
       imark = irsb->stmts[first_stmt_idx];
@@ -452,7 +476,7 @@ IRSB* bb_to_IR (
          case Dis_Continue:
             vassert(dres.continueAt == 0);
             vassert(dres.jk_StopHere == Ijk_INVALID);
-            if (n_instrs < vex_control.guest_max_insns && 
+            if (n_instrs < guest_max_insns_really &&
                 vge->len[vge->n_used-1] < vex_control.guest_max_bytes) {
                /* keep going */
             } else {
diff --git a/priv/guest_generic_bb_to_IR.h b/priv/guest_generic_bb_to_IR.h
index 78a5a870d..2c90db3d2 100644
--- a/priv/guest_generic_bb_to_IR.h
+++ b/priv/guest_generic_bb_to_IR.h
@@ -76,10 +76,16 @@ typedef
          Dis_ResteerC:  (speculatively, of course) followed a
                         conditional branch; continue at 'continueAt'
       */
-      enum { Dis_StopHere, Dis_Continue, 
+      enum { Dis_StopHere=0x10, Dis_Continue, 
              Dis_ResteerU, Dis_ResteerC } whatNext;
 
-      /* For Dis_StopHere, we need to end the block and create a
+      /* Any other hints that we should feed back to the disassembler?
+         Dis_HintNone:     no hint
+         Dis_HintVerbose:  this insn potentially generates a lot of code
+      */
+      enum { Dis_HintNone=0x20, Dis_HintVerbose } hint;
+
+      /* For whatNext==Dis_StopHere, we need to end the block and create a
          transfer to whatever the NIA is.  That will have presumably
          been set by the IR generated for this insn.  So we need to
          know the jump kind to use.  Should Ijk_INVALID in other Dis_
@@ -89,7 +95,6 @@ typedef
       /* For Dis_Resteer, this is the guest address we should continue
          at.  Otherwise ignored (should be zero). */
       Addr   continueAt;
-
    }
 
    DisResult;
diff --git a/priv/guest_mips_toIR.c b/priv/guest_mips_toIR.c
index 58a1129de..2a08eea17 100644
--- a/priv/guest_mips_toIR.c
+++ b/priv/guest_mips_toIR.c
@@ -12062,6 +12062,7 @@ static DisResult disInstr_MIPS_WRK ( Bool(*resteerOkFn) (/*opaque */void *,
    dres.len = 0;
    dres.continueAt = 0;
    dres.jk_StopHere = Ijk_INVALID;
+   dres.hint        = Dis_HintNone;
 
    delay_slot_branch = likely_delay_slot = delay_slot_jump = False;
 
diff --git a/priv/guest_ppc_toIR.c b/priv/guest_ppc_toIR.c
index a04b3b675..e638a609d 100644
--- a/priv/guest_ppc_toIR.c
+++ b/priv/guest_ppc_toIR.c
@@ -27401,6 +27401,7 @@ DisResult disInstr_PPC_WRK (
    dres.len         = 0;
    dres.continueAt  = 0;
    dres.jk_StopHere = Ijk_INVALID;
+   dres.hint        = Dis_HintNone;
 
    /* At least this is simple on PPC32: insns are all 4 bytes long, and
       4-aligned.  So just fish the whole thing out of memory right now
@@ -29111,6 +29112,7 @@ DisResult disInstr_PPC ( IRSB*        irsb_IN,
       dres.whatNext    = Dis_StopHere;
       dres.jk_StopHere = Ijk_NoDecode;
       dres.continueAt   = 0;
+      dres.hint        = Dis_HintNone;
       return dres;
    }
 
diff --git a/priv/guest_tilegx_toIR.c b/priv/guest_tilegx_toIR.c
index 6bca67a2e..139cc1c11 100644
--- a/priv/guest_tilegx_toIR.c
+++ b/priv/guest_tilegx_toIR.c
@@ -329,6 +329,7 @@ static DisResult disInstr_TILEGX_WRK ( Bool(*resteerOkFn) (void *, Addr),
   dres.len = 0;
   dres.continueAt = 0;
   dres.jk_StopHere = Ijk_INVALID;
+  dres.hint        = Dis_HintNone;
 
   /* Verify the code addr is 8-byte aligned. */
   vassert((((Addr)code) & 7) == 0);
diff --git a/priv/guest_x86_toIR.c b/priv/guest_x86_toIR.c
index fe1529bda..d31a2d890 100644
--- a/priv/guest_x86_toIR.c
+++ b/priv/guest_x86_toIR.c
@@ -8563,6 +8563,7 @@ DisResult disInstr_X86_WRK (
    dres.whatNext    = Dis_Continue;
    dres.len         = 0;
    dres.continueAt  = 0;
+   dres.hint        = Dis_HintNone;
    dres.jk_StopHere = Ijk_INVALID;
 
    *expect_CAS = False;

From d57d2ac504485c25a2e5566592457e25f0ea65cf Mon Sep 17 00:00:00 2001
From: mephi42 <mephi42@gmail.com>
Date: Fri, 15 Mar 2019 18:10:51 +0100
Subject: [PATCH 2/9] Port common code changes from efa1e5ef

efa1e5ef: VEX register allocator version 3
---
 common.mk                      |    1 +
 priv/host_amd64_defs.c         |   43 +-
 priv/host_amd64_defs.h         |   29 +-
 priv/host_arm64_defs.c         |   52 +-
 priv/host_arm64_defs.h         |    3 +-
 priv/host_arm_defs.c           |   44 +-
 priv/host_arm_defs.h           |    3 +-
 priv/host_generic_reg_alloc2.c |  179 ++---
 priv/host_generic_reg_alloc3.c | 1373 ++++++++++++++++++++++++++++++++
 priv/host_generic_regs.c       |   49 +-
 priv/host_generic_regs.h       |  108 ++-
 priv/host_mips_defs.c          |   39 +-
 priv/host_mips_defs.h          |    3 +-
 priv/host_ppc_defs.c           |   37 +-
 priv/host_ppc_defs.h           |    3 +-
 priv/host_x86_defs.c           |   36 +-
 priv/host_x86_defs.h           |    4 +-
 priv/main_main.c               |   36 +-
 priv/main_util.c               |   37 +-
 pub/libvex.h                   |    5 +
 20 files changed, 1803 insertions(+), 281 deletions(-)
 create mode 100644 priv/host_generic_reg_alloc3.c

diff --git a/common.mk b/common.mk
index 9277f8f50..5bbeed7d8 100644
--- a/common.mk
+++ b/common.mk
@@ -78,6 +78,7 @@ NORMAL_OBJS = \
 	priv/host_generic_simd128.o	\
 	priv/host_generic_simd256.o	\
 	priv/host_generic_reg_alloc2.o	\
+	priv/host_generic_reg_alloc3.o	\
 	priv/guest_generic_x87.o	\
 	priv/guest_generic_bb_to_IR.o	\
 	priv/guest_x86_helpers.o	\
diff --git a/priv/host_amd64_defs.c b/priv/host_amd64_defs.c
index 9dec78c10..9747b7c6d 100644
--- a/priv/host_amd64_defs.c
+++ b/priv/host_amd64_defs.c
@@ -63,6 +63,7 @@ const RRegUniverse* getRRegUniverse_AMD64 ( void )
    /* Add the registers.  The initial segment of this array must be
       those available for allocation by reg-alloc, and those that
       follow are not available for allocation. */
+   ru->allocable_start[HRcInt64] = ru->size;
    ru->regs[ru->size++] = hregAMD64_RSI();
    ru->regs[ru->size++] = hregAMD64_RDI();
    ru->regs[ru->size++] = hregAMD64_R8();
@@ -72,6 +73,10 @@ const RRegUniverse* getRRegUniverse_AMD64 ( void )
    ru->regs[ru->size++] = hregAMD64_R14();
    ru->regs[ru->size++] = hregAMD64_R15();
    ru->regs[ru->size++] = hregAMD64_RBX();
+   ru->regs[ru->size++] = hregAMD64_R10();
+   ru->allocable_end[HRcInt64] = ru->size - 1;
+
+   ru->allocable_start[HRcVec128] = ru->size;
    ru->regs[ru->size++] = hregAMD64_XMM3();
    ru->regs[ru->size++] = hregAMD64_XMM4();
    ru->regs[ru->size++] = hregAMD64_XMM5();
@@ -82,8 +87,9 @@ const RRegUniverse* getRRegUniverse_AMD64 ( void )
    ru->regs[ru->size++] = hregAMD64_XMM10();
    ru->regs[ru->size++] = hregAMD64_XMM11();
    ru->regs[ru->size++] = hregAMD64_XMM12();
-   ru->regs[ru->size++] = hregAMD64_R10();
+   ru->allocable_end[HRcVec128] = ru->size - 1;
    ru->allocable = ru->size;
+
    /* And other regs, not available to the allocator. */
    ru->regs[ru->size++] = hregAMD64_RAX();
    ru->regs[ru->size++] = hregAMD64_RCX();
@@ -101,7 +107,7 @@ const RRegUniverse* getRRegUniverse_AMD64 ( void )
 }
 
 
-void ppHRegAMD64 ( HReg reg ) 
+UInt ppHRegAMD64 ( HReg reg )
 {
    Int r;
    static const HChar* ireg64_names[16] 
@@ -109,27 +115,24 @@ void ppHRegAMD64 ( HReg reg )
          "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15" };
    /* Be generic for all virtual regs. */
    if (hregIsVirtual(reg)) {
-      ppHReg(reg);
-      return;
+      return ppHReg(reg);
    }
    /* But specific for real regs. */
    switch (hregClass(reg)) {
       case HRcInt64:
          r = hregEncoding(reg);
          vassert(r >= 0 && r < 16);
-         vex_printf("%s", ireg64_names[r]);
-         return;
+         return vex_printf("%s", ireg64_names[r]);
       case HRcVec128:
          r = hregEncoding(reg);
          vassert(r >= 0 && r < 16);
-         vex_printf("%%xmm%d", r);
-         return;
+         return vex_printf("%%xmm%d", r);
       default:
          vpanic("ppHRegAMD64");
    }
 }
 
-static void ppHRegAMD64_lo32 ( HReg reg ) 
+static UInt ppHRegAMD64_lo32 ( HReg reg )
 {
    Int r;
    static const HChar* ireg32_names[16] 
@@ -137,17 +140,16 @@ static void ppHRegAMD64_lo32 ( HReg reg )
          "%r8d", "%r9d", "%r10d", "%r11d", "%r12d", "%r13d", "%r14d", "%r15d" };
    /* Be generic for all virtual regs. */
    if (hregIsVirtual(reg)) {
-      ppHReg(reg);
-      vex_printf("d");
-      return;
+      UInt written = ppHReg(reg);
+      written += vex_printf("d");
+      return written;
    }
    /* But specific for real regs. */
    switch (hregClass(reg)) {
       case HRcInt64:
          r = hregEncoding(reg);
          vassert(r >= 0 && r < 16);
-         vex_printf("%s", ireg32_names[r]);
-         return;
+         return vex_printf("%s", ireg32_names[r]);
       default:
          vpanic("ppHRegAMD64_lo32: invalid regclass");
    }
@@ -1995,6 +1997,19 @@ void genReload_AMD64 ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
    }
 }
 
+AMD64Instr* genMove_AMD64(HReg from, HReg to, Bool mode64)
+{
+   switch (hregClass(from)) {
+   case HRcInt64:
+      return AMD64Instr_Alu64R(Aalu_MOV, AMD64RMI_Reg(from), to);
+   case HRcVec128:
+      return AMD64Instr_SseReRg(Asse_MOV, from, to);
+   default:
+      ppHRegClass(hregClass(from));
+      vpanic("genMove_AMD64: unimplemented regclass");
+   }
+}
+
 
 /* --------- The amd64 assembler (bleh.) --------- */
 
diff --git a/priv/host_amd64_defs.h b/priv/host_amd64_defs.h
index a1957182e..068520ea2 100644
--- a/priv/host_amd64_defs.h
+++ b/priv/host_amd64_defs.h
@@ -56,19 +56,18 @@ ST_IN HReg hregAMD64_R13   ( void ) { return mkHReg(False, HRcInt64,  13,  5); }
 ST_IN HReg hregAMD64_R14   ( void ) { return mkHReg(False, HRcInt64,  14,  6); }
 ST_IN HReg hregAMD64_R15   ( void ) { return mkHReg(False, HRcInt64,  15,  7); }
 ST_IN HReg hregAMD64_RBX   ( void ) { return mkHReg(False, HRcInt64,   3,  8); }
-
-ST_IN HReg hregAMD64_XMM3  ( void ) { return mkHReg(False, HRcVec128,  3,  9); }
-ST_IN HReg hregAMD64_XMM4  ( void ) { return mkHReg(False, HRcVec128,  4, 10); }
-ST_IN HReg hregAMD64_XMM5  ( void ) { return mkHReg(False, HRcVec128,  5, 11); }
-ST_IN HReg hregAMD64_XMM6  ( void ) { return mkHReg(False, HRcVec128,  6, 12); }
-ST_IN HReg hregAMD64_XMM7  ( void ) { return mkHReg(False, HRcVec128,  7, 13); }
-ST_IN HReg hregAMD64_XMM8  ( void ) { return mkHReg(False, HRcVec128,  8, 14); }
-ST_IN HReg hregAMD64_XMM9  ( void ) { return mkHReg(False, HRcVec128,  9, 15); }
-ST_IN HReg hregAMD64_XMM10 ( void ) { return mkHReg(False, HRcVec128, 10, 16); }
-ST_IN HReg hregAMD64_XMM11 ( void ) { return mkHReg(False, HRcVec128, 11, 17); }
-ST_IN HReg hregAMD64_XMM12 ( void ) { return mkHReg(False, HRcVec128, 12, 18); }
-
-ST_IN HReg hregAMD64_R10   ( void ) { return mkHReg(False, HRcInt64,  10, 19); }
+ST_IN HReg hregAMD64_R10   ( void ) { return mkHReg(False, HRcInt64,  10,  9); }
+
+ST_IN HReg hregAMD64_XMM3  ( void ) { return mkHReg(False, HRcVec128,  3, 10); }
+ST_IN HReg hregAMD64_XMM4  ( void ) { return mkHReg(False, HRcVec128,  4, 11); }
+ST_IN HReg hregAMD64_XMM5  ( void ) { return mkHReg(False, HRcVec128,  5, 12); }
+ST_IN HReg hregAMD64_XMM6  ( void ) { return mkHReg(False, HRcVec128,  6, 13); }
+ST_IN HReg hregAMD64_XMM7  ( void ) { return mkHReg(False, HRcVec128,  7, 14); }
+ST_IN HReg hregAMD64_XMM8  ( void ) { return mkHReg(False, HRcVec128,  8, 15); }
+ST_IN HReg hregAMD64_XMM9  ( void ) { return mkHReg(False, HRcVec128,  9, 16); }
+ST_IN HReg hregAMD64_XMM10 ( void ) { return mkHReg(False, HRcVec128, 10, 17); }
+ST_IN HReg hregAMD64_XMM11 ( void ) { return mkHReg(False, HRcVec128, 11, 18); }
+ST_IN HReg hregAMD64_XMM12 ( void ) { return mkHReg(False, HRcVec128, 12, 19); }
 
 ST_IN HReg hregAMD64_RAX   ( void ) { return mkHReg(False, HRcInt64,   0, 20); }
 ST_IN HReg hregAMD64_RCX   ( void ) { return mkHReg(False, HRcInt64,   1, 21); }
@@ -81,7 +80,7 @@ ST_IN HReg hregAMD64_XMM0  ( void ) { return mkHReg(False, HRcVec128,  0, 26); }
 ST_IN HReg hregAMD64_XMM1  ( void ) { return mkHReg(False, HRcVec128,  1, 27); }
 #undef ST_IN
 
-extern void ppHRegAMD64 ( HReg );
+extern UInt ppHRegAMD64 ( HReg );
 
 
 /* --------- Condition codes, AMD encoding. --------- */
@@ -803,7 +802,7 @@ extern void genSpill_AMD64  ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
                               HReg rreg, Int offset, Bool );
 extern void genReload_AMD64 ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
                               HReg rreg, Int offset, Bool );
-
+extern AMD64Instr* genMove_AMD64(HReg from, HReg to, Bool);
 extern const RRegUniverse* getRRegUniverse_AMD64 ( void );
 
 extern HInstrArray* iselSB_AMD64           ( const IRSB*, 
diff --git a/priv/host_arm64_defs.c b/priv/host_arm64_defs.c
index cc7c832f3..417f989ff 100644
--- a/priv/host_arm64_defs.c
+++ b/priv/host_arm64_defs.c
@@ -64,7 +64,7 @@ const RRegUniverse* getRRegUniverse_ARM64 ( void )
    /* Add the registers.  The initial segment of this array must be
       those available for allocation by reg-alloc, and those that
       follow are not available for allocation. */
-
+   ru->allocable_start[HRcInt64] = ru->size;
    ru->regs[ru->size++] = hregARM64_X22();
    ru->regs[ru->size++] = hregARM64_X23();
    ru->regs[ru->size++] = hregARM64_X24();
@@ -81,6 +81,7 @@ const RRegUniverse* getRRegUniverse_ARM64 ( void )
    ru->regs[ru->size++] = hregARM64_X5();
    ru->regs[ru->size++] = hregARM64_X6();
    ru->regs[ru->size++] = hregARM64_X7();
+   ru->allocable_end[HRcInt64] = ru->size - 1;
    // X8 is used as a ProfInc temporary, not available to regalloc.
    // X9 is a chaining/spill temporary, not available to regalloc.
 
@@ -94,19 +95,23 @@ const RRegUniverse* getRRegUniverse_ARM64 ( void )
    // X21 is the guest state pointer, not available to regalloc.
 
    // vector regs.  Unfortunately not callee-saved.
+   ru->allocable_start[HRcVec128] = ru->size;
    ru->regs[ru->size++] = hregARM64_Q16();
    ru->regs[ru->size++] = hregARM64_Q17();
    ru->regs[ru->size++] = hregARM64_Q18();
    ru->regs[ru->size++] = hregARM64_Q19();
    ru->regs[ru->size++] = hregARM64_Q20();
+   ru->allocable_end[HRcVec128] = ru->size - 1;
 
    // F64 regs, all of which are callee-saved
+   ru->allocable_start[HRcFlt64] = ru->size;
    ru->regs[ru->size++] = hregARM64_D8();
    ru->regs[ru->size++] = hregARM64_D9();
    ru->regs[ru->size++] = hregARM64_D10();
    ru->regs[ru->size++] = hregARM64_D11();
    ru->regs[ru->size++] = hregARM64_D12();
    ru->regs[ru->size++] = hregARM64_D13();
+   ru->allocable_end[HRcFlt64] = ru->size - 1;
 
    ru->allocable = ru->size;
    /* And other regs, not available to the allocator. */
@@ -142,43 +147,41 @@ const RRegUniverse* getRRegUniverse_ARM64 ( void )
 }
 
 
-void ppHRegARM64 ( HReg reg )  {
+UInt ppHRegARM64 ( HReg reg )  {
    Int r;
    /* Be generic for all virtual regs. */
    if (hregIsVirtual(reg)) {
-      ppHReg(reg);
-      return;
+      return ppHReg(reg);
    }
    /* But specific for real regs. */
    switch (hregClass(reg)) {
       case HRcInt64:
          r = hregEncoding(reg);
          vassert(r >= 0 && r < 31);
-         vex_printf("x%d", r);
-         return;
+         return vex_printf("x%d", r);
       case HRcFlt64:
          r = hregEncoding(reg);
          vassert(r >= 0 && r < 32);
-         vex_printf("d%d", r);
-         return;
+         return vex_printf("d%d", r);
       case HRcVec128:
          r = hregEncoding(reg);
          vassert(r >= 0 && r < 32);
-         vex_printf("q%d", r);
-         return;
+         return vex_printf("q%d", r);
       default:
          vpanic("ppHRegARM64");
    }
 }
 
-static void ppHRegARM64asSreg ( HReg reg ) {
-   ppHRegARM64(reg);
-   vex_printf("(S-reg)");
+static UInt ppHRegARM64asSreg ( HReg reg ) {
+   UInt written = ppHRegARM64(reg);
+   written += vex_printf("(S-reg)");
+   return written;
 }
 
-static void ppHRegARM64asHreg ( HReg reg ) {
-   ppHRegARM64(reg);
-   vex_printf("(H-reg)");
+static UInt ppHRegARM64asHreg ( HReg reg ) {
+   UInt written = ppHRegARM64(reg);
+   written += vex_printf("(H-reg)");
+   return written;
 }
 
 
@@ -1734,7 +1737,7 @@ void ppARM64Instr ( const ARM64Instr* i ) {
          ppHRegARM64asSreg(i->ARM64in.VCmpS.argR);
          return;
       case ARM64in_VFCSel: {
-         void (*ppHRegARM64fp)(HReg)
+         UInt (*ppHRegARM64fp)(HReg)
             = (i->ARM64in.VFCSel.isD ? ppHRegARM64 : ppHRegARM64asSreg);
          vex_printf("fcsel  ");
          ppHRegARM64fp(i->ARM64in.VFCSel.dst);
@@ -2595,6 +2598,21 @@ void genReload_ARM64 ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
    }
 }
 
+ARM64Instr* genMove_ARM64(HReg from, HReg to, Bool mode64)
+{
+   switch (hregClass(from)) {
+   case HRcInt64:
+      return ARM64Instr_MovI(to, from);
+   case HRcFlt64:
+      return ARM64Instr_VMov(8, to, from);
+   case HRcVec128:
+      return ARM64Instr_VMov(16, to, from);
+   default:
+      ppHRegClass(hregClass(from));
+      vpanic("genMove_ARM64: unimplemented regclass");
+   }
+}
+
 
 /* Emit an instruction into buf and return the number of bytes used.
    Note that buf is not the insn's final place, and therefore it is
diff --git a/priv/host_arm64_defs.h b/priv/host_arm64_defs.h
index 8f2796081..0a4c248cc 100644
--- a/priv/host_arm64_defs.h
+++ b/priv/host_arm64_defs.h
@@ -74,7 +74,7 @@ ST_IN HReg hregARM64_X9  ( void ) { return mkHReg(False, HRcInt64,  9,  27); }
 ST_IN HReg hregARM64_X21 ( void ) { return mkHReg(False, HRcInt64, 21,  28); }
 #undef ST_IN
 
-extern void ppHRegARM64 ( HReg );
+extern UInt ppHRegARM64 ( HReg );
 
 /* Number of registers used arg passing in function calls */
 #define ARM64_N_ARGREGS 8   /* x0 .. x7 */
@@ -982,6 +982,7 @@ extern void genSpill_ARM64  ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
                               HReg rreg, Int offset, Bool );
 extern void genReload_ARM64 ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
                               HReg rreg, Int offset, Bool );
+extern ARM64Instr* genMove_ARM64(HReg from, HReg to, Bool);
 
 extern const RRegUniverse* getRRegUniverse_ARM64 ( void );
 
diff --git a/priv/host_arm_defs.c b/priv/host_arm_defs.c
index 6b51925be..55ecae354 100644
--- a/priv/host_arm_defs.c
+++ b/priv/host_arm_defs.c
@@ -68,6 +68,7 @@ const RRegUniverse* getRRegUniverse_ARM ( void )
 
    /* Callee saves ones are listed first, since we prefer them
       if they're available. */
+   ru->allocable_start[HRcInt32] = ru->size;
    ru->regs[ru->size++] = hregARM_R4();
    ru->regs[ru->size++] = hregARM_R5();
    ru->regs[ru->size++] = hregARM_R6();
@@ -80,24 +81,34 @@ const RRegUniverse* getRRegUniverse_ARM ( void )
    ru->regs[ru->size++] = hregARM_R2();
    ru->regs[ru->size++] = hregARM_R3();
    ru->regs[ru->size++] = hregARM_R9();
+   ru->allocable_end[HRcInt32] = ru->size - 1;
+
    /* FP registers.  Note: these are all callee-save.  Yay!  Hence we
       don't need to mention them as trashed in getHRegUsage for
       ARMInstr_Call. */
+   ru->allocable_start[HRcFlt64] = ru->size;
    ru->regs[ru->size++] = hregARM_D8();
    ru->regs[ru->size++] = hregARM_D9();
    ru->regs[ru->size++] = hregARM_D10();
    ru->regs[ru->size++] = hregARM_D11();
    ru->regs[ru->size++] = hregARM_D12();
+   ru->allocable_end[HRcFlt64] = ru->size - 1;
+
+   ru->allocable_start[HRcFlt32] = ru->size;
    ru->regs[ru->size++] = hregARM_S26();
    ru->regs[ru->size++] = hregARM_S27();
    ru->regs[ru->size++] = hregARM_S28();
    ru->regs[ru->size++] = hregARM_S29();
    ru->regs[ru->size++] = hregARM_S30();
+   ru->allocable_end[HRcFlt32] = ru->size - 1;
+
+   ru->allocable_start[HRcVec128] = ru->size;
    ru->regs[ru->size++] = hregARM_Q8();
    ru->regs[ru->size++] = hregARM_Q9();
    ru->regs[ru->size++] = hregARM_Q10();
    ru->regs[ru->size++] = hregARM_Q11();
    ru->regs[ru->size++] = hregARM_Q12();
+   ru->allocable_end[HRcVec128] = ru->size - 1;
    ru->allocable = ru->size;
 
    /* And other regs, not available to the allocator. */
@@ -140,35 +151,30 @@ const RRegUniverse* getRRegUniverse_ARM ( void )
 }
 
 
-void ppHRegARM ( HReg reg )  {
+UInt ppHRegARM ( HReg reg )  {
    Int r;
    /* Be generic for all virtual regs. */
    if (hregIsVirtual(reg)) {
-      ppHReg(reg);
-      return;
+      return ppHReg(reg);
    }
    /* But specific for real regs. */
    switch (hregClass(reg)) {
       case HRcInt32:
          r = hregEncoding(reg);
          vassert(r >= 0 && r < 16);
-         vex_printf("r%d", r);
-         return;
+         return vex_printf("r%d", r);
       case HRcFlt64:
          r = hregEncoding(reg);
          vassert(r >= 0 && r < 32);
-         vex_printf("d%d", r);
-         return;
+         return vex_printf("d%d", r);
       case HRcFlt32:
          r = hregEncoding(reg);
          vassert(r >= 0 && r < 32);
-         vex_printf("s%d", r);
-         return;
+         return vex_printf("s%d", r);
       case HRcVec128:
          r = hregEncoding(reg);
          vassert(r >= 0 && r < 16);
-         vex_printf("q%d", r);
-         return;
+         return vex_printf("q%d", r);
       default:
          vpanic("ppHRegARM");
    }
@@ -2772,6 +2778,22 @@ void genReload_ARM ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
    }
 }
 
+ARMInstr* genMove_ARM(HReg from, HReg to, Bool mode64)
+{
+   switch (hregClass(from)) {
+   case HRcInt32:
+      return ARMInstr_Mov(to, ARMRI84_R(from));
+   case HRcFlt32:
+      return ARMInstr_VUnaryS(ARMvfpu_COPY, to, from);
+   case HRcFlt64:
+      return ARMInstr_VUnaryD(ARMvfpu_COPY, to, from);
+   case HRcVec128:
+      return ARMInstr_NUnary(ARMneon_COPY, to, from, 4, False);
+   default:
+      ppHRegClass(hregClass(from));
+      vpanic("genMove_ARM: unimplemented regclass");
+   }
+}
 
 /* Emit an instruction into buf and return the number of bytes used.
    Note that buf is not the insn's final place, and therefore it is
diff --git a/priv/host_arm_defs.h b/priv/host_arm_defs.h
index 388533c95..fe529b85b 100644
--- a/priv/host_arm_defs.h
+++ b/priv/host_arm_defs.h
@@ -81,7 +81,7 @@ ST_IN HReg hregARM_Q14 ( void ) { return mkHReg(False, HRcVec128, 14, 32); }
 ST_IN HReg hregARM_Q15 ( void ) { return mkHReg(False, HRcVec128, 15, 33); }
 #undef ST_IN
 
-extern void ppHRegARM ( HReg );
+extern UInt ppHRegARM ( HReg );
 
 /* Number of registers used arg passing in function calls */
 #define ARM_N_ARGREGS 4   /* r0, r1, r2, r3 */
@@ -1073,6 +1073,7 @@ extern void genSpill_ARM  ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
                             HReg rreg, Int offset, Bool );
 extern void genReload_ARM ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
                             HReg rreg, Int offset, Bool );
+extern ARMInstr* genMove_ARM(HReg from, HReg to, Bool);
 
 extern const RRegUniverse* getRRegUniverse_ARM ( void );
 
diff --git a/priv/host_generic_reg_alloc2.c b/priv/host_generic_reg_alloc2.c
index e3f889c69..695b5d7e2 100644
--- a/priv/host_generic_reg_alloc2.c
+++ b/priv/host_generic_reg_alloc2.c
@@ -341,49 +341,6 @@ static inline UInt ULong__minIndex ( ULong w64 ) {
 }
 
 
-/* Vectorised memset, copied from Valgrind's m_libcbase.c. */
-static void* local_memset ( void *destV, Int c, SizeT sz )
-{
-#  define IS_4_ALIGNED(aaa_p) (0 == (((HWord)(aaa_p)) & ((HWord)0x3)))
-
-   UInt   c4;
-   UChar* d = destV;
-   UChar  uc = c;
-
-   while ((!IS_4_ALIGNED(d)) && sz >= 1) {
-      d[0] = uc;
-      d++;
-      sz--;
-   }
-   if (sz == 0)
-      return destV;
-   c4 = uc;
-   c4 |= (c4 << 8);
-   c4 |= (c4 << 16);
-   while (sz >= 16) {
-      ((UInt*)d)[0] = c4;
-      ((UInt*)d)[1] = c4;
-      ((UInt*)d)[2] = c4;
-      ((UInt*)d)[3] = c4;
-      d += 16;
-      sz -= 16;
-   }
-   while (sz >= 4) {
-      ((UInt*)d)[0] = c4;
-      d += 4;
-      sz -= 4;
-   }
-   while (sz >= 1) {
-      d[0] = c;
-      d++;
-      sz--;
-   }
-   return destV;
-
-#  undef IS_4_ALIGNED
-}
-
-
 /* A target-independent register allocator.  Requires various
    functions which it uses to deal abstractly with instructions and
    registers, since it cannot have any target-specific knowledge.
@@ -399,44 +356,13 @@ static void* local_memset ( void *destV, Int c, SizeT sz )
    Takes an expandable array of pointers to unallocated insns.
    Returns an expandable array of pointers to allocated insns.
 */
-HInstrArray* doRegisterAllocation (
+HInstrArray* doRegisterAllocation_v2 (
 
    /* Incoming virtual-registerised code. */ 
    HInstrArray* instrs_in,
 
-   /* The real-register universe to use.  This contains facts about
-      real registers, one of which is the set of registers available
-      for allocation. */
-   const RRegUniverse* univ,
-
-   /* Return True iff the given insn is a reg-reg move, in which
-      case also return the src and dst regs. */
-   Bool (*isMove) ( const HInstr*, HReg*, HReg* ),
-
-   /* Get info about register usage in this insn. */
-   void (*getRegUsage) ( HRegUsage*, const HInstr*, Bool ),
-
-   /* Apply a reg-reg mapping to an insn. */
-   void (*mapRegs) ( HRegRemap*, HInstr*, Bool ),
-
-   /* Return one, or, if we're unlucky, two insn(s) to spill/restore a
-      real reg to a spill slot byte offset.  The two leading HInstr**
-      args are out parameters, through which the generated insns are
-      returned.  Also (optionally) a 'directReload' function, which
-      attempts to replace a given instruction by one which reads
-      directly from a specified spill slot.  May be NULL, in which
-      case the optimisation is not attempted. */
-   void    (*genSpill)  ( HInstr**, HInstr**, HReg, Int, Bool ),
-   void    (*genReload) ( HInstr**, HInstr**, HReg, Int, Bool ),
-   HInstr* (*directReload) ( HInstr*, HReg, Short ),
-   Int     guest_sizeB,
-
-   /* For debug printing only. */
-   void (*ppInstr) ( const HInstr*, Bool ),
-   void (*ppReg) ( HReg ),
-
-   /* 32/64bit mode */
-   Bool mode64
+   /* Register allocator controls to use. */
+   const RegAllocControl* con
 )
 {
 #  define N_SPILL64S  (LibVEX_N_SPILL_BYTES / 8)
@@ -494,7 +420,7 @@ HInstrArray* doRegisterAllocation (
       not at each insn processed. */
    Bool do_sanity_check;
 
-   vassert(0 == (guest_sizeB % LibVEX_GUEST_STATE_ALIGN));
+   vassert(0 == (con->guest_sizeB % LibVEX_GUEST_STATE_ALIGN));
    vassert(0 == (LibVEX_N_SPILL_BYTES % LibVEX_GUEST_STATE_ALIGN));
    vassert(0 == (N_SPILL64S % 2));
 
@@ -510,7 +436,7 @@ HInstrArray* doRegisterAllocation (
         HInstr* _tmp = (_instr);              \
         if (DEBUG_REGALLOC) {                 \
            vex_printf("**  ");                \
-           (*ppInstr)(_tmp, mode64);          \
+           con->ppInstr(_tmp, con->mode64);   \
            vex_printf("\n\n");                \
         }                                     \
         addHInstr ( instrs_out, _tmp );       \
@@ -521,13 +447,13 @@ HInstrArray* doRegisterAllocation (
          Int z, q;						   \
          for (z = 0; z < n_rregs; z++) {			   \
             vex_printf("  rreg_state[%2d] = ", z);		   \
-            (*ppReg)(univ->regs[z]);	       			   \
+            con->ppReg(con->univ->regs[z]);    			   \
             vex_printf("  \t");					   \
             switch (rreg_state[z].disp) {			   \
                case Free:    vex_printf("Free\n"); break;	   \
                case Unavail: vex_printf("Unavail\n"); break;	   \
                case Bound:   vex_printf("BoundTo "); 		   \
-                             (*ppReg)(rreg_state[z].vreg);	   \
+                             con->ppReg(rreg_state[z].vreg);	   \
                              vex_printf("\n"); break;		   \
             }							   \
          }							   \
@@ -552,7 +478,7 @@ HInstrArray* doRegisterAllocation (
 
    /* ... and initialise running state. */
    /* n_rregs is no more than a short name for n_available_real_regs. */
-   n_rregs = univ->allocable;
+   n_rregs = con->univ->allocable;
    n_vregs = instrs_in->n_vregs;
 
    /* If this is not so, vreg_state entries will overflow. */
@@ -634,13 +560,13 @@ HInstrArray* doRegisterAllocation (
 
    for (Int ii = 0; ii < instrs_in->arr_used; ii++) {
 
-      (*getRegUsage)( &reg_usage_arr[ii], instrs_in->arr[ii], mode64 );
+      con->getRegUsage(&reg_usage_arr[ii], instrs_in->arr[ii], con->mode64);
 
       if (0) {
          vex_printf("\n%d  stage1: ", ii);
-         (*ppInstr)(instrs_in->arr[ii], mode64);
+         con->ppInstr(instrs_in->arr[ii], con->mode64);
          vex_printf("\n");
-         ppHRegUsage(univ, &reg_usage_arr[ii]);
+         ppHRegUsage(con->univ, &reg_usage_arr[ii]);
       }
 
       /* ------ start of DEAL WITH VREG LIVE RANGES ------ */
@@ -654,7 +580,7 @@ HInstrArray* doRegisterAllocation (
          Int k = hregIndex(vreg);
          if (k < 0 || k >= n_vregs) {
             vex_printf("\n");
-            (*ppInstr)(instrs_in->arr[ii], mode64);
+            con->ppInstr(instrs_in->arr[ii], con->mode64);
             vex_printf("\n");
             vex_printf("vreg %d, n_vregs %d\n", k, n_vregs);
             vpanic("doRegisterAllocation: out-of-range vreg");
@@ -759,10 +685,10 @@ HInstrArray* doRegisterAllocation (
          } else if (!isW && isR) {
             if (rreg_live_after[j] == INVALID_INSTRNO) {
                vex_printf("\nOFFENDING RREG = ");
-               (*ppReg)(univ->regs[j]);
+               con->ppReg(con->univ->regs[j]);
                vex_printf("\n");
                vex_printf("\nOFFENDING instr = ");
-               (*ppInstr)(instrs_in->arr[ii], mode64);
+               con->ppInstr(instrs_in->arr[ii], con->mode64);
                vex_printf("\n");
                vpanic("doRegisterAllocation: "
                       "first event for rreg is Read");
@@ -772,10 +698,10 @@ HInstrArray* doRegisterAllocation (
             vassert(isR && isW);
             if (rreg_live_after[j] == INVALID_INSTRNO) {
                vex_printf("\nOFFENDING RREG = ");
-               (*ppReg)(univ->regs[j]);
+               con->ppReg(con->univ->regs[j]);
                vex_printf("\n");
                vex_printf("\nOFFENDING instr = ");
-               (*ppInstr)(instrs_in->arr[ii], mode64);
+               con->ppInstr(instrs_in->arr[ii], con->mode64);
                vex_printf("\n");
                vpanic("doRegisterAllocation: "
                       "first event for rreg is Modify");
@@ -789,7 +715,7 @@ HInstrArray* doRegisterAllocation (
             ensureRRLRspace(&rreg_lrs_la, &rreg_lrs_size, rreg_lrs_used);
             if (0) 
                vex_printf("FLUSH 1 (%d,%d)\n", flush_la, flush_db);
-            rreg_lrs_la[rreg_lrs_used].rreg        = univ->regs[j];
+            rreg_lrs_la[rreg_lrs_used].rreg        = con->univ->regs[j];
             rreg_lrs_la[rreg_lrs_used].live_after  = toShort(flush_la);
             rreg_lrs_la[rreg_lrs_used].dead_before = toShort(flush_db);
             rreg_lrs_used++;
@@ -826,7 +752,7 @@ HInstrArray* doRegisterAllocation (
       if (0)
          vex_printf("FLUSH 2 (%d,%d)\n", 
                     rreg_live_after[j], rreg_dead_before[j]);
-      rreg_lrs_la[rreg_lrs_used].rreg        = univ->regs[j];
+      rreg_lrs_la[rreg_lrs_used].rreg        = con->univ->regs[j];
       rreg_lrs_la[rreg_lrs_used].live_after  = toShort(rreg_live_after[j]);
       rreg_lrs_la[rreg_lrs_used].dead_before = toShort(rreg_dead_before[j]);
       rreg_lrs_used++;
@@ -853,7 +779,7 @@ HInstrArray* doRegisterAllocation (
       for (Int j = 0; j < n_rregs; j++) {
          if (!rreg_state[j].has_hlrs)
             continue;
-         ppReg(univ->regs[j]);
+         con->ppReg(con->univ->regs[j]);
          vex_printf(" hinted\n");
       }
    }
@@ -889,14 +815,14 @@ HInstrArray* doRegisterAllocation (
       vex_printf("RRegLRs by LA:\n");
       for (Int j = 0; j < rreg_lrs_used; j++) {
          vex_printf("  ");
-         (*ppReg)(rreg_lrs_la[j].rreg);
+         con->ppReg(rreg_lrs_la[j].rreg);
          vex_printf("      la = %d,  db = %d\n",
                     rreg_lrs_la[j].live_after, rreg_lrs_la[j].dead_before );
       }
       vex_printf("RRegLRs by DB:\n");
       for (Int j = 0; j < rreg_lrs_used; j++) {
          vex_printf("  ");
-         (*ppReg)(rreg_lrs_db[j].rreg);
+         con->ppReg(rreg_lrs_db[j].rreg);
          vex_printf("      la = %d,  db = %d\n",
                     rreg_lrs_db[j].live_after, rreg_lrs_db[j].dead_before );
       }
@@ -930,7 +856,7 @@ HInstrArray* doRegisterAllocation (
    */
    /* Int max_ss_no = -1; */
 
-   local_memset(ss_busy_until_before, 0, sizeof(ss_busy_until_before));
+   vex_bzero(ss_busy_until_before, sizeof(ss_busy_until_before));
 
    for (Int j = 0; j < n_vregs; j++) {
 
@@ -988,7 +914,7 @@ HInstrArray* doRegisterAllocation (
       /* This reflects LibVEX's hard-wired knowledge of the baseBlock
          layout: the guest state, then two equal sized areas following
          it for two sets of shadow state, and then the spill area. */
-      vreg_lrs[j].spill_offset = toShort(guest_sizeB * 3 + ss_no * 8);
+      vreg_lrs[j].spill_offset = toShort(con->guest_sizeB * 3 + ss_no * 8);
 
       /* Independent check that we've made a sane choice of slot */
       sanity_check_spill_offset( &vreg_lrs[j] );
@@ -1031,7 +957,7 @@ HInstrArray* doRegisterAllocation (
       if (DEBUG_REGALLOC) {
          vex_printf("\n====----====---- Insn %d ----====----====\n", ii);
          vex_printf("---- ");
-         (*ppInstr)(instrs_in->arr[ii], mode64);
+         con->ppInstr(instrs_in->arr[ii], con->mode64);
          vex_printf("\n\nInitial state:\n");
          PRINT_STATE;
          vex_printf("\n");
@@ -1066,7 +992,7 @@ HInstrArray* doRegisterAllocation (
                   vex_printf("considering la %d .. db %d   reg = ", 
                              rreg_lrs_la[j].live_after, 
                              rreg_lrs_la[j].dead_before);
-                  (*ppReg)(reg);
+                  con->ppReg(reg);
                   vex_printf("\n");
                }
 
@@ -1107,7 +1033,7 @@ HInstrArray* doRegisterAllocation (
                vassert(rreg_state[j].eq_spill_slot == False);
                continue;
             }
-            vassert(hregClass(univ->regs[j]) 
+            vassert(hregClass(con->univ->regs[j])
                     == hregClass(rreg_state[j].vreg));
             vassert( hregIsVirtual(rreg_state[j].vreg));
          }
@@ -1147,7 +1073,7 @@ HInstrArray* doRegisterAllocation (
          the dst to the src's rreg, and that's all. */
       HReg vregS = INVALID_HREG;
       HReg vregD = INVALID_HREG;
-      if ( (*isMove)( instrs_in->arr[ii], &vregS, &vregD ) ) {
+      if ( con->isMove(instrs_in->arr[ii], &vregS, &vregD) ) {
          if (!hregIsVirtual(vregS)) goto cannot_coalesce;
          if (!hregIsVirtual(vregD)) goto cannot_coalesce;
          /* Check that *isMove is not telling us a bunch of lies ... */
@@ -1160,9 +1086,9 @@ HInstrArray* doRegisterAllocation (
          if (vreg_lrs[m].live_after != ii) goto cannot_coalesce;
          if (DEBUG_REGALLOC) {
          vex_printf("COALESCE ");
-            (*ppReg)(vregS);
+            con->ppReg(vregS);
             vex_printf(" -> ");
-            (*ppReg)(vregD);
+            con->ppReg(vregD);
             vex_printf("\n\n");
          }
          /* Find the state entry for vregS. */
@@ -1211,7 +1137,7 @@ HInstrArray* doRegisterAllocation (
             vreg_state[m] = INVALID_RREG_NO;
             if (DEBUG_REGALLOC) {
                vex_printf("free up "); 
-               (*ppReg)(univ->regs[j]); 
+               con->ppReg(con->univ->regs[j]);
                vex_printf("\n");
             }
          }
@@ -1252,7 +1178,7 @@ HInstrArray* doRegisterAllocation (
             than before it. */
          if (DEBUG_REGALLOC) {
             vex_printf("need to free up rreg: ");
-            (*ppReg)(rreg_lrs_la[rreg_lrs_la_next].rreg);
+            con->ppReg(rreg_lrs_la[rreg_lrs_la_next].rreg);
             vex_printf("\n\n");
          }
          Int k = hregIndex(rreg_lrs_la[rreg_lrs_la_next].rreg);
@@ -1271,8 +1197,8 @@ HInstrArray* doRegisterAllocation (
                if ((!eq_spill_opt) || !rreg_state[k].eq_spill_slot) {
                   HInstr* spill1 = NULL;
                   HInstr* spill2 = NULL;
-                  (*genSpill)( &spill1, &spill2, univ->regs[k],
-                               vreg_lrs[m].spill_offset, mode64 );
+                  con->genSpill(&spill1, &spill2, con->univ->regs[k],
+                                vreg_lrs[m].spill_offset, con->mode64);
                   vassert(spill1 || spill2); /* can't both be NULL */
                   if (spill1)
                      EMIT_INSTR(spill1);
@@ -1319,7 +1245,7 @@ HInstrArray* doRegisterAllocation (
          that the change is invisible to the standard-case handling
          that follows. */
       
-      if (directReload && reg_usage_arr[ii].n_vRegs <= 2) {
+      if (con->directReload != NULL && reg_usage_arr[ii].n_vRegs <= 2) {
          Bool  debug_direct_reload = False;
          HReg  cand     = INVALID_HREG;
          Bool  nreads   = 0;
@@ -1353,19 +1279,20 @@ HInstrArray* doRegisterAllocation (
                vassert(! sameHReg(reg_usage_arr[ii].vRegs[0],
                                   reg_usage_arr[ii].vRegs[1]));
 
-            reloaded = directReload ( instrs_in->arr[ii], cand, spilloff );
+            reloaded = con->directReload(instrs_in->arr[ii], cand, spilloff);
             if (debug_direct_reload && !reloaded) {
                vex_printf("[%3d] ", spilloff); ppHReg(cand); vex_printf(" "); 
-               ppInstr(instrs_in->arr[ii], mode64); 
+               con->ppInstr(instrs_in->arr[ii], con->mode64);
             }
             if (reloaded) {
                /* Update info about the insn, so it looks as if it had
                   been in this form all along. */
                instrs_in->arr[ii] = reloaded;
-               (*getRegUsage)( &reg_usage_arr[ii], instrs_in->arr[ii], mode64 );
+               con->getRegUsage(&reg_usage_arr[ii], instrs_in->arr[ii],
+                                con->mode64);
                if (debug_direct_reload && !reloaded) {
                   vex_printf("  -->  ");
-                  ppInstr(reloaded, mode64);
+                  con->ppInstr(reloaded, con->mode64);
                }
             }
 
@@ -1384,7 +1311,7 @@ HInstrArray* doRegisterAllocation (
          vassert(hregIsVirtual(vreg));
 
          if (0) {
-            vex_printf("considering "); (*ppReg)(vreg); vex_printf("\n");
+            vex_printf("considering "); con->ppReg(vreg); vex_printf("\n");
          }
 
          /* Now we're trying to find a rreg for "vreg".  First of all,
@@ -1395,7 +1322,7 @@ HInstrArray* doRegisterAllocation (
          Int n = vreg_state[m];
          if (IS_VALID_RREGNO(n)) {
             vassert(rreg_state[n].disp == Bound);
-            addToHRegRemap(&remap, vreg, univ->regs[n]);
+            addToHRegRemap(&remap, vreg, con->univ->regs[n]);
             /* If this rreg is written or modified, mark it as different
                from any spill slot value. */
             if (reg_usage_arr[ii].vMode[j] != HRmRead)
@@ -1414,7 +1341,7 @@ HInstrArray* doRegisterAllocation (
          Int k;
          for (k = 0; k < n_rregs; k++) {
             if (rreg_state[k].disp != Free
-                || hregClass(univ->regs[k]) != hregClass(vreg))
+                || hregClass(con->univ->regs[k]) != hregClass(vreg))
                continue;
             if (rreg_state[k].has_hlrs) {
                /* Well, at least we can use k_suboptimal if we really
@@ -1435,7 +1362,7 @@ HInstrArray* doRegisterAllocation (
             Int p = hregIndex(vreg);
             vassert(IS_VALID_VREGNO(p));
             vreg_state[p] = toShort(k);
-            addToHRegRemap(&remap, vreg, univ->regs[k]);
+            addToHRegRemap(&remap, vreg, con->univ->regs[k]);
             /* Generate a reload if needed.  This only creates needed
                reloads because the live range builder for vregs will
                guarantee that the first event for a vreg is a write.
@@ -1446,8 +1373,8 @@ HInstrArray* doRegisterAllocation (
                vassert(vreg_lrs[p].reg_class != HRcINVALID);
                HInstr* reload1 = NULL;
                HInstr* reload2 = NULL;
-               (*genReload)( &reload1, &reload2, univ->regs[k],
-                             vreg_lrs[p].spill_offset, mode64 );
+               con->genReload(&reload1, &reload2, con->univ->regs[k],
+                              vreg_lrs[p].spill_offset, con->mode64);
                vassert(reload1 || reload2); /* can't both be NULL */
                if (reload1)
                   EMIT_INSTR(reload1);
@@ -1481,7 +1408,7 @@ HInstrArray* doRegisterAllocation (
             rreg_state[k].is_spill_cand = False;
             if (rreg_state[k].disp != Bound)
                continue;
-            if (hregClass(univ->regs[k]) != hregClass(vreg))
+            if (hregClass(con->univ->regs[k]) != hregClass(vreg))
                continue;
             rreg_state[k].is_spill_cand = True;
             /* Note, the following loop visits only the virtual regs
@@ -1516,7 +1443,7 @@ HInstrArray* doRegisterAllocation (
          vassert(IS_VALID_RREGNO(spillee));
          vassert(rreg_state[spillee].disp == Bound);
          /* check it's the right class */
-         vassert(hregClass(univ->regs[spillee]) == hregClass(vreg));
+         vassert(hregClass(con->univ->regs[spillee]) == hregClass(vreg));
          /* check we're not ejecting the vreg for which we are trying
             to free up a register. */
          vassert(! sameHReg(rreg_state[spillee].vreg, vreg));
@@ -1531,8 +1458,8 @@ HInstrArray* doRegisterAllocation (
          if ((!eq_spill_opt) || !rreg_state[spillee].eq_spill_slot) {
             HInstr* spill1 = NULL;
             HInstr* spill2 = NULL;
-            (*genSpill)( &spill1, &spill2, univ->regs[spillee],
-                         vreg_lrs[m].spill_offset, mode64 );
+            con->genSpill(&spill1, &spill2, con->univ->regs[spillee],
+                          vreg_lrs[m].spill_offset, con->mode64);
             vassert(spill1 || spill2); /* can't both be NULL */
             if (spill1)
                EMIT_INSTR(spill1);
@@ -1557,8 +1484,8 @@ HInstrArray* doRegisterAllocation (
             vassert(vreg_lrs[m].reg_class != HRcINVALID);
             HInstr* reload1 = NULL;
             HInstr* reload2 = NULL;
-            (*genReload)( &reload1, &reload2, univ->regs[spillee],
-                          vreg_lrs[m].spill_offset, mode64 );
+            con->genReload(&reload1, &reload2, con->univ->regs[spillee],
+                           vreg_lrs[m].spill_offset, con->mode64);
             vassert(reload1 || reload2); /* can't both be NULL */
             if (reload1)
                EMIT_INSTR(reload1);
@@ -1577,7 +1504,7 @@ HInstrArray* doRegisterAllocation (
 
          /* So after much twisting and turning, we have vreg mapped to
             rreg_state[spillee].rreg.  Note that in the map. */
-         addToHRegRemap(&remap, vreg, univ->regs[spillee]);
+         addToHRegRemap(&remap, vreg, con->univ->regs[spillee]);
 
       } /* iterate over virtual registers in this instruction. */
 
@@ -1593,7 +1520,7 @@ HInstrArray* doRegisterAllocation (
       */
 
       /* NOTE, DESTRUCTIVELY MODIFIES instrs_in->arr[ii]. */
-      (*mapRegs)( &remap, instrs_in->arr[ii], mode64 );
+      con->mapRegs(&remap, instrs_in->arr[ii], con->mode64);
       EMIT_INSTR( instrs_in->arr[ii] );
 
       if (DEBUG_REGALLOC) {
diff --git a/priv/host_generic_reg_alloc3.c b/priv/host_generic_reg_alloc3.c
new file mode 100644
index 000000000..e6908ce8f
--- /dev/null
+++ b/priv/host_generic_reg_alloc3.c
@@ -0,0 +1,1373 @@
+/*----------------------------------------------------------------------------*/
+/*--- begin                                      host_generic_reg_alloc3.c ---*/
+/*----------------------------------------------------------------------------*/
+
+/*
+   This file is part of Valgrind, a dynamic binary instrumentation framework.
+
+   Copyright (C) 2017-2017 Ivo Raisr
+      ivosh@ivosh.net
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+   02110-1301, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+*/
+
+#include "libvex_basictypes.h"
+#include "libvex.h"
+
+#include "main_util.h"
+#include "host_generic_regs.h"
+
+/* Set to 1 for lots of debugging output. */
+#define DEBUG_REGALLOC 0
+
+/* Set to 1 for sanity checking at every instruction.
+   Set to 0 for sanity checking only every 17th one and the last one. */
+#define SANITY_CHECKS_EVERY_INSTR 0
+
+
+#define INVALID_INSTRNO (-2)
+#define INVALID_INDEX (-2)
+
+/* Register allocator state is kept in an array of VRegState's.
+   There is an element for every virtual register (vreg).
+   Elements are indexed [0 .. n_vregs-1].
+   Records information about vreg live range and its state. */
+typedef
+   struct {
+      /* Live range, register class and spill offset are computed during the
+         first register allocator pass and remain unchanged after that. */
+
+      /* This vreg becomes live with this instruction (inclusive). Contains
+         either an instruction number or INVALID_INSTRNO. */
+      Short live_after;
+      /* This vreg becomes dead before this instruction (exclusive). Contains
+         either an instruction number or INVALID_INSTRNO. */
+      Short dead_before;
+      /* What kind of register this is. */
+      HRegClass reg_class;
+
+      /* What is its current disposition? */
+      enum { Unallocated, /* Neither spilled nor assigned to a real reg. */
+             Assigned,    /* Assigned to a real register, viz rreg. */
+             Spilled      /* Spilled to the spill slot. */
+           } disp;
+
+      /* If .disp == Assigned, what rreg is it bound to? */
+      HReg rreg;
+
+      /* The "home" spill slot. The offset is relative to the beginning of
+         the guest state. */
+      UShort spill_offset;
+
+      /* This vreg (vregS) is coalesced to another vreg
+         if |coalescedTo| != INVALID_HREG.
+         Coalescing means that there is a MOV instruction which occurs in the
+         instruction stream right at vregS' dead_before
+         and vregD's live_after. */
+      HReg coalescedTo;    /* Which vreg it is coalesced to. */
+      HReg coalescedFirst; /* First vreg in the coalescing chain. */
+
+      /* If this vregS is coalesced to another vregD, what is the combined
+         dead_before for vregS+vregD. Used to effectively allocate registers. */
+      Short effective_dead_before;
+   }
+   VRegState;
+
+/* The allocator also maintains a redundant array of indexes (rreg_state) from
+   rreg numbers back to entries in vreg_state. It is redundant because iff
+   rreg_state[r] == v then hregNumber(vreg_state[v].rreg) == r -- that is, the
+   two entries point at each other. The purpose of this is to speed up
+   activities which involve looking for a particular rreg: there is no need to
+   scan the vreg_state looking for it, just index directly into rreg_state.
+   The FAQ "does this rreg already have an associated vreg" is the main
+   beneficiary.
+   The identity of the real register is not recorded here, because the index
+   of this structure in |rreg_state| is the index number of the register, and
+   the register itself can be extracted from the RRegUniverse (univ). */
+typedef
+   struct {
+      /* What is its current disposition? */
+      enum { Free,     /* Not bound to any vreg. */
+             Bound,    /* Bound to a vreg, viz vreg. */
+             Reserved  /* Reserved for an instruction. */
+           } disp;
+
+      /* If .disp == Bound, what vreg is it bound to? */
+      HReg vreg;
+
+      /* If .disp == Bound, has the associated vreg been reloaded from its spill
+         slot recently and is this rreg still equal to that spill slot?
+         Avoids unnecessary spilling that vreg later, when this rreg needs
+         to be reserved. */
+      Bool eq_spill_slot;
+   }
+   RRegState;
+
+/* Records information on a real-register live range, associated with
+   a particular real register. Computed once; does not change. */
+typedef
+   struct {
+      /* This rreg becomes live with this instruction (inclusive). Contains
+         either an instruction number or INVALID_INSTRNO. */
+      Short live_after;
+      /* This rreg becomes dead before this instruction (exclusive). Contains
+         either an instruction number or INVALID_INSTRNO. */
+      Short dead_before;
+   }
+   RRegLR;
+
+/* Live ranges for a single rreg and the current one.
+   Live ranges are computed during the first register allocator pass and remain
+   unchanged after that.
+   The identity of the real register is not recorded here, because the index
+   of this structure in |rreg_lr_state| is the index number of the register, and
+   the register itself can be extracted from the RRegUniverse (univ). */
+typedef
+   struct {
+      RRegLR* lrs;
+      UInt    lrs_size;
+      UInt    lrs_used;
+
+      /* Live range corresponding to the currently processed instruction.
+         Points into |lrs| array. */
+      RRegLR  *lr_current;
+      UInt     lr_current_idx;
+   }
+   RRegLRState;
+
+#define IS_VALID_VREGNO(v) ((v) >= 0 && (v) < n_vregs)
+#define IS_VALID_RREGNO(r) ((r) >= 0 && (r) < n_rregs)
+
+#define FREE_VREG(v)             \
+   do {                          \
+      (v)->disp = Unallocated;   \
+      (v)->rreg = INVALID_HREG;  \
+   } while (0)
+
+#define FREE_RREG(r)                      \
+   do {                                   \
+      (r)->disp          = Free;          \
+      (r)->vreg          = INVALID_HREG;  \
+      (r)->eq_spill_slot = False;         \
+   } while (0)
+
+
+/* Compute the index of the highest and lowest 1 in a ULong, respectively.
+   Results are undefined if the argument is zero. Don't pass it zero :) */
+static inline UInt ULong__maxIndex ( ULong w64 ) {
+   return 63 - __builtin_clzll(w64);
+}
+
+static inline UInt ULong__minIndex ( ULong w64 ) {
+   return __builtin_ctzll(w64);
+}
+
+static inline void enlarge_rreg_lrs(RRegLRState* rreg_lrs)
+{
+   vassert(rreg_lrs->lrs_used == rreg_lrs->lrs_size);
+
+   RRegLR* lr2 = LibVEX_Alloc_inline(2 * rreg_lrs->lrs_used * sizeof(RRegLR));
+   for (UInt l = 0; l < rreg_lrs->lrs_used; l++) {
+      lr2[l] = rreg_lrs->lrs[l];
+   }
+
+   rreg_lrs->lrs = lr2;
+   rreg_lrs->lrs_size = 2 * rreg_lrs->lrs_used;
+}
+
+#define PRINT_STATE                                              \
+   do {                                                          \
+      print_state(con, vreg_state, n_vregs, rreg_state, n_rregs, \
+                  rreg_lr_state, ii);                            \
+   } while (0)
+
+static inline void print_state(
+   const RegAllocControl* con,
+   const VRegState* vreg_state, UInt n_vregs,
+   const RRegState* rreg_state, UInt n_rregs,
+   const RRegLRState* rreg_lr_state,
+   UShort current_ii)
+{
+#  define RIGHT_JUSTIFY(_total, _written)                   \
+      do {                                                  \
+         for (Int w = (_total) - (_written); w > 0; w--) {  \
+            vex_printf(" ");                                \
+         }                                                  \
+      } while (0)
+
+   for (UInt v_idx = 0; v_idx < n_vregs; v_idx++) {
+      const VRegState* vreg = &vreg_state[v_idx];
+
+      if (vreg->live_after == INVALID_INSTRNO) {
+         continue; /* This is a dead vreg. Never comes into live. */
+      }
+      vex_printf("vreg_state[%3u]    ", v_idx);
+
+      UInt written;
+      switch (vreg->disp) {
+      case Unallocated:
+         written = vex_printf("unallocated");
+         break;
+      case Assigned:
+         written = vex_printf("assigned to ");
+         written += con->ppReg(vreg->rreg);
+         break;
+      case Spilled:
+         written = vex_printf("spilled at offset %u", vreg->spill_offset);
+         break;
+      default:
+         vassert(0);
+      }
+      RIGHT_JUSTIFY(25, written);
+
+      written = vex_printf("lr: [%d, %d) ",
+                           vreg->live_after, vreg->dead_before);
+      RIGHT_JUSTIFY(15, written);
+
+      written = vex_printf("effective lr: [%d, %d)",
+                           vreg->live_after, vreg->effective_dead_before);
+      RIGHT_JUSTIFY(25, written);
+
+      if (vreg->live_after > (Short) current_ii) {
+         vex_printf("[not live yet]\n");
+      } else if ((Short) current_ii >= vreg->dead_before) {
+         if (hregIsInvalid(vreg->coalescedTo)) {
+            vex_printf("[now dead]\n");
+         } else {
+            vex_printf("[now dead, coalesced to ");
+            con->ppReg(vreg->coalescedTo);
+            vex_printf("]\n");
+         }
+      } else {
+         vex_printf("[live]\n");
+      }
+   }
+
+   for (UInt r_idx = 0; r_idx < n_rregs; r_idx++) {
+      const RRegState*   rreg     = &rreg_state[r_idx];
+      const RRegLRState* rreg_lrs = &rreg_lr_state[r_idx];
+      vex_printf("rreg_state[%2u] = ", r_idx);
+      UInt written = con->ppReg(con->univ->regs[r_idx]);
+      RIGHT_JUSTIFY(10, written);
+
+      switch (rreg->disp) {
+      case Free:
+         vex_printf("free\n");
+         break;
+      case Bound:
+         vex_printf("bound for ");
+         con->ppReg(rreg->vreg);
+         if (rreg->eq_spill_slot) {
+            vex_printf("    (equals to its spill slot)");
+         }
+         vex_printf("\n");
+         break;
+      case Reserved:
+         vex_printf("reserved - live range [%d, %d)\n",
+                    rreg_lrs->lr_current->live_after,
+                    rreg_lrs->lr_current->dead_before);
+         break;
+      }
+   }
+
+#  undef RIGHT_JUSTIFY
+}
+
+static inline void emit_instr(HInstr* instr, HInstrArray* instrs_out,
+                              const RegAllocControl* con, const HChar* why)
+{
+   if (DEBUG_REGALLOC) {
+      vex_printf("**  ");
+      con->ppInstr(instr, con->mode64);
+      if (why != NULL) {
+         vex_printf("          (%s)", why);
+      }
+      vex_printf("\n\n");
+   }
+
+   addHInstr(instrs_out, instr);
+}
+
+/* Updates register allocator state after vreg has been spilled. */
+static inline void mark_vreg_spilled(
+   UInt v_idx, VRegState* vreg_state, UInt n_vregs,
+   RRegState* rreg_state, UInt n_rregs)
+{
+   HReg rreg = vreg_state[v_idx].rreg;
+   UInt r_idx = hregIndex(rreg);
+
+   vreg_state[v_idx].disp = Spilled;
+   vreg_state[v_idx].rreg = INVALID_HREG;
+   FREE_RREG(&rreg_state[r_idx]);
+}
+
+/* Spills a vreg assigned to some rreg.
+   The vreg is spilled and the rreg is freed.
+   Returns rreg's index. */
+static inline UInt spill_vreg(
+   HReg vreg, UInt v_idx, UInt current_ii, VRegState* vreg_state, UInt n_vregs,
+   RRegState* rreg_state, UInt n_rregs, HInstrArray* instrs_out,
+   const RegAllocControl* con)
+{
+   /* Check some invariants first. */
+   vassert(IS_VALID_VREGNO((v_idx)));
+   vassert(vreg_state[v_idx].disp == Assigned);
+   HReg rreg = vreg_state[v_idx].rreg;
+   UInt r_idx = hregIndex(rreg);
+   vassert(IS_VALID_RREGNO(r_idx));
+   vassert(hregClass(con->univ->regs[r_idx]) == hregClass(vreg));
+   vassert(vreg_state[v_idx].dead_before > (Short) current_ii);
+   vassert(vreg_state[v_idx].reg_class != HRcINVALID);
+
+   /* Generate spill. */
+   HInstr* spill1 = NULL;
+   HInstr* spill2 = NULL;
+   con->genSpill(&spill1, &spill2, rreg, vreg_state[v_idx].spill_offset,
+                 con->mode64);
+   vassert(spill1 != NULL || spill2 != NULL); /* cannot be both NULL */
+   if (spill1 != NULL) {
+      emit_instr(spill1, instrs_out, con, "spill1");
+   }
+   if (spill2 != NULL) {
+      emit_instr(spill2, instrs_out, con, "spill2");
+   }
+
+   mark_vreg_spilled(v_idx, vreg_state, n_vregs, rreg_state, n_rregs);
+   return r_idx;
+}
+
+/* Chooses a vreg to be spilled based on various criteria.
+   The vreg must not be from the instruction being processed, that is, it must
+   not be listed in reg_usage->vRegs. */
+static inline HReg find_vreg_to_spill(
+   VRegState* vreg_state, UInt n_vregs,
+   RRegState* rreg_state, UInt n_rregs,
+   const HRegUsage* instr_regusage, HRegClass target_hregclass,
+   const HRegUsage* reg_usage, UInt scan_forward_from, UInt scan_forward_max,
+   const RegAllocControl* con)
+{
+   /* Scan forwards a few instructions to find the most distant mentioned
+      use of a vreg. We can scan in the range of (inclusive):
+      - reg_usage[scan_forward_from]
+      - reg_usage[scan_forward_end], where scan_forward_end
+           = MIN(scan_forward_max, scan_forward_from + FEW_INSTRUCTIONS). */
+#  define FEW_INSTRUCTIONS 20
+   UInt scan_forward_end
+      = (scan_forward_max <= scan_forward_from + FEW_INSTRUCTIONS) ?
+        scan_forward_max : scan_forward_from + FEW_INSTRUCTIONS;
+#  undef FEW_INSTRUCTIONS
+
+   HReg vreg_found = INVALID_HREG;
+   UInt distance_so_far = 0;
+
+   for (UInt r_idx = con->univ->allocable_start[target_hregclass];
+        r_idx <= con->univ->allocable_end[target_hregclass]; r_idx++) {
+      if (rreg_state[r_idx].disp == Bound) {
+         HReg vreg = rreg_state[r_idx].vreg;
+         if (! HRegUsage__contains(instr_regusage, vreg)) {
+            UInt ii = scan_forward_from;
+            for ( ; ii <= scan_forward_end; ii++) {
+               if (HRegUsage__contains(&reg_usage[ii], vreg)) {
+                  break;
+               }
+            }
+
+            if (ii >= distance_so_far) {
+               distance_so_far = ii;
+               vreg_found = vreg;
+               if (distance_so_far == scan_forward_end) {
+                  break; /* We are at the end. Nothing could be better. */
+               }
+            }
+         }
+      }
+   }
+
+   if (hregIsInvalid(vreg_found)) {
+      vex_printf("doRegisterAllocation_v3: cannot find a register in class: ");
+      ppHRegClass(target_hregclass);
+      vex_printf("\n");
+      vpanic("doRegisterAllocation_v3: cannot find a register.");
+   }
+
+   return vreg_found;
+}
+
+/* Find a free rreg of the correct class.
+   Tries to find an rreg whose hard live range (if any) starts after the vreg's
+   live range ends. If that is not possible, then at least whose live range
+   is as far ahead in the incoming instruction stream as possible.
+   An ideal rreg candidate is a caller-save register for short-lived vregs
+   and a callee-save register for long-lived vregs because it won't need to
+   be spilled around helper calls. */
+static Int find_free_rreg(
+   const VRegState* vreg_state, UInt n_vregs,
+   const RRegState* rreg_state, UInt n_rregs,
+   const RRegLRState* rreg_lr_state,
+   UInt v_idx, UInt current_ii, HRegClass target_hregclass,
+   Bool reserve_phase, const RegAllocControl* con)
+{
+   Int r_idx_found = INVALID_INDEX;
+   UInt distance_so_far = 0; /* running max for |live_after - current_ii| */
+   const VRegState* vreg = &vreg_state[v_idx];
+
+   /* Assume majority of vregs are short-lived. Start scannig from caller-save
+      registers first. */
+   for (Int r_idx = (Int) con->univ->allocable_end[target_hregclass];
+        r_idx >= (Int) con->univ->allocable_start[target_hregclass]; r_idx--) {
+      const RRegState*   rreg     = &rreg_state[r_idx];
+      const RRegLRState* rreg_lrs = &rreg_lr_state[r_idx];
+      if (rreg->disp == Free) {
+         if (rreg_lrs->lrs_used == 0) {
+            r_idx_found = r_idx;
+            break; /* There could be nothing better, so break now. */
+         } else {
+            const RRegLR* lr = rreg_lrs->lr_current;
+            if (lr->live_after > (Short) current_ii) {
+               /* RReg's hard live range is not live, yet. */
+               if (vreg->effective_dead_before <= lr->live_after) {
+                  r_idx_found = r_idx;
+                  break; /* VReg is short-lived; it fits in. */
+               }
+               if ((lr->live_after - (Short) current_ii) > distance_so_far) {
+                  distance_so_far = lr->live_after - (Short) current_ii;
+                  r_idx_found = r_idx;
+               }
+            } else if ((Short) current_ii >= lr->dead_before) {
+               /* Now dead. Effectively as if there is no LR now. */
+               r_idx_found = r_idx;
+               break; /* There could be nothing better, so break now. */
+            } else {
+               /* Going live for this instruction. This could happen only when
+                  rregs are being reserved en mass, for example before
+                  a helper call. */
+               vassert(reserve_phase);
+            }
+         }
+      }
+   }
+
+   return r_idx_found;
+}
+
+/* A target-independent register allocator (v3). Requires various functions
+   which it uses to deal abstractly with instructions and registers, since it
+   cannot have any target-specific knowledge.
+
+   Returns a new list of instructions, which, as a result of the behaviour of
+   mapRegs, will be in-place modifications of the original instructions.
+
+   Requires that the incoming code has been generated using vreg numbers
+   0, 1 .. n_vregs-1. Appearance of a vreg outside that range is a checked
+   run-time error.
+
+   Takes unallocated instructions and returns allocated instructions.
+*/
+HInstrArray* doRegisterAllocation_v3(
+   /* Incoming virtual-registerised code. */
+   HInstrArray* instrs_in,
+
+   /* Register allocator controls to use. */
+   const RegAllocControl* con
+)
+{
+   vassert((con->guest_sizeB % LibVEX_GUEST_STATE_ALIGN) == 0);
+
+   /* The main register allocator state. */
+   UInt       n_vregs = instrs_in->n_vregs;
+   VRegState* vreg_state = NULL;
+   if (n_vregs > 0) {
+      vreg_state = LibVEX_Alloc_inline(n_vregs * sizeof(VRegState));
+   }
+
+   /* If this is not so, the universe we have is nonsensical. */
+   UInt n_rregs = con->univ->allocable;
+   vassert(n_rregs > 0);
+   STATIC_ASSERT(N_RREGUNIVERSE_REGS == 64);
+
+   /* Redundant rreg -> vreg state. */
+   RRegState* rreg_state = LibVEX_Alloc_inline(n_rregs * sizeof(RRegState));
+
+   /* Info on rreg live ranges. */
+   RRegLRState* rreg_lr_state
+      = LibVEX_Alloc_inline(n_rregs * sizeof(RRegLRState));
+
+   /* Info on register usage in the incoming instruction array. Computed once
+      and remains unchanged, more or less; updated sometimes by the
+      direct-reload optimisation. */
+   HRegUsage* reg_usage
+      = LibVEX_Alloc_inline(sizeof(HRegUsage) * instrs_in->arr_used);
+
+   /* Mark vreg indexes where coalesce chains start at. */
+   UInt* coalesce_heads = LibVEX_Alloc_inline(n_vregs * sizeof(UInt));
+   UInt nr_coalesce_heads = 0;
+
+   /* The live range numbers are signed shorts, and so limiting the
+      number of instructions to 15000 comfortably guards against them
+      overflowing 32k. */
+   vassert(instrs_in->arr_used <= 15000);
+
+   /* The output array of instructions. */
+   HInstrArray* instrs_out = newHInstrArray();
+
+
+#  define OFFENDING_VREG(_v_idx, _instr, _mode)                        \
+   do {                                                                \
+      vex_printf("\n\nOffending vreg = %u\n", (_v_idx));               \
+      vex_printf("\nOffending instruction = ");                        \
+      con->ppInstr((_instr), con->mode64);                             \
+      vex_printf("\n");                                                \
+      vpanic("doRegisterAllocation_v3: first event for vreg is "#_mode \
+             " (should be Write)");                                    \
+   } while (0)
+
+#  define OFFENDING_RREG(_r_idx, _instr, _mode)                        \
+   do {                                                                \
+      vex_printf("\n\nOffending rreg = ");                             \
+      con->ppReg(con->univ->regs[(_r_idx)]);                           \
+      vex_printf("\nOffending instruction = ");                        \
+      con->ppInstr((_instr), con->mode64);                             \
+      vex_printf("\n");                                                \
+      vpanic("doRegisterAllocation_v3: first event for rreg is "#_mode \
+             " (should be Write)");                                    \
+   } while (0)
+
+
+/* Finds an rreg of the correct class.
+   If a free rreg is not found, then spills a vreg not used by the current
+   instruction and makes free the corresponding rreg. */
+#  define FIND_OR_MAKE_FREE_RREG(_ii, _v_idx, _reg_class, _reserve_phase)      \
+   ({                                                                          \
+      Int _r_free_idx = find_free_rreg(                                        \
+                      vreg_state, n_vregs, rreg_state, n_rregs, rreg_lr_state, \
+                      (_v_idx), (_ii), (_reg_class), (_reserve_phase), con);   \
+      if (_r_free_idx == INVALID_INDEX) {                                      \
+         HReg vreg_to_spill = find_vreg_to_spill(                              \
+                                     vreg_state, n_vregs, rreg_state, n_rregs, \
+                                     &reg_usage[(_ii)], (_reg_class),          \
+                                     reg_usage, (_ii) + 1,                     \
+                                     instrs_in->arr_used - 1, con);            \
+         _r_free_idx = spill_vreg(vreg_to_spill, hregIndex(vreg_to_spill),     \
+                                  (_ii), vreg_state, n_vregs,                  \
+                                  rreg_state, n_rregs,                         \
+                                  instrs_out, con);                            \
+      }                                                                        \
+                                                                               \
+      vassert(IS_VALID_RREGNO(_r_free_idx));                                   \
+                                                                               \
+      _r_free_idx;                                                             \
+   })
+
+
+   /* --- Stage 0. Initialize the state. --- */
+   for (UInt v_idx = 0; v_idx < n_vregs; v_idx++) {
+      vreg_state[v_idx].live_after            = INVALID_INSTRNO;
+      vreg_state[v_idx].dead_before           = INVALID_INSTRNO;
+      vreg_state[v_idx].reg_class             = HRcINVALID;
+      vreg_state[v_idx].disp                  = Unallocated;
+      vreg_state[v_idx].rreg                  = INVALID_HREG;
+      vreg_state[v_idx].spill_offset          = 0;
+      vreg_state[v_idx].coalescedTo           = INVALID_HREG;
+      vreg_state[v_idx].coalescedFirst        = INVALID_HREG;
+      vreg_state[v_idx].effective_dead_before = INVALID_INSTRNO;
+   }
+
+   for (UInt r_idx = 0; r_idx < n_rregs; r_idx++) {
+      rreg_state[r_idx].disp          = Free;
+      rreg_state[r_idx].vreg          = INVALID_HREG;
+      rreg_state[r_idx].eq_spill_slot = False;
+   }
+
+   for (UInt r_idx = 0; r_idx < n_rregs; r_idx++) {
+      RRegLRState* rreg_lrs    = &rreg_lr_state[r_idx];
+      rreg_lrs->lrs_size       = 4;
+      rreg_lrs->lrs            = LibVEX_Alloc_inline(rreg_lrs->lrs_size
+                                                     * sizeof(RRegLR));
+      rreg_lrs->lrs_used       = 0;
+      rreg_lrs->lr_current     = &rreg_lrs->lrs[0];
+      rreg_lrs->lr_current_idx = 0;
+   }
+
+   /* --- Stage 1. Scan the incoming instructions. --- */
+   for (UShort ii = 0; ii < instrs_in->arr_used; ii++) {
+      const HInstr* instr = instrs_in->arr[ii];
+
+      con->getRegUsage(&reg_usage[ii], instr, con->mode64);
+      reg_usage[ii].isVregVregMove
+         = reg_usage[ii].isRegRegMove
+           && hregIsVirtual(reg_usage[ii].regMoveSrc)
+           && hregIsVirtual(reg_usage[ii].regMoveDst);
+
+      if (0) {
+         vex_printf("\n%u  stage 1: ", ii);
+         con->ppInstr(instr, con->mode64);
+         vex_printf("\n");
+         ppHRegUsage(con->univ, &reg_usage[ii]);
+      }
+
+      /* Process virtual registers mentioned in the instruction. */
+      for (UInt j = 0; j < reg_usage[ii].n_vRegs; j++) {
+         HReg vreg = reg_usage[ii].vRegs[j];
+         vassert(hregIsVirtual(vreg));
+
+         UInt v_idx = hregIndex(vreg);
+         if (!IS_VALID_VREGNO(v_idx)) {
+            vex_printf("\n");
+            con->ppInstr(instr, con->mode64);
+            vex_printf("\n");
+            vex_printf("vreg %u (n_vregs %u)\n", v_idx, n_vregs);
+            vpanic("doRegisterAllocation_v3: out-of-range vreg");
+         }
+
+         /* Note the register class. */
+         if (vreg_state[v_idx].reg_class == HRcINVALID) {
+            /* First mention of this vreg. */
+            vreg_state[v_idx].reg_class = hregClass(vreg);
+         } else {
+            /* Seen it before, so check for consistency. */
+            vassert(vreg_state[v_idx].reg_class == hregClass(vreg));
+         }
+
+         /* Consider live ranges. */
+         switch (reg_usage[ii].vMode[j]) {
+         case HRmRead:
+            if (vreg_state[v_idx].live_after == INVALID_INSTRNO) {
+               OFFENDING_VREG(v_idx, instr, "Read");
+            }
+            break;
+         case HRmWrite:
+            if (vreg_state[v_idx].live_after == INVALID_INSTRNO) {
+               vreg_state[v_idx].live_after = toShort(ii);
+            }
+            break;
+         case HRmModify:
+            if (vreg_state[v_idx].live_after == INVALID_INSTRNO) {
+               OFFENDING_VREG(v_idx, instr, "Modify");
+            }
+            break;
+         default:
+            vassert(0);
+         }
+
+         vreg_state[v_idx].dead_before = toShort(ii + 1);
+         vreg_state[v_idx].effective_dead_before
+            = vreg_state[v_idx].dead_before;
+      }
+
+      /* Process real registers mentioned in the instruction. */
+      const ULong rRead      = reg_usage[ii].rRead;
+      const ULong rWritten   = reg_usage[ii].rWritten;
+      const ULong rMentioned = rRead | rWritten;
+
+      if (rMentioned != 0) {
+         UInt rReg_minIndex = ULong__minIndex(rMentioned);
+         UInt rReg_maxIndex = ULong__maxIndex(rMentioned);
+         /* Don't bother to look at registers which are not available
+            to the allocator such as the stack or guest state pointers. These
+            are unavailable to the register allocator and so we never visit
+            them. We asserted above that n_rregs > 0, so (n_rregs - 1) is
+            safe. */
+         if (rReg_maxIndex >= n_rregs) {
+            rReg_maxIndex = n_rregs - 1;
+         }
+
+         for (UInt r_idx = rReg_minIndex; r_idx <= rReg_maxIndex; r_idx++) {
+            const ULong jMask = 1ULL << r_idx;
+
+            if (LIKELY((rMentioned & jMask) == 0)) {
+               continue;
+            }
+
+            RRegLRState* rreg_lrs = &rreg_lr_state[r_idx];
+            const Bool isR = (rRead    & jMask) != 0;
+            const Bool isW = (rWritten & jMask) != 0;
+
+            if (isW && !isR) {
+               if (rreg_lrs->lrs_used == rreg_lrs->lrs_size) {
+                  enlarge_rreg_lrs(rreg_lrs);
+               }
+
+               rreg_lrs->lrs[rreg_lrs->lrs_used].live_after = toShort(ii);
+               rreg_lrs->lrs[rreg_lrs->lrs_used].dead_before = toShort(ii + 1);
+               rreg_lrs->lrs_used += 1;
+             } else if (!isW && isR) {
+               if ((rreg_lrs->lrs_used == 0)
+                   || (rreg_lrs->lrs[rreg_lrs->lrs_used - 1].live_after
+                                                          == INVALID_INSTRNO)) {
+                  OFFENDING_RREG(r_idx, instr, "Read");
+               }
+               rreg_lrs->lrs[rreg_lrs->lrs_used - 1].dead_before
+                  = toShort(ii + 1);
+            } else {
+               vassert(isR && isW);
+               if ((rreg_lrs->lrs_used == 0)
+                   || (rreg_lrs->lrs[rreg_lrs->lrs_used - 1].live_after
+                                                          == INVALID_INSTRNO)) {
+                  OFFENDING_RREG(r_idx, instr, "Modify");
+               }
+               rreg_lrs->lrs[rreg_lrs->lrs_used - 1].dead_before
+                  = toShort(ii + 1);
+            }
+         }
+      }
+   }
+
+   if (DEBUG_REGALLOC) {
+      for (UInt v_idx = 0; v_idx < n_vregs; v_idx++) {
+         vex_printf("vreg %3u:  [%3d, %3d)\n",
+                    v_idx, vreg_state[v_idx].live_after,
+                    vreg_state[v_idx].dead_before);
+      }
+
+      for (UInt r_idx = 0; r_idx < n_rregs; r_idx++) {
+         vex_printf("rreg %2u (", r_idx);
+         UInt written = con->ppReg(con->univ->regs[r_idx]);
+         vex_printf("):");
+         for (Int t = 15 - written; t > 0; t--) {
+            vex_printf(" ");
+         }
+
+         const RRegLRState* rreg_lrs = &rreg_lr_state[r_idx];
+         for (UInt l = 0; l < rreg_lrs->lrs_used; l++) {
+            vex_printf("[%3d, %3d) ",
+                     rreg_lrs->lrs[l].live_after, rreg_lrs->lrs[l].dead_before);
+         }
+         vex_printf("\n");
+      }
+   }
+
+
+   /* --- Stage 2. MOV coalescing (preparation). --- */
+   /* Optimise register coalescing:
+         MOV  v <-> v   coalescing (done here).
+         MOV  v <-> r   coalescing (TODO: not yet, not here). */
+   /* If doing a reg-reg move between two vregs, and the src's live range ends
+     here and the dst's live range starts here, coalesce the src vreg
+     to the dst vreg. */
+   Bool coalesce_happened = False;
+   for (UShort ii = 0; ii < instrs_in->arr_used; ii++) {
+      if (reg_usage[ii].isVregVregMove) {
+         HReg vregS = reg_usage[ii].regMoveSrc;
+         HReg vregD = reg_usage[ii].regMoveDst;
+
+         /* Check that |isVregVregMove| is not telling us a bunch of lies ... */
+         vassert(hregClass(vregS) == hregClass(vregD));
+         UInt vs_idx = hregIndex(vregS);
+         UInt vd_idx = hregIndex(vregD);
+         vassert(IS_VALID_VREGNO(vs_idx));
+         vassert(IS_VALID_VREGNO(vd_idx));
+         vassert(! sameHReg(vregS, vregD));
+         VRegState* vs_st = &vreg_state[vs_idx];
+         VRegState* vd_st = &vreg_state[vd_idx];
+
+         if ((vs_st->dead_before == ii + 1) && (vd_st->live_after == ii)) {
+            /* Live ranges are adjacent. */
+
+            vs_st->coalescedTo = vregD;
+            if (hregIsInvalid(vs_st->coalescedFirst)) {
+               vd_st->coalescedFirst = vregS;
+               coalesce_heads[nr_coalesce_heads] = vs_idx;
+               nr_coalesce_heads += 1;
+            } else {
+               vd_st->coalescedFirst = vs_st->coalescedFirst;
+            }
+
+            vreg_state[hregIndex(vd_st->coalescedFirst)].effective_dead_before
+               = vd_st->dead_before;
+
+            if (DEBUG_REGALLOC) {
+               vex_printf("vreg coalescing: ");
+               con->ppReg(vregS);
+               vex_printf(" -> ");
+               con->ppReg(vregD);
+               vex_printf("\n");
+            }
+
+            coalesce_happened = True;
+         }
+      }
+   }
+
+   /* --- Stage 3. Allocate spill slots. --- */
+
+   /* Each spill slot is 8 bytes long. For vregs which take more than 64 bits
+      to spill (for example classes Flt64 and Vec128), we have to allocate two
+      consecutive spill slots. For 256 bit registers (class Vec256), we have to
+      allocate four consecutive spill slots.
+
+      For Vec128-class on PowerPC, the spill slot's actual address must be
+      16-byte aligned. Since the spill slot's address is computed as an offset
+      from the guest state pointer, and since the user of the generated code
+      must set that pointer to a 32-byte aligned value, we have the residual
+      obligation here of choosing a 16-byte aligned spill slot offset for
+      Vec128-class values. Since each spill slot is 8 bytes long, that means for
+      Vec128-class values we must allocate a spill slot number which is
+      zero mod 2.
+
+      Similarly, for Vec256 class on amd64, find a spill slot number which is
+      zero mod 4. This guarantees it will be 32-byte aligned, which isn't
+      actually necessary on amd64 (we use movUpd etc to spill), but seems like
+      a good practice.
+
+      Do a rank-based allocation of vregs to spill slot numbers. We put as few
+      values as possible in spill slots, but nevertheless need to have a spill
+      slot available for all vregs, just in case. */
+
+#  define N_SPILL64S (LibVEX_N_SPILL_BYTES / 8)
+   STATIC_ASSERT((N_SPILL64S % 2) == 0);
+   STATIC_ASSERT((LibVEX_N_SPILL_BYTES % LibVEX_GUEST_STATE_ALIGN) == 0);
+
+   Short ss_busy_until_before[N_SPILL64S];
+   vex_bzero(&ss_busy_until_before, sizeof(ss_busy_until_before));
+
+   for (UInt v_idx = 0; v_idx < n_vregs; v_idx++) {
+      /* True iff this vreg is unused. In which case we also expect that the
+         reg_class field for it has not been set.  */
+      if (vreg_state[v_idx].live_after == INVALID_INSTRNO) {
+         vassert(vreg_state[v_idx].reg_class == HRcINVALID);
+         continue;
+      }
+      if (! hregIsInvalid(vreg_state[v_idx].coalescedFirst)) {
+         /* Coalesced vregs should share the same spill slot with the first vreg
+            in the coalescing chain. But we don't have that information, yet. */
+         continue;
+      }
+
+      /* The spill slots are 64 bits in size.  As per the comment on definition
+         of HRegClass in host_generic_regs.h, that means, to spill a vreg of
+         class Flt64 or Vec128, we'll need to find two adjacent spill slots to
+         use. For Vec256, we'll need to find four adjacent slots to use. Note,
+         this logic needs to be kept in sync with the size info on the
+         definition of HRegClass. */
+      UInt ss_no;
+      switch (vreg_state[v_idx].reg_class) {
+         case HRcFlt64:
+         case HRcVec128:
+            /* Find two adjacent free slots which provide up to 128 bits to
+               spill the vreg. Since we are trying to find an even:odd pair,
+               move along in steps of 2 (slots). */
+            for (ss_no = 0; ss_no < N_SPILL64S - 1; ss_no += 2)
+               if (ss_busy_until_before[ss_no + 0] <= vreg_state[v_idx].live_after
+                 && ss_busy_until_before[ss_no + 1] <= vreg_state[v_idx].live_after)
+                  break;
+            if (ss_no >= N_SPILL64S - 1) {
+               vpanic("N_SPILL64S is too low in VEX. Increase and recompile.");
+            }
+            ss_busy_until_before[ss_no + 0]
+               = vreg_state[v_idx].effective_dead_before;
+            ss_busy_until_before[ss_no + 1]
+               = vreg_state[v_idx].effective_dead_before;
+            break;
+         default:
+            /* The ordinary case -- just find a single lowest-numbered spill
+               slot which is available at the start point of this interval,
+               and assign the interval to it. */
+            for (ss_no = 0; ss_no < N_SPILL64S; ss_no++) {
+               if (ss_busy_until_before[ss_no] <= vreg_state[v_idx].live_after)
+                  break;
+            }
+            if (ss_no == N_SPILL64S) {
+               vpanic("N_SPILL64S is too low in VEX. Increase and recompile.");
+            }
+            ss_busy_until_before[ss_no]
+               = vreg_state[v_idx].effective_dead_before;
+            break;
+      }
+
+      /* This reflects VEX's hard-wired knowledge of the guest state layout:
+         the guest state itself, then two equal sized areas following it for two
+         sets of shadow state, and then the spill area. */
+      vreg_state[v_idx].spill_offset
+         = toShort(con->guest_sizeB * 3 + ss_no * 8);
+
+      /* Independent check that we've made a sane choice of the slot. */
+      switch (vreg_state[v_idx].reg_class) {
+      case HRcVec128: case HRcFlt64:
+         vassert((vreg_state[v_idx].spill_offset % 16) == 0);
+         break;
+      default:
+         vassert((vreg_state[v_idx].spill_offset % 8) == 0);
+         break;
+      }
+   }
+
+   /* Fill in the spill offsets and effective_dead_before for coalesced vregs.*/
+   for (UInt i = 0; i < nr_coalesce_heads; i++) {
+      UInt vs_idx = coalesce_heads[i];
+      Short effective_dead_before = vreg_state[vs_idx].effective_dead_before;
+      UShort spill_offset         = vreg_state[vs_idx].spill_offset;
+      HReg vregD = vreg_state[vs_idx].coalescedTo;
+      while (! hregIsInvalid(vregD)) {
+         UInt vd_idx = hregIndex(vregD);
+         vreg_state[vd_idx].effective_dead_before = effective_dead_before;
+         vreg_state[vd_idx].spill_offset          = spill_offset;
+         vregD = vreg_state[vd_idx].coalescedTo;
+      }
+   }
+
+   if (DEBUG_REGALLOC && coalesce_happened) {
+      UInt ii = 0;
+      vex_printf("After vreg<->vreg MOV coalescing:\n");
+      PRINT_STATE;
+   }
+
+   if (0) {
+      vex_printf("\n\n");
+      for (UInt v_idx = 0; v_idx < n_vregs; v_idx++) {
+         if (vreg_state[v_idx].live_after != INVALID_INSTRNO) {
+            vex_printf("vreg %3u    --> spill offset %u\n",
+                       v_idx, vreg_state[v_idx].spill_offset);
+         }
+      }
+   }
+
+
+   /* --- State 4. Process instructions. --- */
+   for (UShort ii = 0; ii < instrs_in->arr_used; ii++) {
+      HInstr* instr = instrs_in->arr[ii];
+
+      if (DEBUG_REGALLOC) {
+         vex_printf("\n====----====---- Instr %d ----====----====\n", ii);
+         vex_printf("---- ");
+         con->ppInstr(instrs_in->arr[ii], con->mode64);
+         vex_printf("\n\nInitial state:\n");
+         PRINT_STATE;
+         vex_printf("\n");
+      }
+
+      /* ------------ Sanity checks ------------ */
+
+      /* Sanity checks are relatively expensive. So they are done only once
+         every 17 instructions, and just before the last instruction. */
+      Bool do_sanity_check
+         = toBool(
+              SANITY_CHECKS_EVERY_INSTR
+              || ii == instrs_in->arr_used - 1
+              || (ii > 0 && (ii % 17) == 0)
+           );
+
+      if (do_sanity_check) {
+         /* Sanity check: the vreg_state and rreg_state mutually-redundant
+            mappings are consistent. If vreg_state[v].rreg points at some
+            rreg_state entry then that rreg_state entry should point back at
+            vreg_state[v]. */
+         for (UInt v_idx = 0; v_idx < n_vregs; v_idx++) {
+            if (vreg_state[v_idx].disp == Assigned) {
+               vassert(!hregIsVirtual(vreg_state[v_idx].rreg));
+
+               UInt r_idx = hregIndex(vreg_state[v_idx].rreg);
+               vassert(IS_VALID_RREGNO(r_idx));
+               vassert(rreg_state[r_idx].disp == Bound);
+               vassert(hregIndex(rreg_state[r_idx].vreg) == v_idx);
+
+               vassert(hregClass(vreg_state[v_idx].rreg)
+                       == hregClass(con->univ->regs[r_idx]));
+            }
+         }
+
+         for (UInt r_idx = 0; r_idx < n_rregs; r_idx++) {
+            if (rreg_state[r_idx].disp == Bound) {
+               vassert(hregIsVirtual(rreg_state[r_idx].vreg));
+
+               UInt v_idx = hregIndex(rreg_state[r_idx].vreg);
+               vassert(IS_VALID_VREGNO(v_idx));
+               vassert(vreg_state[v_idx].disp == Assigned);
+               vassert(hregIndex(vreg_state[v_idx].rreg) == r_idx);
+            } else {
+               vassert(rreg_state[r_idx].eq_spill_slot == False);
+            }
+         }
+
+         /* Sanity check: if rreg has been marked as Reserved, there must be
+            a corresponding hard live range for it. */
+         for (UInt r_idx = 0; r_idx < n_rregs; r_idx++) {
+            if (rreg_state[r_idx].disp == Reserved) {
+               const RRegLRState* rreg_lrs = &rreg_lr_state[r_idx];
+               vassert(rreg_lrs->lrs_used > 0);
+               vassert(rreg_lrs->lr_current_idx < rreg_lrs->lrs_used);
+               vassert(rreg_lrs->lr_current->live_after <= (Short) ii);
+               vassert((Short) ii < rreg_lrs->lr_current->dead_before);
+            }
+         }
+
+         /* Sanity check: if vregS has been marked as coalesced to vregD,
+            then the effective live range of vregS must also cover live range
+            of vregD. */
+         /* The following sanity check is quite expensive. Some basic blocks
+            contain very lengthy coalescing chains... */
+         if (SANITY_CHECKS_EVERY_INSTR) {
+            for (UInt vs_idx = 0; vs_idx < n_vregs; vs_idx++) {
+               const VRegState* vS_st = &vreg_state[vs_idx];
+               HReg vregD = vS_st->coalescedTo;
+               while (! hregIsInvalid(vregD)) {
+                  const VRegState* vD_st = &vreg_state[hregIndex(vregD)];
+                  vassert(vS_st->live_after <= vD_st->live_after);
+                  vassert(vS_st->effective_dead_before >= vD_st->dead_before);
+                  vregD = vD_st->coalescedTo;
+               }
+            }
+         }
+      }
+
+
+      /* --- MOV coalescing (finishing) --- */
+      /* Optimise register coalescing:
+            MOV  v <-> v   coalescing (finished here).
+            MOV  v <-> r   coalescing (TODO: not yet). */
+      if (reg_usage[ii].isVregVregMove) {
+         HReg vregS = reg_usage[ii].regMoveSrc;
+         HReg vregD = reg_usage[ii].regMoveDst;
+         UInt vs_idx = hregIndex(vregS);
+         UInt vd_idx = hregIndex(vregD);
+
+         if (sameHReg(vreg_state[vs_idx].coalescedTo, vregD)) {
+            /* Finally do the coalescing. */
+
+            HReg rreg = vreg_state[vs_idx].rreg;
+            switch (vreg_state[vs_idx].disp) {
+            case Assigned:
+               vreg_state[vd_idx].rreg = rreg;
+               UInt r_idx = hregIndex(rreg);
+               vassert(rreg_state[r_idx].disp == Bound);
+               rreg_state[r_idx].vreg = vregD;
+               break;
+            case Spilled:
+               vassert(hregIsInvalid(vreg_state[vs_idx].rreg));
+               break;
+            default:
+               vassert(0);
+            }
+
+            vreg_state[vd_idx].disp = vreg_state[vs_idx].disp;
+            FREE_VREG(&vreg_state[vs_idx]);
+
+            if (DEBUG_REGALLOC) {
+               vex_printf("coalesced: ");
+               con->ppReg(vregS);
+               vex_printf(" -> ");
+               con->ppReg(vregD);
+               vex_printf("\n\n");
+            }
+
+            /* In rare cases it can happen that vregD's live range ends here.
+               Check and eventually free the vreg and rreg.
+               This effectively means that either the translated program
+               contained dead code (but VEX iropt passes are pretty good
+               at eliminating it) or the VEX backend generated dead code. */
+            if (vreg_state[vd_idx].dead_before <= (Short) ii + 1) {
+               if (vreg_state[vd_idx].disp == Assigned) {
+                  UInt r_idx = hregIndex(rreg);
+                  FREE_RREG(&rreg_state[r_idx]);
+               }
+               FREE_VREG(&vreg_state[vd_idx]);
+            }
+
+            /* Move on to the next instruction. We skip the post-instruction
+               stuff because all required house-keeping was done here. */
+            continue;
+         }
+      }
+
+
+      /* --- Reserve and free rregs if needed. --- */
+      /* If the rreg enters its hard live range and is not free:
+         1. If the corresponding vreg is not used by the instruction, spill it.
+         2. If the corresponding vreg is used by the instruction, then:
+         2a. If there are no free rregs, spill a vreg not used by this
+             instruction.
+         2b. Move the corresponding vreg to a free rreg. This is better than
+             spilling it and immediatelly reloading it.
+       */
+      const ULong rRead      = reg_usage[ii].rRead;
+      const ULong rWritten   = reg_usage[ii].rWritten;
+      const ULong rMentioned = rRead | rWritten;
+
+      if (rMentioned != 0) {
+         UInt rReg_minIndex = ULong__minIndex(rMentioned);
+         UInt rReg_maxIndex = ULong__maxIndex(rMentioned);
+         if (rReg_maxIndex >= n_rregs) {
+            rReg_maxIndex = n_rregs - 1;
+         }
+
+         for (UInt r_idx = rReg_minIndex; r_idx <= rReg_maxIndex; r_idx++) {
+            const ULong jMask = 1ULL << r_idx;
+
+            if (LIKELY((rMentioned & jMask) == 0)) {
+               continue;
+            }
+
+            RRegState* rreg = &rreg_state[r_idx];
+            const RRegLRState* rreg_lrs = &rreg_lr_state[r_idx];
+            if (LIKELY(rreg_lrs->lrs_used == 0)) {
+               continue;
+            }
+            if (rreg->disp == Reserved) {
+               continue;
+            }
+
+            if ((rreg_lrs->lr_current->live_after <= (Short) ii)
+                && ((Short) ii < rreg_lrs->lr_current->dead_before)) {
+
+               switch (rreg->disp) {
+               case Bound: {
+                  /* Yes, there is an associated vreg. We need to deal with
+                     it now somehow. */
+                  HReg vreg = rreg->vreg;
+                  UInt v_idx = hregIndex(vreg);
+
+                  if (! HRegUsage__contains(&reg_usage[ii], vreg)) {
+                     if (rreg->eq_spill_slot) {
+                        mark_vreg_spilled(v_idx, vreg_state, n_vregs,
+                                          rreg_state, n_rregs);
+                     } else {
+                        /* Spill the vreg. It is not used by this instruction.*/
+                        spill_vreg(vreg, v_idx, ii, vreg_state, n_vregs,
+                                   rreg_state, n_rregs, instrs_out, con);
+                     }
+                  } else {
+                     /* Find or make a free rreg where to move this vreg to. */
+                     UInt r_free_idx = FIND_OR_MAKE_FREE_RREG(
+                                  ii, v_idx, vreg_state[v_idx].reg_class, True);
+
+                     /* Generate "move" between real registers. */
+                     HInstr* move = con->genMove(con->univ->regs[r_idx],
+                                      con->univ->regs[r_free_idx], con->mode64);
+                     vassert(move != NULL);
+                     emit_instr(move, instrs_out, con, "move");
+
+                     /* Update the register allocator state. */
+                     vassert(vreg_state[v_idx].disp == Assigned);
+                     vreg_state[v_idx].rreg = con->univ->regs[r_free_idx];
+                     rreg_state[r_free_idx].disp          = Bound;
+                     rreg_state[r_free_idx].vreg          = vreg;
+                     rreg_state[r_free_idx].eq_spill_slot = rreg->eq_spill_slot;
+                     FREE_RREG(rreg);
+                  }
+                  break;
+               }
+               case Free:
+                  break;
+               default:
+                  vassert(0);
+               }
+
+               /* Finally claim the rreg as reserved. */
+               rreg->disp = Reserved;
+
+               if (DEBUG_REGALLOC) {
+                  vex_printf("rreg has been reserved: ");
+                  con->ppReg(con->univ->regs[r_idx]);
+                  vex_printf("\n\n");
+               }
+            }
+         }
+      }
+
+
+      /* --- Direct reload optimisation. --- */
+      /* If the instruction reads exactly one vreg which is currently spilled,
+         and this is the last use of that vreg, see if we can convert
+         the instruction into one that reads directly from the spill slot.
+         This is clearly only possible for x86 and amd64 targets, since ppc and
+         arm are load-store architectures. If successful, replace
+         instrs_in->arr[ii] with this new instruction, and recompute
+         its reg_usage, so that the change is invisible to the standard-case
+         handling that follows. */
+      if ((con->directReload != NULL) && (reg_usage[ii].n_vRegs <= 2)) {
+         Bool debug_direct_reload = False;
+         Bool nreads = 0;
+         HReg vreg_found = INVALID_HREG;
+         Short spill_offset = 0;
+
+         for (UInt j = 0; j < reg_usage[ii].n_vRegs; j++) {
+            HReg vreg = reg_usage[ii].vRegs[j];
+            vassert(hregIsVirtual(vreg));
+
+            if (reg_usage[ii].vMode[j] == HRmRead) {
+               nreads++;
+               UInt v_idx = hregIndex(vreg);
+               vassert(IS_VALID_VREGNO(v_idx));
+               if (vreg_state[v_idx].disp == Spilled) {
+                  /* Is this its last use? */
+                  vassert(vreg_state[v_idx].dead_before >= (Short) (ii + 1));
+                  if ((vreg_state[v_idx].dead_before == (Short) (ii + 1))
+                      && hregIsInvalid(vreg_found)) {
+                     vreg_found = vreg;
+                     spill_offset = vreg_state[v_idx].spill_offset;
+                  }
+               }
+            }
+         }
+
+         if (!hregIsInvalid(vreg_found) && (nreads == 1)) {
+            if (reg_usage[ii].n_vRegs == 2) {
+               vassert(! sameHReg(reg_usage[ii].vRegs[0],
+                                  reg_usage[ii].vRegs[1]));
+            }
+
+            HInstr* reloaded = con->directReload(instrs_in->arr[ii],
+                                                 vreg_found, spill_offset);
+            if (debug_direct_reload && (reloaded != NULL)) {
+               vex_printf("[%3d] ", spill_offset);
+               ppHReg(vreg_found);
+               vex_printf(": ");
+               con->ppInstr(instr, con->mode64);
+            }
+            if (reloaded != NULL) {
+               /* Update info about the instruction, so it looks as if it had
+                  been in this form all along. */
+               instr = reloaded;
+               instrs_in->arr[ii] = reloaded;
+               con->getRegUsage(&reg_usage[ii], instr, con->mode64);
+               if (debug_direct_reload) {
+                  vex_printf("  -->  ");
+                  con->ppInstr(reloaded, con->mode64);
+               }
+            }
+
+            if (debug_direct_reload && (reloaded != NULL)) {
+               vex_printf("\n");
+            }
+         }
+      }
+
+
+      /* The vreg -> rreg map constructed and then applied to each
+         instruction. */
+         HRegRemap remap;
+         initHRegRemap(&remap);
+
+      /* --- Allocate vregs used by the instruction. --- */
+      /* Vregs used by the instruction can be in the following states:
+         - Unallocated: vreg is entering its live range. Find a free rreg.
+         - Assigned: we do nothing; rreg has been allocated previously.
+         - Spilled: Find a free rreg and reload vreg into it.
+         Naturally, finding a free rreg may involve spilling a vreg not used by
+         the instruction. */
+      for (UInt j = 0; j < reg_usage[ii].n_vRegs; j++) {
+         HReg vreg = reg_usage[ii].vRegs[j];
+         vassert(hregIsVirtual(vreg));
+
+         if (0) {
+            vex_printf("considering "); con->ppReg(vreg); vex_printf("\n");
+         }
+
+         UInt v_idx = hregIndex(vreg);
+         vassert(IS_VALID_VREGNO(v_idx));
+         HReg rreg = vreg_state[v_idx].rreg;
+         UInt r_idx;
+         if (vreg_state[v_idx].disp == Assigned) {
+            r_idx = hregIndex(rreg);
+            vassert(rreg_state[r_idx].disp == Bound);
+            addToHRegRemap(&remap, vreg, rreg);
+         } else {
+            vassert(hregIsInvalid(rreg));
+
+            /* Find or make a free rreg of the correct class. */
+            r_idx = FIND_OR_MAKE_FREE_RREG(
+                                 ii, v_idx, vreg_state[v_idx].reg_class, False);
+            rreg = con->univ->regs[r_idx];
+
+            /* Generate reload only if the vreg is spilled and is about to being
+               read or modified. If it is merely written than reloading it first
+               would be pointless. */
+            if ((vreg_state[v_idx].disp == Spilled)
+                && (reg_usage[ii].vMode[j] != HRmWrite)) {
+
+               HInstr* reload1 = NULL;
+               HInstr* reload2 = NULL;
+               con->genReload(&reload1, &reload2, rreg,
+                         vreg_state[v_idx].spill_offset, con->mode64);
+               vassert(reload1 != NULL || reload2 != NULL);
+               if (reload1 != NULL) {
+                  emit_instr(reload1, instrs_out, con, "reload1");
+               }
+               if (reload2 != NULL) {
+                  emit_instr(reload2, instrs_out, con, "reload2");
+               }
+
+
+            }
+
+            rreg_state[r_idx].disp          = Bound;
+            rreg_state[r_idx].vreg          = vreg;
+            rreg_state[r_idx].eq_spill_slot = True;
+            vreg_state[v_idx].disp = Assigned;
+            vreg_state[v_idx].rreg = rreg;
+            addToHRegRemap(&remap, vreg, rreg);
+         }
+
+         /* If this vreg is written or modified, mark it so. */
+         if (reg_usage[ii].vMode[j] != HRmRead) {
+            rreg_state[r_idx].eq_spill_slot = False;
+         }
+      }
+
+      con->mapRegs(&remap, instr, con->mode64);
+      emit_instr(instr, instrs_out, con, NULL);
+
+      if (DEBUG_REGALLOC) {
+         vex_printf("After dealing with current instruction:\n");
+         PRINT_STATE;
+         vex_printf("\n");
+      }
+
+      /* ------ Post-instruction actions. ------ */
+      /* Free rregs which:
+         - Have been reserved and whose hard live range ended.
+         - Have been bound to vregs whose live range ended. */
+      for (UInt r_idx = 0; r_idx < n_rregs; r_idx++) {
+         RRegState*   rreg     = &rreg_state[r_idx];
+         RRegLRState* rreg_lrs = &rreg_lr_state[r_idx];
+         switch (rreg->disp) {
+         case Free:
+            break;
+         case Reserved:
+            if (rreg_lrs->lrs_used > 0) {
+               /* Consider "dead before" the next instruction. */
+               if (rreg_lrs->lr_current->dead_before <= (Short) ii + 1) {
+                  FREE_RREG(&rreg_state[r_idx]);
+                  if (rreg_lrs->lr_current_idx < rreg_lrs->lrs_used - 1) {
+                     rreg_lrs->lr_current_idx += 1;
+                     rreg_lrs->lr_current
+                        = &rreg_lrs->lrs[rreg_lrs->lr_current_idx];
+                  }
+               }
+            }
+            break;
+         case Bound: {
+            UInt v_idx = hregIndex(rreg->vreg);
+            /* Consider "dead before" the next instruction. */
+            if (vreg_state[v_idx].dead_before <= (Short) ii + 1) {
+               FREE_VREG(&vreg_state[v_idx]);
+               FREE_RREG(&rreg_state[r_idx]);
+            }
+            break;
+         }
+         default:
+            vassert(0);
+         }
+      }
+   }
+
+   return instrs_out;
+}
+
+/*----------------------------------------------------------------------------*/
+/*---                                            host_generic_reg_alloc3.c ---*/
+/*----------------------------------------------------------------------------*/
diff --git a/priv/host_generic_regs.c b/priv/host_generic_regs.c
index 710869b5c..ee2f1b705 100644
--- a/priv/host_generic_regs.c
+++ b/priv/host_generic_regs.c
@@ -58,11 +58,10 @@ void ppHRegClass ( HRegClass hrc )
 }
 
 /* Generic printing for registers. */
-void ppHReg ( HReg r ) 
+UInt ppHReg ( HReg r )
 {
    if (hregIsInvalid(r)) {
-      vex_printf("HReg_INVALID");
-      return;
+      return vex_printf("HReg_INVALID");
    }
    const Bool   isV     = hregIsVirtual(r);
    const HChar* maybe_v = isV ? "v" : "";
@@ -71,12 +70,12 @@ void ppHReg ( HReg r )
       always zero for virtual registers, so that's pointless -- hence
       show the index number instead. */
    switch (hregClass(r)) {
-      case HRcInt32:   vex_printf("%%%sr%u", maybe_v, regNN); return;
-      case HRcInt64:   vex_printf("%%%sR%u", maybe_v, regNN); return;
-      case HRcFlt32:   vex_printf("%%%sF%u", maybe_v, regNN); return;
-      case HRcFlt64:   vex_printf("%%%sD%u", maybe_v, regNN); return;
-      case HRcVec64:   vex_printf("%%%sv%u", maybe_v, regNN); return;
-      case HRcVec128:  vex_printf("%%%sV%u", maybe_v, regNN); return;
+      case HRcInt32:   return vex_printf("%%%sr%u", maybe_v, regNN);
+      case HRcInt64:   return vex_printf("%%%sR%u", maybe_v, regNN);
+      case HRcFlt32:   return vex_printf("%%%sF%u", maybe_v, regNN);
+      case HRcFlt64:   return vex_printf("%%%sD%u", maybe_v, regNN);
+      case HRcVec64:   return vex_printf("%%%sv%u", maybe_v, regNN);
+      case HRcVec128:  return vex_printf("%%%sV%u", maybe_v, regNN);
       default: vpanic("ppHReg");
    }
 }
@@ -94,6 +93,11 @@ void RRegUniverse__init ( /*OUT*/RRegUniverse* univ )
    for (UInt i = 0; i < N_RREGUNIVERSE_REGS; i++) {
       univ->regs[i] = INVALID_HREG;
    }
+
+   for (UInt i = 0; i <= HrcLAST; i++) {
+      univ->allocable_start[i] = N_RREGUNIVERSE_REGS;
+      univ->allocable_end[i]   = N_RREGUNIVERSE_REGS;
+   }
 }
 
 void RRegUniverse__check_is_sane ( const RRegUniverse* univ )
@@ -113,6 +117,33 @@ void RRegUniverse__check_is_sane ( const RRegUniverse* univ )
       HReg reg = univ->regs[i];
       vassert(hregIsInvalid(reg));
    }
+
+   /* Determine register classes used and if they form contiguous range. */
+   Bool regclass_used[HrcLAST + 1];
+   for (UInt i = 0; i <= HrcLAST; i++) {
+      regclass_used[i] = False;
+   }
+
+   for (UInt i = 0; i < univ->allocable; i++) {
+      HReg reg = univ->regs[i];
+      HRegClass regclass = hregClass(reg);
+      if (!regclass_used[regclass]) {
+         regclass_used[regclass] = True;
+      }
+   }
+
+   UInt regs_visited = 0;
+   for (UInt i = 0; i <= HrcLAST; i++) {
+      if (regclass_used[i]) {
+         for (UInt j = univ->allocable_start[i];
+              j <= univ->allocable_end[i]; j++) {
+            vassert(hregClass(univ->regs[j]) == i);
+            regs_visited += 1;
+         }
+      }
+   }
+
+   vassert(regs_visited == univ->allocable);
 }
 
 
diff --git a/priv/host_generic_regs.h b/priv/host_generic_regs.h
index 5c4804e7a..d1f8ac69f 100644
--- a/priv/host_generic_regs.h
+++ b/priv/host_generic_regs.h
@@ -93,7 +93,7 @@ typedef  struct { UInt u32; }  HReg;
    available on any specific host.  For example on x86, the available
    classes are: Int32, Flt64, Vec128 only.
 
-   IMPORTANT NOTE: host_generic_reg_alloc2.c needs how much space is
+   IMPORTANT NOTE: host_generic_reg_alloc*.c needs to know how much space is
    needed to spill each class of register.  It allocates the following
    amount of space:
 
@@ -106,7 +106,7 @@ typedef  struct { UInt u32; }  HReg;
       HRcVec128    128 bits
 
    If you add another regclass, you must remember to update
-   host_generic_reg_alloc2.c accordingly.  
+   host_generic_reg_alloc*.c and RRegUniverse accordingly.
 
    When adding entries to enum HRegClass, do not use any value > 14 or < 1.
 */
@@ -118,15 +118,17 @@ typedef
       HRcFlt32=5,     /* 32-bit float */
       HRcFlt64=6,     /* 64-bit float */
       HRcVec64=7,     /* 64-bit SIMD */
-      HRcVec128=8     /* 128-bit SIMD */
+      HRcVec128=8,    /* 128-bit SIMD */
+      HrcLAST=HRcVec128
    }
    HRegClass;
 
 extern void ppHRegClass ( HRegClass );
 
 
-/* Print an HReg in a generic (non-target-specific) way. */
-extern void ppHReg ( HReg );
+/* Print an HReg in a generic (non-target-specific) way.
+   Returns number of HChar's written. */
+extern UInt ppHReg ( HReg );
 
 /* Construct.  The goal here is that compiler can fold this down to a
    constant in the case where the four arguments are constants, which
@@ -149,7 +151,7 @@ static inline HReg mkHReg ( Bool virtual, HRegClass rc, UInt enc, UInt ix )
 static inline HRegClass hregClass ( HReg r )
 {
    HRegClass rc = (HRegClass)((r.u32 >> 27) & 0xF);
-   vassert(rc >= HRcInt32 && rc <= HRcVec128);
+   vassert(rc >= HRcInt32 && rc <= HrcLAST);
    return rc;
 }
 
@@ -221,6 +223,25 @@ typedef
          index here, since this is the only place where we map index
          numbers to actual registers. */
       HReg regs[N_RREGUNIVERSE_REGS];
+
+      /* Ranges for groups of allocable registers. Used to quickly address only
+         a group of allocable registers belonging to the same register class.
+         Indexes into |allocable_{start,end}| are HRcClass entries, such as
+         HRcInt64. Values in |allocable_{start,end}| give a valid range into
+         |regs| where registers corresponding to the given register class are
+         found.
+
+         For example, let's say allocable_start[HRcInt64] == 10 and
+         allocable_end[HRcInt64] == 14. Then regs[10], regs[11], regs[12],
+         regs[13], and regs[14] give all registers of register class HRcInt64.
+
+         If a register class is not present, then values of the corresponding
+         |allocable_{start,end}| elements are equal to N_RREGUNIVERSE_REGS.
+
+         Naturally registers in |regs| must form contiguous groups. This is
+         checked by RRegUniverse__check_is_sane(). */
+      UInt allocable_start[HrcLAST + 1];
+      UInt allocable_end[HrcLAST + 1];
    }
    RRegUniverse;
 
@@ -305,7 +326,7 @@ extern Bool HRegUsage__contains ( const HRegUsage*, HReg );
 /*---------------------------------------------------------*/
 
 /* Note that such maps can only map virtual regs to real regs.
-   addToHRegRenap will barf if given a pair not of that form.  As a
+   addToHRegRemap will barf if given a pair not of that form.  As a
    result, no valid HRegRemap will bind a real reg to anything, and so
    if lookupHRegMap is given a real reg, it returns it unchanged.
    This is precisely the behaviour that the register allocator needs
@@ -442,40 +463,49 @@ static inline Bool is_RetLoc_INVALID ( RetLoc rl ) {
 /*--- Reg alloc: TODO: move somewhere else              ---*/
 /*---------------------------------------------------------*/
 
-extern
-HInstrArray* doRegisterAllocation (
+/* Control of the VEX register allocator. */
+typedef
+   struct {
+      /* The real-register universe to use.  This contains facts about real
+         registers, one of which is the set of registers available for
+         allocation. */
+      const RRegUniverse* univ;
+
+      /* Return True iff the given insn is a reg-reg move, in which case also
+         return the src and dst regs. */
+      Bool (*isMove)(const HInstr*, HReg*, HReg*);
+
+      /* Get info about register usage in this insn. */
+      void (*getRegUsage)(HRegUsage*, const HInstr*, Bool);
+
+      /* Apply a reg-reg mapping to an insn. */
+      void (*mapRegs)(HRegRemap*, HInstr*, Bool);
+
+      /* Return insn(s) to spill/restore a real register to a spill slot offset.
+         Also a function to move between registers.
+         And optionally a function to do direct reloads. */
+      void    (*genSpill)(HInstr**, HInstr**, HReg, Int, Bool);
+      void    (*genReload)(HInstr**, HInstr**, HReg, Int, Bool);
+      HInstr* (*genMove)(HReg from, HReg to, Bool);
+      HInstr* (*directReload)(HInstr*, HReg, Short);
+      UInt    guest_sizeB;
+
+      /* For debug printing only. */
+      void (*ppInstr)(const HInstr*, Bool);
+      UInt (*ppReg)(HReg);
+
+      /* 32/64bit mode */
+      Bool mode64;
+   }
+   RegAllocControl;
 
-   /* Incoming virtual-registerised code. */ 
+extern HInstrArray* doRegisterAllocation_v2(
    HInstrArray* instrs_in,
-
-   /* The real-register universe to use.  This contains facts about
-      real registers, one of which is the set of registers available
-      for allocation. */
-   const RRegUniverse* univ,
-
-   /* Return True iff the given insn is a reg-reg move, in which
-      case also return the src and dst regs. */
-   Bool (*isMove) (const HInstr*, HReg*, HReg*),
-
-   /* Get info about register usage in this insn. */
-   void (*getRegUsage) (HRegUsage*, const HInstr*, Bool),
-
-   /* Apply a reg-reg mapping to an insn. */
-   void (*mapRegs) (HRegRemap*, HInstr*, Bool),
-
-   /* Return insn(s) to spill/restore a real reg to a spill slot
-      offset.  And optionally a function to do direct reloads. */
-   void    (*genSpill) (  HInstr**, HInstr**, HReg, Int, Bool ),
-   void    (*genReload) ( HInstr**, HInstr**, HReg, Int, Bool ),
-   HInstr* (*directReload) ( HInstr*, HReg, Short ),
-   Int     guest_sizeB,
-
-   /* For debug printing only. */
-   void (*ppInstr) ( const HInstr*, Bool ),
-   void (*ppReg) ( HReg ),
-
-   /* 32/64bit mode */
-   Bool mode64
+   const RegAllocControl* con
+);
+extern HInstrArray* doRegisterAllocation_v3(
+   HInstrArray* instrs_in,
+   const RegAllocControl* con
 );
 
 
diff --git a/priv/host_mips_defs.c b/priv/host_mips_defs.c
index ce202d121..0f08216d7 100644
--- a/priv/host_mips_defs.c
+++ b/priv/host_mips_defs.c
@@ -63,6 +63,7 @@ const RRegUniverse* getRRegUniverse_MIPS ( Bool mode64 )
    /* Add the registers.  The initial segment of this array must be
       those available for allocation by reg-alloc, and those that
       follow are not available for allocation. */
+   ru->allocable_start[(mode64) ? HRcInt64 : HRcInt32] = ru->size;
    ru->regs[ru->size++] = hregMIPS_GPR16(mode64);
    ru->regs[ru->size++] = hregMIPS_GPR17(mode64);
    ru->regs[ru->size++] = hregMIPS_GPR18(mode64);
@@ -76,7 +77,10 @@ const RRegUniverse* getRRegUniverse_MIPS ( Bool mode64 )
    ru->regs[ru->size++] = hregMIPS_GPR14(mode64);
    ru->regs[ru->size++] = hregMIPS_GPR15(mode64);
    ru->regs[ru->size++] = hregMIPS_GPR24(mode64);
+   ru->allocable_end[(mode64) ? HRcInt64 : HRcInt32] = ru->size - 1;
+
    /* s7  (=guest_state) */
+   ru->allocable_start[(mode64) ? HRcFlt64 : HRcFlt32] = ru->size;
    ru->regs[ru->size++] = hregMIPS_F16(mode64);
    ru->regs[ru->size++] = hregMIPS_F18(mode64);
    ru->regs[ru->size++] = hregMIPS_F20(mode64);
@@ -85,8 +89,11 @@ const RRegUniverse* getRRegUniverse_MIPS ( Bool mode64 )
    ru->regs[ru->size++] = hregMIPS_F26(mode64);
    ru->regs[ru->size++] = hregMIPS_F28(mode64);
    ru->regs[ru->size++] = hregMIPS_F30(mode64);
+   ru->allocable_end[(mode64) ? HRcFlt64 : HRcFlt32] = ru->size - 1;
+
    if (!mode64) {
       /* Fake double floating point */
+      ru->allocable_start[HRcFlt64] = ru->size;
       ru->regs[ru->size++] = hregMIPS_D0(mode64);
       ru->regs[ru->size++] = hregMIPS_D1(mode64);
       ru->regs[ru->size++] = hregMIPS_D2(mode64);
@@ -95,6 +102,7 @@ const RRegUniverse* getRRegUniverse_MIPS ( Bool mode64 )
       ru->regs[ru->size++] = hregMIPS_D5(mode64);
       ru->regs[ru->size++] = hregMIPS_D6(mode64);
       ru->regs[ru->size++] = hregMIPS_D7(mode64);
+      ru->allocable_end[HRcFlt64] = ru->size - 1;
    }
 
    ru->allocable = ru->size;
@@ -126,7 +134,7 @@ const RRegUniverse* getRRegUniverse_MIPS ( Bool mode64 )
 }
 
 
-void ppHRegMIPS(HReg reg, Bool mode64)
+UInt ppHRegMIPS(HReg reg, Bool mode64)
 {
    Int r;
    static const HChar *ireg32_names[35]
@@ -151,8 +159,7 @@ void ppHRegMIPS(HReg reg, Bool mode64)
 
    /* Be generic for all virtual regs. */
    if (hregIsVirtual(reg)) {
-      ppHReg(reg);
-      return;
+      return ppHReg(reg);
    }
 
    /* But specific for real regs. */
@@ -164,29 +171,23 @@ void ppHRegMIPS(HReg reg, Bool mode64)
       case HRcInt32:
          r = hregEncoding(reg);
          vassert(r >= 0 && r < 32);
-         vex_printf("%s", ireg32_names[r]);
-         return;
+         return vex_printf("%s", ireg32_names[r]);
       case HRcInt64:
          r = hregEncoding (reg);
          vassert (r >= 0 && r < 32);
-         vex_printf ("%s", ireg32_names[r]);
-         return;
+         return vex_printf ("%s", ireg32_names[r]);
       case HRcFlt32:
          r = hregEncoding(reg);
          vassert(r >= 0 && r < 32);
-         vex_printf("%s", freg32_names[r]);
-         return;
+         return vex_printf("%s", freg32_names[r]);
       case HRcFlt64:
          r = hregEncoding(reg);
          vassert(r >= 0 && r < 32);
-         vex_printf("%s", freg64_names[r]);
-         return;
+         return vex_printf("%s", freg64_names[r]);
       default:
          vpanic("ppHRegMIPS");
          break;
    }
-
-   return;
 }
 
 
@@ -2029,6 +2030,18 @@ void genReload_MIPS( /*OUT*/ HInstr ** i1, /*OUT*/ HInstr ** i2, HReg rreg,
    }
 }
 
+MIPSInstr* genMove_MIPS(HReg from, HReg to, Bool mode64)
+{
+   switch (hregClass(from)) {
+   case HRcInt32:
+   case HRcInt64:
+      return MIPSInstr_Alu(Malu_OR, to, from, MIPSRH_Reg(from));
+   default:
+      ppHRegClass(hregClass(from));
+      vpanic("genMove_MIPS: unimplemented regclass");
+   }
+}
+
 /* --------- The mips assembler --------- */
 
 inline static UInt iregNo(HReg r, Bool mode64)
diff --git a/priv/host_mips_defs.h b/priv/host_mips_defs.h
index 8714621eb..b0fda59a4 100644
--- a/priv/host_mips_defs.h
+++ b/priv/host_mips_defs.h
@@ -135,7 +135,7 @@ ST_IN HReg hregMIPS_GPR31 ( Bool mode64 ) { return GPR(mode64, 31, 37, 45); }
 # define MIPS_N_REGPARMS 8
 #endif
 
-extern void ppHRegMIPS ( HReg, Bool );
+extern UInt ppHRegMIPS ( HReg, Bool );
 
 
 /* --------- Condition codes, Intel encoding. --------- */
@@ -701,6 +701,7 @@ extern void genSpill_MIPS ( /*OUT*/ HInstr ** i1, /*OUT*/ HInstr ** i2,
                             HReg rreg, Int offset, Bool);
 extern void genReload_MIPS( /*OUT*/ HInstr ** i1, /*OUT*/ HInstr ** i2,
                             HReg rreg, Int offset, Bool);
+extern MIPSInstr* genMove_MIPS(HReg from, HReg to, Bool mode64);
 
 extern const RRegUniverse* getRRegUniverse_MIPS ( Bool mode64 );
 
diff --git a/priv/host_ppc_defs.c b/priv/host_ppc_defs.c
index dc70f2426..a2d8f9ef5 100644
--- a/priv/host_ppc_defs.c
+++ b/priv/host_ppc_defs.c
@@ -68,6 +68,7 @@ const RRegUniverse* getRRegUniverse_PPC ( Bool mode64 )
    // GPR0 = scratch reg where poss. - some ops interpret as value zero
    // GPR1 = stack pointer
    // GPR2 = TOC pointer
+   ru->allocable_start[(mode64) ? HRcInt64 : HRcInt32] = ru->size;
    ru->regs[ru->size++] = hregPPC_GPR3(mode64);
    ru->regs[ru->size++] = hregPPC_GPR4(mode64);
    ru->regs[ru->size++] = hregPPC_GPR5(mode64);
@@ -100,6 +101,7 @@ const RRegUniverse* getRRegUniverse_PPC ( Bool mode64 )
    ru->regs[ru->size++] = hregPPC_GPR26(mode64);
    ru->regs[ru->size++] = hregPPC_GPR27(mode64);
    ru->regs[ru->size++] = hregPPC_GPR28(mode64);
+   ru->allocable_end[(mode64) ? HRcInt64 : HRcInt32] = ru->size - 1;
    // GPR29 is reserved for the dispatcher
    // GPR30 is reserved as AltiVec spill reg temporary
    // GPR31 is reserved for the GuestStatePtr
@@ -109,6 +111,7 @@ const RRegUniverse* getRRegUniverse_PPC ( Bool mode64 )
       the occasional extra spill instead. */
    /* For both ppc32-linux and ppc64-linux, f14-f31 are callee save.
       So use them. */
+   ru->allocable_start[HRcFlt64] = ru->size;
    ru->regs[ru->size++] = hregPPC_FPR14(mode64);
    ru->regs[ru->size++] = hregPPC_FPR15(mode64);
    ru->regs[ru->size++] = hregPPC_FPR16(mode64);
@@ -117,11 +120,13 @@ const RRegUniverse* getRRegUniverse_PPC ( Bool mode64 )
    ru->regs[ru->size++] = hregPPC_FPR19(mode64);
    ru->regs[ru->size++] = hregPPC_FPR20(mode64);
    ru->regs[ru->size++] = hregPPC_FPR21(mode64);
+   ru->allocable_end[HRcFlt64] = ru->size - 1;
 
    /* Same deal re Altivec */
    /* For both ppc32-linux and ppc64-linux, v20-v31 are callee save.
       So use them. */
    /* NB, vr29 is used as a scratch temporary -- do not allocate */
+   ru->allocable_start[HRcVec128] = ru->size;
    ru->regs[ru->size++] = hregPPC_VR20(mode64);
    ru->regs[ru->size++] = hregPPC_VR21(mode64);
    ru->regs[ru->size++] = hregPPC_VR22(mode64);
@@ -130,6 +135,7 @@ const RRegUniverse* getRRegUniverse_PPC ( Bool mode64 )
    ru->regs[ru->size++] = hregPPC_VR25(mode64);
    ru->regs[ru->size++] = hregPPC_VR26(mode64);
    ru->regs[ru->size++] = hregPPC_VR27(mode64);
+   ru->allocable_end[HRcVec128] = ru->size - 1;
    ru->allocable = ru->size;
 
    /* And other regs, not available to the allocator. */
@@ -146,7 +152,7 @@ const RRegUniverse* getRRegUniverse_PPC ( Bool mode64 )
 }
 
 
-void ppHRegPPC ( HReg reg ) 
+UInt ppHRegPPC ( HReg reg )
 {
    Int r;
    static const HChar* ireg32_names[32] 
@@ -160,31 +166,26 @@ void ppHRegPPC ( HReg reg )
           "%r28", "%r29", "%r30", "%r31" };
    /* Be generic for all virtual regs. */
    if (hregIsVirtual(reg)) {
-      ppHReg(reg);
-      return;
+      return ppHReg(reg);
    }
    /* But specific for real regs. */
    switch (hregClass(reg)) {
    case HRcInt64:
       r = hregEncoding(reg);
       vassert(r >= 0 && r < 32);
-      vex_printf("%s", ireg32_names[r]);
-      return;
+      return vex_printf("%s", ireg32_names[r]);
    case HRcInt32:
       r = hregEncoding(reg);
       vassert(r >= 0 && r < 32);
-      vex_printf("%s", ireg32_names[r]);
-      return;
+      return vex_printf("%s", ireg32_names[r]);
    case HRcFlt64:
       r = hregEncoding(reg);
       vassert(r >= 0 && r < 32);
-      vex_printf("%%fr%d", r);
-      return;
+      return vex_printf("%%fr%d", r);
    case HRcVec128:
       r = hregEncoding(reg);
       vassert(r >= 0 && r < 32);
-      vex_printf("%%v%d", r);
-      return;
+      return vex_printf("%%v%d", r);
    default:
       vpanic("ppHRegPPC");
    }
@@ -3210,6 +3211,20 @@ void genReload_PPC ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
    }
 }
 
+PPCInstr* genMove_PPC(HReg from, HReg to, Bool mode64)
+{
+   switch (hregClass(from)) {
+   case HRcInt32:
+   case HRcInt64:
+      return PPCInstr_Alu(Palu_OR, to, from, PPCRH_Reg(from));
+   case HRcFlt64:
+      return PPCInstr_FpUnary(Pfp_MOV, to, from);
+   default:
+      ppHRegClass(hregClass(from));
+      vpanic("genMove_PPC: unimplemented regclass");
+   }
+}
+
 
 /* --------- The ppc assembler (bleh.) --------- */
 
diff --git a/priv/host_ppc_defs.h b/priv/host_ppc_defs.h
index 2d1097c87..63ac3f6fd 100644
--- a/priv/host_ppc_defs.h
+++ b/priv/host_ppc_defs.h
@@ -122,7 +122,7 @@ ST_IN HReg hregPPC_VR29  ( Bool mode64 ) { return VR (mode64, 29,  43, 45); }
 /* Num registers used for function calls */
 #define PPC_N_REGPARMS 8
 
-extern void ppHRegPPC ( HReg );
+extern UInt ppHRegPPC ( HReg );
 
 
 /* --------- Condition codes --------- */
@@ -1217,6 +1217,7 @@ extern void genSpill_PPC  ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
                             HReg rreg, Int offsetB, Bool mode64 );
 extern void genReload_PPC ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
                             HReg rreg, Int offsetB, Bool mode64 );
+extern PPCInstr* genMove_PPC(HReg from, HReg to, Bool mode64);
 
 extern const RRegUniverse* getRRegUniverse_PPC ( Bool mode64 );
 
diff --git a/priv/host_x86_defs.c b/priv/host_x86_defs.c
index 6321a3ecb..829d59d74 100644
--- a/priv/host_x86_defs.c
+++ b/priv/host_x86_defs.c
@@ -63,18 +63,25 @@ const RRegUniverse* getRRegUniverse_X86 ( void )
    /* Add the registers.  The initial segment of this array must be
       those available for allocation by reg-alloc, and those that
       follow are not available for allocation. */
+   ru->allocable_start[HRcInt32] = ru->size;
    ru->regs[ru->size++] = hregX86_EAX();
    ru->regs[ru->size++] = hregX86_EBX();
    ru->regs[ru->size++] = hregX86_ECX();
    ru->regs[ru->size++] = hregX86_EDX();
    ru->regs[ru->size++] = hregX86_ESI();
    ru->regs[ru->size++] = hregX86_EDI();
+   ru->allocable_end[HRcInt32] = ru->size - 1;
+
+   ru->allocable_start[HRcFlt64] = ru->size;
    ru->regs[ru->size++] = hregX86_FAKE0();
    ru->regs[ru->size++] = hregX86_FAKE1();
    ru->regs[ru->size++] = hregX86_FAKE2();
    ru->regs[ru->size++] = hregX86_FAKE3();
    ru->regs[ru->size++] = hregX86_FAKE4();
    ru->regs[ru->size++] = hregX86_FAKE5();
+   ru->allocable_end[HRcFlt64] = ru->size - 1;
+
+   ru->allocable_start[HRcVec128] = ru->size;
    ru->regs[ru->size++] = hregX86_XMM0();
    ru->regs[ru->size++] = hregX86_XMM1();
    ru->regs[ru->size++] = hregX86_XMM2();
@@ -83,7 +90,9 @@ const RRegUniverse* getRRegUniverse_X86 ( void )
    ru->regs[ru->size++] = hregX86_XMM5();
    ru->regs[ru->size++] = hregX86_XMM6();
    ru->regs[ru->size++] = hregX86_XMM7();
+   ru->allocable_end[HRcVec128] = ru->size - 1;
    ru->allocable = ru->size;
+
    /* And other regs, not available to the allocator. */
    ru->regs[ru->size++] = hregX86_ESP();
    ru->regs[ru->size++] = hregX86_EBP();
@@ -95,33 +104,29 @@ const RRegUniverse* getRRegUniverse_X86 ( void )
 }
 
 
-void ppHRegX86 ( HReg reg ) 
+UInt ppHRegX86 ( HReg reg )
 {
    Int r;
    static const HChar* ireg32_names[8] 
      = { "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi" };
    /* Be generic for all virtual regs. */
    if (hregIsVirtual(reg)) {
-      ppHReg(reg);
-      return;
+      return ppHReg(reg);
    }
    /* But specific for real regs. */
    switch (hregClass(reg)) {
       case HRcInt32:
          r = hregEncoding(reg);
          vassert(r >= 0 && r < 8);
-         vex_printf("%s", ireg32_names[r]);
-         return;
+         return vex_printf("%s", ireg32_names[r]);
       case HRcFlt64:
          r = hregEncoding(reg);
          vassert(r >= 0 && r < 6);
-         vex_printf("%%fake%d", r);
-         return;
+         return vex_printf("%%fake%d", r);
       case HRcVec128:
          r = hregEncoding(reg);
          vassert(r >= 0 && r < 8);
-         vex_printf("%%xmm%d", r);
-         return;
+         return vex_printf("%%xmm%d", r);
       default:
          vpanic("ppHRegX86");
    }
@@ -1752,6 +1757,19 @@ void genReload_X86 ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
    }
 }
 
+X86Instr* genMove_X86(HReg from, HReg to, Bool mode64)
+{
+   switch (hregClass(from)) {
+   case HRcInt32:
+      return X86Instr_Alu32R(Xalu_MOV, X86RMI_Reg(from), to);
+   case HRcVec128:
+      return X86Instr_SseReRg(Xsse_MOV, from, to);
+   default:
+      ppHRegClass(hregClass(from));
+      vpanic("genMove_X86: unimplemented regclass");
+   }
+}
+
 /* The given instruction reads the specified vreg exactly once, and
    that vreg is currently located at the given spill offset.  If
    possible, return a variant of the instruction to one which instead
diff --git a/priv/host_x86_defs.h b/priv/host_x86_defs.h
index 36c147423..3312c6e76 100644
--- a/priv/host_x86_defs.h
+++ b/priv/host_x86_defs.h
@@ -74,7 +74,7 @@ ST_IN HReg hregX86_ESP   ( void ) { return mkHReg(False, HRcInt32,  4, 20); }
 ST_IN HReg hregX86_EBP   ( void ) { return mkHReg(False, HRcInt32,  5, 21); }
 #undef ST_IN
 
-extern void ppHRegX86 ( HReg );
+extern UInt ppHRegX86 ( HReg );
 
 
 /* --------- Condition codes, Intel encoding. --------- */
@@ -732,7 +732,7 @@ extern void genSpill_X86  ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
                             HReg rreg, Int offset, Bool );
 extern void genReload_X86 ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
                             HReg rreg, Int offset, Bool );
-
+extern X86Instr* genMove_X86(HReg from, HReg to, Bool);
 extern X86Instr* directReload_X86 ( X86Instr* i, HReg vreg, Short spill_off );
 
 extern const RRegUniverse* getRRegUniverse_X86 ( void );
diff --git a/priv/main_main.c b/priv/main_main.c
index 0c6161033..05c5a0cf5 100644
--- a/priv/main_main.c
+++ b/priv/main_main.c
@@ -198,6 +198,7 @@ void LibVEX_default_VexControl ( /*OUT*/ VexControl* vcon )
    vcon->guest_max_bytes                 = 5000;
    vcon->guest_chase_thresh              = 10;
    vcon->guest_chase_cond                = False;
+   vcon->regalloc_version               = 3;
    vcon->strict_block_end            = False;
    vcon->arm_allow_optimizing_lookback   = True;
    vcon->arm64_allow_reordered_writeback = True;
@@ -309,6 +310,7 @@ void LibVEX_Update_Control(const VexControl *vcon)
    vassert(vcon->guest_chase_thresh < vcon->guest_max_insns);
    vassert(vcon->guest_chase_cond == True
            || vcon->guest_chase_cond == False);
+   vassert(vcon->regalloc_version == 2 || vcon->regalloc_version == 3);
    vassert(vcon->strict_block_end  == True
            || vcon->strict_block_end  == False);
    vassert(vcon->arm_allow_optimizing_lookback  == True
@@ -754,9 +756,10 @@ void LibVEX_Codegen (   VexTranslateArgs *vta,
    void         (*mapRegs)      ( HRegRemap*, HInstr*, Bool );
    void         (*genSpill)     ( HInstr**, HInstr**, HReg, Int, Bool );
    void         (*genReload)    ( HInstr**, HInstr**, HReg, Int, Bool );
+   HInstr*      (*genMove)      ( HReg, HReg, Bool );
    HInstr*      (*directReload) ( HInstr*, HReg, Short );
    void         (*ppInstr)      ( const HInstr*, Bool );
-   void         (*ppReg)        ( HReg );
+   UInt         (*ppReg)        ( HReg );
    HInstrArray* (*iselSB)       ( const IRSB*, VexArch, const VexArchInfo*,
                                   const VexAbiInfo*, Int, Int, Bool, Bool,
                                   Addr );
@@ -783,6 +786,7 @@ void LibVEX_Codegen (   VexTranslateArgs *vta,
    mapRegs                 = NULL;
    genSpill                = NULL;
    genReload               = NULL;
+   genMove                 = NULL;
    directReload            = NULL;
    ppInstr                 = NULL;
    ppReg                   = NULL;
@@ -909,6 +913,7 @@ void LibVEX_Codegen (   VexTranslateArgs *vta,
          mapRegs      = CAST_AS(mapRegs) X86FN(mapRegs_X86Instr);
          genSpill     = CAST_AS(genSpill) X86FN(genSpill_X86);
          genReload    = CAST_AS(genReload) X86FN(genReload_X86);
+         genMove      = CAST_AS(genMove) X86FN(genMove_X86);
          directReload = CAST_AS(directReload) X86FN(directReload_X86);
          ppInstr      = CAST_AS(ppInstr) X86FN(ppX86Instr);
          ppReg        = CAST_AS(ppReg) X86FN(ppHRegX86);
@@ -926,6 +931,7 @@ void LibVEX_Codegen (   VexTranslateArgs *vta,
          mapRegs      = CAST_AS(mapRegs) AMD64FN(mapRegs_AMD64Instr);
          genSpill     = CAST_AS(genSpill) AMD64FN(genSpill_AMD64);
          genReload    = CAST_AS(genReload) AMD64FN(genReload_AMD64);
+         genMove      = CAST_AS(genMove) AMD64FN(genMove_AMD64);
          ppInstr      = CAST_AS(ppInstr) AMD64FN(ppAMD64Instr);
          ppReg        = CAST_AS(ppReg) AMD64FN(ppHRegAMD64);
          iselSB       = AMD64FN(iselSB_AMD64);
@@ -942,6 +948,7 @@ void LibVEX_Codegen (   VexTranslateArgs *vta,
          mapRegs      = CAST_AS(mapRegs) PPC32FN(mapRegs_PPCInstr);
          genSpill     = CAST_AS(genSpill) PPC32FN(genSpill_PPC);
          genReload    = CAST_AS(genReload) PPC32FN(genReload_PPC);
+         genMove      = CAST_AS(genMove) PPC32FN(genMove_PPC);
          ppInstr      = CAST_AS(ppInstr) PPC32FN(ppPPCInstr);
          ppReg        = CAST_AS(ppReg) PPC32FN(ppHRegPPC);
          iselSB       = PPC32FN(iselSB_PPC);
@@ -958,6 +965,7 @@ void LibVEX_Codegen (   VexTranslateArgs *vta,
          mapRegs      = CAST_AS(mapRegs) PPC64FN(mapRegs_PPCInstr);
          genSpill     = CAST_AS(genSpill) PPC64FN(genSpill_PPC);
          genReload    = CAST_AS(genReload) PPC64FN(genReload_PPC);
+         genMove      = CAST_AS(genMove) PPC64FN(genMove_PPC);
          ppInstr      = CAST_AS(ppInstr) PPC64FN(ppPPCInstr);
          ppReg        = CAST_AS(ppReg) PPC64FN(ppHRegPPC);
          iselSB       = PPC64FN(iselSB_PPC);
@@ -975,6 +983,7 @@ void LibVEX_Codegen (   VexTranslateArgs *vta,
          mapRegs      = CAST_AS(mapRegs) S390FN(mapRegs_S390Instr);
          genSpill     = CAST_AS(genSpill) S390FN(genSpill_S390);
          genReload    = CAST_AS(genReload) S390FN(genReload_S390);
+         genMove      = CAST_AS(genMove) S390FN(genMove_S390);
          // fixs390: consider implementing directReload_S390
          ppInstr      = CAST_AS(ppInstr) S390FN(ppS390Instr);
          ppReg        = CAST_AS(ppReg) S390FN(ppHRegS390);
@@ -992,6 +1001,7 @@ void LibVEX_Codegen (   VexTranslateArgs *vta,
          mapRegs      = CAST_AS(mapRegs) ARMFN(mapRegs_ARMInstr);
          genSpill     = CAST_AS(genSpill) ARMFN(genSpill_ARM);
          genReload    = CAST_AS(genReload) ARMFN(genReload_ARM);
+         genMove      = CAST_AS(genMove) ARMFN(genMove_ARM);
          ppInstr      = CAST_AS(ppInstr) ARMFN(ppARMInstr);
          ppReg        = CAST_AS(ppReg) ARMFN(ppHRegARM);
          iselSB       = ARMFN(iselSB_ARM);
@@ -1008,6 +1018,7 @@ void LibVEX_Codegen (   VexTranslateArgs *vta,
          mapRegs      = CAST_AS(mapRegs) ARM64FN(mapRegs_ARM64Instr);
          genSpill     = CAST_AS(genSpill) ARM64FN(genSpill_ARM64);
          genReload    = CAST_AS(genReload) ARM64FN(genReload_ARM64);
+         genMove      = CAST_AS(genMove) ARM64FN(genMove_ARM64);
          ppInstr      = CAST_AS(ppInstr) ARM64FN(ppARM64Instr);
          ppReg        = CAST_AS(ppReg) ARM64FN(ppHRegARM64);
          iselSB       = ARM64FN(iselSB_ARM64);
@@ -1024,6 +1035,7 @@ void LibVEX_Codegen (   VexTranslateArgs *vta,
          mapRegs      = CAST_AS(mapRegs) MIPS32FN(mapRegs_MIPSInstr);
          genSpill     = CAST_AS(genSpill) MIPS32FN(genSpill_MIPS);
          genReload    = CAST_AS(genReload) MIPS32FN(genReload_MIPS);
+         genMove      = CAST_AS(genMove) MIPS32FN(genMove_MIPS);
          ppInstr      = CAST_AS(ppInstr) MIPS32FN(ppMIPSInstr);
          ppReg        = CAST_AS(ppReg) MIPS32FN(ppHRegMIPS);
          iselSB       = MIPS32FN(iselSB_MIPS);
@@ -1041,6 +1053,7 @@ void LibVEX_Codegen (   VexTranslateArgs *vta,
          mapRegs      = CAST_AS(mapRegs) MIPS64FN(mapRegs_MIPSInstr);
          genSpill     = CAST_AS(genSpill) MIPS64FN(genSpill_MIPS);
          genReload    = CAST_AS(genReload) MIPS64FN(genReload_MIPS);
+         genMove      = CAST_AS(genMove) MIPS64FN(genMove_MIPS);
          ppInstr      = CAST_AS(ppInstr) MIPS64FN(ppMIPSInstr);
          ppReg        = CAST_AS(ppReg) MIPS64FN(ppHRegMIPS);
          iselSB       = MIPS64FN(iselSB_MIPS);
@@ -1133,11 +1146,22 @@ void LibVEX_Codegen (   VexTranslateArgs *vta,
    }
 
    /* Register allocate. */
-   rcode = doRegisterAllocation ( vcode, rRegUniv,
-                                  isMove, getRegUsage, mapRegs, 
-                                  genSpill, genReload, directReload, 
-                                  guest_sizeB,
-                                  ppInstr, ppReg, mode64 );
+   RegAllocControl con = {
+      .univ = rRegUniv, .isMove = isMove, .getRegUsage = getRegUsage,
+      .mapRegs = mapRegs, .genSpill = genSpill, .genReload = genReload,
+      .genMove = genMove, .directReload = directReload,
+      .guest_sizeB = guest_sizeB, .ppInstr = ppInstr, .ppReg = ppReg,
+      .mode64 = mode64};
+   switch (vex_control.regalloc_version) {
+   case 2:
+      rcode = doRegisterAllocation_v2(vcode, &con);
+      break;
+   case 3:
+      rcode = doRegisterAllocation_v3(vcode, &con);
+      break;
+   default:
+      vassert(0);
+   }
 
    vexAllocSanityCheck();
 
diff --git a/priv/main_util.c b/priv/main_util.c
index 028d353ae..4f2b7091c 100644
--- a/priv/main_util.c
+++ b/priv/main_util.c
@@ -259,13 +259,40 @@ Bool vex_streq ( const HChar* s1, const HChar* s2 )
    }
 }
 
+/* Vectorised memset, copied from Valgrind's m_libcbase.c. */
 void vex_bzero ( void* sV, SizeT n )
 {
-   SizeT i;
-   UChar* s = (UChar*)sV;
-   /* No laughing, please.  Just don't call this too often.  Thank you
-      for your attention. */
-   for (i = 0; i < n; i++) s[i] = 0;
+#  define IS_4_ALIGNED(aaa_p) (0 == (((HWord)(aaa_p)) & ((HWord)0x3)))
+
+   UChar* d = sV;
+
+   while ((!IS_4_ALIGNED(d)) && n >= 1) {
+      d[0] = 0;
+      d++;
+      n--;
+   }
+   if (n == 0)
+      return;
+   while (n >= 16) {
+      ((UInt*)d)[0] = 0;
+      ((UInt*)d)[1] = 0;
+      ((UInt*)d)[2] = 0;
+      ((UInt*)d)[3] = 0;
+      d += 16;
+      n -= 16;
+   }
+   while (n >= 4) {
+      ((UInt*)d)[0] = 0;
+      d += 4;
+      n -= 4;
+   }
+   while (n >= 1) {
+      d[0] = 0;
+      d++;
+      n--;
+   }
+   return;
+#  undef IS_4_ALIGNED
 }
 
 
diff --git a/pub/libvex.h b/pub/libvex.h
index fae6c5bfd..604b429df 100644
--- a/pub/libvex.h
+++ b/pub/libvex.h
@@ -492,6 +492,11 @@ typedef
       /* EXPERIMENTAL: chase across conditional branches?  Not all
          front ends honour this.  Default: NO. */
       Bool guest_chase_cond;
+      /* Register allocator version. Allowed values are:
+         - '2': previous, good and slow implementation.
+         - '3': current, faster implementation; perhaps producing slightly worse
+                spilling decisions. */
+      UInt regalloc_version;
       /* Should the arm-thumb lifter be allowed to look before the
          current instruction pointer in order to check if there are no
          IT instructions so that it can optimize the IR? Default: YES */

From 66be59d3ef9cd8722bed8bcd349514d7c47a1113 Mon Sep 17 00:00:00 2001
From: mephi42 <mephi42@gmail.com>
Date: Fri, 15 Mar 2019 18:14:16 +0100
Subject: [PATCH 3/9] Port common code changes from 83cabd32

83cabd32: Refactor tracking of MOV coalescing
---
 priv/host_amd64_defs.c         | 55 +++++++++------------------
 priv/host_amd64_defs.h         |  1 -
 priv/host_arm64_defs.c         | 29 +++------------
 priv/host_arm64_defs.h         |  1 -
 priv/host_arm_defs.c           | 68 ++++++++++++----------------------
 priv/host_arm_defs.h           |  1 -
 priv/host_generic_reg_alloc2.c | 16 ++++----
 priv/host_generic_regs.c       |  3 ++
 priv/host_generic_regs.h       | 21 +++++++----
 priv/host_mips_defs.c          | 31 +++++-----------
 priv/host_mips_defs.h          |  1 -
 priv/host_ppc_defs.c           | 46 ++++++++---------------
 priv/host_ppc_defs.h           |  1 -
 priv/host_x86_defs.c           | 53 +++++++++-----------------
 priv/host_x86_defs.h           |  1 -
 priv/main_main.c               | 39 +++++++------------
 16 files changed, 128 insertions(+), 239 deletions(-)

diff --git a/priv/host_amd64_defs.c b/priv/host_amd64_defs.c
index 9747b7c6d..f45592655 100644
--- a/priv/host_amd64_defs.c
+++ b/priv/host_amd64_defs.c
@@ -1406,6 +1406,12 @@ void getRegUsage_AMD64Instr ( HRegUsage* u, const AMD64Instr* i, Bool mode64 )
          addRegUsage_AMD64RMI(u, i->Ain.Alu64R.src);
          if (i->Ain.Alu64R.op == Aalu_MOV) {
             addHRegUse(u, HRmWrite, i->Ain.Alu64R.dst);
+
+            if (i->Ain.Alu64R.src->tag == Armi_Reg) {
+               u->isRegRegMove = True;
+               u->regMoveSrc   = i->Ain.Alu64R.src->Armi.Reg.reg;
+               u->regMoveDst   = i->Ain.Alu64R.dst;
+            }
             return;
          }
          if (i->Ain.Alu64R.op == Aalu_CMP) { 
@@ -1670,6 +1676,12 @@ void getRegUsage_AMD64Instr ( HRegUsage* u, const AMD64Instr* i, Bool mode64 )
             addHRegUse(u, i->Ain.SseReRg.op == Asse_MOV 
                              ? HRmWrite : HRmModify, 
                           i->Ain.SseReRg.dst);
+
+            if (i->Ain.SseReRg.op == Asse_MOV) {
+               u->isRegRegMove = True;
+               u->regMoveSrc   = i->Ain.SseReRg.src;
+               u->regMoveDst   = i->Ain.SseReRg.dst;
+            }
          }
          return;
       case Ain_SseCMov:
@@ -1696,6 +1708,12 @@ void getRegUsage_AMD64Instr ( HRegUsage* u, const AMD64Instr* i, Bool mode64 )
       //uu       addHRegUse(u, i->Ain.AvxReRg.op == Asse_MOV 
       //uu                        ? HRmWrite : HRmModify, 
       //uu                     i->Ain.AvxReRg.dst);
+      //uu
+      //uu       if (i->Ain.AvxReRg.op == Asse_MOV) {
+      //uu          u->isRegRegMove = True;
+      //uu          u->regMoveSrc   = i->Ain.AvxReRg.src;
+      //uu          u->regMoveDst   = i->Ain.AvxReRg.dst;
+      //uu       }
       //uu    }
       //uu    return;
       case Ain_EvCheck:
@@ -1912,43 +1930,6 @@ void mapRegs_AMD64Instr ( HRegRemap* m, AMD64Instr* i, Bool mode64 )
    }
 }
 
-/* Figure out if i represents a reg-reg move, and if so assign the
-   source and destination to *src and *dst.  If in doubt say No.  Used
-   by the register allocator to do move coalescing. 
-*/
-Bool isMove_AMD64Instr ( const AMD64Instr* i, HReg* src, HReg* dst )
-{
-   switch (i->tag) {
-      case Ain_Alu64R:
-         /* Moves between integer regs */
-         if (i->Ain.Alu64R.op != Aalu_MOV)
-            return False;
-         if (i->Ain.Alu64R.src->tag != Armi_Reg)
-            return False;
-         *src = i->Ain.Alu64R.src->Armi.Reg.reg;
-         *dst = i->Ain.Alu64R.dst;
-         return True;
-      case Ain_SseReRg:
-         /* Moves between SSE regs */
-         if (i->Ain.SseReRg.op != Asse_MOV)
-            return False;
-         *src = i->Ain.SseReRg.src;
-         *dst = i->Ain.SseReRg.dst;
-         return True;
-      //uu case Ain_AvxReRg:
-      //uu    /* Moves between AVX regs */
-      //uu    if (i->Ain.AvxReRg.op != Asse_MOV)
-      //uu       return False;
-      //uu    *src = i->Ain.AvxReRg.src;
-      //uu    *dst = i->Ain.AvxReRg.dst;
-      //uu    return True;
-      default:
-         return False;
-   }
-   /*NOTREACHED*/
-}
-
-
 /* Generate amd64 spill/reload instructions under the direction of the
    register allocator.  Note it's critical these don't write the
    condition codes. */
diff --git a/priv/host_amd64_defs.h b/priv/host_amd64_defs.h
index 068520ea2..1b04f4489 100644
--- a/priv/host_amd64_defs.h
+++ b/priv/host_amd64_defs.h
@@ -787,7 +787,6 @@ extern void ppAMD64Instr ( const AMD64Instr*, Bool );
    of the underlying instruction set. */
 extern void getRegUsage_AMD64Instr ( HRegUsage*, const AMD64Instr*, Bool );
 extern void mapRegs_AMD64Instr     ( HRegRemap*, AMD64Instr*, Bool );
-extern Bool isMove_AMD64Instr      ( const AMD64Instr*, HReg*, HReg* );
 extern Int          emit_AMD64Instr   ( /*MB_MOD*/Bool* is_profInc,
                                         UChar* buf, Int nbuf,
                                         const AMD64Instr* i, 
diff --git a/priv/host_arm64_defs.c b/priv/host_arm64_defs.c
index 417f989ff..facd56126 100644
--- a/priv/host_arm64_defs.c
+++ b/priv/host_arm64_defs.c
@@ -1947,6 +1947,9 @@ void getRegUsage_ARM64Instr ( HRegUsage* u, const ARM64Instr* i, Bool mode64 )
       case ARM64in_MovI:
          addHRegUse(u, HRmWrite, i->ARM64in.MovI.dst);
          addHRegUse(u, HRmRead,  i->ARM64in.MovI.src);
+         u->isRegRegMove = True;
+         u->regMoveSrc   = i->ARM64in.MovI.src;
+         u->regMoveDst   = i->ARM64in.MovI.dst;
          return;
       case ARM64in_Imm64:
          addHRegUse(u, HRmWrite, i->ARM64in.Imm64.dst);
@@ -2219,6 +2222,9 @@ void getRegUsage_ARM64Instr ( HRegUsage* u, const ARM64Instr* i, Bool mode64 )
       case ARM64in_VMov:
          addHRegUse(u, HRmWrite, i->ARM64in.VMov.dst);
          addHRegUse(u, HRmRead,  i->ARM64in.VMov.src);
+         u->isRegRegMove = True;
+         u->regMoveSrc   = i->ARM64in.VMov.src;
+         u->regMoveDst   = i->ARM64in.VMov.dst;
          return;
       case ARM64in_EvCheck:
          /* We expect both amodes only to mention x21, so this is in
@@ -2489,29 +2495,6 @@ void mapRegs_ARM64Instr ( HRegRemap* m, ARM64Instr* i, Bool mode64 )
    }
 }
 
-/* Figure out if i represents a reg-reg move, and if so assign the
-   source and destination to *src and *dst.  If in doubt say No.  Used
-   by the register allocator to do move coalescing. 
-*/
-Bool isMove_ARM64Instr ( const ARM64Instr* i, HReg* src, HReg* dst )
-{
-   switch (i->tag) {
-      case ARM64in_MovI:
-         *src = i->ARM64in.MovI.src;
-         *dst = i->ARM64in.MovI.dst;
-         return True;
-      case ARM64in_VMov:
-         *src = i->ARM64in.VMov.src;
-         *dst = i->ARM64in.VMov.dst;
-         return True;
-      default:
-         break;
-   }
-
-   return False;
-}
-
-
 /* Generate arm spill/reload instructions under the direction of the
    register allocator.  Note it's critical these don't write the
    condition codes. */
diff --git a/priv/host_arm64_defs.h b/priv/host_arm64_defs.h
index 0a4c248cc..b1a134822 100644
--- a/priv/host_arm64_defs.h
+++ b/priv/host_arm64_defs.h
@@ -968,7 +968,6 @@ extern void ppARM64Instr ( const ARM64Instr* );
    of the underlying instruction set. */
 extern void getRegUsage_ARM64Instr ( HRegUsage*, const ARM64Instr*, Bool );
 extern void mapRegs_ARM64Instr     ( HRegRemap*, ARM64Instr*, Bool );
-extern Bool isMove_ARM64Instr      ( const ARM64Instr*, HReg*, HReg* );
 extern Int  emit_ARM64Instr        ( /*MB_MOD*/Bool* is_profInc,
                                      UChar* buf, Int nbuf, const ARM64Instr* i,
                                      Bool mode64,
diff --git a/priv/host_arm_defs.c b/priv/host_arm_defs.c
index 55ecae354..e38ac9743 100644
--- a/priv/host_arm_defs.c
+++ b/priv/host_arm_defs.c
@@ -2108,6 +2108,12 @@ void getRegUsage_ARMInstr ( HRegUsage* u, const ARMInstr* i, Bool mode64 )
       case ARMin_Mov:
          addHRegUse(u, HRmWrite, i->ARMin.Mov.dst);
          addRegUsage_ARMRI84(u, i->ARMin.Mov.src);
+
+         if (i->ARMin.Mov.src->tag == ARMri84_R) {
+            u->isRegRegMove = True;
+            u->regMoveSrc   = i->ARMin.Mov.src->ARMri84.R.reg;
+            u->regMoveDst   = i->ARMin.Mov.dst;
+         }
          return;
       case ARMin_Imm32:
          addHRegUse(u, HRmWrite, i->ARMin.Imm32.dst);
@@ -2256,10 +2262,22 @@ void getRegUsage_ARMInstr ( HRegUsage* u, const ARMInstr* i, Bool mode64 )
       case ARMin_VUnaryD:
          addHRegUse(u, HRmWrite, i->ARMin.VUnaryD.dst);
          addHRegUse(u, HRmRead, i->ARMin.VUnaryD.src);
+
+         if (i->ARMin.VUnaryD.op == ARMvfpu_COPY) {
+            u->isRegRegMove = True;
+            u->regMoveSrc   = i->ARMin.VUnaryD.src;
+            u->regMoveDst   = i->ARMin.VUnaryD.dst;
+         }
          return;
       case ARMin_VUnaryS:
          addHRegUse(u, HRmWrite, i->ARMin.VUnaryS.dst);
          addHRegUse(u, HRmRead, i->ARMin.VUnaryS.src);
+
+         if (i->ARMin.VUnaryS.op == ARMvfpu_COPY) {
+            u->isRegRegMove = True;
+            u->regMoveSrc   = i->ARMin.VUnaryS.src;
+            u->regMoveDst   = i->ARMin.VUnaryS.dst;
+         }
          return;
       case ARMin_VCmpD:
          addHRegUse(u, HRmRead, i->ARMin.VCmpD.argL);
@@ -2350,6 +2368,12 @@ void getRegUsage_ARMInstr ( HRegUsage* u, const ARMInstr* i, Bool mode64 )
       case ARMin_NUnary:
          addHRegUse(u, HRmWrite, i->ARMin.NUnary.dst);
          addHRegUse(u, HRmRead, i->ARMin.NUnary.src);
+
+         if (i->ARMin.NUnary.op == ARMneon_COPY) {
+            u->isRegRegMove = True;
+            u->regMoveSrc   = i->ARMin.NUnary.src;
+            u->regMoveDst   = i->ARMin.NUnary.dst;
+         }
          return;
       case ARMin_NUnaryS:
          addHRegUse(u, HRmWrite, i->ARMin.NUnaryS.dst->reg);
@@ -2620,50 +2644,6 @@ void mapRegs_ARMInstr ( HRegRemap* m, ARMInstr* i, Bool mode64 )
    }
 }
 
-/* Figure out if i represents a reg-reg move, and if so assign the
-   source and destination to *src and *dst.  If in doubt say No.  Used
-   by the register allocator to do move coalescing. 
-*/
-Bool isMove_ARMInstr ( const ARMInstr* i, HReg* src, HReg* dst )
-{
-   /* Moves between integer regs */
-   switch (i->tag) {
-      case ARMin_Mov:
-         if (i->ARMin.Mov.src->tag == ARMri84_R) {
-            *src = i->ARMin.Mov.src->ARMri84.R.reg;
-            *dst = i->ARMin.Mov.dst;
-            return True;
-         }
-         break;
-      case ARMin_VUnaryD:
-         if (i->ARMin.VUnaryD.op == ARMvfpu_COPY) {
-            *src = i->ARMin.VUnaryD.src;
-            *dst = i->ARMin.VUnaryD.dst;
-            return True;
-         }
-         break;
-      case ARMin_VUnaryS:
-         if (i->ARMin.VUnaryS.op == ARMvfpu_COPY) {
-            *src = i->ARMin.VUnaryS.src;
-            *dst = i->ARMin.VUnaryS.dst;
-            return True;
-         }
-         break;
-      case ARMin_NUnary:
-         if (i->ARMin.NUnary.op == ARMneon_COPY) {
-            *src = i->ARMin.NUnary.src;
-            *dst = i->ARMin.NUnary.dst;
-            return True;
-         }
-         break;
-      default:
-         break;
-   }
-
-   return False;
-}
-
-
 /* Generate arm spill/reload instructions under the direction of the
    register allocator.  Note it's critical these don't write the
    condition codes. */
diff --git a/priv/host_arm_defs.h b/priv/host_arm_defs.h
index fe529b85b..42da4aa28 100644
--- a/priv/host_arm_defs.h
+++ b/priv/host_arm_defs.h
@@ -1059,7 +1059,6 @@ extern void ppARMInstr ( const ARMInstr* );
    of the underlying instruction set. */
 extern void getRegUsage_ARMInstr ( HRegUsage*, const ARMInstr*, Bool );
 extern void mapRegs_ARMInstr     ( HRegRemap*, ARMInstr*, Bool );
-extern Bool isMove_ARMInstr      ( const ARMInstr*, HReg*, HReg* );
 extern Int  emit_ARMInstr        ( /*MB_MOD*/Bool* is_profInc,
                                    UChar* buf, Int nbuf, const ARMInstr* i, 
                                    Bool mode64,
diff --git a/priv/host_generic_reg_alloc2.c b/priv/host_generic_reg_alloc2.c
index 695b5d7e2..bc8829505 100644
--- a/priv/host_generic_reg_alloc2.c
+++ b/priv/host_generic_reg_alloc2.c
@@ -45,8 +45,6 @@
 
 /* TODO 27 Oct 04:
 
-   Better consistency checking from what isMove tells us.
-
    We can possibly do V-V coalescing even when the src is spilled,
    providing we can arrange for the dst to have the same spill slot.
 
@@ -561,6 +559,10 @@ HInstrArray* doRegisterAllocation_v2 (
    for (Int ii = 0; ii < instrs_in->arr_used; ii++) {
 
       con->getRegUsage(&reg_usage_arr[ii], instrs_in->arr[ii], con->mode64);
+      reg_usage_arr[ii].isVregVregMove
+         = reg_usage_arr[ii].isRegRegMove
+           && hregIsVirtual(reg_usage_arr[ii].regMoveSrc)
+           && hregIsVirtual(reg_usage_arr[ii].regMoveDst);
 
       if (0) {
          vex_printf("\n%d  stage1: ", ii);
@@ -1071,12 +1073,10 @@ HInstrArray* doRegisterAllocation_v2 (
       /* If doing a reg-reg move between two vregs, and the src's live
          range ends here and the dst's live range starts here, bind
          the dst to the src's rreg, and that's all. */
-      HReg vregS = INVALID_HREG;
-      HReg vregD = INVALID_HREG;
-      if ( con->isMove(instrs_in->arr[ii], &vregS, &vregD) ) {
-         if (!hregIsVirtual(vregS)) goto cannot_coalesce;
-         if (!hregIsVirtual(vregD)) goto cannot_coalesce;
-         /* Check that *isMove is not telling us a bunch of lies ... */
+      if (reg_usage_arr[ii].isVregVregMove) {
+         HReg vregS = reg_usage_arr[ii].regMoveSrc;
+         HReg vregD = reg_usage_arr[ii].regMoveDst;
+         /* Check that |isVregVregMove| is not telling us a bunch of lies ... */
          vassert(hregClass(vregS) == hregClass(vregD));
          Int k = hregIndex(vregS);
          Int m = hregIndex(vregD);
diff --git a/priv/host_generic_regs.c b/priv/host_generic_regs.c
index ee2f1b705..efdec9342 100644
--- a/priv/host_generic_regs.c
+++ b/priv/host_generic_regs.c
@@ -184,6 +184,9 @@ void ppHRegUsage ( const RRegUniverse* univ, HRegUsage* tab )
       ppHReg(tab->vRegs[i]);
       vex_printf("\n");
    }
+   if (tab->isRegRegMove) {
+      vex_printf("   (is a reg-reg move)\n");
+   }
    vex_printf("}\n");
 }
 
diff --git a/priv/host_generic_regs.h b/priv/host_generic_regs.h
index d1f8ac69f..9436e8313 100644
--- a/priv/host_generic_regs.h
+++ b/priv/host_generic_regs.h
@@ -300,6 +300,16 @@ typedef
       HReg     vRegs[N_HREGUSAGE_VREGS];
       HRegMode vMode[N_HREGUSAGE_VREGS];
       UInt     n_vRegs;
+
+      /* Hint to the register allocator: this instruction is actually a move
+         between two registers: regMoveSrc -> regMoveDst. */
+      Bool     isRegRegMove;
+      HReg     regMoveSrc;
+      HReg     regMoveDst;
+
+      /* Used internally by the register allocator. The reg-reg move is
+         actually a vreg-vreg move. */
+      Bool     isVregVregMove;
    }
    HRegUsage;
 
@@ -307,9 +317,10 @@ extern void ppHRegUsage ( const RRegUniverse*, HRegUsage* );
 
 static inline void initHRegUsage ( HRegUsage* tab )
 {
-   tab->rRead    = 0;
-   tab->rWritten = 0;
-   tab->n_vRegs  = 0;
+   tab->rRead        = 0;
+   tab->rWritten     = 0;
+   tab->n_vRegs      = 0;
+   tab->isRegRegMove = False;
 }
 
 /* Add a register to a usage table.  Combine incoming read uses with
@@ -471,10 +482,6 @@ typedef
          allocation. */
       const RRegUniverse* univ;
 
-      /* Return True iff the given insn is a reg-reg move, in which case also
-         return the src and dst regs. */
-      Bool (*isMove)(const HInstr*, HReg*, HReg*);
-
       /* Get info about register usage in this insn. */
       void (*getRegUsage)(HRegUsage*, const HInstr*, Bool);
 
diff --git a/priv/host_mips_defs.c b/priv/host_mips_defs.c
index 0f08216d7..8597126ec 100644
--- a/priv/host_mips_defs.c
+++ b/priv/host_mips_defs.c
@@ -1578,6 +1578,15 @@ void getRegUsage_MIPSInstr(HRegUsage * u, const MIPSInstr * i, Bool mode64)
          addHRegUse(u, HRmRead, i->Min.Alu.srcL);
          addRegUsage_MIPSRH(u, i->Min.Alu.srcR);
          addHRegUse(u, HRmWrite, i->Min.Alu.dst);
+
+         /* or Rd,Rs,Rs == mr Rd,Rs */
+         if ((i->Min.Alu.op == Malu_OR)
+             && (i->Min.Alu.srcR->tag == Mrh_Reg)
+             && sameHReg(i->Min.Alu.srcR->Mrh.Reg.reg, i->Min.Alu.srcL)) {
+            u->isRegRegMove = True;
+            u->regMoveSrc   = i->Min.Alu.srcL;
+            u->regMoveDst   = i->Min.Alu.dst;
+         }
          return;
       case Min_Shft:
          addHRegUse(u, HRmRead, i->Min.Shft.srcL);
@@ -1942,28 +1951,6 @@ void mapRegs_MIPSInstr(HRegRemap * m, MIPSInstr * i, Bool mode64)
 
 }
 
-/* Figure out if i represents a reg-reg move, and if so assign the
-   source and destination to *src and *dst.  If in doubt say No.  Used
-   by the register allocator to do move coalescing.
-*/
-Bool isMove_MIPSInstr(const MIPSInstr * i, HReg * src, HReg * dst)
-{
-   /* Moves between integer regs */
-   if (i->tag == Min_Alu) {
-      /* or Rd,Rs,Rs == mr Rd,Rs */
-      if (i->Min.Alu.op != Malu_OR)
-         return False;
-      if (i->Min.Alu.srcR->tag != Mrh_Reg)
-         return False;
-      if (!sameHReg(i->Min.Alu.srcR->Mrh.Reg.reg, i->Min.Alu.srcL))
-         return False;
-      *src = i->Min.Alu.srcL;
-      *dst = i->Min.Alu.dst;
-      return True;
-   }
-   return False;
-}
-
 /* Generate mips spill/reload instructions under the direction of the
    register allocator. */
 void genSpill_MIPS( /*OUT*/ HInstr ** i1, /*OUT*/ HInstr ** i2, HReg rreg,
diff --git a/priv/host_mips_defs.h b/priv/host_mips_defs.h
index b0fda59a4..baec7e5a7 100644
--- a/priv/host_mips_defs.h
+++ b/priv/host_mips_defs.h
@@ -687,7 +687,6 @@ extern void ppMIPSInstr(const MIPSInstr *, Bool mode64);
    of the underlying instruction set. */
 extern void getRegUsage_MIPSInstr (HRegUsage *, const MIPSInstr *, Bool);
 extern void mapRegs_MIPSInstr     (HRegRemap *, MIPSInstr *, Bool mode64);
-extern Bool isMove_MIPSInstr      (const MIPSInstr *, HReg *, HReg *);
 extern Int        emit_MIPSInstr (/*MB_MOD*/Bool* is_profInc,
                                   UChar* buf, Int nbuf, const MIPSInstr* i,
                                   Bool mode64,
diff --git a/priv/host_ppc_defs.c b/priv/host_ppc_defs.c
index a2d8f9ef5..41cec1791 100644
--- a/priv/host_ppc_defs.c
+++ b/priv/host_ppc_defs.c
@@ -2360,6 +2360,15 @@ void getRegUsage_PPCInstr ( HRegUsage* u, const PPCInstr* i, Bool mode64 )
       addHRegUse(u, HRmRead,  i->Pin.Alu.srcL);
       addRegUsage_PPCRH(u,    i->Pin.Alu.srcR);
       addHRegUse(u, HRmWrite, i->Pin.Alu.dst);
+
+      // or Rd,Rs,Rs == mr Rd,Rs
+      if ((i->Pin.Alu.op == Palu_OR)
+          && (i->Pin.Alu.srcR->tag == Prh_Reg)
+          && sameHReg(i->Pin.Alu.srcR->Prh.Reg.reg, i->Pin.Alu.srcL)) {
+         u->isRegRegMove = True;
+         u->regMoveSrc   = i->Pin.Alu.srcL;
+         u->regMoveDst   = i->Pin.Alu.dst;
+      }
       return;
    case Pin_Shft:
       addHRegUse(u, HRmRead,  i->Pin.Shft.srcL);
@@ -2487,6 +2496,12 @@ void getRegUsage_PPCInstr ( HRegUsage* u, const PPCInstr* i, Bool mode64 )
    case Pin_FpUnary:
       addHRegUse(u, HRmWrite, i->Pin.FpUnary.dst);
       addHRegUse(u, HRmRead,  i->Pin.FpUnary.src);
+
+      if (i->Pin.FpUnary.op == Pfp_MOV) {
+         u->isRegRegMove = True;
+         u->regMoveSrc   = i->Pin.FpUnary.src;
+         u->regMoveDst   = i->Pin.FpUnary.dst;
+      }
       return;
    case Pin_FpBinary:
       addHRegUse(u, HRmWrite, i->Pin.FpBinary.dst);
@@ -3117,37 +3132,6 @@ void mapRegs_PPCInstr ( HRegRemap* m, PPCInstr* i, Bool mode64 )
    }
 }
 
-/* Figure out if i represents a reg-reg move, and if so assign the
-   source and destination to *src and *dst.  If in doubt say No.  Used
-   by the register allocator to do move coalescing. 
-*/
-Bool isMove_PPCInstr ( const PPCInstr* i, HReg* src, HReg* dst )
-{
-   /* Moves between integer regs */
-   if (i->tag == Pin_Alu) {
-      // or Rd,Rs,Rs == mr Rd,Rs
-      if (i->Pin.Alu.op != Palu_OR)
-         return False;
-      if (i->Pin.Alu.srcR->tag != Prh_Reg)
-         return False;
-      if (! sameHReg(i->Pin.Alu.srcR->Prh.Reg.reg, i->Pin.Alu.srcL))
-         return False;
-      *src = i->Pin.Alu.srcL;
-      *dst = i->Pin.Alu.dst;
-      return True;
-   }
-   /* Moves between FP regs */
-   if (i->tag == Pin_FpUnary) {
-      if (i->Pin.FpUnary.op != Pfp_MOV)
-         return False;
-      *src = i->Pin.FpUnary.src;
-      *dst = i->Pin.FpUnary.dst;
-      return True;
-   }
-   return False;
-}
-
-
 /* Generate ppc spill/reload instructions under the direction of the
    register allocator.  Note it's critical these don't write the
    condition codes. */
diff --git a/priv/host_ppc_defs.h b/priv/host_ppc_defs.h
index 63ac3f6fd..5d5b09067 100644
--- a/priv/host_ppc_defs.h
+++ b/priv/host_ppc_defs.h
@@ -1203,7 +1203,6 @@ extern void ppPPCInstr(const PPCInstr*, Bool mode64);
    of the underlying instruction set. */
 extern void getRegUsage_PPCInstr ( HRegUsage*, const PPCInstr*, Bool mode64 );
 extern void mapRegs_PPCInstr     ( HRegRemap*, PPCInstr* , Bool mode64);
-extern Bool isMove_PPCInstr      ( const PPCInstr*, HReg*, HReg* );
 extern Int          emit_PPCInstr   ( /*MB_MOD*/Bool* is_profInc,
                                       UChar* buf, Int nbuf, const PPCInstr* i, 
                                       Bool mode64,
diff --git a/priv/host_x86_defs.c b/priv/host_x86_defs.c
index 829d59d74..fd9dea399 100644
--- a/priv/host_x86_defs.c
+++ b/priv/host_x86_defs.c
@@ -1234,6 +1234,12 @@ void getRegUsage_X86Instr (HRegUsage* u, const X86Instr* i, Bool mode64)
          addRegUsage_X86RMI(u, i->Xin.Alu32R.src);
          if (i->Xin.Alu32R.op == Xalu_MOV) {
             addHRegUse(u, HRmWrite, i->Xin.Alu32R.dst);
+
+            if (i->Xin.Alu32R.src->tag == Xrmi_Reg) {
+               u->isRegRegMove = True;
+               u->regMoveSrc   = i->Xin.Alu32R.src->Xrmi.Reg.reg;
+               u->regMoveDst   = i->Xin.Alu32R.dst;
+            }
             return;
          }
          if (i->Xin.Alu32R.op == Xalu_CMP) { 
@@ -1374,6 +1380,12 @@ void getRegUsage_X86Instr (HRegUsage* u, const X86Instr* i, Bool mode64)
       case Xin_FpUnary:
          addHRegUse(u, HRmRead, i->Xin.FpUnary.src);
          addHRegUse(u, HRmWrite, i->Xin.FpUnary.dst);
+
+         if (i->Xin.FpUnary.op == Xfp_MOV) {
+            u->isRegRegMove = True;
+            u->regMoveSrc   = i->Xin.FpUnary.src;
+            u->regMoveDst   = i->Xin.FpUnary.dst;
+         }
          return;
       case Xin_FpBinary:
          addHRegUse(u, HRmRead, i->Xin.FpBinary.srcL);
@@ -1469,6 +1481,12 @@ void getRegUsage_X86Instr (HRegUsage* u, const X86Instr* i, Bool mode64)
             addHRegUse(u, i->Xin.SseReRg.op == Xsse_MOV 
                              ? HRmWrite : HRmModify, 
                           i->Xin.SseReRg.dst);
+
+            if (i->Xin.SseReRg.op == Xsse_MOV) {
+               u->isRegRegMove = True;
+               u->regMoveSrc   = i->Xin.SseReRg.src;
+               u->regMoveDst   = i->Xin.SseReRg.dst;
+            }
          }
          return;
       case Xin_SseCMov:
@@ -1668,41 +1686,6 @@ void mapRegs_X86Instr ( HRegRemap* m, X86Instr* i, Bool mode64 )
    }
 }
 
-/* Figure out if i represents a reg-reg move, and if so assign the
-   source and destination to *src and *dst.  If in doubt say No.  Used
-   by the register allocator to do move coalescing. 
-*/
-Bool isMove_X86Instr ( const X86Instr* i, HReg* src, HReg* dst )
-{
-   /* Moves between integer regs */
-   if (i->tag == Xin_Alu32R) {
-      if (i->Xin.Alu32R.op != Xalu_MOV)
-         return False;
-      if (i->Xin.Alu32R.src->tag != Xrmi_Reg)
-         return False;
-      *src = i->Xin.Alu32R.src->Xrmi.Reg.reg;
-      *dst = i->Xin.Alu32R.dst;
-      return True;
-   }
-   /* Moves between FP regs */
-   if (i->tag == Xin_FpUnary) {
-      if (i->Xin.FpUnary.op != Xfp_MOV)
-         return False;
-      *src = i->Xin.FpUnary.src;
-      *dst = i->Xin.FpUnary.dst;
-      return True;
-   }
-   if (i->tag == Xin_SseReRg) {
-      if (i->Xin.SseReRg.op != Xsse_MOV)
-         return False;
-      *src = i->Xin.SseReRg.src;
-      *dst = i->Xin.SseReRg.dst;
-      return True;
-   }
-   return False;
-}
-
-
 /* Generate x86 spill/reload instructions under the direction of the
    register allocator.  Note it's critical these don't write the
    condition codes. */
diff --git a/priv/host_x86_defs.h b/priv/host_x86_defs.h
index 3312c6e76..9c0c0eeec 100644
--- a/priv/host_x86_defs.h
+++ b/priv/host_x86_defs.h
@@ -718,7 +718,6 @@ extern void ppX86Instr ( const X86Instr*, Bool );
    of the underlying instruction set. */
 extern void         getRegUsage_X86Instr ( HRegUsage*, const X86Instr*, Bool );
 extern void         mapRegs_X86Instr     ( HRegRemap*, X86Instr*, Bool );
-extern Bool         isMove_X86Instr      ( const X86Instr*, HReg*, HReg* );
 extern Int          emit_X86Instr   ( /*MB_MOD*/Bool* is_profInc,
                                       UChar* buf, Int nbuf, const X86Instr* i, 
                                       Bool mode64,
diff --git a/priv/main_main.c b/priv/main_main.c
index 05c5a0cf5..5229d4276 100644
--- a/priv/main_main.c
+++ b/priv/main_main.c
@@ -751,7 +751,6 @@ void LibVEX_Codegen (   VexTranslateArgs *vta,
    /* This the bundle of functions we need to do the back-end stuff
       (insn selection, reg-alloc, assembly) whilst being insulated
       from the target instruction set. */
-   Bool         (*isMove)       ( const HInstr*, HReg*, HReg* );
    void         (*getRegUsage)  ( HRegUsage*, const HInstr*, Bool );
    void         (*mapRegs)      ( HRegRemap*, HInstr*, Bool );
    void         (*genSpill)     ( HInstr**, HInstr**, HReg, Int, Bool );
@@ -781,7 +780,6 @@ void LibVEX_Codegen (   VexTranslateArgs *vta,
    HInstrArray*    vcode;
    HInstrArray*    rcode;
 
-   isMove                  = NULL;
    getRegUsage             = NULL;
    mapRegs                 = NULL;
    genSpill                = NULL;
@@ -907,8 +905,7 @@ void LibVEX_Codegen (   VexTranslateArgs *vta,
       case VexArchX86:
          mode64       = False;
          rRegUniv     = X86FN(getRRegUniverse_X86());
-         isMove       = CAST_AS(isMove) X86FN(isMove_X86Instr);
-         getRegUsage  
+         getRegUsage
             = CAST_AS(getRegUsage) X86FN(getRegUsage_X86Instr);
          mapRegs      = CAST_AS(mapRegs) X86FN(mapRegs_X86Instr);
          genSpill     = CAST_AS(genSpill) X86FN(genSpill_X86);
@@ -925,8 +922,7 @@ void LibVEX_Codegen (   VexTranslateArgs *vta,
       case VexArchAMD64:
          mode64       = True;
          rRegUniv     = AMD64FN(getRRegUniverse_AMD64());
-         isMove       = CAST_AS(isMove) AMD64FN(isMove_AMD64Instr);
-         getRegUsage  
+         getRegUsage
             = CAST_AS(getRegUsage) AMD64FN(getRegUsage_AMD64Instr);
          mapRegs      = CAST_AS(mapRegs) AMD64FN(mapRegs_AMD64Instr);
          genSpill     = CAST_AS(genSpill) AMD64FN(genSpill_AMD64);
@@ -942,8 +938,7 @@ void LibVEX_Codegen (   VexTranslateArgs *vta,
       case VexArchPPC32:
          mode64       = False;
          rRegUniv     = PPC32FN(getRRegUniverse_PPC(mode64));
-         isMove       = CAST_AS(isMove) PPC32FN(isMove_PPCInstr);
-         getRegUsage  
+         getRegUsage
             = CAST_AS(getRegUsage) PPC32FN(getRegUsage_PPCInstr);
          mapRegs      = CAST_AS(mapRegs) PPC32FN(mapRegs_PPCInstr);
          genSpill     = CAST_AS(genSpill) PPC32FN(genSpill_PPC);
@@ -959,8 +954,7 @@ void LibVEX_Codegen (   VexTranslateArgs *vta,
       case VexArchPPC64:
          mode64       = True;
          rRegUniv     = PPC64FN(getRRegUniverse_PPC(mode64));
-         isMove       = CAST_AS(isMove) PPC64FN(isMove_PPCInstr);
-         getRegUsage  
+         getRegUsage
             = CAST_AS(getRegUsage) PPC64FN(getRegUsage_PPCInstr);
          mapRegs      = CAST_AS(mapRegs) PPC64FN(mapRegs_PPCInstr);
          genSpill     = CAST_AS(genSpill) PPC64FN(genSpill_PPC);
@@ -977,8 +971,7 @@ void LibVEX_Codegen (   VexTranslateArgs *vta,
       case VexArchS390X:
          mode64       = True;
          rRegUniv     = S390FN(getRRegUniverse_S390());
-         isMove       = CAST_AS(isMove) S390FN(isMove_S390Instr);
-         getRegUsage  
+         getRegUsage
             = CAST_AS(getRegUsage) S390FN(getRegUsage_S390Instr);
          mapRegs      = CAST_AS(mapRegs) S390FN(mapRegs_S390Instr);
          genSpill     = CAST_AS(genSpill) S390FN(genSpill_S390);
@@ -995,8 +988,7 @@ void LibVEX_Codegen (   VexTranslateArgs *vta,
       case VexArchARM:
          mode64       = False;
          rRegUniv     = ARMFN(getRRegUniverse_ARM());
-         isMove       = CAST_AS(isMove) ARMFN(isMove_ARMInstr);
-         getRegUsage  
+         getRegUsage
             = CAST_AS(getRegUsage) ARMFN(getRegUsage_ARMInstr);
          mapRegs      = CAST_AS(mapRegs) ARMFN(mapRegs_ARMInstr);
          genSpill     = CAST_AS(genSpill) ARMFN(genSpill_ARM);
@@ -1012,8 +1004,7 @@ void LibVEX_Codegen (   VexTranslateArgs *vta,
       case VexArchARM64:
          mode64       = True;
          rRegUniv     = ARM64FN(getRRegUniverse_ARM64());
-         isMove       = CAST_AS(isMove) ARM64FN(isMove_ARM64Instr);
-         getRegUsage  
+         getRegUsage
             = CAST_AS(getRegUsage) ARM64FN(getRegUsage_ARM64Instr);
          mapRegs      = CAST_AS(mapRegs) ARM64FN(mapRegs_ARM64Instr);
          genSpill     = CAST_AS(genSpill) ARM64FN(genSpill_ARM64);
@@ -1029,8 +1020,7 @@ void LibVEX_Codegen (   VexTranslateArgs *vta,
       case VexArchMIPS32:
          mode64       = False;
          rRegUniv     = MIPS32FN(getRRegUniverse_MIPS(mode64));
-         isMove       = CAST_AS(isMove) MIPS32FN(isMove_MIPSInstr);
-         getRegUsage  
+         getRegUsage
             = CAST_AS(getRegUsage) MIPS32FN(getRegUsage_MIPSInstr);
          mapRegs      = CAST_AS(mapRegs) MIPS32FN(mapRegs_MIPSInstr);
          genSpill     = CAST_AS(genSpill) MIPS32FN(genSpill_MIPS);
@@ -1047,8 +1037,7 @@ void LibVEX_Codegen (   VexTranslateArgs *vta,
       case VexArchMIPS64:
          mode64       = True;
          rRegUniv     = MIPS64FN(getRRegUniverse_MIPS(mode64));
-         isMove       = CAST_AS(isMove) MIPS64FN(isMove_MIPSInstr);
-         getRegUsage  
+         getRegUsage
             = CAST_AS(getRegUsage) MIPS64FN(getRegUsage_MIPSInstr);
          mapRegs      = CAST_AS(mapRegs) MIPS64FN(mapRegs_MIPSInstr);
          genSpill     = CAST_AS(genSpill) MIPS64FN(genSpill_MIPS);
@@ -1065,7 +1054,6 @@ void LibVEX_Codegen (   VexTranslateArgs *vta,
       case VexArchTILEGX:
          mode64      = True;
          rRegUniv    = TILEGXFN(getRRegUniverse_TILEGX());
-         isMove      = CAST_AS(isMove) TILEGXFN(isMove_TILEGXInstr);
          getRegUsage =
             CAST_AS(getRegUsage) TILEGXFN(getRegUsage_TILEGXInstr);
          mapRegs     = CAST_AS(mapRegs) TILEGXFN(mapRegs_TILEGXInstr);
@@ -1147,11 +1135,10 @@ void LibVEX_Codegen (   VexTranslateArgs *vta,
 
    /* Register allocate. */
    RegAllocControl con = {
-      .univ = rRegUniv, .isMove = isMove, .getRegUsage = getRegUsage,
-      .mapRegs = mapRegs, .genSpill = genSpill, .genReload = genReload,
-      .genMove = genMove, .directReload = directReload,
-      .guest_sizeB = guest_sizeB, .ppInstr = ppInstr, .ppReg = ppReg,
-      .mode64 = mode64};
+      .univ = rRegUniv, .getRegUsage = getRegUsage, .mapRegs = mapRegs,
+      .genSpill = genSpill, .genReload = genReload, .genMove = genMove,
+      .directReload = directReload, .guest_sizeB = guest_sizeB,
+      .ppInstr = ppInstr, .ppReg = ppReg, .mode64 = mode64};
    switch (vex_control.regalloc_version) {
    case 2:
       rcode = doRegisterAllocation_v2(vcode, &con);

From 4731397593745ad1ef61fa685a88e77e7d66474b Mon Sep 17 00:00:00 2001
From: mephi42 <mephi42@gmail.com>
Date: Fri, 15 Mar 2019 17:48:51 +0100
Subject: [PATCH 4/9] Pick common code changes from f1a49eeb

f1a49eeb: s390x: z13 vector "support" instructions not implemented
---
 priv/ir_defs.c      | 17 +++++++++++++++++
 priv/main_main.c    |  3 +++
 pub/libvex.h        |  4 +++-
 pub/libvex_emnote.h |  3 +++
 pub/libvex_ir.h     | 10 +++++++++-
 5 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/priv/ir_defs.c b/priv/ir_defs.c
index 4449e6276..ebc864600 100644
--- a/priv/ir_defs.c
+++ b/priv/ir_defs.c
@@ -1118,6 +1118,11 @@ void ppIROp ( IROp op )
       case Iop_GetElem32x4: vex_printf("GetElem32x4"); return;
       case Iop_GetElem64x2: vex_printf("GetElem64x2"); return;
 
+      case Iop_SetElem8x16: vex_printf("SetElem8x16"); return;
+      case Iop_SetElem16x8: vex_printf("SetElem16x8"); return;
+      case Iop_SetElem32x4: vex_printf("SetElem32x4"); return;
+      case Iop_SetElem64x2: vex_printf("SetElem64x2"); return;
+
       case Iop_GetElem8x8: vex_printf("GetElem8x8"); return;
       case Iop_GetElem16x4: vex_printf("GetElem16x4"); return;
       case Iop_GetElem32x2: vex_printf("GetElem32x2"); return;
@@ -1130,6 +1135,7 @@ void ppIROp ( IROp op )
 
       case Iop_Perm8x16: vex_printf("Perm8x16"); return;
       case Iop_Perm32x4: vex_printf("Perm32x4"); return;
+      case Iop_Perm8x16x2: vex_printf("Perm8x16x2"); return;
       case Iop_Reverse8sIn16_x8: vex_printf("Reverse8sIn16_x8"); return;
       case Iop_Reverse8sIn32_x4: vex_printf("Reverse8sIn32_x4"); return;
       case Iop_Reverse16sIn32_x4: vex_printf("Reverse16sIn32_x4"); return;
@@ -3133,6 +3139,9 @@ void typeOfPrimop ( IROp op,
       case Iop_MulI128by10ECarry:
          BINARY(Ity_V128,Ity_V128, Ity_V128);
 
+      case Iop_Perm8x16x2:
+         TERNARY(Ity_V128, Ity_V128, Ity_V128, Ity_V128);
+
       case Iop_PolynomialMull8x8:
       case Iop_Mull8Ux8: case Iop_Mull8Sx8:
       case Iop_Mull16Ux4: case Iop_Mull16Sx4:
@@ -3228,6 +3237,14 @@ void typeOfPrimop ( IROp op,
          BINARY(Ity_V128, Ity_I8, Ity_I32);
       case Iop_GetElem64x2:
          BINARY(Ity_V128, Ity_I8, Ity_I64);
+      case Iop_SetElem8x16:
+         TERNARY(Ity_V128, Ity_I8, Ity_I8, Ity_V128);
+      case Iop_SetElem16x8:
+         TERNARY(Ity_V128, Ity_I8, Ity_I16, Ity_V128);
+      case Iop_SetElem32x4:
+         TERNARY(Ity_V128, Ity_I8, Ity_I32, Ity_V128);
+      case Iop_SetElem64x2:
+         TERNARY(Ity_V128, Ity_I8, Ity_I64, Ity_V128);
       case Iop_GetElem8x8:
          BINARY(Ity_I64, Ity_I8, Ity_I8);
       case Iop_GetElem16x4:
diff --git a/priv/main_main.c b/priv/main_main.c
index 5229d4276..0e394e104 100644
--- a/priv/main_main.c
+++ b/priv/main_main.c
@@ -1496,6 +1496,9 @@ const HChar* LibVEX_EmNote_string ( VexEmNote ew )
      case EmFail_S390X_invalid_PFPO_function:
         return "The function code in GPR 0 for the PFPO instruction"
                " is invalid";
+     case EmFail_S390X_vx:
+        return "Encountered an instruction that requires the vector facility.\n"
+               "  That facility is not available on this host";
      default: 
         vpanic("LibVEX_EmNote_string: unknown warning");
    }
diff --git a/pub/libvex.h b/pub/libvex.h
index 604b429df..fed8867bf 100644
--- a/pub/libvex.h
+++ b/pub/libvex.h
@@ -159,6 +159,7 @@ typedef
 #define VEX_HWCAPS_S390X_FPEXT (1<<15)  /* Floating point extension facility */
 #define VEX_HWCAPS_S390X_LSC   (1<<16)  /* Conditional load/store facility */
 #define VEX_HWCAPS_S390X_PFPO  (1<<17)  /* Perform floating point ops facility */
+#define VEX_HWCAPS_S390X_VX    (1<<18)  /* Vector facility */
 
 /* Special value representing all available s390x hwcaps */
 #define VEX_HWCAPS_S390X_ALL   (VEX_HWCAPS_S390X_LDISP | \
@@ -172,7 +173,8 @@ typedef
                                 VEX_HWCAPS_S390X_LSC   | \
                                 VEX_HWCAPS_S390X_ETF3  | \
                                 VEX_HWCAPS_S390X_ETF2  | \
-                                VEX_HWCAPS_S390X_PFPO)
+                                VEX_HWCAPS_S390X_PFPO  | \
+                                VEX_HWCAPS_S390X_VX)
 
 #define VEX_HWCAPS_S390X(x)  ((x) & ~VEX_S390X_MODEL_MASK)
 #define VEX_S390X_MODEL(x)   ((x) &  VEX_S390X_MODEL_MASK)
diff --git a/pub/libvex_emnote.h b/pub/libvex_emnote.h
index 943513062..61a58edc3 100644
--- a/pub/libvex_emnote.h
+++ b/pub/libvex_emnote.h
@@ -120,6 +120,9 @@ typedef
          instruction is invalid */
       EmFail_S390X_invalid_PFPO_function,
 
+      /* some insn needs vector facility which is not available on this host */
+      EmFail_S390X_vx,
+
       EmNote_NUMBER
    }
    VexEmNote;
diff --git a/pub/libvex_ir.h b/pub/libvex_ir.h
index bfda265b8..ebef9fba8 100644
--- a/pub/libvex_ir.h
+++ b/pub/libvex_ir.h
@@ -1763,9 +1763,11 @@ typedef
       Iop_CatEvenLanes8x16, Iop_CatEvenLanes16x8, Iop_CatEvenLanes32x4,
 
       /* GET elements of VECTOR
-         GET is binop (V128, I8) -> I<elem_size> */
+         GET is binop (V128, I8) -> I<elem_size>
+         SET is triop (V128, I8, I<elem_size>) -> V128 */
       /* Note: the arm back-end handles only constant second argument. */
       Iop_GetElem8x16, Iop_GetElem16x8, Iop_GetElem32x4, Iop_GetElem64x2,
+      Iop_SetElem8x16, Iop_SetElem16x8, Iop_SetElem32x4, Iop_SetElem64x2,
 
       /* DUPLICATING -- copy value to all lanes */
       Iop_Dup8x16,   Iop_Dup16x8,   Iop_Dup32x4,
@@ -1794,6 +1796,12 @@ typedef
       Iop_Perm8x16,
       Iop_Perm32x4, /* ditto, except argR values are restricted to 0 .. 3 */
 
+      /* same, but Triop (argL consists of two 128-bit parts) */
+      /* correct range for argR values is 0..31 */
+      /* (V128, V128, V128) -> V128 */
+      /* (ArgL_first, ArgL_second, ArgR) -> result */
+      Iop_Perm8x16x2,
+
       /* MISC CONVERSION -- get high bits of each byte lane, a la
          x86/amd64 pmovmskb */
       Iop_GetMSBs8x16, /* V128 -> I16 */

From 7e18b5a08be3040648b2511ed2b141d4778ac723 Mon Sep 17 00:00:00 2001
From: mephi42 <mephi42@gmail.com>
Date: Fri, 15 Mar 2019 17:49:55 +0100
Subject: [PATCH 5/9] Pick common code changes from d44563c4

d44563c4: s390x: new non-vector z13 instructions not implemented
---
 pub/libvex.h        | 5 ++++-
 pub/libvex_emnote.h | 3 +++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/pub/libvex.h b/pub/libvex.h
index fed8867bf..223da74bc 100644
--- a/pub/libvex.h
+++ b/pub/libvex.h
@@ -160,6 +160,8 @@ typedef
 #define VEX_HWCAPS_S390X_LSC   (1<<16)  /* Conditional load/store facility */
 #define VEX_HWCAPS_S390X_PFPO  (1<<17)  /* Perform floating point ops facility */
 #define VEX_HWCAPS_S390X_VX    (1<<18)  /* Vector facility */
+#define VEX_HWCAPS_S390X_MSA5  (1<<19)  /* message security assistance facility */
+
 
 /* Special value representing all available s390x hwcaps */
 #define VEX_HWCAPS_S390X_ALL   (VEX_HWCAPS_S390X_LDISP | \
@@ -174,7 +176,8 @@ typedef
                                 VEX_HWCAPS_S390X_ETF3  | \
                                 VEX_HWCAPS_S390X_ETF2  | \
                                 VEX_HWCAPS_S390X_PFPO  | \
-                                VEX_HWCAPS_S390X_VX)
+                                VEX_HWCAPS_S390X_VX    | \
+                                VEX_HWCAPS_S390X_MSA5)
 
 #define VEX_HWCAPS_S390X(x)  ((x) & ~VEX_S390X_MODEL_MASK)
 #define VEX_S390X_MODEL(x)   ((x) &  VEX_S390X_MODEL_MASK)
diff --git a/pub/libvex_emnote.h b/pub/libvex_emnote.h
index 61a58edc3..bb0c064d9 100644
--- a/pub/libvex_emnote.h
+++ b/pub/libvex_emnote.h
@@ -123,6 +123,9 @@ typedef
       /* some insn needs vector facility which is not available on this host */
       EmFail_S390X_vx,
 
+      /* ppno insn is not supported on this host */
+      EmFail_S390X_ppno,
+
       EmNote_NUMBER
    }
    VexEmNote;

From fa3229117f3f6d2cd4ec854b19ad22fa0e4bed67 Mon Sep 17 00:00:00 2001
From: mephi42 <mephi42@gmail.com>
Date: Fri, 15 Mar 2019 17:41:13 +0100
Subject: [PATCH 6/9] Pick common code changes from 20976f43

20976f43: s390x: Implement conditional trap instructions
---
 pub/libvex_ir.h         | 1 +
 pub/libvex_trc_values.h | 1 +
 2 files changed, 2 insertions(+)

diff --git a/pub/libvex_ir.h b/pub/libvex_ir.h
index ebef9fba8..5d79270f2 100644
--- a/pub/libvex_ir.h
+++ b/pub/libvex_ir.h
@@ -2341,6 +2341,7 @@ typedef
       Ijk_SigTRAP,        /* current instruction synths SIGTRAP */
       Ijk_SigSEGV,        /* current instruction synths SIGSEGV */
       Ijk_SigBUS,         /* current instruction synths SIGBUS */
+      Ijk_SigFPE,         /* current instruction synths generic SIGFPE */
       Ijk_SigFPE_IntDiv,  /* current instruction synths SIGFPE - IntDiv */
       Ijk_SigFPE_IntOvf,  /* current instruction synths SIGFPE - IntOvf */
       /* Unfortunately, various guest-dependent syscall kinds.  They
diff --git a/pub/libvex_trc_values.h b/pub/libvex_trc_values.h
index 38ff2e125..cafe1cec5 100644
--- a/pub/libvex_trc_values.h
+++ b/pub/libvex_trc_values.h
@@ -58,6 +58,7 @@
 #define VEX_TRC_JMP_SIGSEGV    87  /* deliver segv (SIGSEGV) before
                                       continuing */
 #define VEX_TRC_JMP_SIGBUS     93  /* deliver SIGBUS before continuing */
+#define VEX_TRC_JMP_SIGFPE    105  /* deliver SIGFPE before continuing */
 
 #define VEX_TRC_JMP_SIGFPE_INTDIV     97  /* deliver SIGFPE (integer divide
                                              by zero) before continuing */

From 4fee547adbb2e474579547e2780c55715ad8ee28 Mon Sep 17 00:00:00 2001
From: mephi42 <mephi42@gmail.com>
Date: Fri, 15 Mar 2019 17:45:03 +0100
Subject: [PATCH 7/9] Pick common code changes from 1cc1d564

1cc1d564: s390x: Vector integer and string instruction support
---
 priv/ir_defs.c     | 27 +++++++++++++++++++--------
 pub/libvex_ir.h    | 17 +++++++++--------
 useful/test_main.c | 21 +++++++++++++++++++++
 3 files changed, 49 insertions(+), 16 deletions(-)

diff --git a/priv/ir_defs.c b/priv/ir_defs.c
index ebc864600..03a62b7a5 100644
--- a/priv/ir_defs.c
+++ b/priv/ir_defs.c
@@ -797,6 +797,7 @@ void ppIROp ( IROp op )
       case Iop_CmpNEZ16x8: vex_printf("CmpNEZ16x8"); return;
       case Iop_CmpNEZ32x4: vex_printf("CmpNEZ32x4"); return;
       case Iop_CmpNEZ64x2: vex_printf("CmpNEZ64x2"); return;
+      case Iop_CmpNEZ128x1: vex_printf("CmpNEZ128x1"); return;
 
       case Iop_Abs8x16: vex_printf("Abs8x16"); return;
       case Iop_Abs16x8: vex_printf("Abs16x8"); return;
@@ -807,6 +808,7 @@ void ppIROp ( IROp op )
       case Iop_Add16x8:   vex_printf("Add16x8"); return;
       case Iop_Add32x4:   vex_printf("Add32x4"); return;
       case Iop_Add64x2:   vex_printf("Add64x2"); return;
+      case Iop_Add128x1:  vex_printf("Add128x1"); return;
       case Iop_QAdd8Ux16: vex_printf("QAdd8Ux16"); return;
       case Iop_QAdd16Ux8: vex_printf("QAdd16Ux8"); return;
       case Iop_QAdd32Ux4: vex_printf("QAdd32Ux4"); return;
@@ -831,6 +833,7 @@ void ppIROp ( IROp op )
       case Iop_PwAddL8Ux16: vex_printf("PwAddL8Ux16"); return;
       case Iop_PwAddL16Ux8: vex_printf("PwAddL16Ux8"); return;
       case Iop_PwAddL32Ux4: vex_printf("PwAddL32Ux4"); return;
+      case Iop_PwAddL64Ux2: vex_printf("PwAddL64Ux2"); return;
       case Iop_PwAddL8Sx16: vex_printf("PwAddL8Sx16"); return;
       case Iop_PwAddL16Sx8: vex_printf("PwAddL16Sx8"); return;
       case Iop_PwAddL32Sx4: vex_printf("PwAddL32Sx4"); return;
@@ -839,6 +842,7 @@ void ppIROp ( IROp op )
       case Iop_Sub16x8:   vex_printf("Sub16x8"); return;
       case Iop_Sub32x4:   vex_printf("Sub32x4"); return;
       case Iop_Sub64x2:   vex_printf("Sub64x2"); return;
+      case Iop_Sub128x1:  vex_printf("Sub128x1"); return;
       case Iop_QSub8Ux16: vex_printf("QSub8Ux16"); return;
       case Iop_QSub16Ux8: vex_printf("QSub16Ux8"); return;
       case Iop_QSub32Ux4: vex_printf("QSub32Ux4"); return;
@@ -859,8 +863,10 @@ void ppIROp ( IROp op )
       case Iop_Mull32Sx2:    vex_printf("Mull32Sx2"); return;
       case Iop_PolynomialMul8x16: vex_printf("PolynomialMul8x16"); return;
       case Iop_PolynomialMull8x8: vex_printf("PolynomialMull8x8"); return;
+      case Iop_MulHi8Ux16: vex_printf("MulHi8Ux16"); return;
       case Iop_MulHi16Ux8: vex_printf("MulHi16Ux8"); return;
       case Iop_MulHi32Ux4: vex_printf("MulHi32Ux4"); return;
+      case Iop_MulHi8Sx16: vex_printf("MulHi8Sx16"); return;
       case Iop_MulHi16Sx8: vex_printf("MulHi16Sx8"); return;
       case Iop_MulHi32Sx4: vex_printf("MulHi32Sx4"); return;
       case Iop_QDMulHi16Sx8: vex_printf("QDMulHi16Sx8"); return;
@@ -887,9 +893,11 @@ void ppIROp ( IROp op )
       case Iop_Avg8Ux16: vex_printf("Avg8Ux16"); return;
       case Iop_Avg16Ux8: vex_printf("Avg16Ux8"); return;
       case Iop_Avg32Ux4: vex_printf("Avg32Ux4"); return;
+      case Iop_Avg64Ux2: vex_printf("Avg64Ux2"); return;
       case Iop_Avg8Sx16: vex_printf("Avg8Sx16"); return;
       case Iop_Avg16Sx8: vex_printf("Avg16Sx8"); return;
       case Iop_Avg32Sx4: vex_printf("Avg32Sx4"); return;
+      case Iop_Avg64Sx2: vex_printf("Avg64Sx2"); return;
 
       case Iop_Max8Sx16: vex_printf("Max8Sx16"); return;
       case Iop_Max16Sx8: vex_printf("Max16Sx8"); return;
@@ -937,6 +945,7 @@ void ppIROp ( IROp op )
 
       case Iop_ShlV128: vex_printf("ShlV128"); return;
       case Iop_ShrV128: vex_printf("ShrV128"); return;
+      case Iop_SarV128: vex_printf("SarV128"); return;
 
       case Iop_ShlN8x16: vex_printf("ShlN8x16"); return;
       case Iop_ShlN16x8: vex_printf("ShlN16x8"); return;
@@ -1592,6 +1601,7 @@ void ppIRJumpKind ( IRJumpKind kind )
       case Ijk_SigTRAP:       vex_printf("SigTRAP"); break;
       case Ijk_SigSEGV:       vex_printf("SigSEGV"); break;
       case Ijk_SigBUS:        vex_printf("SigBUS"); break;
+      case Ijk_SigFPE:        vex_printf("SigFPE"); break;
       case Ijk_SigFPE_IntDiv: vex_printf("SigFPE_IntDiv"); break;
       case Ijk_SigFPE_IntOvf: vex_printf("SigFPE_IntOvf"); break;
       case Ijk_Sys_syscall:   vex_printf("Sys_syscall"); break;
@@ -3053,7 +3063,7 @@ void typeOfPrimop ( IROp op,
       case Iop_Sub64F0x2:
       case Iop_AndV128: case Iop_OrV128: case Iop_XorV128:
       case Iop_Add8x16:   case Iop_Add16x8:   
-      case Iop_Add32x4:   case Iop_Add64x2:
+      case Iop_Add32x4:   case Iop_Add64x2: case Iop_Add128x1:
       case Iop_QAdd8Ux16: case Iop_QAdd16Ux8:
       case Iop_QAdd32Ux4: case Iop_QAdd64Ux2:
       case Iop_QAdd8Sx16: case Iop_QAdd16Sx8:
@@ -3064,7 +3074,7 @@ void typeOfPrimop ( IROp op,
       case Iop_QAddExtSUsatUU32x4: case Iop_QAddExtSUsatUU64x2:
       case Iop_PwAdd8x16: case Iop_PwAdd16x8: case Iop_PwAdd32x4:
       case Iop_Sub8x16:   case Iop_Sub16x8:
-      case Iop_Sub32x4:   case Iop_Sub64x2:
+      case Iop_Sub32x4:   case Iop_Sub64x2: case Iop_Sub128x1:
       case Iop_QSub8Ux16: case Iop_QSub16Ux8:
       case Iop_QSub32Ux4: case Iop_QSub64Ux2:
       case Iop_QSub8Sx16: case Iop_QSub16Sx8:
@@ -3073,14 +3083,14 @@ void typeOfPrimop ( IROp op,
       case Iop_PolynomialMul8x16:
       case Iop_PolynomialMulAdd8x16: case Iop_PolynomialMulAdd16x8:
       case Iop_PolynomialMulAdd32x4: case Iop_PolynomialMulAdd64x2:
-      case Iop_MulHi16Ux8: case Iop_MulHi32Ux4: 
-      case Iop_MulHi16Sx8: case Iop_MulHi32Sx4: 
+      case Iop_MulHi8Ux16: case Iop_MulHi16Ux8: case Iop_MulHi32Ux4:
+      case Iop_MulHi8Sx16: case Iop_MulHi16Sx8: case Iop_MulHi32Sx4:
       case Iop_QDMulHi16Sx8: case Iop_QDMulHi32Sx4:
       case Iop_QRDMulHi16Sx8: case Iop_QRDMulHi32Sx4:
       case Iop_MullEven8Ux16: case Iop_MullEven16Ux8: case Iop_MullEven32Ux4:
       case Iop_MullEven8Sx16: case Iop_MullEven16Sx8: case Iop_MullEven32Sx4:
-      case Iop_Avg8Ux16: case Iop_Avg16Ux8: case Iop_Avg32Ux4:
-      case Iop_Avg8Sx16: case Iop_Avg16Sx8: case Iop_Avg32Sx4:
+      case Iop_Avg8Ux16: case Iop_Avg16Ux8: case Iop_Avg32Ux4: case Iop_Avg64Ux2:
+      case Iop_Avg8Sx16: case Iop_Avg16Sx8: case Iop_Avg32Sx4: case Iop_Avg64Sx2:
       case Iop_Max8Sx16: case Iop_Max16Sx8: case Iop_Max32Sx4:
       case Iop_Max64Sx2:
       case Iop_Max8Ux16: case Iop_Max16Ux8: case Iop_Max32Ux4:
@@ -3156,11 +3166,12 @@ void typeOfPrimop ( IROp op,
       case Iop_Sqrt32F0x4:
       case Iop_Sqrt64F0x2:
       case Iop_CmpNEZ8x16: case Iop_CmpNEZ16x8:
-      case Iop_CmpNEZ32x4: case Iop_CmpNEZ64x2:
+      case Iop_CmpNEZ32x4: case Iop_CmpNEZ64x2: case Iop_CmpNEZ128x1:
       case Iop_Cnt8x16:
       case Iop_Clz8x16: case Iop_Clz16x8: case Iop_Clz32x4: case Iop_Clz64x2:
       case Iop_Cls8x16: case Iop_Cls16x8: case Iop_Cls32x4:
       case Iop_PwAddL8Ux16: case Iop_PwAddL16Ux8: case Iop_PwAddL32Ux4:
+      case Iop_PwAddL64Ux2:
       case Iop_PwAddL8Sx16: case Iop_PwAddL16Sx8: case Iop_PwAddL32Sx4:
       case Iop_Reverse8sIn64_x2: case Iop_Reverse16sIn64_x2:
       case Iop_Reverse32sIn64_x2:
@@ -3182,7 +3193,7 @@ void typeOfPrimop ( IROp op,
       case Iop_BCD128toI128S:
          UNARY(Ity_V128, Ity_V128);
 
-      case Iop_ShlV128: case Iop_ShrV128:
+      case Iop_ShlV128: case Iop_ShrV128: case Iop_SarV128:
       case Iop_ShlN8x16: case Iop_ShlN16x8: 
       case Iop_ShlN32x4: case Iop_ShlN64x2:
       case Iop_ShrN8x16: case Iop_ShrN16x8: 
diff --git a/pub/libvex_ir.h b/pub/libvex_ir.h
index 5d79270f2..e70b8f4ae 100644
--- a/pub/libvex_ir.h
+++ b/pub/libvex_ir.h
@@ -1448,13 +1448,14 @@ typedef
       Iop_AndV128, Iop_OrV128, Iop_XorV128, 
 
       /* VECTOR SHIFT (shift amt :: Ity_I8) */
-      Iop_ShlV128, Iop_ShrV128,
+      Iop_ShlV128, Iop_ShrV128, Iop_SarV128,
 
       /* MISC (vector integer cmp != 0) */
       Iop_CmpNEZ8x16, Iop_CmpNEZ16x8, Iop_CmpNEZ32x4, Iop_CmpNEZ64x2,
+      Iop_CmpNEZ128x1,
 
       /* ADDITION (normal / U->U sat / S->S sat) */
-      Iop_Add8x16,    Iop_Add16x8,    Iop_Add32x4,    Iop_Add64x2,
+      Iop_Add8x16,    Iop_Add16x8,    Iop_Add32x4,    Iop_Add64x2,   Iop_Add128x1,
       Iop_QAdd8Ux16,  Iop_QAdd16Ux8,  Iop_QAdd32Ux4,  Iop_QAdd64Ux2,
       Iop_QAdd8Sx16,  Iop_QAdd16Sx8,  Iop_QAdd32Sx4,  Iop_QAdd64Sx2,
 
@@ -1469,14 +1470,14 @@ typedef
       Iop_QAddExtSUsatUU32x4, Iop_QAddExtSUsatUU64x2,
 
       /* SUBTRACTION (normal / unsigned sat / signed sat) */
-      Iop_Sub8x16,   Iop_Sub16x8,   Iop_Sub32x4,   Iop_Sub64x2,
+      Iop_Sub8x16,   Iop_Sub16x8,   Iop_Sub32x4,   Iop_Sub64x2,   Iop_Sub128x1,
       Iop_QSub8Ux16, Iop_QSub16Ux8, Iop_QSub32Ux4, Iop_QSub64Ux2,
       Iop_QSub8Sx16, Iop_QSub16Sx8, Iop_QSub32Sx4, Iop_QSub64Sx2,
 
       /* MULTIPLICATION (normal / high half of signed/unsigned) */
       Iop_Mul8x16,  Iop_Mul16x8,    Iop_Mul32x4,
-                    Iop_MulHi16Ux8, Iop_MulHi32Ux4,
-                    Iop_MulHi16Sx8, Iop_MulHi32Sx4,
+      Iop_MulHi8Ux16, Iop_MulHi16Ux8, Iop_MulHi32Ux4,
+      Iop_MulHi8Sx16, Iop_MulHi16Sx8, Iop_MulHi32Sx4,
       /* (widening signed/unsigned of even lanes, with lowest lane=zero) */
       Iop_MullEven8Ux16, Iop_MullEven16Ux8, Iop_MullEven32Ux4,
       Iop_MullEven8Sx16, Iop_MullEven16Sx8, Iop_MullEven32Sx4,
@@ -1553,7 +1554,7 @@ typedef
          Example:
             Iop_PwAddL16Ux4( [a,b,c,d] ) = [a+b,c+d]
                where a+b and c+d are unsigned 32-bit values. */
-      Iop_PwAddL8Ux16, Iop_PwAddL16Ux8, Iop_PwAddL32Ux4,
+      Iop_PwAddL8Ux16, Iop_PwAddL16Ux8, Iop_PwAddL32Ux4, Iop_PwAddL64Ux2,
       Iop_PwAddL8Sx16, Iop_PwAddL16Sx8, Iop_PwAddL32Sx4,
 
       /* Other unary pairwise ops */
@@ -1567,8 +1568,8 @@ typedef
       Iop_Abs8x16, Iop_Abs16x8, Iop_Abs32x4, Iop_Abs64x2,
 
       /* AVERAGING: note: (arg1 + arg2 + 1) >>u 1 */
-      Iop_Avg8Ux16, Iop_Avg16Ux8, Iop_Avg32Ux4,
-      Iop_Avg8Sx16, Iop_Avg16Sx8, Iop_Avg32Sx4,
+      Iop_Avg8Ux16, Iop_Avg16Ux8, Iop_Avg32Ux4, Iop_Avg64Ux2,
+      Iop_Avg8Sx16, Iop_Avg16Sx8, Iop_Avg32Sx4, Iop_Avg64Sx2,
 
       /* MIN/MAX */
       Iop_Max8Sx16, Iop_Max16Sx8, Iop_Max32Sx4, Iop_Max64Sx2,
diff --git a/useful/test_main.c b/useful/test_main.c
index 5db6ec6b7..4a3ada73b 100644
--- a/useful/test_main.c
+++ b/useful/test_main.c
@@ -1416,6 +1416,10 @@ static IRAtom* mkPCast64x2 ( MCEnv* mce, IRAtom* at )
    return assignNew(mce, Ity_V128, unop(Iop_CmpNEZ64x2, at));
 }
 
+static IRAtom* mkPCast128x1 ( MCEnv* mce, IRAtom* at )
+{
+   return assignNew(mce, Ity_V128, unop(Iop_CmpNEZ128x1, at));
+}
 
 /* Here's a simple scheme capable of handling ops derived from SSE1
    code and while only generating ops that can be efficiently
@@ -1631,6 +1635,14 @@ IRAtom* binary64Ix2 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
    return at;   
 }
 
+static
+IRAtom* binary128Ix1 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
+{
+   IRAtom* at;
+   at = mkUifUV128(mce, vatom1, vatom2);
+   at = mkPCast128x1(mce, at);
+   return at;
+}
 
 /*------------------------------------------------------------*/
 /*--- Generate shadow values from all kinds of IRExprs.    ---*/
@@ -1674,6 +1686,8 @@ IRAtom* expr2vbits_Binop ( MCEnv* mce,
       case Iop_QSub8Ux16:
       case Iop_QSub8Sx16:
       case Iop_Sub8x16:
+      case Iop_MulHi8Sx16:
+      case Iop_MulHi8Ux16:
       case Iop_Min8Ux16:
       case Iop_Max8Ux16:
       case Iop_CmpGT8Sx16:
@@ -1713,11 +1727,18 @@ IRAtom* expr2vbits_Binop ( MCEnv* mce,
       case Iop_Sub64x2:
       case Iop_QSub64Ux2:
       case Iop_QSub64Sx2:
+      case Iop_Avg64Ux2:
+      case Iop_Avg64Sx2:
       case Iop_Add64x2:
       case Iop_QAdd64Ux2:
       case Iop_QAdd64Sx2:
          return binary64Ix2(mce, vatom1, vatom2);
 
+      case Iop_Add128x1:
+      case Iop_Sub128x1:
+      case Iop_CmpNEZ128x1:
+         return binary128Ix1(mce, vatom1, vatom2);
+
       case Iop_QNarrowBin32Sto16Sx8:
       case Iop_QNarrowBin16Sto8Sx16:
       case Iop_QNarrowBin16Sto8Ux16:

From 84adf266bcd76007b7a512f250875668bcbbb8e2 Mon Sep 17 00:00:00 2001
From: mephi42 <mephi42@gmail.com>
Date: Fri, 15 Mar 2019 17:37:26 +0100
Subject: [PATCH 8/9] s390x: update to upstream revision 7e9113cb7

- Update genoffsets
---
 auxprogs/genoffsets.c     |   48 +-
 priv/guest_s390_defs.h    |   83 +-
 priv/guest_s390_helpers.c |  368 +-
 priv/guest_s390_toIR.c    | 8044 ++++++++++++++++++++++++++++++-------
 priv/host_s390_defs.c     | 1769 +++++++-
 priv/host_s390_defs.h     |  143 +-
 priv/host_s390_isel.c     | 1288 +++++-
 priv/s390_defs.h          |   24 +-
 priv/s390_disasm.c        |   82 +-
 priv/s390_disasm.h        |   18 +-
 pub/libvex_guest_s390x.h  |  132 +-
 pub/libvex_s390x_common.h |    7 +-
 12 files changed, 10149 insertions(+), 1857 deletions(-)

diff --git a/auxprogs/genoffsets.c b/auxprogs/genoffsets.c
index 795715842..7e28f8c6e 100644
--- a/auxprogs/genoffsets.c
+++ b/auxprogs/genoffsets.c
@@ -635,22 +635,38 @@ int main(int argc, char **argv)
    GENOFFSET(S390X,s390x,a13);
    GENOFFSET(S390X,s390x,a14);
    GENOFFSET(S390X,s390x,a15);
-   GENOFFSET(S390X,s390x,f0);
-   GENOFFSET(S390X,s390x,f1);
-   GENOFFSET(S390X,s390x,f2);
-   GENOFFSET(S390X,s390x,f3);
-   GENOFFSET(S390X,s390x,f4);
-   GENOFFSET(S390X,s390x,f5);
-   GENOFFSET(S390X,s390x,f6);
-   GENOFFSET(S390X,s390x,f7);
-   GENOFFSET(S390X,s390x,f8);
-   GENOFFSET(S390X,s390x,f9);
-   GENOFFSET(S390X,s390x,f10);
-   GENOFFSET(S390X,s390x,f11);
-   GENOFFSET(S390X,s390x,f12);
-   GENOFFSET(S390X,s390x,f13);
-   GENOFFSET(S390X,s390x,f14);
-   GENOFFSET(S390X,s390x,f15);
+   GENOFFSET(S390X,s390x,v0);
+   GENOFFSET(S390X,s390x,v1);
+   GENOFFSET(S390X,s390x,v2);
+   GENOFFSET(S390X,s390x,v3);
+   GENOFFSET(S390X,s390x,v4);
+   GENOFFSET(S390X,s390x,v5);
+   GENOFFSET(S390X,s390x,v6);
+   GENOFFSET(S390X,s390x,v7);
+   GENOFFSET(S390X,s390x,v8);
+   GENOFFSET(S390X,s390x,v9);
+   GENOFFSET(S390X,s390x,v10);
+   GENOFFSET(S390X,s390x,v11);
+   GENOFFSET(S390X,s390x,v12);
+   GENOFFSET(S390X,s390x,v13);
+   GENOFFSET(S390X,s390x,v14);
+   GENOFFSET(S390X,s390x,v15);
+   GENOFFSET(S390X,s390x,v16);
+   GENOFFSET(S390X,s390x,v17);
+   GENOFFSET(S390X,s390x,v18);
+   GENOFFSET(S390X,s390x,v19);
+   GENOFFSET(S390X,s390x,v20);
+   GENOFFSET(S390X,s390x,v21);
+   GENOFFSET(S390X,s390x,v22);
+   GENOFFSET(S390X,s390x,v23);
+   GENOFFSET(S390X,s390x,v24);
+   GENOFFSET(S390X,s390x,v25);
+   GENOFFSET(S390X,s390x,v26);
+   GENOFFSET(S390X,s390x,v27);
+   GENOFFSET(S390X,s390x,v28);
+   GENOFFSET(S390X,s390x,v29);
+   GENOFFSET(S390X,s390x,v30);
+   GENOFFSET(S390X,s390x,v31);
    GENOFFSET(S390X,s390x,r0);
    GENOFFSET(S390X,s390x,r1);
    GENOFFSET(S390X,s390x,r2);
diff --git a/priv/guest_s390_defs.h b/priv/guest_s390_defs.h
index da6166840..d72cc9f6d 100644
--- a/priv/guest_s390_defs.h
+++ b/priv/guest_s390_defs.h
@@ -8,7 +8,7 @@
    This file is part of Valgrind, a dynamic binary instrumentation
    framework.
 
-   Copyright IBM Corp. 2010-2015
+   Copyright IBM Corp. 2010-2017
 
    This program is free software; you can redistribute it and/or
    modify it under the terms of the GNU General Public License as
@@ -80,7 +80,8 @@ ULong s390x_dirtyhelper_STCKF(ULong *addr);
 ULong s390x_dirtyhelper_STCKE(ULong *addr);
 ULong s390x_dirtyhelper_STFLE(VexGuestS390XState *guest_state, ULong *addr);
 void  s390x_dirtyhelper_CUxy(UChar *addr, ULong data, ULong num_bytes);
-
+ULong s390x_dirtyhelper_vec_op(VexGuestS390XState *guest_state,
+                               ULong details);
 ULong s390_do_cu12_cu14_helper1(UInt byte1, UInt etf3_and_m3_is_1);
 ULong s390_do_cu12_helper2(UInt byte1, UInt byte2, UInt byte3, UInt byte4,
                            ULong stuff);
@@ -94,7 +95,9 @@ UInt  s390_do_cvb(ULong decimal);
 ULong s390_do_cvd(ULong binary);
 ULong s390_do_ecag(ULong op2addr);
 UInt  s390_do_pfpo(UInt gpr0);
-
+void  s390x_dirtyhelper_PPNO_query(VexGuestS390XState *guest_state, ULong r1, ULong r2);
+ULong  s390x_dirtyhelper_PPNO_sha512(VexGuestS390XState *guest_state, ULong r1, ULong r2);
+void  s390x_dirtyhelper_PPNO_sha512_load_param_block( void );
 /* The various ways to compute the condition code. */
 enum {
    S390_CC_OP_BITWISE = 0,
@@ -254,6 +257,80 @@ UInt s390_calculate_cond(ULong mask, ULong op, ULong dep1, ULong dep2,
 /* Last target instruction for the EX helper */
 extern ULong last_execute_target;
 
+/*------------------------------------------------------------*/
+/*--- Vector helpers.                                      ---*/
+/*------------------------------------------------------------*/
+
+/* Vector operatons passed to s390x_dirtyhelper_vec_op(...) helper.
+   Please don't change ordering of elements and append new items
+   before  S390_VEC_OP_LAST. */
+enum {
+   S390_VEC_OP_INVALID = 0,
+   S390_VEC_OP_VPKS = 1,
+   S390_VEC_OP_VPKLS = 2,
+   S390_VEC_OP_VFAE = 3,
+   S390_VEC_OP_VFEE = 4,
+   S390_VEC_OP_VFENE = 5,
+   S390_VEC_OP_VISTR = 6,
+   S390_VEC_OP_VSTRC = 7,
+   S390_VEC_OP_VCEQ = 8,
+   S390_VEC_OP_VTM = 9,
+   S390_VEC_OP_VGFM = 10,
+   S390_VEC_OP_VGFMA = 11,
+   S390_VEC_OP_VMAH = 12,
+   S390_VEC_OP_VMALH = 13,
+   S390_VEC_OP_VCH = 14,
+   S390_VEC_OP_VCHL = 15,
+   S390_VEC_OP_VFCE = 16,
+   S390_VEC_OP_VFCH = 17,
+   S390_VEC_OP_VFCHE = 18,
+   S390_VEC_OP_VFTCI = 19,
+   S390_VEC_OP_LAST = 20 // supposed to be the last element in enum
+} s390x_vec_op_t;
+
+/* Arguments of s390x_dirtyhelper_vec_op(...) which are packed into one
+   ULong variable.
+ */
+typedef union {
+   struct {
+      unsigned int op : 8;        // should be an element of s390x_vec_op_t
+      unsigned int v1 : 5;        // result of operation
+      unsigned int v2 : 5;        // argument one of operation
+      unsigned int v3 : 5;        // argument two of operation or
+                                  // zero for unary operations
+
+      unsigned int v4 : 5;        // argument two of operation or
+                                  // zero for unary and binary operations
+
+      unsigned int m4 : 4;        // field m4 of insn or zero if it's missing
+      unsigned int m5 : 4;        // field m5 of insn or zero if it's missing
+      unsigned int m6 : 4;        // field m6 of insn or zero if it's missing
+      unsigned int i3 : 12;       // field i3 of insn or zero if it's missing
+      unsigned int read_only: 1;  // don't write result to Guest State
+      unsigned int reserved : 11; // reserved for future
+   };
+   ULong serialized;
+} s390x_vec_op_details_t;
+
+STATIC_ASSERT(sizeof(s390x_vec_op_details_t) == sizeof(ULong));
+
+/* Macro definitions for opcodes that are not generally available.
+
+   The values to be encoded in those fields must be integer values in
+   hexadecimal notation without a leading 0x.
+   E.g. VRX_VXBD(e7, 1, 0, 3, 0000, 0, 06) is equal to "vl %%v1, 0(%%r3)\n\t"
+*/
+#define VRX_VXBD(op1, v1, x2, b2, d2, rxb, op2)  \
+            ".short 0x" #op1 #v1 #x2 "\n\t .int  0x" #b2 #d2 "0" #rxb #op2 "\n\t"
+#define VRR_VVVMM(op1, v1, v2, v3, m5, m4, rxb, op2) \
+            ".short 0x" #op1 #v1 #v2 "\n\t .int  0x" #v3 "0" #m5 "0" #m4 #rxb #op2 "\n\t"
+
+#define VL(v1, x2, b2, d2, rxb)                VRX_VXBD(e7, v1, x2, b2, d2, rxb, 06)
+#define VST(v1, x2, b2, d2, rxb)               VRX_VXBD(e7, v1, x2, b2, d2, rxb, 0e)
+#define VPKS(v1, v2, v3, m4, m5, rxb)          VRR_VVVMM(e7, v1, v2, v3, m5, m4, rxb, 97)
+#define VPKLS(v1, v2, v3, m4, m5, rxb)         VRR_VVVMM(e7, v1, v2, v3, m5, m4, rxb, 95)
+
+
 /*---------------------------------------------------------------*/
 /*--- end                                   guest_s390_defs.h ---*/
 /*---------------------------------------------------------------*/
diff --git a/priv/guest_s390_helpers.c b/priv/guest_s390_helpers.c
index f484f8ed0..5877743c9 100644
--- a/priv/guest_s390_helpers.c
+++ b/priv/guest_s390_helpers.c
@@ -8,7 +8,7 @@
    This file is part of Valgrind, a dynamic binary instrumentation
    framework.
 
-   Copyright IBM Corp. 2010-2015
+   Copyright IBM Corp. 2010-2017
 
    This program is free software; you can redistribute it and/or
    modify it under the terms of the GNU General Public License as
@@ -68,26 +68,48 @@ LibVEX_GuestS390X_initialise(VexGuestS390XState *state)
    state->guest_a15 = 0;
 
 /*------------------------------------------------------------*/
-/*--- Initialise fpr registers                             ---*/
+/*--- Initialise vr registers                             ---*/
 /*------------------------------------------------------------*/
 
-   state->guest_f0 = 0;
-   state->guest_f1 = 0;
-   state->guest_f2 = 0;
-   state->guest_f3 = 0;
-   state->guest_f4 = 0;
-   state->guest_f5 = 0;
-   state->guest_f6 = 0;
-   state->guest_f7 = 0;
-   state->guest_f8 = 0;
-   state->guest_f9 = 0;
-   state->guest_f10 = 0;
-   state->guest_f11 = 0;
-   state->guest_f12 = 0;
-   state->guest_f13 = 0;
-   state->guest_f14 = 0;
-   state->guest_f15 = 0;
-
+#define VRZERO(vr) \
+   do { \
+      vr.w64[0] = vr.w64[1] = 0ULL; \
+   } while(0);
+
+   VRZERO(state->guest_v0)
+   VRZERO(state->guest_v1)
+   VRZERO(state->guest_v2)
+   VRZERO(state->guest_v3)
+   VRZERO(state->guest_v4)
+   VRZERO(state->guest_v5)
+   VRZERO(state->guest_v6)
+   VRZERO(state->guest_v7)
+   VRZERO(state->guest_v8)
+   VRZERO(state->guest_v9)
+   VRZERO(state->guest_v10)
+   VRZERO(state->guest_v11)
+   VRZERO(state->guest_v12)
+   VRZERO(state->guest_v13)
+   VRZERO(state->guest_v14)
+   VRZERO(state->guest_v15)
+   VRZERO(state->guest_v16)
+   VRZERO(state->guest_v17)
+   VRZERO(state->guest_v18)
+   VRZERO(state->guest_v19)
+   VRZERO(state->guest_v20)
+   VRZERO(state->guest_v21)
+   VRZERO(state->guest_v22)
+   VRZERO(state->guest_v23)
+   VRZERO(state->guest_v24)
+   VRZERO(state->guest_v25)
+   VRZERO(state->guest_v26)
+   VRZERO(state->guest_v27)
+   VRZERO(state->guest_v28)
+   VRZERO(state->guest_v29)
+   VRZERO(state->guest_v30)
+   VRZERO(state->guest_v31)
+
+#undef VRZERO
 /*------------------------------------------------------------*/
 /*--- Initialise gpr registers                             ---*/
 /*------------------------------------------------------------*/
@@ -343,6 +365,7 @@ s390x_dirtyhelper_STFLE(VexGuestS390XState *guest_state, ULong *addr)
    s390_set_facility_bit(addr, S390_FAC_GIE,    1);
    s390_set_facility_bit(addr, S390_FAC_EXEXT,  1);
    s390_set_facility_bit(addr, S390_FAC_HIGHW,  1);
+   s390_set_facility_bit(addr, S390_FAC_LSC2,   1);
 
    s390_set_facility_bit(addr, S390_FAC_HFPMAS, 0);
    s390_set_facility_bit(addr, S390_FAC_HFPUNX, 0);
@@ -1818,6 +1841,13 @@ isC64(const IRExpr *expr)
    return expr->tag == Iex_Const && expr->Iex.Const.con->tag == Ico_U64;
 }
 
+static inline Bool
+isC64_exactly(const IRExpr *expr, ULong n)
+{
+   return expr->tag == Iex_Const && expr->Iex.Const.con->tag == Ico_U64
+          && expr->Iex.Const.con->Ico.U64 == n;
+}
+
 
 /* The returned expression is NULL if no specialization was found. In that
    case the helper function will be called. Otherwise, the expression has
@@ -1888,6 +1918,14 @@ guest_s390x_spechelper(const HChar *function_name, IRExpr **args,
             return unop(Iop_1Uto32, binop(Iop_CmpNE64, cc_dep1, cc_dep2));
          }
          if (cond == 4 || cond == 4 + 1) {
+            if (isC64_exactly(cc_dep2, 0)) {
+               /*     dep1 <signed 0
+                  --> m.s.bit of dep1 == 1 */
+               return unop(Iop_64to32,
+                           binop(Iop_And64,
+                                 binop(Iop_Shr64, cc_dep1, mkU8(63)),
+                                 mkU64(1)));
+            }
             return unop(Iop_1Uto32, binop(Iop_CmpLT64S, cc_dep1, cc_dep2));
          }
          if (cond == 8 + 4 || cond == 8 + 4 + 1) {
@@ -1895,9 +1933,25 @@ guest_s390x_spechelper(const HChar *function_name, IRExpr **args,
          }
          /* cc_dep1 > cc_dep2  ---->  cc_dep2 < cc_dep1 */
          if (cond == 2 || cond == 2 + 1) {
+            /* If we ever need the counterpart of the bug387712 fix just
+               below, then here is the place.  We'll need to give an
+               alternative expression for the case "cc_dep2 <s 0".  From a
+               bit of simple testing, I've yet to see any such cases,
+               however. */
             return unop(Iop_1Uto32, binop(Iop_CmpLT64S, cc_dep2, cc_dep1));
          }
          if (cond == 8 + 2 || cond == 8 + 2 + 1) {
+            if (isC64_exactly(cc_dep2, 0)) {
+               /*     0    <=signed dep1
+                  --> dep1 >=signed 0
+                  --> m.s.bit of dep1 == 0 */
+               /* See bug 387712.  This is an old trick from gcc to extract
+                  the most significant bit of a word. */
+               return unop(Iop_64to32,
+                           binop(Iop_Xor64,
+                                 binop(Iop_Shr64, cc_dep1, mkU8(63)),
+                                 mkU64(1)));
+            }
             return unop(Iop_1Uto32, binop(Iop_CmpLE64S, cc_dep2, cc_dep1));
          }
          if (cond == 8 + 4 + 2 || cond == 8 + 4 + 2 + 1) {
@@ -2411,6 +2465,282 @@ guest_s390x_spechelper(const HChar *function_name, IRExpr **args,
    return NULL;
 }
 
+/*------------------------------------------------------------*/
+/*--- Dirty helper for vector instructions                 ---*/
+/*------------------------------------------------------------*/
+
+#if defined(VGA_s390x)
+ULong
+s390x_dirtyhelper_vec_op(VexGuestS390XState *guest_state,
+                         const ULong serialized)
+{
+   UInt psw;
+   s390x_vec_op_details_t details;
+   const s390x_vec_op_details_t* d = (const s390x_vec_op_details_t*) &details;
+
+   details.serialized = serialized;
+
+   vassert(d->op > S390_VEC_OP_INVALID && d->op < S390_VEC_OP_LAST);
+   static const UChar opcodes[][2] = {
+      {0x00, 0x00}, /* invalid */
+      {0xe7, 0x97}, /* VPKS */
+      {0xe7, 0x95}, /* VPKLS */
+      {0xe7, 0x82}, /* VFAE */
+      {0xe7, 0x80}, /* VFEE */
+      {0xe7, 0x81}, /* VFENE */
+      {0xe7, 0x5c}, /* VISTR */
+      {0xe7, 0x8a}, /* VSTRC */
+      {0xe7, 0xf8}, /* VCEQ */
+      {0xe7, 0xd8}, /* VTM */
+      {0xe7, 0xb4}, /* VGFM */
+      {0xe7, 0xbc}, /* VGFMA */
+      {0xe7, 0xab}, /* VMAH */
+      {0xe7, 0xa9}, /* VMALH */
+      {0xe7, 0xfb}, /* VCH */
+      {0xe7, 0xf9}, /* VCHL */
+      {0xe7, 0xe8}, /* VFCE */
+      {0xe7, 0xeb}, /* VFCH */
+      {0xe7, 0xea}, /* VFCHE */
+      {0xe7, 0x4a}  /* VFTCI */
+   };
+
+   union {
+      struct {
+        unsigned int op1 : 8;
+        unsigned int v1  : 4;
+        unsigned int v2  : 4;
+        unsigned int v3  : 4;
+        unsigned int     : 4;
+        unsigned int m5  : 4;
+        unsigned int     : 4;
+        unsigned int m4  : 4;
+        unsigned int rxb : 4;
+        unsigned int op2 : 8;
+      } VRR;
+      struct {
+        unsigned int op1 : 8;
+        unsigned int v1  : 4;
+        unsigned int v2  : 4;
+        unsigned int v3  : 4;
+        unsigned int m5  : 4;
+        unsigned int m6  : 4;
+        unsigned int     : 4;
+        unsigned int v4  : 4;
+        unsigned int rxb : 4;
+        unsigned int op2 : 8;
+      } VRRd;
+      struct {
+         UInt op1 : 8;
+         UInt v1  : 4;
+         UInt v2  : 4;
+         UInt v3  : 4;
+         UInt     : 4;
+         UInt m6  : 4;
+         UInt m5  : 4;
+         UInt m4  : 4;
+         UInt rxb : 4;
+         UInt op2 : 8;
+      } VRRc;
+      struct {
+         UInt op1 : 8;
+         UInt v1  : 4;
+         UInt v2  : 4;
+         UInt i3  : 12;
+         UInt m5  : 4;
+         UInt m4  : 4;
+         UInt rxb : 4;
+         UInt op2 : 8;
+      } VRIe;
+      UChar bytes[6];
+   } the_insn;
+
+   the_insn.VRR.op1 = opcodes[d->op][0];
+   the_insn.bytes[1] = the_insn.bytes[2]
+      = the_insn.bytes[3] = the_insn.bytes[4] = 0;
+   the_insn.VRR.op2 = opcodes[d->op][1];
+
+   switch(d->op) {
+   case S390_VEC_OP_VISTR:
+      the_insn.VRR.v1 = 1;
+      the_insn.VRR.v2 = 2;
+      the_insn.VRR.rxb = 0b1100;
+      the_insn.VRR.m4 = d->m4;
+      the_insn.VRR.m5 = d->m5;
+      break;
+
+   case S390_VEC_OP_VTM:
+      the_insn.VRR.v1 = 2;
+      the_insn.VRR.v2 = 3;
+      the_insn.VRR.rxb = 0b1100;
+      break;
+
+   case S390_VEC_OP_VPKS:
+   case S390_VEC_OP_VPKLS:
+   case S390_VEC_OP_VFAE:
+   case S390_VEC_OP_VFEE:
+   case S390_VEC_OP_VFENE:
+   case S390_VEC_OP_VCEQ:
+   case S390_VEC_OP_VGFM:
+   case S390_VEC_OP_VCH:
+   case S390_VEC_OP_VCHL:
+      the_insn.VRR.v1 = 1;
+      the_insn.VRR.v2 = 2;
+      the_insn.VRR.v3 = 3;
+      the_insn.VRR.rxb = 0b1110;
+      the_insn.VRR.m4 = d->m4;
+      the_insn.VRR.m5 = d->m5;
+      break;
+
+   case S390_VEC_OP_VSTRC:
+   case S390_VEC_OP_VGFMA:
+   case S390_VEC_OP_VMAH:
+   case S390_VEC_OP_VMALH:
+      the_insn.VRRd.v1 = 1;
+      the_insn.VRRd.v2 = 2;
+      the_insn.VRRd.v3 = 3;
+      the_insn.VRRd.v4 = 4;
+      the_insn.VRRd.rxb = 0b1111;
+      the_insn.VRRd.m5 = d->m4;
+      the_insn.VRRd.m6 = d->m5;
+      break;
+
+   case S390_VEC_OP_VFCE:
+   case S390_VEC_OP_VFCH:
+   case S390_VEC_OP_VFCHE:
+      the_insn.VRRc.v1 = 1;
+      the_insn.VRRc.v2 = 2;
+      the_insn.VRRc.v3 = 3;
+      the_insn.VRRc.rxb = 0b1110;
+      the_insn.VRRc.m4 = d->m4;
+      the_insn.VRRc.m5 = d->m5;
+      the_insn.VRRc.m6 = d->m6;
+      break;
+
+   case S390_VEC_OP_VFTCI:
+      the_insn.VRIe.v1 = 1;
+      the_insn.VRIe.v2 = 2;
+      the_insn.VRIe.rxb = 0b1100;
+      the_insn.VRIe.i3 = d->i3;
+      the_insn.VRIe.m4 = d->m4;
+      the_insn.VRIe.m5 = d->m5;
+      break;
+
+   default:
+      vex_printf("operation = %d\n", d->op);
+      vpanic("s390x_dirtyhelper_vec_op: unknown operation");
+   }
+
+   const V128* guest_v = &(guest_state->guest_v0);
+   __asm__ volatile (
+      "lgr %%r10, %[arg1]\n"
+      VL(2, 0, a, 000, 8)
+      "lgr %%r10, %[arg2]\n"
+      VL(3, 0, a, 000, 8)
+      "lgr %%r10, %[arg3]\n"
+      VL(4, 0, a, 000, 8)
+      "ex %[zero], %[insn]\n"
+
+      "cijne %[read_only], 0, return_cc\n"
+      "lgr %%r10, %[res]\n"
+      VST(1, 0, a, 000, 8)
+
+      "return_cc: "
+      "ipm %[psw]\n\t"
+         : [psw] "=d" (psw)
+
+         : [res]  "r" (&guest_v[d->v1]),
+           [arg1] "r" (&guest_v[d->v2]),
+           [arg2] "r" (&guest_v[d->v3]),
+           [arg3] "r" (&guest_v[d->v4]),
+
+           [zero] "r" (0ULL),
+           [insn] "m" (the_insn),
+           [read_only] "r" (d->read_only)
+
+         : "cc", "r10", "v16", "v17", "v18", "v19"
+      );
+
+   return psw >> 28;   /* cc */
+}
+
+#else
+
+ULong
+s390x_dirtyhelper_vec_op(VexGuestS390XState *guest_state,
+                         const ULong serialized)
+{ return 0; }
+
+#endif
+
+/*-----------------------------------------------------------------*/
+/*--- Dirty helper for Perform Pseudorandom number instruction  ---*/
+/*-----------------------------------------------------------------*/
+
+/* Dummy helper that is needed to indicate load of parameter block.
+   We have to use it because dirty helper cannot have two memory side
+   effects.
+ */
+void s390x_dirtyhelper_PPNO_sha512_load_param_block( void )
+{
+}
+
+#if defined(VGA_s390x)
+
+/* IMPORTANT!
+   We return here bit mask where only supported functions are set to one.
+   If you implement new functions don't forget the supported array.
+ */
+void
+s390x_dirtyhelper_PPNO_query(VexGuestS390XState *guest_state, ULong r1, ULong r2)
+{
+   ULong supported[2] = {0x9000000000000000ULL, 0x0000000000000000ULL};
+   ULong *result = (ULong*) guest_state->guest_r1;
+
+   result[0] = supported[0];
+   result[1] = supported[1];
+}
+
+ULong
+s390x_dirtyhelper_PPNO_sha512(VexGuestS390XState *guest_state, ULong r1, ULong r2)
+{
+   ULong* op1 = (ULong*) (((ULong)(&guest_state->guest_r0)) + r1 * sizeof(ULong));
+   ULong* op2 = (ULong*) (((ULong)(&guest_state->guest_r0)) + r2 * sizeof(ULong));
+
+   register ULong reg0 asm("0") = guest_state->guest_r0;
+   register ULong reg1 asm("1") = guest_state->guest_r1;
+   register ULong reg2 asm("2") = op1[0];
+   register ULong reg3 asm("3") = op1[1];
+   register ULong reg4 asm("4") = op2[0];
+   register ULong reg5 asm("5") = op2[1];
+
+   ULong cc = 0;
+   asm volatile(".insn rre, 0xb93c0000, %%r2, %%r4\n"
+                "ipm %[cc]\n"
+                "srl %[cc], 28\n"
+                : "+d"(reg0), "+d"(reg1),
+                  "+d"(reg2), "+d"(reg3),
+                  "+d"(reg4), "+d"(reg5),
+                  [cc] "=d"(cc)
+                :
+                : "cc", "memory");
+
+   return cc;
+}
+
+#else
+
+void
+s390x_dirtyhelper_PPNO_query(VexGuestS390XState *guest_state, ULong r1, ULong r2)
+{
+}
+
+ULong
+s390x_dirtyhelper_PPNO_sha512(VexGuestS390XState *guest_state, ULong r1, ULong r2)
+{
+   return 0;
+}
+
+#endif /* VGA_s390x */
 /*---------------------------------------------------------------*/
 /*--- end                                guest_s390_helpers.c ---*/
 /*---------------------------------------------------------------*/
diff --git a/priv/guest_s390_toIR.c b/priv/guest_s390_toIR.c
index 85f42a2b0..8599e5e2e 100644
--- a/priv/guest_s390_toIR.c
+++ b/priv/guest_s390_toIR.c
@@ -8,7 +8,7 @@
    This file is part of Valgrind, a dynamic binary instrumentation
    framework.
 
-   Copyright IBM Corp. 2010-2016
+   Copyright IBM Corp. 2010-2017
 
    This program is free software; you can redistribute it and/or
    modify it under the terms of the GNU General Public License as
@@ -48,7 +48,7 @@
 /*------------------------------------------------------------*/
 /*--- Forward declarations                                 ---*/
 /*------------------------------------------------------------*/
-static UInt s390_decode_and_irgen(const UChar *, UInt, DisResult *, VexEndness);
+static UInt s390_decode_and_irgen(const UChar *, UInt, DisResult *);
 static void s390_irgen_xonc(IROp, IRTemp, IRTemp, IRTemp);
 static void s390_irgen_CLC_EX(IRTemp, IRTemp, IRTemp);
 
@@ -86,6 +86,7 @@ typedef enum {
    S390_DECODE_UNKNOWN_INSN,
    S390_DECODE_UNIMPLEMENTED_INSN,
    S390_DECODE_UNKNOWN_SPECIAL_INSN,
+   S390_DECODE_SPECIFICATION_EXCEPTION,
    S390_DECODE_ERROR
 } s390_decode_t;
 
@@ -94,227 +95,165 @@ typedef enum {
 /*--- Instruction formats.                                 ---*/
 /*------------------------------------------------------------*/
 
-#define E_op(insn) ((insn) & 0xffff)
-#define I_op(insn) (((insn) >> 8) & 0xff)
 #define I_i(insn) ((insn) & 0xff)
-#define RR_op(insn) (((insn) >> 8) & 0xff)
 #define RR_r1(insn) (((insn) >> 4) & 0xf)
 #define RR_r2(insn) ((insn) & 0xf)
-#define RI_op1(insn) (((insn) >> 24) & 0xff)
 #define RI_r1(insn) (((insn) >> 20) & 0xf)
-#define RI_op2(insn) (((insn) >> 16) & 0xf)
 #define RI_i2(insn) ((insn) & 0xffff)
-#define RRE_op(insn) (((insn) >> 16) & 0xffff)
 #define RRE_r1(insn) (((insn) >> 4) & 0xf)
 #define RRE_r2(insn) ((insn) & 0xf)
-#define RRF_op(insn) (((insn) >> 16) & 0xffff)
 #define RRF_r1(insn) (((insn) >> 12) & 0xf)
 #define RRF_r3(insn) (((insn) >> 4) & 0xf)
 #define RRF_r2(insn) ((insn) & 0xf)
-#define RRF2_op(insn) (((insn) >> 16) & 0xffff)
 #define RRF2_m3(insn) (((insn) >> 12) & 0xf)
 #define RRF2_m4(insn) (((insn) >> 8) & 0xf)
 #define RRF2_r1(insn) (((insn) >> 4) & 0xf)
 #define RRF2_r2(insn) ((insn) & 0xf)
-#define RRF3_op(insn) (((insn) >> 16) & 0xffff)
 #define RRF3_r3(insn) (((insn) >> 12) & 0xf)
 #define RRF3_r1(insn) (((insn) >> 4) & 0xf)
 #define RRF3_r2(insn) ((insn) & 0xf)
-#define RRR_op(insn) (((insn) >> 16) & 0xffff)
-#define RRR_r3(insn) (((insn) >> 12) & 0xf)
-#define RRR_r1(insn) (((insn) >> 4) & 0xf)
-#define RRR_r2(insn) ((insn) & 0xf)
-#define RRF4_op(insn) (((insn) >> 16) & 0xffff)
 #define RRF4_r3(insn) (((insn) >> 12) & 0xf)
 #define RRF4_m4(insn) (((insn) >> 8) & 0xf)
 #define RRF4_r1(insn) (((insn) >> 4) & 0xf)
 #define RRF4_r2(insn) ((insn) & 0xf)
-#define RRF5_op(insn) (((insn) >> 16) & 0xffff)
 #define RRF5_m4(insn) (((insn) >> 8) & 0xf)
 #define RRF5_r1(insn) (((insn) >> 4) & 0xf)
 #define RRF5_r2(insn) ((insn) & 0xf)
-#define RS_op(insn) (((insn) >> 24) & 0xff)
 #define RS_r1(insn) (((insn) >> 20) & 0xf)
 #define RS_r3(insn) (((insn) >> 16) & 0xf)
 #define RS_b2(insn) (((insn) >> 12) & 0xf)
 #define RS_d2(insn) ((insn) & 0xfff)
-#define RSI_op(insn) (((insn) >> 24) & 0xff)
 #define RSI_r1(insn) (((insn) >> 20) & 0xf)
 #define RSI_r3(insn) (((insn) >> 16) & 0xf)
 #define RSI_i2(insn) ((insn) & 0xffff)
-#define RX_op(insn) (((insn) >> 24) & 0xff)
 #define RX_r1(insn) (((insn) >> 20) & 0xf)
 #define RX_x2(insn) (((insn) >> 16) & 0xf)
 #define RX_b2(insn) (((insn) >> 12) & 0xf)
 #define RX_d2(insn) ((insn) & 0xfff)
-#define S_op(insn) (((insn) >> 16) & 0xffff)
 #define S_b2(insn) (((insn) >> 12) & 0xf)
 #define S_d2(insn) ((insn) & 0xfff)
-#define SI_op(insn) (((insn) >> 24) & 0xff)
 #define SI_i2(insn) (((insn) >> 16) & 0xff)
 #define SI_b1(insn) (((insn) >> 12) & 0xf)
 #define SI_d1(insn) ((insn) & 0xfff)
-#define RIE_op1(insn) (((insn) >> 56) & 0xff)
 #define RIE_r1(insn) (((insn) >> 52) & 0xf)
 #define RIE_r3(insn) (((insn) >> 48) & 0xf)
 #define RIE_i2(insn) (((insn) >> 32) & 0xffff)
-#define RIE_op2(insn) (((insn) >> 16) & 0xff)
-#define RIE_RRUUU_op1(insn) (((insn) >> 56) & 0xff)
 #define RIE_RRUUU_r1(insn) (((insn) >> 52) & 0xf)
 #define RIE_RRUUU_r2(insn) (((insn) >> 48) & 0xf)
 #define RIE_RRUUU_i3(insn) (((insn) >> 40) & 0xff)
 #define RIE_RRUUU_i4(insn) (((insn) >> 32) & 0xff)
 #define RIE_RRUUU_i5(insn) (((insn) >> 24) & 0xff)
-#define RIE_RRUUU_op2(insn) (((insn) >> 16) & 0xff)
-#define RIEv1_op1(insn) (((insn) >> 56) & 0xff)
 #define RIEv1_r1(insn) (((insn) >> 52) & 0xf)
 #define RIEv1_i2(insn) (((insn) >> 32) & 0xffff)
 #define RIEv1_m3(insn) (((insn) >> 28) & 0xf)
-#define RIEv1_op2(insn) (((insn) >> 16) & 0xff)
-#define RIE_RRPU_op1(insn) (((insn) >> 56) & 0xff)
 #define RIE_RRPU_r1(insn) (((insn) >> 52) & 0xf)
 #define RIE_RRPU_r2(insn) (((insn) >> 48) & 0xf)
 #define RIE_RRPU_i4(insn) (((insn) >> 32) & 0xffff)
 #define RIE_RRPU_m3(insn) (((insn) >> 28) & 0xf)
-#define RIE_RRPU_op2(insn) (((insn) >> 16) & 0xff)
-#define RIEv3_op1(insn) (((insn) >> 56) & 0xff)
 #define RIEv3_r1(insn) (((insn) >> 52) & 0xf)
 #define RIEv3_m3(insn) (((insn) >> 48) & 0xf)
 #define RIEv3_i4(insn) (((insn) >> 32) & 0xffff)
 #define RIEv3_i2(insn) (((insn) >> 24) & 0xff)
-#define RIEv3_op2(insn) (((insn) >> 16) & 0xff)
-#define RIL_op1(insn) (((insn) >> 56) & 0xff)
 #define RIL_r1(insn) (((insn) >> 52) & 0xf)
-#define RIL_op2(insn) (((insn) >> 48) & 0xf)
 #define RIL_i2(insn) (((insn) >> 16) & 0xffffffff)
-#define RIS_op1(insn) (((insn) >> 56) & 0xff)
 #define RIS_r1(insn) (((insn) >> 52) & 0xf)
 #define RIS_m3(insn) (((insn) >> 48) & 0xf)
 #define RIS_b4(insn) (((insn) >> 44) & 0xf)
 #define RIS_d4(insn) (((insn) >> 32) & 0xfff)
 #define RIS_i2(insn) (((insn) >> 24) & 0xff)
-#define RIS_op2(insn) (((insn) >> 16) & 0xff)
-#define RRS_op1(insn) (((insn) >> 56) & 0xff)
 #define RRS_r1(insn) (((insn) >> 52) & 0xf)
 #define RRS_r2(insn) (((insn) >> 48) & 0xf)
 #define RRS_b4(insn) (((insn) >> 44) & 0xf)
 #define RRS_d4(insn) (((insn) >> 32) & 0xfff)
 #define RRS_m3(insn) (((insn) >> 28) & 0xf)
-#define RRS_op2(insn) (((insn) >> 16) & 0xff)
-#define RSL_op1(insn) (((insn) >> 56) & 0xff)
-#define RSL_l1(insn) (((insn) >> 52) & 0xf)
-#define RSL_b1(insn) (((insn) >> 44) & 0xf)
-#define RSL_d1(insn) (((insn) >> 32) & 0xfff)
-#define RSL_op2(insn) (((insn) >> 16) & 0xff)
-#define RSY_op1(insn) (((insn) >> 56) & 0xff)
 #define RSY_r1(insn) (((insn) >> 52) & 0xf)
 #define RSY_r3(insn) (((insn) >> 48) & 0xf)
 #define RSY_b2(insn) (((insn) >> 44) & 0xf)
 #define RSY_dl2(insn) (((insn) >> 32) & 0xfff)
 #define RSY_dh2(insn) (((insn) >> 24) & 0xff)
-#define RSY_op2(insn) (((insn) >> 16) & 0xff)
-#define RXE_op1(insn) (((insn) >> 56) & 0xff)
 #define RXE_r1(insn) (((insn) >> 52) & 0xf)
 #define RXE_x2(insn) (((insn) >> 48) & 0xf)
 #define RXE_b2(insn) (((insn) >> 44) & 0xf)
 #define RXE_d2(insn) (((insn) >> 32) & 0xfff)
 #define RXE_m3(insn) (((insn) >> 28) & 0xf)
-#define RXE_op2(insn) (((insn) >> 16) & 0xff)
-#define RXF_op1(insn) (((insn) >> 56) & 0xff)
 #define RXF_r3(insn) (((insn) >> 52) & 0xf)
 #define RXF_x2(insn) (((insn) >> 48) & 0xf)
 #define RXF_b2(insn) (((insn) >> 44) & 0xf)
 #define RXF_d2(insn) (((insn) >> 32) & 0xfff)
 #define RXF_r1(insn) (((insn) >> 28) & 0xf)
-#define RXF_op2(insn) (((insn) >> 16) & 0xff)
-#define RXY_op1(insn) (((insn) >> 56) & 0xff)
 #define RXY_r1(insn) (((insn) >> 52) & 0xf)
 #define RXY_x2(insn) (((insn) >> 48) & 0xf)
 #define RXY_b2(insn) (((insn) >> 44) & 0xf)
 #define RXY_dl2(insn) (((insn) >> 32) & 0xfff)
 #define RXY_dh2(insn) (((insn) >> 24) & 0xff)
-#define RXY_op2(insn) (((insn) >> 16) & 0xff)
-#define SIY_op1(insn) (((insn) >> 56) & 0xff)
 #define SIY_i2(insn) (((insn) >> 48) & 0xff)
 #define SIY_b1(insn) (((insn) >> 44) & 0xf)
 #define SIY_dl1(insn) (((insn) >> 32) & 0xfff)
 #define SIY_dh1(insn) (((insn) >> 24) & 0xff)
-#define SIY_op2(insn) (((insn) >> 16) & 0xff)
-#define SS_op(insn) (((insn) >> 56) & 0xff)
 #define SS_l(insn) (((insn) >> 48) & 0xff)
 #define SS_b1(insn) (((insn) >> 44) & 0xf)
 #define SS_d1(insn) (((insn) >> 32) & 0xfff)
 #define SS_b2(insn) (((insn) >> 28) & 0xf)
 #define SS_d2(insn) (((insn) >> 16) & 0xfff)
-#define SS_LLRDRD_op(insn) (((insn) >> 56) & 0xff)
-#define SS_LLRDRD_l1(insn) (((insn) >> 52) & 0xf)
-#define SS_LLRDRD_l2(insn) (((insn) >> 48) & 0xf)
-#define SS_LLRDRD_b1(insn) (((insn) >> 44) & 0xf)
-#define SS_LLRDRD_d1(insn) (((insn) >> 32) & 0xfff)
-#define SS_LLRDRD_b2(insn) (((insn) >> 28) & 0xf)
-#define SS_LLRDRD_d2(insn) (((insn) >> 16) & 0xfff)
-#define SS_RRRDRD2_op(insn) (((insn) >> 56) & 0xff)
-#define SS_RRRDRD2_r1(insn) (((insn) >> 52) & 0xf)
-#define SS_RRRDRD2_r3(insn) (((insn) >> 48) & 0xf)
-#define SS_RRRDRD2_b2(insn) (((insn) >> 44) & 0xf)
-#define SS_RRRDRD2_d2(insn) (((insn) >> 32) & 0xfff)
-#define SS_RRRDRD2_b4(insn) (((insn) >> 28) & 0xf)
-#define SS_RRRDRD2_d4(insn) (((insn) >> 16) & 0xfff)
-#define SSE_op(insn) (((insn) >> 48) & 0xffff)
-#define SSE_b1(insn) (((insn) >> 44) & 0xf)
-#define SSE_d1(insn) (((insn) >> 32) & 0xfff)
-#define SSE_b2(insn) (((insn) >> 28) & 0xf)
-#define SSE_d2(insn) (((insn) >> 16) & 0xfff)
-#define SSF_op1(insn) (((insn) >> 56) & 0xff)
-#define SSF_r3(insn) (((insn) >> 52) & 0xf)
-#define SSF_op2(insn) (((insn) >> 48) & 0xf)
-#define SSF_b1(insn) (((insn) >> 44) & 0xf)
-#define SSF_d1(insn) (((insn) >> 32) & 0xfff)
-#define SSF_b2(insn) (((insn) >> 28) & 0xf)
-#define SSF_d2(insn) (((insn) >> 16) & 0xfff)
-#define SIL_op(insn) (((insn) >> 48) & 0xffff)
 #define SIL_b1(insn) (((insn) >> 44) & 0xf)
 #define SIL_d1(insn) (((insn) >> 32) & 0xfff)
 #define SIL_i2(insn) (((insn) >> 16) & 0xffff)
-#define VRX_op1(insn) (((insn) >> 56) & 0xff)
 #define VRX_v1(insn) (((insn) >> 52) & 0xf)
 #define VRX_x2(insn) (((insn) >> 48) & 0xf)
 #define VRX_b2(insn) (((insn) >> 44) & 0xf)
 #define VRX_d2(insn) (((insn) >> 32) & 0xfff)
 #define VRX_m3(insn) (((insn) >> 28) & 0xf)
 #define VRX_rxb(insn) (((insn) >> 24) & 0xf)
-#define VRX_op2(insn) (((insn) >> 16) & 0xff)
-#define VRR_op1(insn) (((insn) >> 56) & 0xff)
 #define VRR_v1(insn) (((insn) >> 52) & 0xf)
 #define VRR_v2(insn) (((insn) >> 48) & 0xf)
 #define VRR_r3(insn) (((insn) >> 44) & 0xf)
 #define VRR_m5(insn) (((insn) >> 36) & 0xf)
 #define VRR_m4(insn) (((insn) >> 28) & 0xf)
 #define VRR_rxb(insn) (((insn) >> 24) & 0xf)
-#define VRR_op2(insn) (((insn) >> 16) & 0xff)
-#define VRI_op1(insn) (((insn) >> 56) & 0xff)
+#define VRRa_v1(insn) (((insn) >> 52) & 0xf)
+#define VRRa_v2(insn) (((insn) >> 48) & 0xf)
+#define VRRa_v3(insn) (((insn) >> 44) & 0xf)
+#define VRRa_m5(insn) (((insn) >> 36) & 0xf)
+#define VRRa_m4(insn) (((insn) >> 32) & 0xf)
+#define VRRa_m3(insn) (((insn) >> 28) & 0xf)
+#define VRRa_rxb(insn) (((insn) >> 24) & 0xf)
+#define VRRd_v1(insn) (((insn) >> 52) & 0xf)
+#define VRRd_v2(insn) (((insn) >> 48) & 0xf)
+#define VRRd_v3(insn) (((insn) >> 44) & 0xf)
+#define VRRd_m5(insn) (((insn) >> 40) & 0xf)
+#define VRRd_m6(insn) (((insn) >> 36) & 0xf)
+#define VRRd_v4(insn) (((insn) >> 28) & 0xf)
+#define VRRd_rxb(insn) (((insn) >> 24) & 0xf)
+#define VRRe_v1(insn) (((insn) >> 52) & 0xf)
+#define VRRe_v2(insn) (((insn) >> 48) & 0xf)
+#define VRRe_v3(insn) (((insn) >> 44) & 0xf)
+#define VRRe_m6(insn) (((insn) >> 40) & 0xf)
+#define VRRe_m5(insn) (((insn) >> 32) & 0xf)
+#define VRRe_v4(insn) (((insn) >> 28) & 0xf)
+#define VRRe_rxb(insn) (((insn) >> 24) & 0xf)
 #define VRI_v1(insn) (((insn) >> 52) & 0xf)
 #define VRI_v3(insn) (((insn) >> 48) & 0xf)
 #define VRI_i2(insn) (((insn) >> 32) & 0xffff)
 #define VRI_m3(insn) (((insn) >> 28) & 0xf)
 #define VRI_rxb(insn) (((insn) >> 24) & 0xf)
-#define VRI_op2(insn) (((insn) >> 16) & 0xff)
-#define VRS_op1(insn) (((insn) >> 56) & 0xff)
+#define VRId_v1(insn) (((insn) >> 52) & 0xf)
+#define VRId_v2(insn) (((insn) >> 48) & 0xf)
+#define VRId_v3(insn) (((insn) >> 44) & 0xf)
+#define VRId_i4(insn) (((insn) >> 32) & 0xff)
+#define VRId_m5(insn) (((insn) >> 28) & 0xf)
+#define VRId_rxb(insn) (((insn) >> 24) & 0xf)
+#define VRIe_v1(insn) (((insn) >> 52) & 0xf)
+#define VRIe_v2(insn) (((insn) >> 48) & 0xf)
+#define VRIe_i3(insn) (((insn) >> 36) & 0xfff)
+#define VRIe_m5(insn) (((insn) >> 32) & 0xf)
+#define VRIe_m4(insn) (((insn) >> 28) & 0xf)
+#define VRIe_rxb(insn) (((insn) >> 24) & 0xf)
 #define VRS_v1(insn) (((insn) >> 52) & 0xf)
 #define VRS_v3(insn) (((insn) >> 48) & 0xf)
 #define VRS_b2(insn) (((insn) >> 44) & 0xf)
 #define VRS_d2(insn) (((insn) >> 32) & 0xfff)
 #define VRS_m4(insn) (((insn) >> 28) & 0xf)
 #define VRS_rxb(insn) (((insn) >> 24) & 0xf)
-#define VRS_op2(insn) (((insn) >> 16) & 0xff)
-#define VRV_op1(insn) (((insn) >> 56) & 0xff)
-#define VRV_v1(insn) (((insn) >> 52) & 0xf)
-#define VRV_v2(insn) (((insn) >> 48) & 0xf)
-#define VRV_b2(insn) (((insn) >> 44) & 0xf)
-#define VRV_d2(insn) (((insn) >> 32) & 0xfff)
-#define VRV_m3(insn) (((insn) >> 28) & 0xf)
-#define VRV_rxb(insn) (((insn) >> 24) & 0xf)
-#define VRV_op2(insn) (((insn) >> 16) & 0xff)
 
 
 /*------------------------------------------------------------*/
@@ -648,6 +587,26 @@ yield_if(IRExpr *condition)
                     S390X_GUEST_OFFSET(guest_IA)));
 }
 
+/* Convenience macro to yield a specification exception if the given condition
+   is not met.  Used to pass this type of decoding error up through the call
+   chain. */
+#define s390_insn_assert(mnm, cond)             \
+   do {                                         \
+      if (!(cond)) {                            \
+         dis_res->whatNext = Dis_StopHere;      \
+         dis_res->jk_StopHere = Ijk_NoDecode;   \
+         return (mnm);                          \
+      }                                         \
+   } while (0)
+
+/* Convenience function to check for a specification exception. */
+static Bool
+is_specification_exception(void)
+{
+   return (dis_res->whatNext == Dis_StopHere &&
+           dis_res->jk_StopHere == Ijk_NoDecode);
+}
+
 static __inline__ IRExpr *get_fpr_dw0(UInt);
 static __inline__ void    put_fpr_dw0(UInt, IRExpr *);
 static __inline__ IRExpr *get_dpr_dw0(UInt);
@@ -975,12 +934,18 @@ s390_cc_thunk_put1d128Z(UInt opc, IRTemp d1, IRTemp nd)
    s390_cc_thunk_fill(op, hi, lox, ndep);
 }
 
+static void
+s390_cc_set(IRTemp cc)
+{
+   vassert(typeOfIRTemp(irsb->tyenv, cc) == Ity_I64);
+
+   s390_cc_thunk_fill(mkU64(S390_CC_OP_SET), mkexpr(cc), mkU64(0), mkU64(0));
+}
 
 static void
-s390_cc_set(UInt val)
+s390_cc_set_val(UInt val)
 {
-   s390_cc_thunk_fill(mkU64(S390_CC_OP_SET),
-                      mkU64(val), mkU64(0), mkU64(0));
+   s390_cc_thunk_fill(mkU64(S390_CC_OP_SET), mkU64(val), mkU64(0), mkU64(0));
 }
 
 /* Build IR to calculate the condition code from flags thunk.
@@ -1146,27 +1111,29 @@ get_ar_w0(UInt archreg)
 /*--- fpr registers                                        ---*/
 /*------------------------------------------------------------*/
 
-/* Return the guest state offset of a fpr register. */
+/* Return the guest state offset of a fpr register.
+   FPRs are maped to first doubleword of VRs.
+*/
 static UInt
 fpr_offset(UInt archreg)
 {
    static const UInt offset[16] = {
-      S390X_GUEST_OFFSET(guest_f0),
-      S390X_GUEST_OFFSET(guest_f1),
-      S390X_GUEST_OFFSET(guest_f2),
-      S390X_GUEST_OFFSET(guest_f3),
-      S390X_GUEST_OFFSET(guest_f4),
-      S390X_GUEST_OFFSET(guest_f5),
-      S390X_GUEST_OFFSET(guest_f6),
-      S390X_GUEST_OFFSET(guest_f7),
-      S390X_GUEST_OFFSET(guest_f8),
-      S390X_GUEST_OFFSET(guest_f9),
-      S390X_GUEST_OFFSET(guest_f10),
-      S390X_GUEST_OFFSET(guest_f11),
-      S390X_GUEST_OFFSET(guest_f12),
-      S390X_GUEST_OFFSET(guest_f13),
-      S390X_GUEST_OFFSET(guest_f14),
-      S390X_GUEST_OFFSET(guest_f15),
+      S390X_GUEST_OFFSET(guest_v0),
+      S390X_GUEST_OFFSET(guest_v1),
+      S390X_GUEST_OFFSET(guest_v2),
+      S390X_GUEST_OFFSET(guest_v3),
+      S390X_GUEST_OFFSET(guest_v4),
+      S390X_GUEST_OFFSET(guest_v5),
+      S390X_GUEST_OFFSET(guest_v6),
+      S390X_GUEST_OFFSET(guest_v7),
+      S390X_GUEST_OFFSET(guest_v8),
+      S390X_GUEST_OFFSET(guest_v9),
+      S390X_GUEST_OFFSET(guest_v10),
+      S390X_GUEST_OFFSET(guest_v11),
+      S390X_GUEST_OFFSET(guest_v12),
+      S390X_GUEST_OFFSET(guest_v13),
+      S390X_GUEST_OFFSET(guest_v14),
+      S390X_GUEST_OFFSET(guest_v15),
    };
 
    vassert(archreg < 16);
@@ -1293,6 +1260,13 @@ gpr_w0_offset(UInt archreg)
    return gpr_offset(archreg) + 0;
 }
 
+/* Read an integer of given type from a gpr. */
+static __inline__ IRExpr *
+get_gpr_int(UInt archreg, IRType ty)
+{
+   return IRExpr_Get(gpr_offset(archreg) + 8 - sizeofIRType(ty), ty);
+}
+
 /* Write word #0 of a gpr to the guest state. */
 static __inline__ void
 put_gpr_w0(UInt archreg, IRExpr *expr)
@@ -1738,6 +1712,655 @@ get_fpc_w0(void)
 }
 
 
+/*------------------------------------------------------------*/
+/*--- vr registers                                        ---*/
+/*------------------------------------------------------------*/
+
+/* Return the guest state offset of a vr register. */
+static UInt
+vr_offset(const UInt archreg)
+{
+   static const UInt offset[32] = {
+      S390X_GUEST_OFFSET(guest_v0),
+      S390X_GUEST_OFFSET(guest_v1),
+      S390X_GUEST_OFFSET(guest_v2),
+      S390X_GUEST_OFFSET(guest_v3),
+      S390X_GUEST_OFFSET(guest_v4),
+      S390X_GUEST_OFFSET(guest_v5),
+      S390X_GUEST_OFFSET(guest_v6),
+      S390X_GUEST_OFFSET(guest_v7),
+      S390X_GUEST_OFFSET(guest_v8),
+      S390X_GUEST_OFFSET(guest_v9),
+      S390X_GUEST_OFFSET(guest_v10),
+      S390X_GUEST_OFFSET(guest_v11),
+      S390X_GUEST_OFFSET(guest_v12),
+      S390X_GUEST_OFFSET(guest_v13),
+      S390X_GUEST_OFFSET(guest_v14),
+      S390X_GUEST_OFFSET(guest_v15),
+      S390X_GUEST_OFFSET(guest_v16),
+      S390X_GUEST_OFFSET(guest_v17),
+      S390X_GUEST_OFFSET(guest_v18),
+      S390X_GUEST_OFFSET(guest_v19),
+      S390X_GUEST_OFFSET(guest_v20),
+      S390X_GUEST_OFFSET(guest_v21),
+      S390X_GUEST_OFFSET(guest_v22),
+      S390X_GUEST_OFFSET(guest_v23),
+      S390X_GUEST_OFFSET(guest_v24),
+      S390X_GUEST_OFFSET(guest_v25),
+      S390X_GUEST_OFFSET(guest_v26),
+      S390X_GUEST_OFFSET(guest_v27),
+      S390X_GUEST_OFFSET(guest_v28),
+      S390X_GUEST_OFFSET(guest_v29),
+      S390X_GUEST_OFFSET(guest_v30),
+      S390X_GUEST_OFFSET(guest_v31),
+   };
+
+   vassert(archreg < 32);
+
+   return offset[archreg];
+}
+
+/* Return the guest state offset of quadword of a vr register. */
+static UInt
+vr_qw_offset(const UInt archreg)
+{
+   return vr_offset(archreg) + 0;
+}
+
+/* Write quadword of a vr to the guest state. */
+static void
+put_vr_qw(const UInt archreg, IRExpr *expr)
+{
+   vassert(typeOfIRExpr(irsb->tyenv, expr) == Ity_V128);
+
+   stmt(IRStmt_Put(vr_qw_offset(archreg), expr));
+}
+
+/* Read quadword of a vr register. */
+static IRExpr *
+get_vr_qw(const UInt archreg)
+{
+   return IRExpr_Get(vr_qw_offset(archreg), Ity_V128);
+}
+
+/* Return the guest state offset of double word #0 of a gpr register. */
+static UInt
+vr_dw0_offset(UInt archreg)
+{
+   return vr_offset(archreg) + 0;
+}
+
+/* Read doubleword #0 of a vr register. */
+static IRExpr *
+get_vr_dw0(UInt archreg)
+{
+   return IRExpr_Get(vr_dw0_offset(archreg), Ity_I64);
+}
+
+/* Write double word #0 of a vr to the guest state. */
+static void
+put_vr_dw0(UInt archreg, IRExpr *expr)
+{
+   vassert(typeOfIRExpr(irsb->tyenv, expr) == Ity_I64);
+
+   stmt(IRStmt_Put(vr_dw0_offset(archreg), expr));
+}
+
+/* Return the guest state offset of double word #1 of a gpr register. */
+static UInt
+vr_dw1_offset(UInt archreg)
+{
+   return vr_offset(archreg) + 8;
+}
+
+/* Read doubleword #1 of a vr register. */
+static IRExpr *
+get_vr_dw1(UInt archreg)
+{
+   return IRExpr_Get(vr_dw1_offset(archreg), Ity_I64);
+}
+
+/* Write double word #0 of a vr to the guest state. */
+static void
+put_vr_dw1(UInt archreg, IRExpr *expr)
+{
+   vassert(typeOfIRExpr(irsb->tyenv, expr) == Ity_I64);
+
+   stmt(IRStmt_Put(vr_dw1_offset(archreg), expr));
+}
+
+/* Return the guest state offset of word #1 of a gpr register. */
+static UInt
+vr_w1_offset(UInt archreg)
+{
+   return vr_offset(archreg) + 4;
+}
+
+/* Return the guest state offset of word #3 of a gpr register. */
+static UInt
+vr_w3_offset(UInt archreg)
+{
+   return vr_offset(archreg) + 12;
+}
+
+/* Read word #0 of a vr register. */
+static IRExpr *
+get_vr_w0(UInt archreg)
+{
+   return IRExpr_Get(vr_dw0_offset(archreg), Ity_I32);
+}
+
+/* Read word #1 of a vr register. */
+static IRExpr *
+get_vr_w1(UInt archreg)
+{
+   return IRExpr_Get(vr_w1_offset(archreg), Ity_I32);
+}
+
+/* Read word #2 of a vr register. */
+static IRExpr *
+get_vr_w2(UInt archreg)
+{
+   return IRExpr_Get(vr_dw1_offset(archreg), Ity_I32);
+}
+
+/* Read word #3 of a vr register. */
+static IRExpr *
+get_vr_w3(UInt archreg)
+{
+   return IRExpr_Get(vr_w3_offset(archreg), Ity_I32);
+}
+
+/* Return the guest state offset of halfword #3 of a gpr register. */
+static UInt
+vr_hw3_offset(UInt archreg)
+{
+   return vr_offset(archreg) + 6;
+}
+
+/* Read halfword #3 of a vr register. */
+static IRExpr *
+get_vr_hw3(UInt archreg)
+{
+   return IRExpr_Get(vr_hw3_offset(archreg), Ity_I16);
+}
+
+/* Return the guest state offset of halfword #7 of a gpr register. */
+static UInt
+vr_hw7_offset(UInt archreg)
+{
+   return vr_offset(archreg) + 14;
+}
+
+/* Read halfword #7 of a vr register. */
+static IRExpr *
+get_vr_hw7(UInt archreg)
+{
+   return IRExpr_Get(vr_hw7_offset(archreg), Ity_I16);
+}
+
+/* Return the guest state offset of byte #7 of a vr register. */
+static UInt
+vr_b7_offset(UInt archreg)
+{
+   return vr_offset(archreg) + 7;
+}
+
+/* Read byte #7 of a vr register. */
+static IRExpr *
+get_vr_b7(UInt archreg)
+{
+   return IRExpr_Get(vr_b7_offset(archreg), Ity_I8);
+}
+
+/* Return the guest state offset of byte #15 of a vr register. */
+static UInt
+vr_b15_offset(UInt archreg)
+{
+   return vr_offset(archreg) + 15;
+}
+
+/* Read byte #15 of a vr register. */
+static IRExpr *
+get_vr_b15(UInt archreg)
+{
+   return IRExpr_Get(vr_b15_offset(archreg), Ity_I8);
+}
+
+/* Determine IRType by instruction's m3 field */
+static IRType
+s390_vr_get_type(const UChar m)
+{
+   static const IRType results[] = {Ity_I8, Ity_I16, Ity_I32, Ity_I64, Ity_V128};
+   if (m > 4) {
+      vex_printf("s390_vr_get_type: m=%x\n", m);
+      vpanic("s390_vr_get_type: reserved m value");
+   }
+
+   return results[m];
+}
+
+/* Determine if Condition Code Set (CS) flag is set in m field */
+#define s390_vr_is_cs_set(m) (((m) & 0x1) != 0)
+
+/* Determine if Zero Search (ZS) flag is set in m field */
+#define s390_vr_is_zs_set(m) (((m) & 0b0010) != 0)
+
+/* Check if the "Single-Element-Control" bit is set.
+   Used in vector FP instructions.
+ */
+#define s390_vr_is_single_element_control_set(m) (((m) & 0x8) != 0)
+
+/* Generates arg1 < arg2 (or arg1 <= arg2 if allow_equal == True) expression.
+   Arguments must have V128 type and are treated as unsigned 128-bit numbers.
+*/
+static IRExpr*
+s390_V128_compareLT128x1(IRExpr* arg1, IRExpr* arg2, Bool allow_equal)
+{
+   /* If high halves are equal
+      then we compare lower ones
+      otherwise we compare high halves.
+    */
+   IRExpr* result;
+   result = mkite(binop(Iop_CmpEQ64,
+                        unop(Iop_V128HIto64, arg1),
+                        unop(Iop_V128HIto64, arg2)
+                        ),
+                  unop(Iop_1Uto64,
+                       binop(allow_equal ? Iop_CmpLE64U : Iop_CmpLT64U,
+                             unop(Iop_V128to64, arg1),
+                             unop(Iop_V128to64, arg2)
+                            )
+                      ),
+                  unop(Iop_1Uto64,
+                       binop(Iop_CmpLT64U,
+                             unop(Iop_V128HIto64, arg1),
+                             unop(Iop_V128HIto64, arg2)
+                            )
+                      )
+                  );
+
+   return result;
+}
+
+/* Generates arg1 == 0 expression.
+   Argument must have V128 type and is treated as unsigned 128-bit number.
+*/
+static IRExpr*
+s390_V128_isZero(IRExpr* arg)
+{
+   IRExpr* high_or_low = binop(Iop_Or64,
+                               unop(Iop_V128to64, arg),
+                               unop(Iop_V128HIto64, arg)
+                              );
+
+   return unop(Iop_1Uto64, binop(Iop_CmpEQ64, high_or_low, mkU64(0ULL)));
+}
+
+/* Generate the two's complement for arg.
+   Arg should be V128.
+*/
+static IRExpr*
+s390_V128_get_complement(IRExpr* arg, IRType type)
+{
+   IRExpr* notArg = unop(Iop_NotV128, arg);
+   IRExpr* ones;
+   IRExpr* result;
+   switch(type) {
+   case Ity_I8:
+      ones = unop(Iop_Dup8x16, mkU8(0x01));
+      result = binop(Iop_Add8x16, notArg, ones);
+      break;
+   case Ity_I16:
+      ones = unop(Iop_Dup16x8, mkU16(0x0001));
+      result = binop(Iop_Add16x8, notArg, ones);
+      break;
+   case Ity_I32:
+      ones = unop(Iop_Dup32x4, mkU32(0x00000001));
+      result = binop(Iop_Add32x4, notArg, ones);
+      break;
+   case Ity_I64:
+      ones = binop(Iop_64HLtoV128, mkU64(0x1ULL), mkU64(0x1ULL));
+      result = binop(Iop_Add64x2, notArg, ones);
+      break;
+   case Ity_V128:
+      ones = binop(Iop_64HLtoV128, mkU64(0x0ULL), mkU64(0x1ULL));
+      result = binop(Iop_Add128x1, notArg, ones);
+      break;
+   default:
+      vpanic("s390_V128_get_complement: unknown type");
+   }
+
+   return result;
+}
+
+/* # Elements are treated as 128-bit unsigned integers
+   For i = 0; i < elemCount; i++ do:
+      sum = arg1[i] + arg2[i]
+      result[i] = carry_out_bit(sum)
+   end
+   return result
+ */
+static IRExpr*
+s390_V128_calculate_carry_out(IRExpr* arg1, IRExpr* arg2, IRType type,
+                              Bool allow_equal)
+{
+   IRTemp sum = newTemp(Ity_V128);
+   IRExpr* mask;
+   IRExpr* comparison;
+   IRExpr* result;
+   switch(type){
+   case Ity_I8:
+      assign(sum, binop(Iop_Add8x16, arg1, arg2));
+      mask = unop(Iop_Dup8x16, mkU8(0x1));
+      comparison = binop(Iop_CmpGT8Ux16, arg1, mkexpr(sum));
+      if(allow_equal) {
+         comparison = binop(Iop_OrV128, binop(Iop_CmpEQ8x16, arg1, mkexpr(sum)),
+                            comparison);
+      }
+      result = binop(Iop_AndV128, comparison, mask);
+      break;
+   case Ity_I16:
+      assign(sum, binop(Iop_Add16x8, arg1, arg2));
+      mask = unop(Iop_Dup16x8, mkU16(0x1));
+      comparison = binop(Iop_CmpGT16Ux8, arg1, mkexpr(sum));
+      if(allow_equal) {
+         comparison = binop(Iop_OrV128, binop(Iop_CmpEQ16x8, arg1, mkexpr(sum)),
+                            comparison);
+      }
+      result = binop(Iop_AndV128, comparison, mask);
+      break;
+   case Ity_I32:
+      assign(sum, binop(Iop_Add32x4, arg1, arg2));
+      mask = unop(Iop_Dup32x4, mkU32(0x1));
+      comparison = binop(Iop_CmpGT32Ux4, arg1, mkexpr(sum));
+      if(allow_equal) {
+         comparison = binop(Iop_OrV128, binop(Iop_CmpEQ32x4, arg1, mkexpr(sum)),
+                            comparison);
+      }
+      result = binop(Iop_AndV128, comparison, mask);
+      break;
+   case Ity_I64:
+      assign(sum, binop(Iop_Add64x2, arg1, arg2));
+      mask = binop(Iop_64HLtoV128, mkU64(0x1), mkU64(0x1));
+      comparison = binop(Iop_CmpGT64Ux2, arg1, mkexpr(sum));
+      if(allow_equal) {
+         comparison = binop(Iop_OrV128, binop(Iop_CmpEQ64x2, arg1, mkexpr(sum)),
+                            comparison);
+      }
+      result = binop(Iop_AndV128, comparison, mask);
+      break;
+   case Ity_V128:
+      assign(sum, binop(Iop_Add128x1, arg1, arg2));
+      comparison = s390_V128_compareLT128x1(mkexpr(sum), arg1, allow_equal);
+      result = binop(Iop_64HLtoV128, mkU64(0x0), comparison);
+      break;
+   default:
+      ppIRType(type);
+      vpanic("s390_V128_calculate_carry_out: unknown type");
+   }
+
+   return result;
+}
+
+/* # elemCount = 1 for now (elements are 128-bit unsigned integers)
+   For i = 0; i < elemCount; i++ do:
+      sum = arg1[i] + arg2[i] + arg3[i] & 0x1
+      result[i] = carry_out_bit(sum)
+   end
+   return result
+ */
+static IRExpr*
+s390_V128_calculate_carry_out_with_carry(IRExpr* arg1, IRExpr* arg2, IRExpr* arg3)
+{
+   IRTemp sum = newTemp(Ity_V128);
+   assign(sum, binop(Iop_Add128x1, arg1, arg2));
+
+   IRTemp overflow_before = newTemp(Ity_I64);
+   assign(overflow_before, s390_V128_compareLT128x1(mkexpr(sum), arg1, False));
+
+   IRExpr* mask = binop(Iop_64HLtoV128, mkU64(0), mkU64(1));
+   IRTemp carry_in = newTemp(Ity_V128);
+   assign(carry_in, binop(Iop_AndV128, arg3, mask));
+
+   IRExpr* carry_is_not_zero = unop(Iop_1Uto64,
+                                    binop(Iop_CmpNE64,
+                                          unop(Iop_V128to64, mkexpr(carry_in)),
+                                          mkU64(0ULL)
+                                         )
+                                    );
+
+   IRTemp sum_plus_carry = newTemp(Ity_V128);
+   assign(sum_plus_carry, binop(Iop_Add128x1, mkexpr(sum), mkexpr(carry_in)));
+
+   IRExpr* overflow_after = binop(Iop_And64,
+                                  carry_is_not_zero,
+                                  s390_V128_isZero(mkexpr(sum_plus_carry))
+                                  );
+
+   IRExpr* result = binop(Iop_Or64, mkexpr(overflow_before), overflow_after);
+   result = binop(Iop_64HLtoV128, mkU64(0Ull), result);
+   return result;
+}
+
+/* Performs "arg1 + arg2 + carry_out_bit(arg1 + arg2)".
+   Arguments and result are Ity_I32.
+*/
+static IRTemp
+s390_checksum_add(IRExpr* arg1, IRExpr* arg2)
+{
+   IRTemp sum = newTemp(Ity_I32);
+   IRTemp res = newTemp(Ity_I32);
+
+   assign(sum, binop(Iop_Add32, arg1, arg2));
+   assign(res,
+          mkite(binop(Iop_CmpLT32U, mkexpr(sum), arg1),
+                binop(Iop_Add32, mkexpr(sum), mkU32(1)),
+                mkexpr(sum))
+               );
+
+   return res;
+}
+
+/* Return the guest state offset of element with type's size and given index
+   of a vr register.
+*/
+static UInt
+s390_vr_offset_by_index(UInt archreg,IRType type, UChar index)
+{
+   switch (type) {
+   case Ity_I8:
+      if(index > 15) {
+         goto invalidIndex;
+      }
+      return vr_offset(archreg) + sizeof(UChar) * index;
+
+   case Ity_I16:
+      if(index > 7) {
+         goto invalidIndex;
+      }
+      return vr_offset(archreg) + sizeof(UShort) * index;
+
+   case Ity_I32:
+   case Ity_F32:
+      if(index > 3) {
+         goto invalidIndex;
+      }
+      return vr_offset(archreg) + sizeof(UInt) * index;
+
+   case Ity_I64:
+   case Ity_F64:
+      if(index > 1) {
+         goto invalidIndex;
+      }
+      return vr_offset(archreg) + sizeof(ULong) * index;
+   case Ity_V128:
+      if(index == 0) {
+         return vr_qw_offset(archreg);
+      } else {
+         goto invalidIndex;
+      }
+   default:
+      vpanic("s390_vr_offset_by_index: unknown type");
+   }
+
+   invalidIndex:
+      vex_printf("s390_vr_offset_by_index: index = %d ; type = ", index);
+      ppIRType(type);
+      vpanic("s390_vr_offset_by_index: invalid index for given type");
+}
+
+/* Write type sized element to indexed part of vr to the guest state. */
+static void
+put_vr(UInt archreg, IRType type, UChar index, IRExpr *expr)
+{
+   UInt offset = s390_vr_offset_by_index(archreg, type, index);
+   vassert(typeOfIRExpr(irsb->tyenv, expr) == type);
+
+   stmt(IRStmt_Put(offset, expr));
+}
+
+/* Read type sized part specified by index of a vr register. */
+static IRExpr *
+get_vr(UInt archreg, IRType type, UChar index)
+{
+   UInt offset = s390_vr_offset_by_index(archreg, type, index);
+   return IRExpr_Get(offset, type);
+}
+
+/* Calculates vr index according to instruction's rxb field
+   and position of vr in instruction.
+   Index of first argument must be 1 (not zero) */
+static UChar
+s390_vr_getVRindex(UChar v,UChar argNumber, UChar rxb)
+{
+   vassert(argNumber > 0 && argNumber <= 4);
+   vassert(rxb < 16);
+   return v | (((rxb) << argNumber) & 0b00010000);
+}
+
+static void
+s390_vr_fill(UChar v1, IRExpr *o2)
+{
+   IRType o2type = typeOfIRExpr(irsb->tyenv, o2);
+   switch (o2type) {
+   case Ity_I8:
+      put_vr_qw(v1, unop(Iop_Dup8x16, o2));
+      break;
+   case Ity_I16:
+      put_vr_qw(v1, unop(Iop_Dup16x8, o2));
+      break;
+   case Ity_I32:
+      put_vr_qw(v1, unop(Iop_Dup32x4, o2));
+      break;
+   case Ity_I64:
+      put_vr_qw(v1, binop(Iop_64HLtoV128, o2, o2));
+      break;
+   default:
+      ppIRType(o2type);
+      vpanic("s390_vr_fill: invalid IRType");
+   }
+}
+
+/* Returns Ity_I32 number of bytes till block boundary specified by m */
+static IRExpr*
+s390_getCountToBlockBoundary(IRTemp op2addr, UChar m)
+{
+   IRTemp boundary = newTemp(Ity_I32);
+   IRTemp sixteen = newTemp(Ity_I32);
+   IRTemp divisionResult = newTemp(Ity_I64);
+   IRTemp mod_result = newTemp(Ity_I32);
+   IRTemp output = newTemp(Ity_I32);
+
+   switch (m) {
+   case 0: assign(boundary, mkU32(64)); break;
+   case 1: assign(boundary, mkU32(128)); break;
+   case 2: assign(boundary, mkU32(256)); break;
+   case 3: assign(boundary, mkU32(512)); break;
+   case 4: assign(boundary, mkU32(1024)); break;
+   case 5: assign(boundary, mkU32(2048)); break;
+   case 6: assign(boundary, mkU32(4096)); break;
+   default:
+      vex_printf("m = %d\n", m);
+      vpanic("s390_getCountToBlockBoundary: invalid m");
+   }
+   assign(sixteen, mkU32(16));
+   assign(divisionResult,
+          binop(Iop_DivModU64to32, mkexpr(op2addr), mkexpr(boundary)));
+   assign(mod_result,
+          binop(Iop_Sub32,mkexpr(boundary),
+                unop(Iop_64HIto32, mkexpr(divisionResult))));
+
+   assign(output,
+          mkite(binop(Iop_CmpLE32U, mkexpr(sixteen), mkexpr(mod_result)),
+          mkexpr(sixteen),
+          mkexpr(mod_result)
+         ));
+
+   return mkexpr(output);
+}
+
+/* Load bytes into v1.
+   maxIndex specifies max index to load and must be Ity_I32.
+   If maxIndex >= 15, all 16 bytes are loaded.
+   All bytes after maxIndex are zeroed. */
+static void s390_vr_loadWithLength(UChar v1, IRTemp addr, IRExpr *maxIndex)
+{
+   IRTemp maxIdx = newTemp(Ity_I32);
+   IRTemp cappedMax = newTemp(Ity_I64);
+   IRTemp offset = newTemp(Ity_I64);
+   IRTemp zeroed = newTemp(Ity_I64);
+   IRTemp back = newTemp(Ity_I64);
+
+   /* Implement the insn with a single 16-byte load, to allow memcheck's
+      "partial-loads-OK" heuristic to apply.  Ensure that a page boundary is
+      crossed if and only if the real insn would have crossed it as well.
+      Thus, if the bytes to load are fully contained in an aligned 16-byte
+      chunk, load the whole 16-byte aligned chunk, and otherwise load 16 bytes
+      from the unaligned address.  Then shift the loaded data left-aligned
+      into the target vector register. */
+
+   assign(maxIdx, maxIndex);
+   assign(cappedMax, mkite(binop(Iop_CmpLT32U, mkexpr(maxIdx), mkU32(15)),
+                           unop(Iop_32Uto64, mkexpr(maxIdx)), mkU64(15)));
+   /* 'offset': addr's offset from last 16-byte aligned address
+      'zeroed': number of bytes to be zeroed in the target vector
+      'back': how much to subtract from addr before loading 16 bytes */
+   assign(offset, binop(Iop_And64, mkexpr(addr), mkU64(15)));
+   assign(zeroed, binop(Iop_Sub64, mkU64(15), mkexpr(cappedMax)));
+   assign(back, mkite(binop(Iop_CmpLE64U, mkexpr(offset), mkexpr(zeroed)),
+                      mkexpr(offset), mkU64(0)));
+
+   /* How much to shift the loaded 16-byte vector to the right, and then to
+      the left.  Since both 'zeroed' and 'back' range from 0 to 15, the shift
+      amounts range from 0 to 120. */
+   IRExpr *shrAmount = binop(Iop_Shl64,
+                             binop(Iop_Sub64, mkexpr(zeroed), mkexpr(back)),
+                             mkU8(3));
+   IRExpr *shlAmount = binop(Iop_Shl64, mkexpr(zeroed), mkU8(3));
+
+   put_vr_qw(v1, binop(Iop_ShlV128,
+                       binop(Iop_ShrV128,
+                             load(Ity_V128,
+                                  binop(Iop_Sub64, mkexpr(addr), mkexpr(back))),
+                             unop(Iop_64to8, shrAmount)),
+                       unop(Iop_64to8, shlAmount)));
+}
+
+/* Bitwise vCond ? v1 : v2
+   All args are V128.
+ */
+static IRExpr*
+s390_V128_bitwiseITE(IRExpr* vCond, IRExpr* v1, IRExpr* v2)
+{
+   IRTemp vc = newTemp(Ity_V128);
+   assign(vc, vCond);
+   /* result = (v1 & vCond) | (v2 & ~vCond) */
+   return binop(Iop_OrV128,
+                binop(Iop_AndV128, v1, mkexpr(vc)),
+                binop(Iop_AndV128, v2, unop(Iop_NotV128, mkexpr(vc))));
+}
+
 /*------------------------------------------------------------*/
 /*--- Rounding modes                                       ---*/
 /*------------------------------------------------------------*/
@@ -1797,8 +2420,8 @@ encode_bfp_rounding_mode(UChar mode)
    case S390_BFP_ROUND_PER_FPC:
       rm = get_bfp_rounding_mode_from_fpc();
       break;
-   case S390_BFP_ROUND_NEAREST_AWAY:  /* not supported */
-   case S390_BFP_ROUND_PREPARE_SHORT: /* not supported */
+   case S390_BFP_ROUND_NEAREST_AWAY:  rm = mkU32(Irrm_NEAREST_TIE_AWAY_0); break;
+   case S390_BFP_ROUND_PREPARE_SHORT: rm = mkU32(Irrm_PREPARE_SHORTER); break;
    case S390_BFP_ROUND_NEAREST_EVEN:  rm = mkU32(Irrm_NEAREST); break;
    case S390_BFP_ROUND_ZERO:          rm = mkU32(Irrm_ZERO);    break;
    case S390_BFP_ROUND_POSINF:        rm = mkU32(Irrm_PosINF);  break;
@@ -2042,6 +2665,16 @@ s390_format_RIE_RRUUU(const HChar *(*irgen)(UChar r1, UChar r2, UChar i3,
                   i5);
 }
 
+static void
+s390_format_RIEv1(const HChar *(*irgen)(UChar r1, UShort i2, UChar m3),
+                  UChar r1, UShort i2, UChar m3)
+{
+   const HChar *mnm = irgen(r1, i2, m3);
+
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_FE))
+      s390_disasm(ENC4(MNM, GPR, UINT, UINT), mnm, r1, i2, m3);
+}
+
 static void
 s390_format_RIE_RRPU(const HChar *(*irgen)(UChar r1, UChar r2, UShort i4,
                                            UChar m3),
@@ -2078,6 +2711,18 @@ s390_format_RIE_RUPI(const HChar *(*irgen)(UChar r1, UChar m3, UShort i4,
                   (Int)(Char)i2, m3, (Int)(Short)i4);
 }
 
+static void
+s390_format_RIE_RUPIX(const HChar *(*irgen)(UChar r1, UChar m3, UShort i4,
+                                           UChar i2),
+                     UChar r1, UChar m3, UShort i4, UChar i2, Int xmnm_kind)
+{
+   const HChar *mnm = irgen(r1, m3, i4, i2);
+
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_FE))
+      s390_disasm(ENC5(XMNM, GPR, INT, CABM, PCREL), xmnm_kind, mnm, m3, r1,
+                  (Int)(Char)i2, m3, (Int)(Short)i4);
+}
+
 static void
 s390_format_RIL(const HChar *(*irgen)(UChar r1, UInt i2),
                 UChar r1, UInt i2)
@@ -2623,8 +3268,8 @@ s390_format_RXE_FRRD(const HChar *(*irgen)(UChar r1, IRTemp op2addr),
 }
 
 static void
-s390_format_RXF_FRRDF(const HChar *(*irgen)(UChar, IRTemp, UChar),
-                      UChar r3, UChar x2, UChar b2, UShort d2, UChar r1)
+s390_format_RXE_RRRDR(const HChar *(*irgen)(UChar r1, IRTemp op2addr, UChar m3),
+                     UChar r1, UChar x2, UChar b2, UShort d2, UChar m3)
 {
    const HChar *mnm;
    IRTemp op2addr = newTemp(Ity_I64);
@@ -2633,10 +3278,27 @@ s390_format_RXF_FRRDF(const HChar *(*irgen)(UChar, IRTemp, UChar),
           b2 != 0 ? get_gpr_dw0(b2) : mkU64(0)), x2 != 0 ? get_gpr_dw0(x2) :
           mkU64(0)));
 
-   mnm = irgen(r3, op2addr, r1);
+   mnm = irgen(r1, op2addr, m3);
 
    if (UNLIKELY(vex_traceflags & VEX_TRACE_FE))
-      s390_disasm(ENC4(MNM, FPR, FPR, UDXB), mnm, r1, r3, d2, x2, b2);
+      s390_disasm(ENC3(MNM, GPR, UDXB), mnm, r1, d2, x2, b2);
+}
+
+static void
+s390_format_RXF_FRRDF(const HChar *(*irgen)(UChar, IRTemp, UChar),
+                      UChar r3, UChar x2, UChar b2, UShort d2, UChar r1)
+{
+   const HChar *mnm;
+   IRTemp op2addr = newTemp(Ity_I64);
+
+   assign(op2addr, binop(Iop_Add64, binop(Iop_Add64, mkU64(d2),
+          b2 != 0 ? get_gpr_dw0(b2) : mkU64(0)), x2 != 0 ? get_gpr_dw0(x2) :
+          mkU64(0)));
+
+   mnm = irgen(r3, op2addr, r1);
+
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_FE))
+      s390_disasm(ENC4(MNM, FPR, FPR, UDXB), mnm, r1, r3, d2, x2, b2);
 }
 
 static void
@@ -2815,375 +3477,618 @@ s390_format_SIL_RDU(const HChar *(*irgen)(UShort i2, IRTemp op1addr),
       s390_disasm(ENC3(MNM, UDXB, UINT), mnm, d1, 0, b1, i2);
 }
 
+static void
+s390_format_VRX_VRRD(const HChar *(*irgen)(UChar v1, IRTemp op2addr),
+                    UChar v1, UChar x2, UChar b2, UShort d2, UChar rxb)
+{
+   const HChar *mnm;
+   IRTemp op2addr = newTemp(Ity_I64);
 
+   if (! s390_host_has_vx) {
+      emulation_failure(EmFail_S390X_vx);
+      return;
+   }
 
-/*------------------------------------------------------------*/
-/*--- Build IR for opcodes                                 ---*/
-/*------------------------------------------------------------*/
-
-static const HChar *
-s390_irgen_AR(UChar r1, UChar r2)
-{
-   IRTemp op1 = newTemp(Ity_I32);
-   IRTemp op2 = newTemp(Ity_I32);
-   IRTemp result = newTemp(Ity_I32);
+   assign(op2addr, binop(Iop_Add64, binop(Iop_Add64, mkU64(d2),
+          b2 != 0 ? get_gpr_dw0(b2) : mkU64(0)), x2 != 0 ? get_gpr_dw0(x2) :
+          mkU64(0)));
 
-   assign(op1, get_gpr_w1(r1));
-   assign(op2, get_gpr_w1(r2));
-   assign(result, binop(Iop_Add32, mkexpr(op1), mkexpr(op2)));
-   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_ADD_32, op1, op2);
-   put_gpr_w1(r1, mkexpr(result));
+   v1  = s390_vr_getVRindex(v1, 1, rxb);
+   mnm = irgen(v1, op2addr);
 
-   return "ar";
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_FE))
+      s390_disasm(ENC3(MNM, VR, UDXB), mnm, v1, d2, x2, b2);
 }
 
-static const HChar *
-s390_irgen_AGR(UChar r1, UChar r2)
+
+static void
+s390_format_VRX_VRRDM(const HChar *(*irgen)(UChar v1, IRTemp op2addr, UChar m3),
+                    UChar v1, UChar x2, UChar b2, UShort d2, UChar m3, UChar rxb)
 {
-   IRTemp op1 = newTemp(Ity_I64);
-   IRTemp op2 = newTemp(Ity_I64);
-   IRTemp result = newTemp(Ity_I64);
+   const HChar *mnm;
+   IRTemp op2addr = newTemp(Ity_I64);
 
-   assign(op1, get_gpr_dw0(r1));
-   assign(op2, get_gpr_dw0(r2));
-   assign(result, binop(Iop_Add64, mkexpr(op1), mkexpr(op2)));
-   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_ADD_64, op1, op2);
-   put_gpr_dw0(r1, mkexpr(result));
+   if (! s390_host_has_vx) {
+      emulation_failure(EmFail_S390X_vx);
+      return;
+   }
 
-   return "agr";
+   assign(op2addr, binop(Iop_Add64, binop(Iop_Add64, mkU64(d2),
+          b2 != 0 ? get_gpr_dw0(b2) : mkU64(0)), x2 != 0 ? get_gpr_dw0(x2) :
+          mkU64(0)));
+
+   v1  = s390_vr_getVRindex(v1, 1, rxb);
+   mnm = irgen(v1, op2addr, m3);
+
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_FE))
+      s390_disasm(ENC3(MNM, VR, UDXB), mnm, v1, d2, x2, b2);
 }
 
-static const HChar *
-s390_irgen_AGFR(UChar r1, UChar r2)
+
+static void
+s390_format_VRR_VV(const HChar *(*irgen)(UChar v1, UChar v2),
+                    UChar v1, UChar v2, UChar rxb)
 {
-   IRTemp op1 = newTemp(Ity_I64);
-   IRTemp op2 = newTemp(Ity_I64);
-   IRTemp result = newTemp(Ity_I64);
+   const HChar *mnm;
 
-   assign(op1, get_gpr_dw0(r1));
-   assign(op2, unop(Iop_32Sto64, get_gpr_w1(r2)));
-   assign(result, binop(Iop_Add64, mkexpr(op1), mkexpr(op2)));
-   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_ADD_64, op1, op2);
-   put_gpr_dw0(r1, mkexpr(result));
+   if (! s390_host_has_vx) {
+      emulation_failure(EmFail_S390X_vx);
+      return;
+   }
 
-   return "agfr";
+   v1  = s390_vr_getVRindex(v1, 1, rxb);
+   v2  = s390_vr_getVRindex(v2, 2, rxb);
+   mnm = irgen(v1, v2);
+
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_FE))
+      s390_disasm(ENC3(MNM, VR, VR), mnm, v1, v2);
 }
 
-static const HChar *
-s390_irgen_ARK(UChar r3, UChar r1, UChar r2)
+
+static void
+s390_format_VRR_VVV(const HChar *(*irgen)(UChar v1, UChar v2, UChar v3),
+                    UChar v1, UChar v2, UChar v3, UChar rxb)
 {
-   IRTemp op2 = newTemp(Ity_I32);
-   IRTemp op3 = newTemp(Ity_I32);
-   IRTemp result = newTemp(Ity_I32);
+   const HChar *mnm;
 
-   assign(op2, get_gpr_w1(r2));
-   assign(op3, get_gpr_w1(r3));
-   assign(result, binop(Iop_Add32, mkexpr(op2), mkexpr(op3)));
-   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_ADD_32, op2, op3);
-   put_gpr_w1(r1, mkexpr(result));
+   if (! s390_host_has_vx) {
+      emulation_failure(EmFail_S390X_vx);
+      return;
+   }
 
-   return "ark";
+   v1  = s390_vr_getVRindex(v1, 1, rxb);
+   v2  = s390_vr_getVRindex(v2, 2, rxb);
+   v3  = s390_vr_getVRindex(v3, 3, rxb);
+   mnm = irgen(v1, v2, v3);
+
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_FE))
+      s390_disasm(ENC4(MNM, VR, VR, VR), mnm, v1, v2, v3);
 }
 
-static const HChar *
-s390_irgen_AGRK(UChar r3, UChar r1, UChar r2)
+
+static void
+s390_format_VRR_VVVM(const HChar *(*irgen)(UChar v1, UChar v2, UChar v3, UChar m4),
+                    UChar v1, UChar v2, UChar v3, UChar m4, UChar rxb)
 {
-   IRTemp op2 = newTemp(Ity_I64);
-   IRTemp op3 = newTemp(Ity_I64);
-   IRTemp result = newTemp(Ity_I64);
+   const HChar *mnm;
 
-   assign(op2, get_gpr_dw0(r2));
-   assign(op3, get_gpr_dw0(r3));
-   assign(result, binop(Iop_Add64, mkexpr(op2), mkexpr(op3)));
-   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_ADD_64, op2, op3);
-   put_gpr_dw0(r1, mkexpr(result));
+   if (! s390_host_has_vx) {
+      emulation_failure(EmFail_S390X_vx);
+      return;
+   }
 
-   return "agrk";
+   v1  = s390_vr_getVRindex(v1, 1, rxb);
+   v2  = s390_vr_getVRindex(v2, 2, rxb);
+   v3  = s390_vr_getVRindex(v3, 3, rxb);
+   mnm = irgen(v1, v2, v3, m4);
+
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_FE))
+      s390_disasm(ENC5(MNM, VR, VR, VR, UINT), mnm, v1, v2, v3, m4);
 }
 
-static const HChar *
-s390_irgen_A(UChar r1, IRTemp op2addr)
+
+static void
+s390_format_VRR_VVVMM(const HChar *(*irgen)(UChar v1, UChar v2, UChar v3, UChar m4, UChar m5),
+                    UChar v1, UChar v2, UChar v3, UChar m4, UChar m5, UChar rxb)
 {
-   IRTemp op1 = newTemp(Ity_I32);
-   IRTemp op2 = newTemp(Ity_I32);
-   IRTemp result = newTemp(Ity_I32);
+   const HChar *mnm;
 
-   assign(op1, get_gpr_w1(r1));
-   assign(op2, load(Ity_I32, mkexpr(op2addr)));
-   assign(result, binop(Iop_Add32, mkexpr(op1), mkexpr(op2)));
-   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_ADD_32, op1, op2);
-   put_gpr_w1(r1, mkexpr(result));
+   if (! s390_host_has_vx) {
+      emulation_failure(EmFail_S390X_vx);
+      return;
+   }
 
-   return "a";
+   v1  = s390_vr_getVRindex(v1, 1, rxb);
+   v2  = s390_vr_getVRindex(v2, 2, rxb);
+   v3  = s390_vr_getVRindex(v3, 3, rxb);
+   mnm = irgen(v1, v2, v3, m4, m5);
+
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_FE))
+      s390_disasm(ENC6(MNM, VR, VR, VR, UINT, UINT), mnm, v1, v2, v3, m4, m5);
 }
 
-static const HChar *
-s390_irgen_AY(UChar r1, IRTemp op2addr)
+
+static void
+s390_format_VRR_VVVV(const HChar *(*irgen)(UChar v1, UChar v2, UChar v3, UChar v4),
+                    UChar v1, UChar v2, UChar v3, UChar v4, UChar rxb)
 {
-   IRTemp op1 = newTemp(Ity_I32);
-   IRTemp op2 = newTemp(Ity_I32);
-   IRTemp result = newTemp(Ity_I32);
+   const HChar *mnm;
 
-   assign(op1, get_gpr_w1(r1));
-   assign(op2, load(Ity_I32, mkexpr(op2addr)));
-   assign(result, binop(Iop_Add32, mkexpr(op1), mkexpr(op2)));
-   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_ADD_32, op1, op2);
-   put_gpr_w1(r1, mkexpr(result));
+   if (! s390_host_has_vx) {
+      emulation_failure(EmFail_S390X_vx);
+      return;
+   }
 
-   return "ay";
+   v1  = s390_vr_getVRindex(v1, 1, rxb);
+   v2  = s390_vr_getVRindex(v2, 2, rxb);
+   v3  = s390_vr_getVRindex(v3, 3, rxb);
+   v4  = s390_vr_getVRindex(v4, 4, rxb);
+   mnm = irgen(v1, v2, v3, v4);
+
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_FE))
+      s390_disasm(ENC5(MNM, VR, VR, VR, VR), mnm, v1, v2, v3, v4);
 }
 
-static const HChar *
-s390_irgen_AG(UChar r1, IRTemp op2addr)
+
+static void
+s390_format_VRR_VRR(const HChar *(*irgen)(UChar v1, UChar r2, UChar r3),
+                    UChar v1, UChar r2, UChar r3, UChar rxb)
 {
-   IRTemp op1 = newTemp(Ity_I64);
-   IRTemp op2 = newTemp(Ity_I64);
-   IRTemp result = newTemp(Ity_I64);
+   const HChar *mnm;
 
-   assign(op1, get_gpr_dw0(r1));
-   assign(op2, load(Ity_I64, mkexpr(op2addr)));
-   assign(result, binop(Iop_Add64, mkexpr(op1), mkexpr(op2)));
-   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_ADD_64, op1, op2);
-   put_gpr_dw0(r1, mkexpr(result));
+   if (! s390_host_has_vx) {
+      emulation_failure(EmFail_S390X_vx);
+      return;
+   }
 
-   return "ag";
+   v1  = s390_vr_getVRindex(v1, 1, rxb);
+   mnm = irgen(v1, r2, r3);
+
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_FE))
+      s390_disasm(ENC4(MNM, VR, GPR, GPR), mnm, v1, r2, r3);
 }
 
-static const HChar *
-s390_irgen_AGF(UChar r1, IRTemp op2addr)
+
+static void
+s390_format_VRR_VVM(const HChar *(*irgen)(UChar v1, UChar v2, UChar m3),
+                    UChar v1, UChar v2, UChar m3, UChar rxb)
 {
-   IRTemp op1 = newTemp(Ity_I64);
-   IRTemp op2 = newTemp(Ity_I64);
-   IRTemp result = newTemp(Ity_I64);
+   const HChar *mnm;
 
-   assign(op1, get_gpr_dw0(r1));
-   assign(op2, unop(Iop_32Sto64, load(Ity_I32, mkexpr(op2addr))));
-   assign(result, binop(Iop_Add64, mkexpr(op1), mkexpr(op2)));
-   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_ADD_64, op1, op2);
-   put_gpr_dw0(r1, mkexpr(result));
+   if (! s390_host_has_vx) {
+      emulation_failure(EmFail_S390X_vx);
+      return;
+   }
 
-   return "agf";
+   v1  = s390_vr_getVRindex(v1, 1, rxb);
+   v2  = s390_vr_getVRindex(v2, 2, rxb);
+   mnm = irgen(v1, v2, m3);
+
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_FE))
+      s390_disasm(ENC4(MNM, VR, VR, UINT), mnm, v1, v2, m3);
 }
 
-static const HChar *
-s390_irgen_AFI(UChar r1, UInt i2)
+
+static void
+s390_format_VRI_VIM(const HChar *(*irgen)(UChar v1, UShort i2, UChar m3),
+                    UChar v1, UShort i2, UChar m3, UChar rxb)
 {
-   IRTemp op1 = newTemp(Ity_I32);
-   Int op2;
-   IRTemp result = newTemp(Ity_I32);
+   const HChar *mnm;
 
-   assign(op1, get_gpr_w1(r1));
-   op2 = (Int)i2;
-   assign(result, binop(Iop_Add32, mkexpr(op1), mkU32((UInt)op2)));
-   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_ADD_32, op1, mktemp(Ity_I32,
-                       mkU32((UInt)op2)));
-   put_gpr_w1(r1, mkexpr(result));
+   if (! s390_host_has_vx) {
+      emulation_failure(EmFail_S390X_vx);
+      return;
+   }
 
-   return "afi";
+   v1  = s390_vr_getVRindex(v1, 1, rxb);
+   mnm = irgen(v1, i2, m3);
+
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_FE))
+      s390_disasm(ENC4(MNM, VR, UINT, UINT), mnm, v1, i2, m3);
 }
 
-static const HChar *
-s390_irgen_AGFI(UChar r1, UInt i2)
+
+static void
+s390_format_VRI_VVIM(const HChar *(*irgen)(UChar v1, UChar v3, UShort i2, UChar m4),
+                    UChar v1, UChar v3, UShort i2, UChar m4, UChar rxb)
 {
-   IRTemp op1 = newTemp(Ity_I64);
-   Long op2;
-   IRTemp result = newTemp(Ity_I64);
+   const HChar *mnm;
 
-   assign(op1, get_gpr_dw0(r1));
-   op2 = (Long)(Int)i2;
-   assign(result, binop(Iop_Add64, mkexpr(op1), mkU64((ULong)op2)));
-   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_ADD_64, op1, mktemp(Ity_I64,
-                       mkU64((ULong)op2)));
-   put_gpr_dw0(r1, mkexpr(result));
+   if (! s390_host_has_vx) {
+      emulation_failure(EmFail_S390X_vx);
+      return;
+   }
 
-   return "agfi";
+   v1  = s390_vr_getVRindex(v1, 1, rxb);
+   v3  = s390_vr_getVRindex(v3, 2, rxb);
+   mnm = irgen(v1, v3, i2, m4);
+
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_FE))
+      s390_disasm(ENC5(MNM, VR, VR, UINT, UINT), mnm, v1, v3, i2, m4);
 }
 
-static const HChar *
-s390_irgen_AHIK(UChar r1, UChar r3, UShort i2)
+static void
+s390_format_VRI_VVIMM(const HChar *(*irgen)(UChar v1, UChar v2, UShort i3,
+                                            UChar m4, UChar m5),
+                      UChar v1, UChar v2, UShort i3, UChar m4, UChar m5,
+                      UChar rxb)
 {
-   Int op2;
-   IRTemp op3 = newTemp(Ity_I32);
-   IRTemp result = newTemp(Ity_I32);
+   const HChar *mnm;
 
-   op2 = (Int)(Short)i2;
-   assign(op3, get_gpr_w1(r3));
-   assign(result, binop(Iop_Add32, mkU32((UInt)op2), mkexpr(op3)));
-   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_ADD_32, mktemp(Ity_I32, mkU32((UInt)
-                       op2)), op3);
-   put_gpr_w1(r1, mkexpr(result));
+   if (!s390_host_has_vx) {
+      emulation_failure(EmFail_S390X_vx);
+      return;
+   }
 
-   return "ahik";
+   v1 = s390_vr_getVRindex(v1, 1, rxb);
+   v2 = s390_vr_getVRindex(v2, 2, rxb);
+   mnm = irgen(v1, v2, i3, m4, m5);
+
+   if (vex_traceflags & VEX_TRACE_FE)
+      s390_disasm(ENC6(MNM, VR, VR, UINT, UINT, UINT), mnm, v1, v2, i3, m4, m5);
 }
 
-static const HChar *
-s390_irgen_AGHIK(UChar r1, UChar r3, UShort i2)
+static void
+s390_format_VRS_RRDVM(const HChar *(*irgen)(UChar r1, IRTemp op2addr, UChar v3,
+                      UChar m4), UChar r1, UChar b2, UShort d2, UChar v3,
+                      UChar m4, UChar rxb)
 {
-   Long op2;
-   IRTemp op3 = newTemp(Ity_I64);
-   IRTemp result = newTemp(Ity_I64);
+   const HChar *mnm;
+   IRTemp op2addr = newTemp(Ity_I64);
 
-   op2 = (Long)(Short)i2;
-   assign(op3, get_gpr_dw0(r3));
-   assign(result, binop(Iop_Add64, mkU64((ULong)op2), mkexpr(op3)));
-   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_ADD_64, mktemp(Ity_I64, mkU64((ULong)
-                       op2)), op3);
-   put_gpr_dw0(r1, mkexpr(result));
+   if (! s390_host_has_vx) {
+      emulation_failure(EmFail_S390X_vx);
+      return;
+   }
 
-   return "aghik";
+   assign(op2addr, binop(Iop_Add64, mkU64(d2), b2 != 0 ? get_gpr_dw0(b2) :
+          mkU64(0)));
+
+   v3  = s390_vr_getVRindex(v3, 2, rxb);
+   mnm = irgen(r1, op2addr, v3, m4);
+
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_FE))
+      s390_disasm(ENC5(MNM, GPR, UDXB, VR, UINT), mnm, r1, d2, 0, b2, v3, m4);
 }
 
-static const HChar *
-s390_irgen_ASI(UChar i2, IRTemp op1addr)
+
+static void
+s390_format_VRS_VRDVM(const HChar *(*irgen)(UChar v1, IRTemp op2addr, UChar v3,
+                      UChar m4), UChar v1, UChar b2, UShort d2, UChar v3,
+                      UChar m4, UChar rxb)
 {
-   IRTemp op1 = newTemp(Ity_I32);
-   Int op2;
-   IRTemp result = newTemp(Ity_I32);
+   const HChar *mnm;
+   IRTemp op2addr = newTemp(Ity_I64);
 
-   assign(op1, load(Ity_I32, mkexpr(op1addr)));
-   op2 = (Int)(Char)i2;
-   assign(result, binop(Iop_Add32, mkexpr(op1), mkU32((UInt)op2)));
-   store(mkexpr(op1addr), mkexpr(result));
-   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_ADD_32, op1, mktemp(Ity_I32,
-                       mkU32((UInt)op2)));
+   if (! s390_host_has_vx) {
+      emulation_failure(EmFail_S390X_vx);
+      return;
+   }
 
-   return "asi";
+   assign(op2addr, binop(Iop_Add64, mkU64(d2), b2 != 0 ? get_gpr_dw0(b2) :
+          mkU64(0)));
+
+   v1  = s390_vr_getVRindex(v1, 1, rxb);
+   v3  = s390_vr_getVRindex(v3, 2, rxb);
+   mnm = irgen(v1, op2addr, v3, m4);
+
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_FE))
+      s390_disasm(ENC5(MNM, VR, UDXB, VR, UINT), mnm, v1, d2, 0, b2, v3, m4);
 }
 
-static const HChar *
-s390_irgen_AGSI(UChar i2, IRTemp op1addr)
+
+static void
+s390_format_VRS_VRDV(const HChar *(*irgen)(UChar v1, IRTemp op2addr, UChar v3),
+                     UChar v1, UChar b2, UShort d2, UChar v3, UChar rxb)
 {
-   IRTemp op1 = newTemp(Ity_I64);
-   Long op2;
-   IRTemp result = newTemp(Ity_I64);
+   const HChar *mnm;
+   IRTemp op2addr = newTemp(Ity_I64);
 
-   assign(op1, load(Ity_I64, mkexpr(op1addr)));
-   op2 = (Long)(Char)i2;
-   assign(result, binop(Iop_Add64, mkexpr(op1), mkU64((ULong)op2)));
-   store(mkexpr(op1addr), mkexpr(result));
-   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_ADD_64, op1, mktemp(Ity_I64,
-                       mkU64((ULong)op2)));
+   if (! s390_host_has_vx) {
+      emulation_failure(EmFail_S390X_vx);
+      return;
+   }
 
-   return "agsi";
+   assign(op2addr, binop(Iop_Add64, mkU64(d2), b2 != 0 ? get_gpr_dw0(b2) :
+          mkU64(0)));
+
+   v1  = s390_vr_getVRindex(v1, 1, rxb);
+   v3  = s390_vr_getVRindex(v3, 2, rxb);
+   mnm = irgen(v1, op2addr, v3);
+
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_FE))
+      s390_disasm(ENC4(MNM, VR, UDXB, VR), mnm, v1, d2, 0, b2, v3);
 }
 
-static const HChar *
-s390_irgen_AH(UChar r1, IRTemp op2addr)
+
+static void
+s390_format_VRS_VRRDM(const HChar *(*irgen)(UChar v1, IRTemp op2addr, UChar r3,
+                     UChar m4),
+                     UChar v1, UChar b2, UShort d2, UChar r3, UChar m4, UChar rxb)
 {
-   IRTemp op1 = newTemp(Ity_I32);
-   IRTemp op2 = newTemp(Ity_I32);
-   IRTemp result = newTemp(Ity_I32);
+   const HChar *mnm;
+   IRTemp op2addr = newTemp(Ity_I64);
 
-   assign(op1, get_gpr_w1(r1));
-   assign(op2, unop(Iop_16Sto32, load(Ity_I16, mkexpr(op2addr))));
-   assign(result, binop(Iop_Add32, mkexpr(op1), mkexpr(op2)));
-   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_ADD_32, op1, op2);
-   put_gpr_w1(r1, mkexpr(result));
+   if (! s390_host_has_vx) {
+      emulation_failure(EmFail_S390X_vx);
+      return;
+   }
 
-   return "ah";
+   assign(op2addr, binop(Iop_Add64, mkU64(d2), b2 != 0 ? get_gpr_dw0(b2) :
+          mkU64(0)));
+
+   v1  = s390_vr_getVRindex(v1, 1, rxb);
+   mnm = irgen(v1, op2addr, r3, m4);
+
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_FE))
+      s390_disasm(ENC5(MNM, VR, GPR, UDXB, UINT), mnm, v1, r3, d2, 0, b2, m4);
 }
 
-static const HChar *
-s390_irgen_AHY(UChar r1, IRTemp op2addr)
+
+static void
+s390_format_VRS_VRRD(const HChar *(*irgen)(UChar v1, IRTemp op2addr, UChar r3),
+                     UChar v1, UChar b2, UShort d2, UChar r3, UChar rxb)
 {
-   IRTemp op1 = newTemp(Ity_I32);
-   IRTemp op2 = newTemp(Ity_I32);
-   IRTemp result = newTemp(Ity_I32);
+   const HChar *mnm;
+   IRTemp op2addr = newTemp(Ity_I64);
 
-   assign(op1, get_gpr_w1(r1));
-   assign(op2, unop(Iop_16Sto32, load(Ity_I16, mkexpr(op2addr))));
-   assign(result, binop(Iop_Add32, mkexpr(op1), mkexpr(op2)));
-   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_ADD_32, op1, op2);
-   put_gpr_w1(r1, mkexpr(result));
+   if (! s390_host_has_vx) {
+      emulation_failure(EmFail_S390X_vx);
+      return;
+   }
 
-   return "ahy";
+   assign(op2addr, binop(Iop_Add64, mkU64(d2), b2 != 0 ? get_gpr_dw0(b2) :
+          mkU64(0)));
+
+   v1  = s390_vr_getVRindex(v1, 1, rxb);
+   mnm = irgen(v1, op2addr, r3);
+
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_FE))
+      s390_disasm(ENC4(MNM, VR, GPR, UDXB), mnm, v1, r3, d2, 0, b2);
 }
 
-static const HChar *
-s390_irgen_AHI(UChar r1, UShort i2)
+
+static void
+s390_format_VRV_VVRDMT(const HChar *(*irgen)(UChar v1, IRTemp op2addr, UChar m3),
+                       UChar v1, UChar v2, UChar b2, UShort d2, UChar m3, UChar rxb,
+                       IRType type)
 {
-   IRTemp op1 = newTemp(Ity_I32);
-   Int op2;
-   IRTemp result = newTemp(Ity_I32);
+   const HChar *mnm;
+   IRTemp op2addr = newTemp(Ity_I64);
 
-   assign(op1, get_gpr_w1(r1));
-   op2 = (Int)(Short)i2;
-   assign(result, binop(Iop_Add32, mkexpr(op1), mkU32((UInt)op2)));
-   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_ADD_32, op1, mktemp(Ity_I32,
-                       mkU32((UInt)op2)));
-   put_gpr_w1(r1, mkexpr(result));
+   if (! s390_host_has_vx) {
+      emulation_failure(EmFail_S390X_vx);
+      return;
+   }
 
-   return "ahi";
+   v1  = s390_vr_getVRindex(v1, 1, rxb);
+   v2  = s390_vr_getVRindex(v2, 2, rxb);
+
+   vassert(type == Ity_I32 || type == Ity_I64);
+   IRExpr *x2;
+   if(type == Ity_I32) {
+      x2 = unop(Iop_32Uto64, get_vr(v2, type, m3));
+   } else {
+      x2 = get_vr(v2, type, m3);
+   }
+
+   assign(op2addr, binop(Iop_Add64, binop(Iop_Add64, mkU64(d2),
+          b2 != 0 ? get_gpr_dw0(b2) : mkU64(0)), x2));
+
+   mnm = irgen(v1, op2addr, m3);
+
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_FE))
+      s390_disasm(ENC4(MNM, VR, UDVB, UINT), mnm, v1, d2, v2, b2, m3);
 }
 
-static const HChar *
-s390_irgen_AGHI(UChar r1, UShort i2)
+
+static void
+s390_format_VRR_VVVVMM(const HChar *(*irgen)(UChar v1, UChar v2, UChar v3,
+                                              UChar v4, UChar m5, UChar m6),
+                        UChar v1, UChar v2, UChar v3, UChar v4, UChar m5,
+                        UChar m6, UChar rxb)
 {
-   IRTemp op1 = newTemp(Ity_I64);
-   Long op2;
-   IRTemp result = newTemp(Ity_I64);
+   const HChar *mnm;
 
-   assign(op1, get_gpr_dw0(r1));
-   op2 = (Long)(Short)i2;
-   assign(result, binop(Iop_Add64, mkexpr(op1), mkU64((ULong)op2)));
-   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_ADD_64, op1, mktemp(Ity_I64,
-                       mkU64((ULong)op2)));
-   put_gpr_dw0(r1, mkexpr(result));
+   if (! s390_host_has_vx) {
+      emulation_failure(EmFail_S390X_vx);
+      return;
+   }
 
-   return "aghi";
+   v1  = s390_vr_getVRindex(v1, 1, rxb);
+   v2  = s390_vr_getVRindex(v2, 2, rxb);
+   v3  = s390_vr_getVRindex(v3, 3, rxb);
+   v4  = s390_vr_getVRindex(v4, 4, rxb);
+   mnm = irgen(v1, v2, v3, v4, m5, m6);
+
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_FE))
+      s390_disasm(ENC7(MNM, VR, VR, VR, VR, UINT, UINT),
+                  mnm, v1, v2, v3, v4, m5, m6);
 }
 
-static const HChar *
-s390_irgen_AHHHR(UChar r3, UChar r1, UChar r2)
+
+static void
+s390_format_VRR_VVMM(const HChar *(*irgen)(UChar v1, UChar v2, UChar m3,
+                                           UChar m5),
+                     UChar v1, UChar v2, UChar m3, UChar m5, UChar rxb)
 {
-   IRTemp op2 = newTemp(Ity_I32);
-   IRTemp op3 = newTemp(Ity_I32);
-   IRTemp result = newTemp(Ity_I32);
+   const HChar *mnm;
 
-   assign(op2, get_gpr_w0(r2));
-   assign(op3, get_gpr_w0(r3));
-   assign(result, binop(Iop_Add32, mkexpr(op2), mkexpr(op3)));
-   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_ADD_32, op2, op3);
-   put_gpr_w0(r1, mkexpr(result));
+   if (! s390_host_has_vx) {
+      emulation_failure(EmFail_S390X_vx);
+      return;
+   }
 
-   return "ahhhr";
+   v1  = s390_vr_getVRindex(v1, 1, rxb);
+   v2  = s390_vr_getVRindex(v2, 2, rxb);
+   mnm = irgen(v1, v2, m3, m5);
+
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_FE))
+      s390_disasm(ENC5(MNM, VR, VR, UINT, UINT), mnm, v1, v2, m3, m5);
 }
 
-static const HChar *
-s390_irgen_AHHLR(UChar r3, UChar r1, UChar r2)
+
+static void
+s390_format_VRId_VVVIM(const HChar *(*irgen)(UChar v1, UChar v2, UChar v3,
+                                             UChar i4, UChar m5),
+                       UChar v1, UChar v2, UChar v3, UChar i4, UChar m5,
+                       UChar rxb)
 {
-   IRTemp op2 = newTemp(Ity_I32);
-   IRTemp op3 = newTemp(Ity_I32);
-   IRTemp result = newTemp(Ity_I32);
+   const HChar *mnm;
 
-   assign(op2, get_gpr_w0(r2));
-   assign(op3, get_gpr_w1(r3));
-   assign(result, binop(Iop_Add32, mkexpr(op2), mkexpr(op3)));
-   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_ADD_32, op2, op3);
-   put_gpr_w0(r1, mkexpr(result));
+   if (! s390_host_has_vx) {
+      emulation_failure(EmFail_S390X_vx);
+      return;
+   }
 
-   return "ahhlr";
+   v1  = s390_vr_getVRindex(v1, 1, rxb);
+   v2  = s390_vr_getVRindex(v2, 2, rxb);
+   v3  = s390_vr_getVRindex(v3, 3, rxb);
+   mnm = irgen(v1, v2, v3, i4, m5);
+
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_FE))
+      s390_disasm(ENC6(MNM, VR, VR, VR, UINT, UINT), mnm, v1, v2, v3, i4, m5);
 }
 
-static const HChar *
-s390_irgen_AIH(UChar r1, UInt i2)
+
+static void
+s390_format_VRId_VVVI(const HChar *(*irgen)(UChar v1, UChar v2, UChar v3,
+                                            UChar i4),
+                      UChar v1, UChar v2, UChar v3, UChar i4, UChar rxb)
 {
-   IRTemp op1 = newTemp(Ity_I32);
-   Int op2;
-   IRTemp result = newTemp(Ity_I32);
+   const HChar *mnm;
 
-   assign(op1, get_gpr_w0(r1));
-   op2 = (Int)i2;
-   assign(result, binop(Iop_Add32, mkexpr(op1), mkU32((UInt)op2)));
-   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_ADD_32, op1, mktemp(Ity_I32,
-                       mkU32((UInt)op2)));
-   put_gpr_w0(r1, mkexpr(result));
+   if (! s390_host_has_vx) {
+      emulation_failure(EmFail_S390X_vx);
+      return;
+   }
 
-   return "aih";
+   v1  = s390_vr_getVRindex(v1, 1, rxb);
+   v2  = s390_vr_getVRindex(v2, 2, rxb);
+   v3  = s390_vr_getVRindex(v3, 3, rxb);
+   mnm = irgen(v1, v2, v3, i4);
+
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_FE))
+      s390_disasm(ENC5(MNM, VR, VR, VR, UINT), mnm, v1, v2, v3, i4);
+}
+
+
+static void
+s390_format_VRRd_VVVVM(const HChar *(*irgen)(UChar v1, UChar v2, UChar v3,
+                                             UChar v4, UChar m5),
+                       UChar v1, UChar v2, UChar v3, UChar v4, UChar m5,
+                       UChar rxb)
+{
+   const HChar *mnm;
+
+   if (! s390_host_has_vx) {
+      emulation_failure(EmFail_S390X_vx);
+      return;
+   }
+
+   v1  = s390_vr_getVRindex(v1, 1, rxb);
+   v2  = s390_vr_getVRindex(v2, 2, rxb);
+   v3  = s390_vr_getVRindex(v3, 3, rxb);
+   v4  = s390_vr_getVRindex(v4, 4, rxb);
+   mnm = irgen(v1, v2, v3, v4, m5);
+
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_FE))
+      s390_disasm(ENC6(MNM, VR, VR, VR, VR, UINT), mnm, v1, v2, v3, v4, m5);
 }
 
+
+static void
+s390_format_VRRa_VVMMM(const HChar *(*irgen)(UChar v1, UChar v2, UChar m3,
+                                             UChar m4, UChar m5),
+                       UChar v1, UChar v2, UChar m3, UChar m4, UChar m5,
+                       UChar rxb)
+{
+   const HChar *mnm;
+
+   if (!s390_host_has_vx) {
+      emulation_failure(EmFail_S390X_vx);
+      return;
+   }
+
+   v1 = s390_vr_getVRindex(v1, 1, rxb);
+   v2 = s390_vr_getVRindex(v2, 2, rxb);
+   mnm = irgen(v1, v2, m3, m4, m5);
+
+   if (vex_traceflags & VEX_TRACE_FE)
+      s390_disasm(ENC6(MNM, VR, VR, UINT, UINT, UINT), mnm, v1, v2, m3, m4, m5);
+}
+
+static void
+s390_format_VRRa_VVVMM(const HChar *(*irgen)(UChar v1, UChar v2, UChar v3,
+                                             UChar m4, UChar m5),
+                       UChar v1, UChar v2, UChar v3, UChar m4, UChar m5,
+                       UChar rxb)
+{
+   const HChar *mnm;
+
+   if (!s390_host_has_vx) {
+      emulation_failure(EmFail_S390X_vx);
+      return;
+   }
+
+   v1 = s390_vr_getVRindex(v1, 1, rxb);
+   v2 = s390_vr_getVRindex(v2, 2, rxb);
+   v3 = s390_vr_getVRindex(v3, 3, rxb);
+   mnm = irgen(v1, v2, v3, m4, m5);
+
+   if (vex_traceflags & VEX_TRACE_FE)
+      s390_disasm(ENC6(MNM, VR, VR, VR, UINT, UINT), mnm, v1, v2, v3, m4, m5);
+}
+
+static void
+s390_format_VRRa_VVMM(const HChar *(*irgen)(UChar v1, UChar v2, UChar m3,
+                                            UChar m4),
+                       UChar v1, UChar v2, UChar m3, UChar m4, UChar rxb)
+{
+   const HChar *mnm;
+
+   if (!s390_host_has_vx) {
+      emulation_failure(EmFail_S390X_vx);
+      return;
+   }
+
+   v1 = s390_vr_getVRindex(v1, 1, rxb);
+   v2 = s390_vr_getVRindex(v2, 2, rxb);
+   mnm = irgen(v1, v2, m3, m4);
+
+   if (vex_traceflags & VEX_TRACE_FE)
+      s390_disasm(ENC5(MNM, VR, VR, UINT, UINT), mnm, v1, v2, m3, m4);
+}
+
+static void
+s390_format_VRRa_VVVMMM(const HChar *(*irgen)(UChar v1, UChar v2, UChar v3,
+                                              UChar m4, UChar m5, UChar m6),
+                        UChar v1, UChar v2, UChar v3, UChar m4, UChar m5,
+                        UChar m6, UChar rxb)
+{
+   const HChar *mnm;
+
+   if (!s390_host_has_vx) {
+      emulation_failure(EmFail_S390X_vx);
+      return;
+   }
+
+   v1 = s390_vr_getVRindex(v1, 1, rxb);
+   v2 = s390_vr_getVRindex(v2, 2, rxb);
+   v3 = s390_vr_getVRindex(v3, 3, rxb);
+   mnm = irgen(v1, v2, v3, m4, m5, m6);
+
+   if (vex_traceflags & VEX_TRACE_FE)
+      s390_disasm(ENC6(MNM, VR, VR, VR, UINT, UINT),
+                  mnm, v1, v2, v3, m4, m5, m6);
+}
+
+/*------------------------------------------------------------*/
+/*--- Build IR for opcodes                                 ---*/
+/*------------------------------------------------------------*/
+
 static const HChar *
-s390_irgen_ALR(UChar r1, UChar r2)
+s390_irgen_AR(UChar r1, UChar r2)
 {
    IRTemp op1 = newTemp(Ity_I32);
    IRTemp op2 = newTemp(Ity_I32);
@@ -3192,14 +4097,14 @@ s390_irgen_ALR(UChar r1, UChar r2)
    assign(op1, get_gpr_w1(r1));
    assign(op2, get_gpr_w1(r2));
    assign(result, binop(Iop_Add32, mkexpr(op1), mkexpr(op2)));
-   s390_cc_thunk_putZZ(S390_CC_OP_UNSIGNED_ADD_32, op1, op2);
+   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_ADD_32, op1, op2);
    put_gpr_w1(r1, mkexpr(result));
 
-   return "alr";
+   return "ar";
 }
 
 static const HChar *
-s390_irgen_ALGR(UChar r1, UChar r2)
+s390_irgen_AGR(UChar r1, UChar r2)
 {
    IRTemp op1 = newTemp(Ity_I64);
    IRTemp op2 = newTemp(Ity_I64);
@@ -3208,30 +4113,30 @@ s390_irgen_ALGR(UChar r1, UChar r2)
    assign(op1, get_gpr_dw0(r1));
    assign(op2, get_gpr_dw0(r2));
    assign(result, binop(Iop_Add64, mkexpr(op1), mkexpr(op2)));
-   s390_cc_thunk_putZZ(S390_CC_OP_UNSIGNED_ADD_64, op1, op2);
+   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_ADD_64, op1, op2);
    put_gpr_dw0(r1, mkexpr(result));
 
-   return "algr";
+   return "agr";
 }
 
 static const HChar *
-s390_irgen_ALGFR(UChar r1, UChar r2)
+s390_irgen_AGFR(UChar r1, UChar r2)
 {
    IRTemp op1 = newTemp(Ity_I64);
    IRTemp op2 = newTemp(Ity_I64);
    IRTemp result = newTemp(Ity_I64);
 
    assign(op1, get_gpr_dw0(r1));
-   assign(op2, unop(Iop_32Uto64, get_gpr_w1(r2)));
+   assign(op2, unop(Iop_32Sto64, get_gpr_w1(r2)));
    assign(result, binop(Iop_Add64, mkexpr(op1), mkexpr(op2)));
-   s390_cc_thunk_putZZ(S390_CC_OP_UNSIGNED_ADD_64, op1, op2);
+   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_ADD_64, op1, op2);
    put_gpr_dw0(r1, mkexpr(result));
 
-   return "algfr";
+   return "agfr";
 }
 
 static const HChar *
-s390_irgen_ALRK(UChar r3, UChar r1, UChar r2)
+s390_irgen_ARK(UChar r3, UChar r1, UChar r2)
 {
    IRTemp op2 = newTemp(Ity_I32);
    IRTemp op3 = newTemp(Ity_I32);
@@ -3240,14 +4145,14 @@ s390_irgen_ALRK(UChar r3, UChar r1, UChar r2)
    assign(op2, get_gpr_w1(r2));
    assign(op3, get_gpr_w1(r3));
    assign(result, binop(Iop_Add32, mkexpr(op2), mkexpr(op3)));
-   s390_cc_thunk_putZZ(S390_CC_OP_UNSIGNED_ADD_32, op2, op3);
+   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_ADD_32, op2, op3);
    put_gpr_w1(r1, mkexpr(result));
 
-   return "alrk";
+   return "ark";
 }
 
 static const HChar *
-s390_irgen_ALGRK(UChar r3, UChar r1, UChar r2)
+s390_irgen_AGRK(UChar r3, UChar r1, UChar r2)
 {
    IRTemp op2 = newTemp(Ity_I64);
    IRTemp op3 = newTemp(Ity_I64);
@@ -3256,14 +4161,14 @@ s390_irgen_ALGRK(UChar r3, UChar r1, UChar r2)
    assign(op2, get_gpr_dw0(r2));
    assign(op3, get_gpr_dw0(r3));
    assign(result, binop(Iop_Add64, mkexpr(op2), mkexpr(op3)));
-   s390_cc_thunk_putZZ(S390_CC_OP_UNSIGNED_ADD_64, op2, op3);
+   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_ADD_64, op2, op3);
    put_gpr_dw0(r1, mkexpr(result));
 
-   return "algrk";
+   return "agrk";
 }
 
 static const HChar *
-s390_irgen_AL(UChar r1, IRTemp op2addr)
+s390_irgen_A(UChar r1, IRTemp op2addr)
 {
    IRTemp op1 = newTemp(Ity_I32);
    IRTemp op2 = newTemp(Ity_I32);
@@ -3272,14 +4177,14 @@ s390_irgen_AL(UChar r1, IRTemp op2addr)
    assign(op1, get_gpr_w1(r1));
    assign(op2, load(Ity_I32, mkexpr(op2addr)));
    assign(result, binop(Iop_Add32, mkexpr(op1), mkexpr(op2)));
-   s390_cc_thunk_putZZ(S390_CC_OP_UNSIGNED_ADD_32, op1, op2);
+   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_ADD_32, op1, op2);
    put_gpr_w1(r1, mkexpr(result));
 
-   return "al";
+   return "a";
 }
 
 static const HChar *
-s390_irgen_ALY(UChar r1, IRTemp op2addr)
+s390_irgen_AY(UChar r1, IRTemp op2addr)
 {
    IRTemp op1 = newTemp(Ity_I32);
    IRTemp op2 = newTemp(Ity_I32);
@@ -3288,14 +4193,14 @@ s390_irgen_ALY(UChar r1, IRTemp op2addr)
    assign(op1, get_gpr_w1(r1));
    assign(op2, load(Ity_I32, mkexpr(op2addr)));
    assign(result, binop(Iop_Add32, mkexpr(op1), mkexpr(op2)));
-   s390_cc_thunk_putZZ(S390_CC_OP_UNSIGNED_ADD_32, op1, op2);
+   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_ADD_32, op1, op2);
    put_gpr_w1(r1, mkexpr(result));
 
-   return "aly";
+   return "ay";
 }
 
 static const HChar *
-s390_irgen_ALG(UChar r1, IRTemp op2addr)
+s390_irgen_AG(UChar r1, IRTemp op2addr)
 {
    IRTemp op1 = newTemp(Ity_I64);
    IRTemp op2 = newTemp(Ity_I64);
@@ -3304,306 +4209,295 @@ s390_irgen_ALG(UChar r1, IRTemp op2addr)
    assign(op1, get_gpr_dw0(r1));
    assign(op2, load(Ity_I64, mkexpr(op2addr)));
    assign(result, binop(Iop_Add64, mkexpr(op1), mkexpr(op2)));
-   s390_cc_thunk_putZZ(S390_CC_OP_UNSIGNED_ADD_64, op1, op2);
+   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_ADD_64, op1, op2);
    put_gpr_dw0(r1, mkexpr(result));
 
-   return "alg";
+   return "ag";
 }
 
 static const HChar *
-s390_irgen_ALGF(UChar r1, IRTemp op2addr)
+s390_irgen_AGF(UChar r1, IRTemp op2addr)
 {
    IRTemp op1 = newTemp(Ity_I64);
    IRTemp op2 = newTemp(Ity_I64);
    IRTemp result = newTemp(Ity_I64);
 
    assign(op1, get_gpr_dw0(r1));
-   assign(op2, unop(Iop_32Uto64, load(Ity_I32, mkexpr(op2addr))));
+   assign(op2, unop(Iop_32Sto64, load(Ity_I32, mkexpr(op2addr))));
    assign(result, binop(Iop_Add64, mkexpr(op1), mkexpr(op2)));
-   s390_cc_thunk_putZZ(S390_CC_OP_UNSIGNED_ADD_64, op1, op2);
+   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_ADD_64, op1, op2);
    put_gpr_dw0(r1, mkexpr(result));
 
-   return "algf";
+   return "agf";
 }
 
 static const HChar *
-s390_irgen_ALFI(UChar r1, UInt i2)
+s390_irgen_AFI(UChar r1, UInt i2)
 {
    IRTemp op1 = newTemp(Ity_I32);
-   UInt op2;
+   Int op2;
    IRTemp result = newTemp(Ity_I32);
 
    assign(op1, get_gpr_w1(r1));
-   op2 = i2;
-   assign(result, binop(Iop_Add32, mkexpr(op1), mkU32(op2)));
-   s390_cc_thunk_putZZ(S390_CC_OP_UNSIGNED_ADD_32, op1, mktemp(Ity_I32,
-                       mkU32(op2)));
+   op2 = (Int)i2;
+   assign(result, binop(Iop_Add32, mkexpr(op1), mkU32((UInt)op2)));
+   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_ADD_32, op1, mktemp(Ity_I32,
+                       mkU32((UInt)op2)));
    put_gpr_w1(r1, mkexpr(result));
 
-   return "alfi";
+   return "afi";
 }
 
 static const HChar *
-s390_irgen_ALGFI(UChar r1, UInt i2)
+s390_irgen_AGFI(UChar r1, UInt i2)
 {
    IRTemp op1 = newTemp(Ity_I64);
-   ULong op2;
+   Long op2;
    IRTemp result = newTemp(Ity_I64);
 
    assign(op1, get_gpr_dw0(r1));
-   op2 = (ULong)i2;
-   assign(result, binop(Iop_Add64, mkexpr(op1), mkU64(op2)));
-   s390_cc_thunk_putZZ(S390_CC_OP_UNSIGNED_ADD_64, op1, mktemp(Ity_I64,
-                       mkU64(op2)));
+   op2 = (Long)(Int)i2;
+   assign(result, binop(Iop_Add64, mkexpr(op1), mkU64((ULong)op2)));
+   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_ADD_64, op1, mktemp(Ity_I64,
+                       mkU64((ULong)op2)));
    put_gpr_dw0(r1, mkexpr(result));
 
-   return "algfi";
+   return "agfi";
 }
 
 static const HChar *
-s390_irgen_ALHHHR(UChar r3, UChar r1, UChar r2)
+s390_irgen_AHIK(UChar r1, UChar r3, UShort i2)
 {
-   IRTemp op2 = newTemp(Ity_I32);
+   Int op2;
    IRTemp op3 = newTemp(Ity_I32);
    IRTemp result = newTemp(Ity_I32);
 
-   assign(op2, get_gpr_w0(r2));
-   assign(op3, get_gpr_w0(r3));
-   assign(result, binop(Iop_Add32, mkexpr(op2), mkexpr(op3)));
-   s390_cc_thunk_putZZ(S390_CC_OP_UNSIGNED_ADD_32, op2, op3);
-   put_gpr_w0(r1, mkexpr(result));
+   op2 = (Int)(Short)i2;
+   assign(op3, get_gpr_w1(r3));
+   assign(result, binop(Iop_Add32, mkU32((UInt)op2), mkexpr(op3)));
+   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_ADD_32, mktemp(Ity_I32, mkU32((UInt)
+                       op2)), op3);
+   put_gpr_w1(r1, mkexpr(result));
 
-   return "alhhhr";
+   return "ahik";
 }
 
 static const HChar *
-s390_irgen_ALHHLR(UChar r3, UChar r1, UChar r2)
+s390_irgen_AGHIK(UChar r1, UChar r3, UShort i2)
 {
-   IRTemp op2 = newTemp(Ity_I32);
-   IRTemp op3 = newTemp(Ity_I32);
-   IRTemp result = newTemp(Ity_I32);
+   Long op2;
+   IRTemp op3 = newTemp(Ity_I64);
+   IRTemp result = newTemp(Ity_I64);
 
-   assign(op2, get_gpr_w0(r2));
-   assign(op3, get_gpr_w1(r3));
-   assign(result, binop(Iop_Add32, mkexpr(op2), mkexpr(op3)));
-   s390_cc_thunk_putZZ(S390_CC_OP_UNSIGNED_ADD_32, op2, op3);
-   put_gpr_w0(r1, mkexpr(result));
+   op2 = (Long)(Short)i2;
+   assign(op3, get_gpr_dw0(r3));
+   assign(result, binop(Iop_Add64, mkU64((ULong)op2), mkexpr(op3)));
+   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_ADD_64, mktemp(Ity_I64, mkU64((ULong)
+                       op2)), op3);
+   put_gpr_dw0(r1, mkexpr(result));
 
-   return "alhhlr";
+   return "aghik";
 }
 
 static const HChar *
-s390_irgen_ALCR(UChar r1, UChar r2)
+s390_irgen_ASI(UChar i2, IRTemp op1addr)
 {
    IRTemp op1 = newTemp(Ity_I32);
-   IRTemp op2 = newTemp(Ity_I32);
+   Int op2;
    IRTemp result = newTemp(Ity_I32);
-   IRTemp carry_in = newTemp(Ity_I32);
 
-   assign(op1, get_gpr_w1(r1));
-   assign(op2, get_gpr_w1(r2));
-   assign(carry_in, binop(Iop_Shr32, s390_call_calculate_cc(), mkU8(1)));
-   assign(result, binop(Iop_Add32, binop(Iop_Add32, mkexpr(op1), mkexpr(op2)),
-          mkexpr(carry_in)));
-   s390_cc_thunk_putZZZ(S390_CC_OP_UNSIGNED_ADDC_32, op1, op2, carry_in);
-   put_gpr_w1(r1, mkexpr(result));
+   assign(op1, load(Ity_I32, mkexpr(op1addr)));
+   op2 = (Int)(Char)i2;
+   assign(result, binop(Iop_Add32, mkexpr(op1), mkU32((UInt)op2)));
+   store(mkexpr(op1addr), mkexpr(result));
+   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_ADD_32, op1, mktemp(Ity_I32,
+                       mkU32((UInt)op2)));
 
-   return "alcr";
+   return "asi";
 }
 
 static const HChar *
-s390_irgen_ALCGR(UChar r1, UChar r2)
+s390_irgen_AGSI(UChar i2, IRTemp op1addr)
 {
    IRTemp op1 = newTemp(Ity_I64);
-   IRTemp op2 = newTemp(Ity_I64);
+   Long op2;
    IRTemp result = newTemp(Ity_I64);
-   IRTemp carry_in = newTemp(Ity_I64);
 
-   assign(op1, get_gpr_dw0(r1));
-   assign(op2, get_gpr_dw0(r2));
-   assign(carry_in, unop(Iop_32Uto64, binop(Iop_Shr32, s390_call_calculate_cc(),
-          mkU8(1))));
-   assign(result, binop(Iop_Add64, binop(Iop_Add64, mkexpr(op1), mkexpr(op2)),
-          mkexpr(carry_in)));
-   s390_cc_thunk_putZZZ(S390_CC_OP_UNSIGNED_ADDC_64, op1, op2, carry_in);
-   put_gpr_dw0(r1, mkexpr(result));
+   assign(op1, load(Ity_I64, mkexpr(op1addr)));
+   op2 = (Long)(Char)i2;
+   assign(result, binop(Iop_Add64, mkexpr(op1), mkU64((ULong)op2)));
+   store(mkexpr(op1addr), mkexpr(result));
+   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_ADD_64, op1, mktemp(Ity_I64,
+                       mkU64((ULong)op2)));
 
-   return "alcgr";
+   return "agsi";
 }
 
 static const HChar *
-s390_irgen_ALC(UChar r1, IRTemp op2addr)
+s390_irgen_AH(UChar r1, IRTemp op2addr)
 {
    IRTemp op1 = newTemp(Ity_I32);
    IRTemp op2 = newTemp(Ity_I32);
    IRTemp result = newTemp(Ity_I32);
-   IRTemp carry_in = newTemp(Ity_I32);
 
    assign(op1, get_gpr_w1(r1));
-   assign(op2, load(Ity_I32, mkexpr(op2addr)));
-   assign(carry_in, binop(Iop_Shr32, s390_call_calculate_cc(), mkU8(1)));
-   assign(result, binop(Iop_Add32, binop(Iop_Add32, mkexpr(op1), mkexpr(op2)),
-          mkexpr(carry_in)));
-   s390_cc_thunk_putZZZ(S390_CC_OP_UNSIGNED_ADDC_32, op1, op2, carry_in);
+   assign(op2, unop(Iop_16Sto32, load(Ity_I16, mkexpr(op2addr))));
+   assign(result, binop(Iop_Add32, mkexpr(op1), mkexpr(op2)));
+   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_ADD_32, op1, op2);
    put_gpr_w1(r1, mkexpr(result));
 
-   return "alc";
+   return "ah";
 }
 
 static const HChar *
-s390_irgen_ALCG(UChar r1, IRTemp op2addr)
+s390_irgen_AHY(UChar r1, IRTemp op2addr)
 {
-   IRTemp op1 = newTemp(Ity_I64);
-   IRTemp op2 = newTemp(Ity_I64);
-   IRTemp result = newTemp(Ity_I64);
-   IRTemp carry_in = newTemp(Ity_I64);
+   IRTemp op1 = newTemp(Ity_I32);
+   IRTemp op2 = newTemp(Ity_I32);
+   IRTemp result = newTemp(Ity_I32);
 
-   assign(op1, get_gpr_dw0(r1));
-   assign(op2, load(Ity_I64, mkexpr(op2addr)));
-   assign(carry_in, unop(Iop_32Uto64, binop(Iop_Shr32, s390_call_calculate_cc(),
-          mkU8(1))));
-   assign(result, binop(Iop_Add64, binop(Iop_Add64, mkexpr(op1), mkexpr(op2)),
-          mkexpr(carry_in)));
-   s390_cc_thunk_putZZZ(S390_CC_OP_UNSIGNED_ADDC_64, op1, op2, carry_in);
-   put_gpr_dw0(r1, mkexpr(result));
+   assign(op1, get_gpr_w1(r1));
+   assign(op2, unop(Iop_16Sto32, load(Ity_I16, mkexpr(op2addr))));
+   assign(result, binop(Iop_Add32, mkexpr(op1), mkexpr(op2)));
+   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_ADD_32, op1, op2);
+   put_gpr_w1(r1, mkexpr(result));
 
-   return "alcg";
+   return "ahy";
 }
 
 static const HChar *
-s390_irgen_ALSI(UChar i2, IRTemp op1addr)
+s390_irgen_AHI(UChar r1, UShort i2)
 {
    IRTemp op1 = newTemp(Ity_I32);
-   UInt op2;
+   Int op2;
    IRTemp result = newTemp(Ity_I32);
 
-   assign(op1, load(Ity_I32, mkexpr(op1addr)));
-   op2 = (UInt)(Int)(Char)i2;
-   assign(result, binop(Iop_Add32, mkexpr(op1), mkU32(op2)));
-   s390_cc_thunk_putZZ(S390_CC_OP_UNSIGNED_ADD_32, op1, mktemp(Ity_I32,
-                       mkU32(op2)));
-   store(mkexpr(op1addr), mkexpr(result));
+   assign(op1, get_gpr_w1(r1));
+   op2 = (Int)(Short)i2;
+   assign(result, binop(Iop_Add32, mkexpr(op1), mkU32((UInt)op2)));
+   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_ADD_32, op1, mktemp(Ity_I32,
+                       mkU32((UInt)op2)));
+   put_gpr_w1(r1, mkexpr(result));
 
-   return "alsi";
+   return "ahi";
 }
 
 static const HChar *
-s390_irgen_ALGSI(UChar i2, IRTemp op1addr)
+s390_irgen_AGHI(UChar r1, UShort i2)
 {
    IRTemp op1 = newTemp(Ity_I64);
-   ULong op2;
+   Long op2;
    IRTemp result = newTemp(Ity_I64);
 
-   assign(op1, load(Ity_I64, mkexpr(op1addr)));
-   op2 = (ULong)(Long)(Char)i2;
-   assign(result, binop(Iop_Add64, mkexpr(op1), mkU64(op2)));
-   s390_cc_thunk_putZZ(S390_CC_OP_UNSIGNED_ADD_64, op1, mktemp(Ity_I64,
-                       mkU64(op2)));
-   store(mkexpr(op1addr), mkexpr(result));
+   assign(op1, get_gpr_dw0(r1));
+   op2 = (Long)(Short)i2;
+   assign(result, binop(Iop_Add64, mkexpr(op1), mkU64((ULong)op2)));
+   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_ADD_64, op1, mktemp(Ity_I64,
+                       mkU64((ULong)op2)));
+   put_gpr_dw0(r1, mkexpr(result));
 
-   return "algsi";
+   return "aghi";
 }
 
 static const HChar *
-s390_irgen_ALHSIK(UChar r1, UChar r3, UShort i2)
+s390_irgen_AHHHR(UChar r3, UChar r1, UChar r2)
 {
-   UInt op2;
+   IRTemp op2 = newTemp(Ity_I32);
    IRTemp op3 = newTemp(Ity_I32);
    IRTemp result = newTemp(Ity_I32);
 
-   op2 = (UInt)(Int)(Short)i2;
-   assign(op3, get_gpr_w1(r3));
-   assign(result, binop(Iop_Add32, mkU32(op2), mkexpr(op3)));
-   s390_cc_thunk_putZZ(S390_CC_OP_UNSIGNED_ADD_32, mktemp(Ity_I32, mkU32(op2)),
-                       op3);
-   put_gpr_w1(r1, mkexpr(result));
+   assign(op2, get_gpr_w0(r2));
+   assign(op3, get_gpr_w0(r3));
+   assign(result, binop(Iop_Add32, mkexpr(op2), mkexpr(op3)));
+   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_ADD_32, op2, op3);
+   put_gpr_w0(r1, mkexpr(result));
 
-   return "alhsik";
+   return "ahhhr";
 }
 
 static const HChar *
-s390_irgen_ALGHSIK(UChar r1, UChar r3, UShort i2)
+s390_irgen_AHHLR(UChar r3, UChar r1, UChar r2)
 {
-   ULong op2;
-   IRTemp op3 = newTemp(Ity_I64);
-   IRTemp result = newTemp(Ity_I64);
+   IRTemp op2 = newTemp(Ity_I32);
+   IRTemp op3 = newTemp(Ity_I32);
+   IRTemp result = newTemp(Ity_I32);
 
-   op2 = (ULong)(Long)(Short)i2;
-   assign(op3, get_gpr_dw0(r3));
-   assign(result, binop(Iop_Add64, mkU64(op2), mkexpr(op3)));
-   s390_cc_thunk_putZZ(S390_CC_OP_UNSIGNED_ADD_64, mktemp(Ity_I64, mkU64(op2)),
-                       op3);
-   put_gpr_dw0(r1, mkexpr(result));
+   assign(op2, get_gpr_w0(r2));
+   assign(op3, get_gpr_w1(r3));
+   assign(result, binop(Iop_Add32, mkexpr(op2), mkexpr(op3)));
+   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_ADD_32, op2, op3);
+   put_gpr_w0(r1, mkexpr(result));
 
-   return "alghsik";
+   return "ahhlr";
 }
 
 static const HChar *
-s390_irgen_ALSIH(UChar r1, UInt i2)
+s390_irgen_AIH(UChar r1, UInt i2)
 {
    IRTemp op1 = newTemp(Ity_I32);
-   UInt op2;
+   Int op2;
    IRTemp result = newTemp(Ity_I32);
 
    assign(op1, get_gpr_w0(r1));
-   op2 = i2;
-   assign(result, binop(Iop_Add32, mkexpr(op1), mkU32(op2)));
-   s390_cc_thunk_putZZ(S390_CC_OP_UNSIGNED_ADD_32, op1, mktemp(Ity_I32,
-                       mkU32(op2)));
+   op2 = (Int)i2;
+   assign(result, binop(Iop_Add32, mkexpr(op1), mkU32((UInt)op2)));
+   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_ADD_32, op1, mktemp(Ity_I32,
+                       mkU32((UInt)op2)));
    put_gpr_w0(r1, mkexpr(result));
 
-   return "alsih";
+   return "aih";
 }
 
 static const HChar *
-s390_irgen_ALSIHN(UChar r1, UInt i2)
+s390_irgen_ALR(UChar r1, UChar r2)
 {
    IRTemp op1 = newTemp(Ity_I32);
-   UInt op2;
+   IRTemp op2 = newTemp(Ity_I32);
    IRTemp result = newTemp(Ity_I32);
 
-   assign(op1, get_gpr_w0(r1));
-   op2 = i2;
-   assign(result, binop(Iop_Add32, mkexpr(op1), mkU32(op2)));
-   put_gpr_w0(r1, mkexpr(result));
+   assign(op1, get_gpr_w1(r1));
+   assign(op2, get_gpr_w1(r2));
+   assign(result, binop(Iop_Add32, mkexpr(op1), mkexpr(op2)));
+   s390_cc_thunk_putZZ(S390_CC_OP_UNSIGNED_ADD_32, op1, op2);
+   put_gpr_w1(r1, mkexpr(result));
 
-   return "alsihn";
+   return "alr";
 }
 
 static const HChar *
-s390_irgen_NR(UChar r1, UChar r2)
+s390_irgen_ALGR(UChar r1, UChar r2)
 {
-   IRTemp op1 = newTemp(Ity_I32);
-   IRTemp op2 = newTemp(Ity_I32);
-   IRTemp result = newTemp(Ity_I32);
+   IRTemp op1 = newTemp(Ity_I64);
+   IRTemp op2 = newTemp(Ity_I64);
+   IRTemp result = newTemp(Ity_I64);
 
-   assign(op1, get_gpr_w1(r1));
-   assign(op2, get_gpr_w1(r2));
-   assign(result, binop(Iop_And32, mkexpr(op1), mkexpr(op2)));
-   s390_cc_thunk_putZ(S390_CC_OP_BITWISE, result);
-   put_gpr_w1(r1, mkexpr(result));
+   assign(op1, get_gpr_dw0(r1));
+   assign(op2, get_gpr_dw0(r2));
+   assign(result, binop(Iop_Add64, mkexpr(op1), mkexpr(op2)));
+   s390_cc_thunk_putZZ(S390_CC_OP_UNSIGNED_ADD_64, op1, op2);
+   put_gpr_dw0(r1, mkexpr(result));
 
-   return "nr";
+   return "algr";
 }
 
 static const HChar *
-s390_irgen_NGR(UChar r1, UChar r2)
+s390_irgen_ALGFR(UChar r1, UChar r2)
 {
    IRTemp op1 = newTemp(Ity_I64);
    IRTemp op2 = newTemp(Ity_I64);
    IRTemp result = newTemp(Ity_I64);
 
    assign(op1, get_gpr_dw0(r1));
-   assign(op2, get_gpr_dw0(r2));
-   assign(result, binop(Iop_And64, mkexpr(op1), mkexpr(op2)));
-   s390_cc_thunk_putZ(S390_CC_OP_BITWISE, result);
+   assign(op2, unop(Iop_32Uto64, get_gpr_w1(r2)));
+   assign(result, binop(Iop_Add64, mkexpr(op1), mkexpr(op2)));
+   s390_cc_thunk_putZZ(S390_CC_OP_UNSIGNED_ADD_64, op1, op2);
    put_gpr_dw0(r1, mkexpr(result));
 
-   return "ngr";
+   return "algfr";
 }
 
 static const HChar *
-s390_irgen_NRK(UChar r3, UChar r1, UChar r2)
+s390_irgen_ALRK(UChar r3, UChar r1, UChar r2)
 {
    IRTemp op2 = newTemp(Ity_I32);
    IRTemp op3 = newTemp(Ity_I32);
@@ -3611,15 +4505,15 @@ s390_irgen_NRK(UChar r3, UChar r1, UChar r2)
 
    assign(op2, get_gpr_w1(r2));
    assign(op3, get_gpr_w1(r3));
-   assign(result, binop(Iop_And32, mkexpr(op2), mkexpr(op3)));
-   s390_cc_thunk_putZ(S390_CC_OP_BITWISE, result);
+   assign(result, binop(Iop_Add32, mkexpr(op2), mkexpr(op3)));
+   s390_cc_thunk_putZZ(S390_CC_OP_UNSIGNED_ADD_32, op2, op3);
    put_gpr_w1(r1, mkexpr(result));
 
-   return "nrk";
+   return "alrk";
 }
 
 static const HChar *
-s390_irgen_NGRK(UChar r3, UChar r1, UChar r2)
+s390_irgen_ALGRK(UChar r3, UChar r1, UChar r2)
 {
    IRTemp op2 = newTemp(Ity_I64);
    IRTemp op3 = newTemp(Ity_I64);
@@ -3627,15 +4521,15 @@ s390_irgen_NGRK(UChar r3, UChar r1, UChar r2)
 
    assign(op2, get_gpr_dw0(r2));
    assign(op3, get_gpr_dw0(r3));
-   assign(result, binop(Iop_And64, mkexpr(op2), mkexpr(op3)));
-   s390_cc_thunk_putZ(S390_CC_OP_BITWISE, result);
+   assign(result, binop(Iop_Add64, mkexpr(op2), mkexpr(op3)));
+   s390_cc_thunk_putZZ(S390_CC_OP_UNSIGNED_ADD_64, op2, op3);
    put_gpr_dw0(r1, mkexpr(result));
 
-   return "ngrk";
+   return "algrk";
 }
 
 static const HChar *
-s390_irgen_N(UChar r1, IRTemp op2addr)
+s390_irgen_AL(UChar r1, IRTemp op2addr)
 {
    IRTemp op1 = newTemp(Ity_I32);
    IRTemp op2 = newTemp(Ity_I32);
@@ -3643,15 +4537,15 @@ s390_irgen_N(UChar r1, IRTemp op2addr)
 
    assign(op1, get_gpr_w1(r1));
    assign(op2, load(Ity_I32, mkexpr(op2addr)));
-   assign(result, binop(Iop_And32, mkexpr(op1), mkexpr(op2)));
-   s390_cc_thunk_putZ(S390_CC_OP_BITWISE, result);
+   assign(result, binop(Iop_Add32, mkexpr(op1), mkexpr(op2)));
+   s390_cc_thunk_putZZ(S390_CC_OP_UNSIGNED_ADD_32, op1, op2);
    put_gpr_w1(r1, mkexpr(result));
 
-   return "n";
+   return "al";
 }
 
 static const HChar *
-s390_irgen_NY(UChar r1, IRTemp op2addr)
+s390_irgen_ALY(UChar r1, IRTemp op2addr)
 {
    IRTemp op1 = newTemp(Ity_I32);
    IRTemp op2 = newTemp(Ity_I32);
@@ -3659,15 +4553,15 @@ s390_irgen_NY(UChar r1, IRTemp op2addr)
 
    assign(op1, get_gpr_w1(r1));
    assign(op2, load(Ity_I32, mkexpr(op2addr)));
-   assign(result, binop(Iop_And32, mkexpr(op1), mkexpr(op2)));
-   s390_cc_thunk_putZ(S390_CC_OP_BITWISE, result);
+   assign(result, binop(Iop_Add32, mkexpr(op1), mkexpr(op2)));
+   s390_cc_thunk_putZZ(S390_CC_OP_UNSIGNED_ADD_32, op1, op2);
    put_gpr_w1(r1, mkexpr(result));
 
-   return "ny";
+   return "aly";
 }
 
 static const HChar *
-s390_irgen_NG(UChar r1, IRTemp op2addr)
+s390_irgen_ALG(UChar r1, IRTemp op2addr)
 {
    IRTemp op1 = newTemp(Ity_I64);
    IRTemp op2 = newTemp(Ity_I64);
@@ -3675,1189 +4569,1561 @@ s390_irgen_NG(UChar r1, IRTemp op2addr)
 
    assign(op1, get_gpr_dw0(r1));
    assign(op2, load(Ity_I64, mkexpr(op2addr)));
-   assign(result, binop(Iop_And64, mkexpr(op1), mkexpr(op2)));
-   s390_cc_thunk_putZ(S390_CC_OP_BITWISE, result);
+   assign(result, binop(Iop_Add64, mkexpr(op1), mkexpr(op2)));
+   s390_cc_thunk_putZZ(S390_CC_OP_UNSIGNED_ADD_64, op1, op2);
    put_gpr_dw0(r1, mkexpr(result));
 
-   return "ng";
+   return "alg";
 }
 
 static const HChar *
-s390_irgen_NI(UChar i2, IRTemp op1addr)
+s390_irgen_ALGF(UChar r1, IRTemp op2addr)
 {
-   IRTemp op1 = newTemp(Ity_I8);
-   UChar op2;
-   IRTemp result = newTemp(Ity_I8);
+   IRTemp op1 = newTemp(Ity_I64);
+   IRTemp op2 = newTemp(Ity_I64);
+   IRTemp result = newTemp(Ity_I64);
 
-   assign(op1, load(Ity_I8, mkexpr(op1addr)));
-   op2 = i2;
-   assign(result, binop(Iop_And8, mkexpr(op1), mkU8(op2)));
-   s390_cc_thunk_putZ(S390_CC_OP_BITWISE, result);
-   store(mkexpr(op1addr), mkexpr(result));
+   assign(op1, get_gpr_dw0(r1));
+   assign(op2, unop(Iop_32Uto64, load(Ity_I32, mkexpr(op2addr))));
+   assign(result, binop(Iop_Add64, mkexpr(op1), mkexpr(op2)));
+   s390_cc_thunk_putZZ(S390_CC_OP_UNSIGNED_ADD_64, op1, op2);
+   put_gpr_dw0(r1, mkexpr(result));
 
-   return "ni";
+   return "algf";
 }
 
 static const HChar *
-s390_irgen_NIY(UChar i2, IRTemp op1addr)
+s390_irgen_ALFI(UChar r1, UInt i2)
 {
-   IRTemp op1 = newTemp(Ity_I8);
-   UChar op2;
-   IRTemp result = newTemp(Ity_I8);
+   IRTemp op1 = newTemp(Ity_I32);
+   UInt op2;
+   IRTemp result = newTemp(Ity_I32);
 
-   assign(op1, load(Ity_I8, mkexpr(op1addr)));
+   assign(op1, get_gpr_w1(r1));
    op2 = i2;
-   assign(result, binop(Iop_And8, mkexpr(op1), mkU8(op2)));
-   s390_cc_thunk_putZ(S390_CC_OP_BITWISE, result);
-   store(mkexpr(op1addr), mkexpr(result));
+   assign(result, binop(Iop_Add32, mkexpr(op1), mkU32(op2)));
+   s390_cc_thunk_putZZ(S390_CC_OP_UNSIGNED_ADD_32, op1, mktemp(Ity_I32,
+                       mkU32(op2)));
+   put_gpr_w1(r1, mkexpr(result));
 
-   return "niy";
+   return "alfi";
 }
 
 static const HChar *
-s390_irgen_NIHF(UChar r1, UInt i2)
+s390_irgen_ALGFI(UChar r1, UInt i2)
 {
-   IRTemp op1 = newTemp(Ity_I32);
-   UInt op2;
-   IRTemp result = newTemp(Ity_I32);
+   IRTemp op1 = newTemp(Ity_I64);
+   ULong op2;
+   IRTemp result = newTemp(Ity_I64);
 
-   assign(op1, get_gpr_w0(r1));
-   op2 = i2;
-   assign(result, binop(Iop_And32, mkexpr(op1), mkU32(op2)));
-   s390_cc_thunk_putZ(S390_CC_OP_BITWISE, result);
-   put_gpr_w0(r1, mkexpr(result));
+   assign(op1, get_gpr_dw0(r1));
+   op2 = (ULong)i2;
+   assign(result, binop(Iop_Add64, mkexpr(op1), mkU64(op2)));
+   s390_cc_thunk_putZZ(S390_CC_OP_UNSIGNED_ADD_64, op1, mktemp(Ity_I64,
+                       mkU64(op2)));
+   put_gpr_dw0(r1, mkexpr(result));
 
-   return "nihf";
+   return "algfi";
 }
 
 static const HChar *
-s390_irgen_NIHH(UChar r1, UShort i2)
+s390_irgen_ALHHHR(UChar r3, UChar r1, UChar r2)
 {
-   IRTemp op1 = newTemp(Ity_I16);
-   UShort op2;
-   IRTemp result = newTemp(Ity_I16);
+   IRTemp op2 = newTemp(Ity_I32);
+   IRTemp op3 = newTemp(Ity_I32);
+   IRTemp result = newTemp(Ity_I32);
 
-   assign(op1, get_gpr_hw0(r1));
-   op2 = i2;
-   assign(result, binop(Iop_And16, mkexpr(op1), mkU16(op2)));
-   s390_cc_thunk_putZ(S390_CC_OP_BITWISE, result);
-   put_gpr_hw0(r1, mkexpr(result));
+   assign(op2, get_gpr_w0(r2));
+   assign(op3, get_gpr_w0(r3));
+   assign(result, binop(Iop_Add32, mkexpr(op2), mkexpr(op3)));
+   s390_cc_thunk_putZZ(S390_CC_OP_UNSIGNED_ADD_32, op2, op3);
+   put_gpr_w0(r1, mkexpr(result));
 
-   return "nihh";
+   return "alhhhr";
 }
 
 static const HChar *
-s390_irgen_NIHL(UChar r1, UShort i2)
+s390_irgen_ALHHLR(UChar r3, UChar r1, UChar r2)
 {
-   IRTemp op1 = newTemp(Ity_I16);
-   UShort op2;
-   IRTemp result = newTemp(Ity_I16);
+   IRTemp op2 = newTemp(Ity_I32);
+   IRTemp op3 = newTemp(Ity_I32);
+   IRTemp result = newTemp(Ity_I32);
 
-   assign(op1, get_gpr_hw1(r1));
-   op2 = i2;
-   assign(result, binop(Iop_And16, mkexpr(op1), mkU16(op2)));
-   s390_cc_thunk_putZ(S390_CC_OP_BITWISE, result);
-   put_gpr_hw1(r1, mkexpr(result));
+   assign(op2, get_gpr_w0(r2));
+   assign(op3, get_gpr_w1(r3));
+   assign(result, binop(Iop_Add32, mkexpr(op2), mkexpr(op3)));
+   s390_cc_thunk_putZZ(S390_CC_OP_UNSIGNED_ADD_32, op2, op3);
+   put_gpr_w0(r1, mkexpr(result));
 
-   return "nihl";
+   return "alhhlr";
 }
 
 static const HChar *
-s390_irgen_NILF(UChar r1, UInt i2)
+s390_irgen_ALCR(UChar r1, UChar r2)
 {
    IRTemp op1 = newTemp(Ity_I32);
-   UInt op2;
+   IRTemp op2 = newTemp(Ity_I32);
    IRTemp result = newTemp(Ity_I32);
+   IRTemp carry_in = newTemp(Ity_I32);
 
    assign(op1, get_gpr_w1(r1));
-   op2 = i2;
-   assign(result, binop(Iop_And32, mkexpr(op1), mkU32(op2)));
-   s390_cc_thunk_putZ(S390_CC_OP_BITWISE, result);
+   assign(op2, get_gpr_w1(r2));
+   assign(carry_in, binop(Iop_Shr32, s390_call_calculate_cc(), mkU8(1)));
+   assign(result, binop(Iop_Add32, binop(Iop_Add32, mkexpr(op1), mkexpr(op2)),
+          mkexpr(carry_in)));
+   s390_cc_thunk_putZZZ(S390_CC_OP_UNSIGNED_ADDC_32, op1, op2, carry_in);
    put_gpr_w1(r1, mkexpr(result));
 
-   return "nilf";
+   return "alcr";
 }
 
 static const HChar *
-s390_irgen_NILH(UChar r1, UShort i2)
+s390_irgen_ALCGR(UChar r1, UChar r2)
 {
-   IRTemp op1 = newTemp(Ity_I16);
-   UShort op2;
-   IRTemp result = newTemp(Ity_I16);
+   IRTemp op1 = newTemp(Ity_I64);
+   IRTemp op2 = newTemp(Ity_I64);
+   IRTemp result = newTemp(Ity_I64);
+   IRTemp carry_in = newTemp(Ity_I64);
 
-   assign(op1, get_gpr_hw2(r1));
-   op2 = i2;
-   assign(result, binop(Iop_And16, mkexpr(op1), mkU16(op2)));
-   s390_cc_thunk_putZ(S390_CC_OP_BITWISE, result);
-   put_gpr_hw2(r1, mkexpr(result));
+   assign(op1, get_gpr_dw0(r1));
+   assign(op2, get_gpr_dw0(r2));
+   assign(carry_in, unop(Iop_32Uto64, binop(Iop_Shr32, s390_call_calculate_cc(),
+          mkU8(1))));
+   assign(result, binop(Iop_Add64, binop(Iop_Add64, mkexpr(op1), mkexpr(op2)),
+          mkexpr(carry_in)));
+   s390_cc_thunk_putZZZ(S390_CC_OP_UNSIGNED_ADDC_64, op1, op2, carry_in);
+   put_gpr_dw0(r1, mkexpr(result));
 
-   return "nilh";
+   return "alcgr";
 }
 
 static const HChar *
-s390_irgen_NILL(UChar r1, UShort i2)
+s390_irgen_ALC(UChar r1, IRTemp op2addr)
 {
-   IRTemp op1 = newTemp(Ity_I16);
-   UShort op2;
-   IRTemp result = newTemp(Ity_I16);
+   IRTemp op1 = newTemp(Ity_I32);
+   IRTemp op2 = newTemp(Ity_I32);
+   IRTemp result = newTemp(Ity_I32);
+   IRTemp carry_in = newTemp(Ity_I32);
 
-   assign(op1, get_gpr_hw3(r1));
-   op2 = i2;
-   assign(result, binop(Iop_And16, mkexpr(op1), mkU16(op2)));
-   s390_cc_thunk_putZ(S390_CC_OP_BITWISE, result);
-   put_gpr_hw3(r1, mkexpr(result));
+   assign(op1, get_gpr_w1(r1));
+   assign(op2, load(Ity_I32, mkexpr(op2addr)));
+   assign(carry_in, binop(Iop_Shr32, s390_call_calculate_cc(), mkU8(1)));
+   assign(result, binop(Iop_Add32, binop(Iop_Add32, mkexpr(op1), mkexpr(op2)),
+          mkexpr(carry_in)));
+   s390_cc_thunk_putZZZ(S390_CC_OP_UNSIGNED_ADDC_32, op1, op2, carry_in);
+   put_gpr_w1(r1, mkexpr(result));
 
-   return "nill";
+   return "alc";
 }
 
 static const HChar *
-s390_irgen_BASR(UChar r1, UChar r2)
+s390_irgen_ALCG(UChar r1, IRTemp op2addr)
 {
-   IRTemp target = newTemp(Ity_I64);
+   IRTemp op1 = newTemp(Ity_I64);
+   IRTemp op2 = newTemp(Ity_I64);
+   IRTemp result = newTemp(Ity_I64);
+   IRTemp carry_in = newTemp(Ity_I64);
 
-   if (r2 == 0) {
-      put_gpr_dw0(r1, mkU64(guest_IA_curr_instr + 2ULL));
-   } else {
-      if (r1 != r2) {
-         put_gpr_dw0(r1, mkU64(guest_IA_curr_instr + 2ULL));
-         call_function(get_gpr_dw0(r2));
-      } else {
-         assign(target, get_gpr_dw0(r2));
-         put_gpr_dw0(r1, mkU64(guest_IA_curr_instr + 2ULL));
-         call_function(mkexpr(target));
-      }
-   }
+   assign(op1, get_gpr_dw0(r1));
+   assign(op2, load(Ity_I64, mkexpr(op2addr)));
+   assign(carry_in, unop(Iop_32Uto64, binop(Iop_Shr32, s390_call_calculate_cc(),
+          mkU8(1))));
+   assign(result, binop(Iop_Add64, binop(Iop_Add64, mkexpr(op1), mkexpr(op2)),
+          mkexpr(carry_in)));
+   s390_cc_thunk_putZZZ(S390_CC_OP_UNSIGNED_ADDC_64, op1, op2, carry_in);
+   put_gpr_dw0(r1, mkexpr(result));
 
-   return "basr";
+   return "alcg";
 }
 
 static const HChar *
-s390_irgen_BAS(UChar r1, IRTemp op2addr)
+s390_irgen_ALSI(UChar i2, IRTemp op1addr)
 {
-   IRTemp target = newTemp(Ity_I64);
+   IRTemp op1 = newTemp(Ity_I32);
+   UInt op2;
+   IRTemp result = newTemp(Ity_I32);
 
-   put_gpr_dw0(r1, mkU64(guest_IA_curr_instr + 4ULL));
-   assign(target, mkexpr(op2addr));
-   call_function(mkexpr(target));
+   assign(op1, load(Ity_I32, mkexpr(op1addr)));
+   op2 = (UInt)(Int)(Char)i2;
+   assign(result, binop(Iop_Add32, mkexpr(op1), mkU32(op2)));
+   s390_cc_thunk_putZZ(S390_CC_OP_UNSIGNED_ADD_32, op1, mktemp(Ity_I32,
+                       mkU32(op2)));
+   store(mkexpr(op1addr), mkexpr(result));
 
-   return "bas";
+   return "alsi";
 }
 
 static const HChar *
-s390_irgen_BCR(UChar r1, UChar r2)
+s390_irgen_ALGSI(UChar i2, IRTemp op1addr)
 {
-   IRTemp cond = newTemp(Ity_I32);
-
-   if (r2 == 0 && (r1 >= 14)) {    /* serialization */
-      stmt(IRStmt_MBE(Imbe_Fence));
-   }
+   IRTemp op1 = newTemp(Ity_I64);
+   ULong op2;
+   IRTemp result = newTemp(Ity_I64);
 
-   if ((r2 == 0) || (r1 == 0)) {
-   } else {
-      if (r1 == 15) {
-         return_from_function(get_gpr_dw0(r2));
-      } else {
-         assign(cond, s390_call_calculate_cond(r1));
-         if_condition_goto_computed(binop(Iop_CmpNE32, mkexpr(cond), mkU32(0)),
-                                    get_gpr_dw0(r2));
-      }
-   }
-   if (UNLIKELY(vex_traceflags & VEX_TRACE_FE))
-      s390_disasm(ENC2(XMNM, GPR), S390_XMNM_BCR, r1, r2);
+   assign(op1, load(Ity_I64, mkexpr(op1addr)));
+   op2 = (ULong)(Long)(Char)i2;
+   assign(result, binop(Iop_Add64, mkexpr(op1), mkU64(op2)));
+   s390_cc_thunk_putZZ(S390_CC_OP_UNSIGNED_ADD_64, op1, mktemp(Ity_I64,
+                       mkU64(op2)));
+   store(mkexpr(op1addr), mkexpr(result));
 
-   return "bcr";
+   return "algsi";
 }
 
 static const HChar *
-s390_irgen_BC(UChar r1, UChar x2, UChar b2, UShort d2, IRTemp op2addr)
+s390_irgen_ALHSIK(UChar r1, UChar r3, UShort i2)
 {
-   IRTemp cond = newTemp(Ity_I32);
+   UInt op2;
+   IRTemp op3 = newTemp(Ity_I32);
+   IRTemp result = newTemp(Ity_I32);
 
-   if (r1 == 0) {
-   } else {
-      if (r1 == 15) {
-         always_goto(mkexpr(op2addr));
-      } else {
-         assign(cond, s390_call_calculate_cond(r1));
-         if_condition_goto_computed(binop(Iop_CmpNE32, mkexpr(cond), mkU32(0)),
-                                    mkexpr(op2addr));
-      }
-   }
-   if (UNLIKELY(vex_traceflags & VEX_TRACE_FE))
-      s390_disasm(ENC2(XMNM, UDXB), S390_XMNM_BC, r1, d2, x2, b2);
+   op2 = (UInt)(Int)(Short)i2;
+   assign(op3, get_gpr_w1(r3));
+   assign(result, binop(Iop_Add32, mkU32(op2), mkexpr(op3)));
+   s390_cc_thunk_putZZ(S390_CC_OP_UNSIGNED_ADD_32, mktemp(Ity_I32, mkU32(op2)),
+                       op3);
+   put_gpr_w1(r1, mkexpr(result));
 
-   return "bc";
+   return "alhsik";
 }
 
 static const HChar *
-s390_irgen_BCTR(UChar r1, UChar r2)
+s390_irgen_ALGHSIK(UChar r1, UChar r3, UShort i2)
 {
-   put_gpr_w1(r1, binop(Iop_Sub32, get_gpr_w1(r1), mkU32(1)));
-   if (r2 != 0) {
-      if_condition_goto_computed(binop(Iop_CmpNE32, get_gpr_w1(r1), mkU32(0)),
-                                 get_gpr_dw0(r2));
-   }
+   ULong op2;
+   IRTemp op3 = newTemp(Ity_I64);
+   IRTemp result = newTemp(Ity_I64);
 
-   return "bctr";
+   op2 = (ULong)(Long)(Short)i2;
+   assign(op3, get_gpr_dw0(r3));
+   assign(result, binop(Iop_Add64, mkU64(op2), mkexpr(op3)));
+   s390_cc_thunk_putZZ(S390_CC_OP_UNSIGNED_ADD_64, mktemp(Ity_I64, mkU64(op2)),
+                       op3);
+   put_gpr_dw0(r1, mkexpr(result));
+
+   return "alghsik";
 }
 
 static const HChar *
-s390_irgen_BCTGR(UChar r1, UChar r2)
+s390_irgen_ALSIH(UChar r1, UInt i2)
 {
-   put_gpr_dw0(r1, binop(Iop_Sub64, get_gpr_dw0(r1), mkU64(1)));
-   if (r2 != 0) {
-      if_condition_goto_computed(binop(Iop_CmpNE64, get_gpr_dw0(r1), mkU64(0)),
-                                 get_gpr_dw0(r2));
-   }
+   IRTemp op1 = newTemp(Ity_I32);
+   UInt op2;
+   IRTemp result = newTemp(Ity_I32);
 
-   return "bctgr";
+   assign(op1, get_gpr_w0(r1));
+   op2 = i2;
+   assign(result, binop(Iop_Add32, mkexpr(op1), mkU32(op2)));
+   s390_cc_thunk_putZZ(S390_CC_OP_UNSIGNED_ADD_32, op1, mktemp(Ity_I32,
+                       mkU32(op2)));
+   put_gpr_w0(r1, mkexpr(result));
+
+   return "alsih";
 }
 
 static const HChar *
-s390_irgen_BCT(UChar r1, IRTemp op2addr)
+s390_irgen_ALSIHN(UChar r1, UInt i2)
 {
-   put_gpr_w1(r1, binop(Iop_Sub32, get_gpr_w1(r1), mkU32(1)));
-   if_condition_goto_computed(binop(Iop_CmpNE32, get_gpr_w1(r1), mkU32(0)),
-                              mkexpr(op2addr));
+   IRTemp op1 = newTemp(Ity_I32);
+   UInt op2;
+   IRTemp result = newTemp(Ity_I32);
 
-   return "bct";
+   assign(op1, get_gpr_w0(r1));
+   op2 = i2;
+   assign(result, binop(Iop_Add32, mkexpr(op1), mkU32(op2)));
+   put_gpr_w0(r1, mkexpr(result));
+
+   return "alsihn";
 }
 
 static const HChar *
-s390_irgen_BCTG(UChar r1, IRTemp op2addr)
+s390_irgen_NR(UChar r1, UChar r2)
 {
-   put_gpr_dw0(r1, binop(Iop_Sub64, get_gpr_dw0(r1), mkU64(1)));
-   if_condition_goto_computed(binop(Iop_CmpNE64, get_gpr_dw0(r1), mkU64(0)),
-                              mkexpr(op2addr));
+   IRTemp op1 = newTemp(Ity_I32);
+   IRTemp op2 = newTemp(Ity_I32);
+   IRTemp result = newTemp(Ity_I32);
 
-   return "bctg";
+   assign(op1, get_gpr_w1(r1));
+   assign(op2, get_gpr_w1(r2));
+   assign(result, binop(Iop_And32, mkexpr(op1), mkexpr(op2)));
+   s390_cc_thunk_putZ(S390_CC_OP_BITWISE, result);
+   put_gpr_w1(r1, mkexpr(result));
+
+   return "nr";
 }
 
 static const HChar *
-s390_irgen_BXH(UChar r1, UChar r3, IRTemp op2addr)
+s390_irgen_NGR(UChar r1, UChar r2)
 {
-   IRTemp value = newTemp(Ity_I32);
+   IRTemp op1 = newTemp(Ity_I64);
+   IRTemp op2 = newTemp(Ity_I64);
+   IRTemp result = newTemp(Ity_I64);
 
-   assign(value, get_gpr_w1(r3 | 1));
-   put_gpr_w1(r1, binop(Iop_Add32, get_gpr_w1(r1), get_gpr_w1(r3)));
-   if_condition_goto_computed(binop(Iop_CmpLT32S, mkexpr(value),
-                                    get_gpr_w1(r1)), mkexpr(op2addr));
+   assign(op1, get_gpr_dw0(r1));
+   assign(op2, get_gpr_dw0(r2));
+   assign(result, binop(Iop_And64, mkexpr(op1), mkexpr(op2)));
+   s390_cc_thunk_putZ(S390_CC_OP_BITWISE, result);
+   put_gpr_dw0(r1, mkexpr(result));
 
-   return "bxh";
+   return "ngr";
 }
 
 static const HChar *
-s390_irgen_BXHG(UChar r1, UChar r3, IRTemp op2addr)
+s390_irgen_NRK(UChar r3, UChar r1, UChar r2)
 {
-   IRTemp value = newTemp(Ity_I64);
+   IRTemp op2 = newTemp(Ity_I32);
+   IRTemp op3 = newTemp(Ity_I32);
+   IRTemp result = newTemp(Ity_I32);
 
-   assign(value, get_gpr_dw0(r3 | 1));
-   put_gpr_dw0(r1, binop(Iop_Add64, get_gpr_dw0(r1), get_gpr_dw0(r3)));
-   if_condition_goto_computed(binop(Iop_CmpLT64S, mkexpr(value),
-                                    get_gpr_dw0(r1)), mkexpr(op2addr));
+   assign(op2, get_gpr_w1(r2));
+   assign(op3, get_gpr_w1(r3));
+   assign(result, binop(Iop_And32, mkexpr(op2), mkexpr(op3)));
+   s390_cc_thunk_putZ(S390_CC_OP_BITWISE, result);
+   put_gpr_w1(r1, mkexpr(result));
 
-   return "bxhg";
+   return "nrk";
 }
 
 static const HChar *
-s390_irgen_BXLE(UChar r1, UChar r3, IRTemp op2addr)
+s390_irgen_NGRK(UChar r3, UChar r1, UChar r2)
 {
-   IRTemp value = newTemp(Ity_I32);
+   IRTemp op2 = newTemp(Ity_I64);
+   IRTemp op3 = newTemp(Ity_I64);
+   IRTemp result = newTemp(Ity_I64);
 
-   assign(value, get_gpr_w1(r3 | 1));
-   put_gpr_w1(r1, binop(Iop_Add32, get_gpr_w1(r1), get_gpr_w1(r3)));
-   if_condition_goto_computed(binop(Iop_CmpLE32S, get_gpr_w1(r1),
-                                    mkexpr(value)), mkexpr(op2addr));
+   assign(op2, get_gpr_dw0(r2));
+   assign(op3, get_gpr_dw0(r3));
+   assign(result, binop(Iop_And64, mkexpr(op2), mkexpr(op3)));
+   s390_cc_thunk_putZ(S390_CC_OP_BITWISE, result);
+   put_gpr_dw0(r1, mkexpr(result));
 
-   return "bxle";
+   return "ngrk";
 }
 
 static const HChar *
-s390_irgen_BXLEG(UChar r1, UChar r3, IRTemp op2addr)
+s390_irgen_N(UChar r1, IRTemp op2addr)
 {
-   IRTemp value = newTemp(Ity_I64);
+   IRTemp op1 = newTemp(Ity_I32);
+   IRTemp op2 = newTemp(Ity_I32);
+   IRTemp result = newTemp(Ity_I32);
 
-   assign(value, get_gpr_dw0(r3 | 1));
-   put_gpr_dw0(r1, binop(Iop_Add64, get_gpr_dw0(r1), get_gpr_dw0(r3)));
-   if_condition_goto_computed(binop(Iop_CmpLE64S, get_gpr_dw0(r1),
-                                    mkexpr(value)), mkexpr(op2addr));
+   assign(op1, get_gpr_w1(r1));
+   assign(op2, load(Ity_I32, mkexpr(op2addr)));
+   assign(result, binop(Iop_And32, mkexpr(op1), mkexpr(op2)));
+   s390_cc_thunk_putZ(S390_CC_OP_BITWISE, result);
+   put_gpr_w1(r1, mkexpr(result));
 
-   return "bxleg";
+   return "n";
 }
 
 static const HChar *
-s390_irgen_BRAS(UChar r1, UShort i2)
+s390_irgen_NY(UChar r1, IRTemp op2addr)
 {
-   put_gpr_dw0(r1, mkU64(guest_IA_curr_instr + 4ULL));
-   call_function_and_chase(guest_IA_curr_instr + ((ULong)(Long)(Short)i2 << 1));
+   IRTemp op1 = newTemp(Ity_I32);
+   IRTemp op2 = newTemp(Ity_I32);
+   IRTemp result = newTemp(Ity_I32);
 
-   return "bras";
+   assign(op1, get_gpr_w1(r1));
+   assign(op2, load(Ity_I32, mkexpr(op2addr)));
+   assign(result, binop(Iop_And32, mkexpr(op1), mkexpr(op2)));
+   s390_cc_thunk_putZ(S390_CC_OP_BITWISE, result);
+   put_gpr_w1(r1, mkexpr(result));
+
+   return "ny";
 }
 
 static const HChar *
-s390_irgen_BRASL(UChar r1, UInt i2)
+s390_irgen_NG(UChar r1, IRTemp op2addr)
 {
-   put_gpr_dw0(r1, mkU64(guest_IA_curr_instr + 6ULL));
-   call_function_and_chase(guest_IA_curr_instr + ((ULong)(Long)(Int)i2 << 1));
+   IRTemp op1 = newTemp(Ity_I64);
+   IRTemp op2 = newTemp(Ity_I64);
+   IRTemp result = newTemp(Ity_I64);
 
-   return "brasl";
+   assign(op1, get_gpr_dw0(r1));
+   assign(op2, load(Ity_I64, mkexpr(op2addr)));
+   assign(result, binop(Iop_And64, mkexpr(op1), mkexpr(op2)));
+   s390_cc_thunk_putZ(S390_CC_OP_BITWISE, result);
+   put_gpr_dw0(r1, mkexpr(result));
+
+   return "ng";
 }
 
 static const HChar *
-s390_irgen_BRC(UChar r1, UShort i2)
+s390_irgen_NI(UChar i2, IRTemp op1addr)
 {
-   IRTemp cond = newTemp(Ity_I32);
-
-   if (r1 == 0) {
-   } else {
-      if (r1 == 15) {
-         always_goto_and_chase(
-               guest_IA_curr_instr + ((ULong)(Long)(Short)i2 << 1));
-      } else {
-         assign(cond, s390_call_calculate_cond(r1));
-         if_condition_goto(binop(Iop_CmpNE32, mkexpr(cond), mkU32(0)),
-                           guest_IA_curr_instr + ((ULong)(Long)(Short)i2 << 1));
+   IRTemp op1 = newTemp(Ity_I8);
+   UChar op2;
+   IRTemp result = newTemp(Ity_I8);
 
-      }
-   }
-   if (UNLIKELY(vex_traceflags & VEX_TRACE_FE))
-      s390_disasm(ENC2(XMNM, PCREL), S390_XMNM_BRC, r1, (Int)(Short)i2);
+   assign(op1, load(Ity_I8, mkexpr(op1addr)));
+   op2 = i2;
+   assign(result, binop(Iop_And8, mkexpr(op1), mkU8(op2)));
+   s390_cc_thunk_putZ(S390_CC_OP_BITWISE, result);
+   store(mkexpr(op1addr), mkexpr(result));
 
-   return "brc";
+   return "ni";
 }
 
 static const HChar *
-s390_irgen_BRCL(UChar r1, UInt i2)
+s390_irgen_NIY(UChar i2, IRTemp op1addr)
 {
-   IRTemp cond = newTemp(Ity_I32);
+   IRTemp op1 = newTemp(Ity_I8);
+   UChar op2;
+   IRTemp result = newTemp(Ity_I8);
 
-   if (r1 == 0) {
-   } else {
-      if (r1 == 15) {
-         always_goto_and_chase(guest_IA_curr_instr + ((ULong)(Long)(Int)i2 << 1));
-      } else {
-         assign(cond, s390_call_calculate_cond(r1));
-         if_condition_goto(binop(Iop_CmpNE32, mkexpr(cond), mkU32(0)),
-                           guest_IA_curr_instr + ((ULong)(Long)(Int)i2 << 1));
-      }
-   }
-   if (UNLIKELY(vex_traceflags & VEX_TRACE_FE))
-      s390_disasm(ENC2(XMNM, PCREL), S390_XMNM_BRCL, r1, i2);
+   assign(op1, load(Ity_I8, mkexpr(op1addr)));
+   op2 = i2;
+   assign(result, binop(Iop_And8, mkexpr(op1), mkU8(op2)));
+   s390_cc_thunk_putZ(S390_CC_OP_BITWISE, result);
+   store(mkexpr(op1addr), mkexpr(result));
 
-   return "brcl";
+   return "niy";
 }
 
 static const HChar *
-s390_irgen_BRCT(UChar r1, UShort i2)
+s390_irgen_NIHF(UChar r1, UInt i2)
 {
-   put_gpr_w1(r1, binop(Iop_Sub32, get_gpr_w1(r1), mkU32(1)));
-   if_condition_goto(binop(Iop_CmpNE32, get_gpr_w1(r1), mkU32(0)),
-                     guest_IA_curr_instr + ((ULong)(Long)(Short)i2 << 1));
+   IRTemp op1 = newTemp(Ity_I32);
+   UInt op2;
+   IRTemp result = newTemp(Ity_I32);
 
-   return "brct";
+   assign(op1, get_gpr_w0(r1));
+   op2 = i2;
+   assign(result, binop(Iop_And32, mkexpr(op1), mkU32(op2)));
+   s390_cc_thunk_putZ(S390_CC_OP_BITWISE, result);
+   put_gpr_w0(r1, mkexpr(result));
+
+   return "nihf";
 }
 
 static const HChar *
-s390_irgen_BRCTH(UChar r1, UInt i2)
+s390_irgen_NIHH(UChar r1, UShort i2)
 {
-   put_gpr_w0(r1, binop(Iop_Sub32, get_gpr_w0(r1), mkU32(1)));
-   if_condition_goto(binop(Iop_CmpNE32, get_gpr_w0(r1), mkU32(0)),
-                     guest_IA_curr_instr + ((ULong)(Long)(Short)i2 << 1));
+   IRTemp op1 = newTemp(Ity_I16);
+   UShort op2;
+   IRTemp result = newTemp(Ity_I16);
 
-   return "brcth";
+   assign(op1, get_gpr_hw0(r1));
+   op2 = i2;
+   assign(result, binop(Iop_And16, mkexpr(op1), mkU16(op2)));
+   s390_cc_thunk_putZ(S390_CC_OP_BITWISE, result);
+   put_gpr_hw0(r1, mkexpr(result));
+
+   return "nihh";
 }
 
 static const HChar *
-s390_irgen_BRCTG(UChar r1, UShort i2)
+s390_irgen_NIHL(UChar r1, UShort i2)
 {
-   put_gpr_dw0(r1, binop(Iop_Sub64, get_gpr_dw0(r1), mkU64(1)));
-   if_condition_goto(binop(Iop_CmpNE64, get_gpr_dw0(r1), mkU64(0)),
-                     guest_IA_curr_instr + ((ULong)(Long)(Short)i2 << 1));
+   IRTemp op1 = newTemp(Ity_I16);
+   UShort op2;
+   IRTemp result = newTemp(Ity_I16);
 
-   return "brctg";
+   assign(op1, get_gpr_hw1(r1));
+   op2 = i2;
+   assign(result, binop(Iop_And16, mkexpr(op1), mkU16(op2)));
+   s390_cc_thunk_putZ(S390_CC_OP_BITWISE, result);
+   put_gpr_hw1(r1, mkexpr(result));
+
+   return "nihl";
 }
 
 static const HChar *
-s390_irgen_BRXH(UChar r1, UChar r3, UShort i2)
+s390_irgen_NILF(UChar r1, UInt i2)
 {
-   IRTemp value = newTemp(Ity_I32);
+   IRTemp op1 = newTemp(Ity_I32);
+   UInt op2;
+   IRTemp result = newTemp(Ity_I32);
 
-   assign(value, get_gpr_w1(r3 | 1));
-   put_gpr_w1(r1, binop(Iop_Add32, get_gpr_w1(r1), get_gpr_w1(r3)));
-   if_condition_goto(binop(Iop_CmpLT32S, mkexpr(value), get_gpr_w1(r1)),
-                     guest_IA_curr_instr + ((ULong)(Long)(Short)i2 << 1));
+   assign(op1, get_gpr_w1(r1));
+   op2 = i2;
+   assign(result, binop(Iop_And32, mkexpr(op1), mkU32(op2)));
+   s390_cc_thunk_putZ(S390_CC_OP_BITWISE, result);
+   put_gpr_w1(r1, mkexpr(result));
 
-   return "brxh";
+   return "nilf";
 }
 
 static const HChar *
-s390_irgen_BRXHG(UChar r1, UChar r3, UShort i2)
+s390_irgen_NILH(UChar r1, UShort i2)
 {
-   IRTemp value = newTemp(Ity_I64);
+   IRTemp op1 = newTemp(Ity_I16);
+   UShort op2;
+   IRTemp result = newTemp(Ity_I16);
 
-   assign(value, get_gpr_dw0(r3 | 1));
-   put_gpr_dw0(r1, binop(Iop_Add64, get_gpr_dw0(r1), get_gpr_dw0(r3)));
-   if_condition_goto(binop(Iop_CmpLT64S, mkexpr(value), get_gpr_dw0(r1)),
-                     guest_IA_curr_instr + ((ULong)(Long)(Short)i2 << 1));
+   assign(op1, get_gpr_hw2(r1));
+   op2 = i2;
+   assign(result, binop(Iop_And16, mkexpr(op1), mkU16(op2)));
+   s390_cc_thunk_putZ(S390_CC_OP_BITWISE, result);
+   put_gpr_hw2(r1, mkexpr(result));
 
-   return "brxhg";
+   return "nilh";
 }
 
 static const HChar *
-s390_irgen_BRXLE(UChar r1, UChar r3, UShort i2)
+s390_irgen_NILL(UChar r1, UShort i2)
 {
-   IRTemp value = newTemp(Ity_I32);
+   IRTemp op1 = newTemp(Ity_I16);
+   UShort op2;
+   IRTemp result = newTemp(Ity_I16);
 
-   assign(value, get_gpr_w1(r3 | 1));
-   put_gpr_w1(r1, binop(Iop_Add32, get_gpr_w1(r1), get_gpr_w1(r3)));
-   if_condition_goto(binop(Iop_CmpLE32S, get_gpr_w1(r1), mkexpr(value)),
-                     guest_IA_curr_instr + ((ULong)(Long)(Short)i2 << 1));
+   assign(op1, get_gpr_hw3(r1));
+   op2 = i2;
+   assign(result, binop(Iop_And16, mkexpr(op1), mkU16(op2)));
+   s390_cc_thunk_putZ(S390_CC_OP_BITWISE, result);
+   put_gpr_hw3(r1, mkexpr(result));
 
-   return "brxle";
+   return "nill";
 }
 
 static const HChar *
-s390_irgen_BRXLG(UChar r1, UChar r3, UShort i2)
+s390_irgen_BASR(UChar r1, UChar r2)
 {
-   IRTemp value = newTemp(Ity_I64);
+   IRTemp target = newTemp(Ity_I64);
 
-   assign(value, get_gpr_dw0(r3 | 1));
-   put_gpr_dw0(r1, binop(Iop_Add64, get_gpr_dw0(r1), get_gpr_dw0(r3)));
-   if_condition_goto(binop(Iop_CmpLE64S, get_gpr_dw0(r1), mkexpr(value)),
-                     guest_IA_curr_instr + ((ULong)(Long)(Short)i2 << 1));
+   if (r2 == 0) {
+      put_gpr_dw0(r1, mkU64(guest_IA_curr_instr + 2ULL));
+   } else {
+      if (r1 != r2) {
+         put_gpr_dw0(r1, mkU64(guest_IA_curr_instr + 2ULL));
+         call_function(get_gpr_dw0(r2));
+      } else {
+         assign(target, get_gpr_dw0(r2));
+         put_gpr_dw0(r1, mkU64(guest_IA_curr_instr + 2ULL));
+         call_function(mkexpr(target));
+      }
+   }
 
-   return "brxlg";
+   return "basr";
 }
 
 static const HChar *
-s390_irgen_CR(UChar r1, UChar r2)
+s390_irgen_BAS(UChar r1, IRTemp op2addr)
 {
-   IRTemp op1 = newTemp(Ity_I32);
-   IRTemp op2 = newTemp(Ity_I32);
+   IRTemp target = newTemp(Ity_I64);
 
-   assign(op1, get_gpr_w1(r1));
-   assign(op2, get_gpr_w1(r2));
-   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_COMPARE, op1, op2);
+   put_gpr_dw0(r1, mkU64(guest_IA_curr_instr + 4ULL));
+   assign(target, mkexpr(op2addr));
+   call_function(mkexpr(target));
 
-   return "cr";
+   return "bas";
 }
 
 static const HChar *
-s390_irgen_CGR(UChar r1, UChar r2)
+s390_irgen_BCR(UChar r1, UChar r2)
 {
-   IRTemp op1 = newTemp(Ity_I64);
-   IRTemp op2 = newTemp(Ity_I64);
+   IRTemp cond = newTemp(Ity_I32);
 
-   assign(op1, get_gpr_dw0(r1));
-   assign(op2, get_gpr_dw0(r2));
-   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_COMPARE, op1, op2);
+   if (r2 == 0 && (r1 >= 14)) {    /* serialization */
+      stmt(IRStmt_MBE(Imbe_Fence));
+   }
 
-   return "cgr";
+   if ((r2 == 0) || (r1 == 0)) {
+   } else {
+      if (r1 == 15) {
+         return_from_function(get_gpr_dw0(r2));
+      } else {
+         assign(cond, s390_call_calculate_cond(r1));
+         if_condition_goto_computed(binop(Iop_CmpNE32, mkexpr(cond), mkU32(0)),
+                                    get_gpr_dw0(r2));
+      }
+   }
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_FE))
+      s390_disasm(ENC2(XMNM, GPR), S390_XMNM_BCR, r1, r2);
+
+   return "bcr";
 }
 
 static const HChar *
-s390_irgen_CGFR(UChar r1, UChar r2)
+s390_irgen_BC(UChar r1, UChar x2, UChar b2, UShort d2, IRTemp op2addr)
 {
-   IRTemp op1 = newTemp(Ity_I64);
-   IRTemp op2 = newTemp(Ity_I64);
+   IRTemp cond = newTemp(Ity_I32);
 
-   assign(op1, get_gpr_dw0(r1));
-   assign(op2, unop(Iop_32Sto64, get_gpr_w1(r2)));
-   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_COMPARE, op1, op2);
+   if (r1 == 0) {
+   } else {
+      if (r1 == 15) {
+         always_goto(mkexpr(op2addr));
+      } else {
+         assign(cond, s390_call_calculate_cond(r1));
+         if_condition_goto_computed(binop(Iop_CmpNE32, mkexpr(cond), mkU32(0)),
+                                    mkexpr(op2addr));
+      }
+   }
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_FE))
+      s390_disasm(ENC2(XMNM, UDXB), S390_XMNM_BC, r1, d2, x2, b2);
 
-   return "cgfr";
+   return "bc";
 }
 
 static const HChar *
-s390_irgen_C(UChar r1, IRTemp op2addr)
+s390_irgen_BCTR(UChar r1, UChar r2)
 {
-   IRTemp op1 = newTemp(Ity_I32);
-   IRTemp op2 = newTemp(Ity_I32);
-
-   assign(op1, get_gpr_w1(r1));
-   assign(op2, load(Ity_I32, mkexpr(op2addr)));
-   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_COMPARE, op1, op2);
+   put_gpr_w1(r1, binop(Iop_Sub32, get_gpr_w1(r1), mkU32(1)));
+   if (r2 != 0) {
+      if_condition_goto_computed(binop(Iop_CmpNE32, get_gpr_w1(r1), mkU32(0)),
+                                 get_gpr_dw0(r2));
+   }
 
-   return "c";
+   return "bctr";
 }
 
 static const HChar *
-s390_irgen_CY(UChar r1, IRTemp op2addr)
+s390_irgen_BCTGR(UChar r1, UChar r2)
 {
-   IRTemp op1 = newTemp(Ity_I32);
-   IRTemp op2 = newTemp(Ity_I32);
-
-   assign(op1, get_gpr_w1(r1));
-   assign(op2, load(Ity_I32, mkexpr(op2addr)));
-   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_COMPARE, op1, op2);
+   put_gpr_dw0(r1, binop(Iop_Sub64, get_gpr_dw0(r1), mkU64(1)));
+   if (r2 != 0) {
+      if_condition_goto_computed(binop(Iop_CmpNE64, get_gpr_dw0(r1), mkU64(0)),
+                                 get_gpr_dw0(r2));
+   }
 
-   return "cy";
+   return "bctgr";
 }
 
 static const HChar *
-s390_irgen_CG(UChar r1, IRTemp op2addr)
+s390_irgen_BCT(UChar r1, IRTemp op2addr)
 {
-   IRTemp op1 = newTemp(Ity_I64);
-   IRTemp op2 = newTemp(Ity_I64);
-
-   assign(op1, get_gpr_dw0(r1));
-   assign(op2, load(Ity_I64, mkexpr(op2addr)));
-   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_COMPARE, op1, op2);
+   put_gpr_w1(r1, binop(Iop_Sub32, get_gpr_w1(r1), mkU32(1)));
+   if_condition_goto_computed(binop(Iop_CmpNE32, get_gpr_w1(r1), mkU32(0)),
+                              mkexpr(op2addr));
 
-   return "cg";
+   return "bct";
 }
 
 static const HChar *
-s390_irgen_CGF(UChar r1, IRTemp op2addr)
+s390_irgen_BCTG(UChar r1, IRTemp op2addr)
 {
-   IRTemp op1 = newTemp(Ity_I64);
-   IRTemp op2 = newTemp(Ity_I64);
-
-   assign(op1, get_gpr_dw0(r1));
-   assign(op2, unop(Iop_32Sto64, load(Ity_I32, mkexpr(op2addr))));
-   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_COMPARE, op1, op2);
+   put_gpr_dw0(r1, binop(Iop_Sub64, get_gpr_dw0(r1), mkU64(1)));
+   if_condition_goto_computed(binop(Iop_CmpNE64, get_gpr_dw0(r1), mkU64(0)),
+                              mkexpr(op2addr));
 
-   return "cgf";
+   return "bctg";
 }
 
 static const HChar *
-s390_irgen_CFI(UChar r1, UInt i2)
+s390_irgen_BXH(UChar r1, UChar r3, IRTemp op2addr)
 {
-   IRTemp op1 = newTemp(Ity_I32);
-   Int op2;
+   IRTemp value = newTemp(Ity_I32);
 
-   assign(op1, get_gpr_w1(r1));
-   op2 = (Int)i2;
-   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_COMPARE, op1, mktemp(Ity_I32,
-                       mkU32((UInt)op2)));
+   assign(value, get_gpr_w1(r3 | 1));
+   put_gpr_w1(r1, binop(Iop_Add32, get_gpr_w1(r1), get_gpr_w1(r3)));
+   if_condition_goto_computed(binop(Iop_CmpLT32S, mkexpr(value),
+                                    get_gpr_w1(r1)), mkexpr(op2addr));
 
-   return "cfi";
+   return "bxh";
 }
 
 static const HChar *
-s390_irgen_CGFI(UChar r1, UInt i2)
+s390_irgen_BXHG(UChar r1, UChar r3, IRTemp op2addr)
 {
-   IRTemp op1 = newTemp(Ity_I64);
-   Long op2;
+   IRTemp value = newTemp(Ity_I64);
 
-   assign(op1, get_gpr_dw0(r1));
-   op2 = (Long)(Int)i2;
-   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_COMPARE, op1, mktemp(Ity_I64,
-                       mkU64((ULong)op2)));
+   assign(value, get_gpr_dw0(r3 | 1));
+   put_gpr_dw0(r1, binop(Iop_Add64, get_gpr_dw0(r1), get_gpr_dw0(r3)));
+   if_condition_goto_computed(binop(Iop_CmpLT64S, mkexpr(value),
+                                    get_gpr_dw0(r1)), mkexpr(op2addr));
 
-   return "cgfi";
+   return "bxhg";
 }
 
 static const HChar *
-s390_irgen_CRL(UChar r1, UInt i2)
+s390_irgen_BXLE(UChar r1, UChar r3, IRTemp op2addr)
 {
-   IRTemp op1 = newTemp(Ity_I32);
-   IRTemp op2 = newTemp(Ity_I32);
+   IRTemp value = newTemp(Ity_I32);
 
-   assign(op1, get_gpr_w1(r1));
-   assign(op2, load(Ity_I32, mkU64(guest_IA_curr_instr + ((ULong)(Long)(Int)
-          i2 << 1))));
-   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_COMPARE, op1, op2);
+   assign(value, get_gpr_w1(r3 | 1));
+   put_gpr_w1(r1, binop(Iop_Add32, get_gpr_w1(r1), get_gpr_w1(r3)));
+   if_condition_goto_computed(binop(Iop_CmpLE32S, get_gpr_w1(r1),
+                                    mkexpr(value)), mkexpr(op2addr));
 
-   return "crl";
+   return "bxle";
 }
 
 static const HChar *
-s390_irgen_CGRL(UChar r1, UInt i2)
+s390_irgen_BXLEG(UChar r1, UChar r3, IRTemp op2addr)
 {
-   IRTemp op1 = newTemp(Ity_I64);
-   IRTemp op2 = newTemp(Ity_I64);
+   IRTemp value = newTemp(Ity_I64);
 
-   assign(op1, get_gpr_dw0(r1));
-   assign(op2, load(Ity_I64, mkU64(guest_IA_curr_instr + ((ULong)(Long)(Int)
-          i2 << 1))));
-   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_COMPARE, op1, op2);
+   assign(value, get_gpr_dw0(r3 | 1));
+   put_gpr_dw0(r1, binop(Iop_Add64, get_gpr_dw0(r1), get_gpr_dw0(r3)));
+   if_condition_goto_computed(binop(Iop_CmpLE64S, get_gpr_dw0(r1),
+                                    mkexpr(value)), mkexpr(op2addr));
 
-   return "cgrl";
+   return "bxleg";
 }
 
 static const HChar *
-s390_irgen_CGFRL(UChar r1, UInt i2)
+s390_irgen_BRAS(UChar r1, UShort i2)
 {
-   IRTemp op1 = newTemp(Ity_I64);
-   IRTemp op2 = newTemp(Ity_I64);
-
-   assign(op1, get_gpr_dw0(r1));
-   assign(op2, unop(Iop_32Sto64, load(Ity_I32, mkU64(guest_IA_curr_instr +
-          ((ULong)(Long)(Int)i2 << 1)))));
-   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_COMPARE, op1, op2);
+   put_gpr_dw0(r1, mkU64(guest_IA_curr_instr + 4ULL));
+   call_function_and_chase(guest_IA_curr_instr + ((ULong)(Long)(Short)i2 << 1));
 
-   return "cgfrl";
+   return "bras";
 }
 
 static const HChar *
-s390_irgen_CRB(UChar r1, UChar r2, UChar m3, IRTemp op4addr)
+s390_irgen_BRASL(UChar r1, UInt i2)
 {
-   IRTemp op1 = newTemp(Ity_I32);
-   IRTemp op2 = newTemp(Ity_I32);
-   IRTemp cond = newTemp(Ity_I32);
-
-   if (m3 == 0) {
-   } else {
-      if (m3 == 14) {
-         always_goto(mkexpr(op4addr));
-      } else {
-         assign(op1, get_gpr_w1(r1));
-         assign(op2, get_gpr_w1(r2));
-         assign(cond, s390_call_calculate_icc(m3, S390_CC_OP_SIGNED_COMPARE,
-                                              op1, op2));
-         if_condition_goto_computed(binop(Iop_CmpNE32, mkexpr(cond),
-                                          mkU32(0)), mkexpr(op4addr));
-      }
-   }
+   put_gpr_dw0(r1, mkU64(guest_IA_curr_instr + 6ULL));
+   call_function_and_chase(guest_IA_curr_instr + ((ULong)(Long)(Int)i2 << 1));
 
-   return "crb";
+   return "brasl";
 }
 
 static const HChar *
-s390_irgen_CGRB(UChar r1, UChar r2, UChar m3, IRTemp op4addr)
+s390_irgen_BRC(UChar r1, UShort i2)
 {
-   IRTemp op1 = newTemp(Ity_I64);
-   IRTemp op2 = newTemp(Ity_I64);
    IRTemp cond = newTemp(Ity_I32);
 
-   if (m3 == 0) {
+   if (r1 == 0) {
    } else {
-      if (m3 == 14) {
-         always_goto(mkexpr(op4addr));
+      if (r1 == 15) {
+         always_goto_and_chase(
+               guest_IA_curr_instr + ((ULong)(Long)(Short)i2 << 1));
       } else {
-         assign(op1, get_gpr_dw0(r1));
-         assign(op2, get_gpr_dw0(r2));
-         assign(cond, s390_call_calculate_icc(m3, S390_CC_OP_SIGNED_COMPARE,
-                                              op1, op2));
-         if_condition_goto_computed(binop(Iop_CmpNE32, mkexpr(cond),
-                                          mkU32(0)), mkexpr(op4addr));
+         assign(cond, s390_call_calculate_cond(r1));
+         if_condition_goto(binop(Iop_CmpNE32, mkexpr(cond), mkU32(0)),
+                           guest_IA_curr_instr + ((ULong)(Long)(Short)i2 << 1));
+
       }
    }
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_FE))
+      s390_disasm(ENC2(XMNM, PCREL), S390_XMNM_BRC, r1, (Int)(Short)i2);
 
-   return "cgrb";
+   return "brc";
 }
 
 static const HChar *
-s390_irgen_CRJ(UChar r1, UChar r2, UShort i4, UChar m3)
+s390_irgen_BRCL(UChar r1, UInt i2)
 {
-   IRTemp op1 = newTemp(Ity_I32);
-   IRTemp op2 = newTemp(Ity_I32);
    IRTemp cond = newTemp(Ity_I32);
 
-   if (m3 == 0) {
+   if (r1 == 0) {
    } else {
-      if (m3 == 14) {
-         always_goto_and_chase(
-                guest_IA_curr_instr + ((ULong)(Long)(Short)i4 << 1));
+      if (r1 == 15) {
+         always_goto_and_chase(guest_IA_curr_instr + ((ULong)(Long)(Int)i2 << 1));
       } else {
-         assign(op1, get_gpr_w1(r1));
-         assign(op2, get_gpr_w1(r2));
-         assign(cond, s390_call_calculate_icc(m3, S390_CC_OP_SIGNED_COMPARE,
-                                              op1, op2));
+         assign(cond, s390_call_calculate_cond(r1));
          if_condition_goto(binop(Iop_CmpNE32, mkexpr(cond), mkU32(0)),
-                           guest_IA_curr_instr + ((ULong)(Long)(Short)i4 << 1));
-
+                           guest_IA_curr_instr + ((ULong)(Long)(Int)i2 << 1));
       }
    }
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_FE))
+      s390_disasm(ENC2(XMNM, PCREL), S390_XMNM_BRCL, r1, i2);
 
-   return "crj";
+   return "brcl";
 }
 
 static const HChar *
-s390_irgen_CGRJ(UChar r1, UChar r2, UShort i4, UChar m3)
+s390_irgen_BRCT(UChar r1, UShort i2)
 {
-   IRTemp op1 = newTemp(Ity_I64);
-   IRTemp op2 = newTemp(Ity_I64);
-   IRTemp cond = newTemp(Ity_I32);
+   put_gpr_w1(r1, binop(Iop_Sub32, get_gpr_w1(r1), mkU32(1)));
+   if_condition_goto(binop(Iop_CmpNE32, get_gpr_w1(r1), mkU32(0)),
+                     guest_IA_curr_instr + ((ULong)(Long)(Short)i2 << 1));
 
-   if (m3 == 0) {
-   } else {
-      if (m3 == 14) {
-         always_goto_and_chase(
-                guest_IA_curr_instr + ((ULong)(Long)(Short)i4 << 1));
-      } else {
-         assign(op1, get_gpr_dw0(r1));
-         assign(op2, get_gpr_dw0(r2));
-         assign(cond, s390_call_calculate_icc(m3, S390_CC_OP_SIGNED_COMPARE,
-                                              op1, op2));
-         if_condition_goto(binop(Iop_CmpNE32, mkexpr(cond), mkU32(0)),
-                           guest_IA_curr_instr + ((ULong)(Long)(Short)i4 << 1));
+   return "brct";
+}
 
-      }
-   }
+static const HChar *
+s390_irgen_BRCTH(UChar r1, UInt i2)
+{
+   put_gpr_w0(r1, binop(Iop_Sub32, get_gpr_w0(r1), mkU32(1)));
+   if_condition_goto(binop(Iop_CmpNE32, get_gpr_w0(r1), mkU32(0)),
+                     guest_IA_curr_instr + ((ULong)(Long)(Short)i2 << 1));
 
-   return "cgrj";
+   return "brcth";
 }
 
 static const HChar *
-s390_irgen_CIB(UChar r1, UChar m3, UChar i2, IRTemp op4addr)
+s390_irgen_BRCTG(UChar r1, UShort i2)
 {
-   IRTemp op1 = newTemp(Ity_I32);
-   Int op2;
-   IRTemp cond = newTemp(Ity_I32);
-
-   if (m3 == 0) {
-   } else {
-      if (m3 == 14) {
-         always_goto(mkexpr(op4addr));
-      } else {
-         assign(op1, get_gpr_w1(r1));
-         op2 = (Int)(Char)i2;
-         assign(cond, s390_call_calculate_icc(m3, S390_CC_OP_SIGNED_COMPARE, op1,
-                                              mktemp(Ity_I32, mkU32((UInt)op2))));
-         if_condition_goto_computed(binop(Iop_CmpNE32, mkexpr(cond), mkU32(0)),
-                                    mkexpr(op4addr));
-      }
-   }
+   put_gpr_dw0(r1, binop(Iop_Sub64, get_gpr_dw0(r1), mkU64(1)));
+   if_condition_goto(binop(Iop_CmpNE64, get_gpr_dw0(r1), mkU64(0)),
+                     guest_IA_curr_instr + ((ULong)(Long)(Short)i2 << 1));
 
-   return "cib";
+   return "brctg";
 }
 
 static const HChar *
-s390_irgen_CGIB(UChar r1, UChar m3, UChar i2, IRTemp op4addr)
+s390_irgen_BRXH(UChar r1, UChar r3, UShort i2)
 {
-   IRTemp op1 = newTemp(Ity_I64);
-   Long op2;
-   IRTemp cond = newTemp(Ity_I32);
+   IRTemp value = newTemp(Ity_I32);
 
-   if (m3 == 0) {
-   } else {
-      if (m3 == 14) {
-         always_goto(mkexpr(op4addr));
-      } else {
-         assign(op1, get_gpr_dw0(r1));
-         op2 = (Long)(Char)i2;
-         assign(cond, s390_call_calculate_icc(m3, S390_CC_OP_SIGNED_COMPARE, op1,
-                                              mktemp(Ity_I64, mkU64((ULong)op2))));
-         if_condition_goto_computed(binop(Iop_CmpNE32, mkexpr(cond), mkU32(0)),
-                                    mkexpr(op4addr));
-      }
-   }
+   assign(value, get_gpr_w1(r3 | 1));
+   put_gpr_w1(r1, binop(Iop_Add32, get_gpr_w1(r1), get_gpr_w1(r3)));
+   if_condition_goto(binop(Iop_CmpLT32S, mkexpr(value), get_gpr_w1(r1)),
+                     guest_IA_curr_instr + ((ULong)(Long)(Short)i2 << 1));
 
-   return "cgib";
+   return "brxh";
 }
 
 static const HChar *
-s390_irgen_CIJ(UChar r1, UChar m3, UShort i4, UChar i2)
+s390_irgen_BRXHG(UChar r1, UChar r3, UShort i2)
 {
-   IRTemp op1 = newTemp(Ity_I32);
-   Int op2;
-   IRTemp cond = newTemp(Ity_I32);
-
-   if (m3 == 0) {
-   } else {
-      if (m3 == 14) {
-         always_goto_and_chase(guest_IA_curr_instr + ((ULong)(Long)(Short)i4 << 1));
-      } else {
-         assign(op1, get_gpr_w1(r1));
-         op2 = (Int)(Char)i2;
-         assign(cond, s390_call_calculate_icc(m3, S390_CC_OP_SIGNED_COMPARE, op1,
-                                              mktemp(Ity_I32, mkU32((UInt)op2))));
-         if_condition_goto(binop(Iop_CmpNE32, mkexpr(cond), mkU32(0)),
-                           guest_IA_curr_instr + ((ULong)(Long)(Short)i4 << 1));
+   IRTemp value = newTemp(Ity_I64);
 
-      }
-   }
+   assign(value, get_gpr_dw0(r3 | 1));
+   put_gpr_dw0(r1, binop(Iop_Add64, get_gpr_dw0(r1), get_gpr_dw0(r3)));
+   if_condition_goto(binop(Iop_CmpLT64S, mkexpr(value), get_gpr_dw0(r1)),
+                     guest_IA_curr_instr + ((ULong)(Long)(Short)i2 << 1));
 
-   return "cij";
+   return "brxhg";
 }
 
 static const HChar *
-s390_irgen_CGIJ(UChar r1, UChar m3, UShort i4, UChar i2)
+s390_irgen_BRXLE(UChar r1, UChar r3, UShort i2)
 {
-   IRTemp op1 = newTemp(Ity_I64);
-   Long op2;
-   IRTemp cond = newTemp(Ity_I32);
+   IRTemp value = newTemp(Ity_I32);
 
-   if (m3 == 0) {
-   } else {
-      if (m3 == 14) {
-         always_goto_and_chase(guest_IA_curr_instr + ((ULong)(Long)(Short)i4 << 1));
-      } else {
-         assign(op1, get_gpr_dw0(r1));
-         op2 = (Long)(Char)i2;
-         assign(cond, s390_call_calculate_icc(m3, S390_CC_OP_SIGNED_COMPARE, op1,
-                                              mktemp(Ity_I64, mkU64((ULong)op2))));
-         if_condition_goto(binop(Iop_CmpNE32, mkexpr(cond), mkU32(0)),
-                           guest_IA_curr_instr + ((ULong)(Long)(Short)i4 << 1));
+   assign(value, get_gpr_w1(r3 | 1));
+   put_gpr_w1(r1, binop(Iop_Add32, get_gpr_w1(r1), get_gpr_w1(r3)));
+   if_condition_goto(binop(Iop_CmpLE32S, get_gpr_w1(r1), mkexpr(value)),
+                     guest_IA_curr_instr + ((ULong)(Long)(Short)i2 << 1));
 
-      }
-   }
+   return "brxle";
+}
 
-   return "cgij";
+static const HChar *
+s390_irgen_BRXLG(UChar r1, UChar r3, UShort i2)
+{
+   IRTemp value = newTemp(Ity_I64);
+
+   assign(value, get_gpr_dw0(r3 | 1));
+   put_gpr_dw0(r1, binop(Iop_Add64, get_gpr_dw0(r1), get_gpr_dw0(r3)));
+   if_condition_goto(binop(Iop_CmpLE64S, get_gpr_dw0(r1), mkexpr(value)),
+                     guest_IA_curr_instr + ((ULong)(Long)(Short)i2 << 1));
+
+   return "brxlg";
 }
 
 static const HChar *
-s390_irgen_CH(UChar r1, IRTemp op2addr)
+s390_irgen_CR(UChar r1, UChar r2)
 {
    IRTemp op1 = newTemp(Ity_I32);
    IRTemp op2 = newTemp(Ity_I32);
 
    assign(op1, get_gpr_w1(r1));
-   assign(op2, unop(Iop_16Sto32, load(Ity_I16, mkexpr(op2addr))));
+   assign(op2, get_gpr_w1(r2));
    s390_cc_thunk_putSS(S390_CC_OP_SIGNED_COMPARE, op1, op2);
 
-   return "ch";
+   return "cr";
 }
 
 static const HChar *
-s390_irgen_CHY(UChar r1, IRTemp op2addr)
+s390_irgen_CGR(UChar r1, UChar r2)
 {
-   IRTemp op1 = newTemp(Ity_I32);
-   IRTemp op2 = newTemp(Ity_I32);
+   IRTemp op1 = newTemp(Ity_I64);
+   IRTemp op2 = newTemp(Ity_I64);
 
-   assign(op1, get_gpr_w1(r1));
-   assign(op2, unop(Iop_16Sto32, load(Ity_I16, mkexpr(op2addr))));
+   assign(op1, get_gpr_dw0(r1));
+   assign(op2, get_gpr_dw0(r2));
    s390_cc_thunk_putSS(S390_CC_OP_SIGNED_COMPARE, op1, op2);
 
-   return "chy";
+   return "cgr";
 }
 
 static const HChar *
-s390_irgen_CGH(UChar r1, IRTemp op2addr)
+s390_irgen_CGFR(UChar r1, UChar r2)
 {
    IRTemp op1 = newTemp(Ity_I64);
    IRTemp op2 = newTemp(Ity_I64);
 
    assign(op1, get_gpr_dw0(r1));
-   assign(op2, unop(Iop_16Sto64, load(Ity_I16, mkexpr(op2addr))));
+   assign(op2, unop(Iop_32Sto64, get_gpr_w1(r2)));
    s390_cc_thunk_putSS(S390_CC_OP_SIGNED_COMPARE, op1, op2);
 
-   return "cgh";
+   return "cgfr";
 }
 
 static const HChar *
-s390_irgen_CHI(UChar r1, UShort i2)
+s390_irgen_C(UChar r1, IRTemp op2addr)
 {
    IRTemp op1 = newTemp(Ity_I32);
-   Int op2;
+   IRTemp op2 = newTemp(Ity_I32);
 
    assign(op1, get_gpr_w1(r1));
-   op2 = (Int)(Short)i2;
-   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_COMPARE, op1, mktemp(Ity_I32,
-                       mkU32((UInt)op2)));
+   assign(op2, load(Ity_I32, mkexpr(op2addr)));
+   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_COMPARE, op1, op2);
 
-   return "chi";
+   return "c";
 }
 
 static const HChar *
-s390_irgen_CGHI(UChar r1, UShort i2)
+s390_irgen_CY(UChar r1, IRTemp op2addr)
+{
+   IRTemp op1 = newTemp(Ity_I32);
+   IRTemp op2 = newTemp(Ity_I32);
+
+   assign(op1, get_gpr_w1(r1));
+   assign(op2, load(Ity_I32, mkexpr(op2addr)));
+   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_COMPARE, op1, op2);
+
+   return "cy";
+}
+
+static const HChar *
+s390_irgen_CG(UChar r1, IRTemp op2addr)
 {
    IRTemp op1 = newTemp(Ity_I64);
-   Long op2;
+   IRTemp op2 = newTemp(Ity_I64);
 
    assign(op1, get_gpr_dw0(r1));
-   op2 = (Long)(Short)i2;
-   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_COMPARE, op1, mktemp(Ity_I64,
-                       mkU64((ULong)op2)));
+   assign(op2, load(Ity_I64, mkexpr(op2addr)));
+   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_COMPARE, op1, op2);
 
-   return "cghi";
+   return "cg";
 }
 
 static const HChar *
-s390_irgen_CHHSI(UShort i2, IRTemp op1addr)
+s390_irgen_CGF(UChar r1, IRTemp op2addr)
 {
-   IRTemp op1 = newTemp(Ity_I16);
-   Short op2;
+   IRTemp op1 = newTemp(Ity_I64);
+   IRTemp op2 = newTemp(Ity_I64);
 
-   assign(op1, load(Ity_I16, mkexpr(op1addr)));
-   op2 = (Short)i2;
-   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_COMPARE, op1, mktemp(Ity_I16,
-                       mkU16((UShort)op2)));
+   assign(op1, get_gpr_dw0(r1));
+   assign(op2, unop(Iop_32Sto64, load(Ity_I32, mkexpr(op2addr))));
+   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_COMPARE, op1, op2);
 
-   return "chhsi";
+   return "cgf";
 }
 
 static const HChar *
-s390_irgen_CHSI(UShort i2, IRTemp op1addr)
+s390_irgen_CFI(UChar r1, UInt i2)
 {
    IRTemp op1 = newTemp(Ity_I32);
    Int op2;
 
-   assign(op1, load(Ity_I32, mkexpr(op1addr)));
-   op2 = (Int)(Short)i2;
+   assign(op1, get_gpr_w1(r1));
+   op2 = (Int)i2;
    s390_cc_thunk_putSS(S390_CC_OP_SIGNED_COMPARE, op1, mktemp(Ity_I32,
                        mkU32((UInt)op2)));
 
-   return "chsi";
+   return "cfi";
 }
 
 static const HChar *
-s390_irgen_CGHSI(UShort i2, IRTemp op1addr)
+s390_irgen_CGFI(UChar r1, UInt i2)
 {
    IRTemp op1 = newTemp(Ity_I64);
    Long op2;
 
-   assign(op1, load(Ity_I64, mkexpr(op1addr)));
-   op2 = (Long)(Short)i2;
+   assign(op1, get_gpr_dw0(r1));
+   op2 = (Long)(Int)i2;
    s390_cc_thunk_putSS(S390_CC_OP_SIGNED_COMPARE, op1, mktemp(Ity_I64,
                        mkU64((ULong)op2)));
 
-   return "cghsi";
+   return "cgfi";
 }
 
 static const HChar *
-s390_irgen_CHRL(UChar r1, UInt i2)
+s390_irgen_CRL(UChar r1, UInt i2)
 {
    IRTemp op1 = newTemp(Ity_I32);
    IRTemp op2 = newTemp(Ity_I32);
 
    assign(op1, get_gpr_w1(r1));
-   assign(op2, unop(Iop_16Sto32, load(Ity_I16, mkU64(guest_IA_curr_instr +
-          ((ULong)(Long)(Int)i2 << 1)))));
+   assign(op2, load(Ity_I32, mkU64(guest_IA_curr_instr + ((ULong)(Long)(Int)
+          i2 << 1))));
    s390_cc_thunk_putSS(S390_CC_OP_SIGNED_COMPARE, op1, op2);
 
-   return "chrl";
+   return "crl";
 }
 
 static const HChar *
-s390_irgen_CGHRL(UChar r1, UInt i2)
+s390_irgen_CGRL(UChar r1, UInt i2)
 {
    IRTemp op1 = newTemp(Ity_I64);
    IRTemp op2 = newTemp(Ity_I64);
 
    assign(op1, get_gpr_dw0(r1));
-   assign(op2, unop(Iop_16Sto64, load(Ity_I16, mkU64(guest_IA_curr_instr +
-          ((ULong)(Long)(Int)i2 << 1)))));
+   assign(op2, load(Ity_I64, mkU64(guest_IA_curr_instr + ((ULong)(Long)(Int)
+          i2 << 1))));
    s390_cc_thunk_putSS(S390_CC_OP_SIGNED_COMPARE, op1, op2);
 
-   return "cghrl";
+   return "cgrl";
 }
 
 static const HChar *
-s390_irgen_CHHR(UChar r1, UChar r2)
+s390_irgen_CGFRL(UChar r1, UInt i2)
 {
-   IRTemp op1 = newTemp(Ity_I32);
-   IRTemp op2 = newTemp(Ity_I32);
+   IRTemp op1 = newTemp(Ity_I64);
+   IRTemp op2 = newTemp(Ity_I64);
 
-   assign(op1, get_gpr_w0(r1));
-   assign(op2, get_gpr_w0(r2));
+   assign(op1, get_gpr_dw0(r1));
+   assign(op2, unop(Iop_32Sto64, load(Ity_I32, mkU64(guest_IA_curr_instr +
+          ((ULong)(Long)(Int)i2 << 1)))));
    s390_cc_thunk_putSS(S390_CC_OP_SIGNED_COMPARE, op1, op2);
 
-   return "chhr";
+   return "cgfrl";
 }
 
 static const HChar *
-s390_irgen_CHLR(UChar r1, UChar r2)
+s390_irgen_CRB(UChar r1, UChar r2, UChar m3, IRTemp op4addr)
 {
    IRTemp op1 = newTemp(Ity_I32);
    IRTemp op2 = newTemp(Ity_I32);
+   IRTemp cond = newTemp(Ity_I32);
 
-   assign(op1, get_gpr_w0(r1));
-   assign(op2, get_gpr_w1(r2));
-   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_COMPARE, op1, op2);
+   if (m3 == 0) {
+   } else {
+      if (m3 == 14) {
+         always_goto(mkexpr(op4addr));
+      } else {
+         assign(op1, get_gpr_w1(r1));
+         assign(op2, get_gpr_w1(r2));
+         assign(cond, s390_call_calculate_icc(m3, S390_CC_OP_SIGNED_COMPARE,
+                                              op1, op2));
+         if_condition_goto_computed(binop(Iop_CmpNE32, mkexpr(cond),
+                                          mkU32(0)), mkexpr(op4addr));
+      }
+   }
 
-   return "chlr";
+   return "crb";
 }
 
 static const HChar *
-s390_irgen_CHF(UChar r1, IRTemp op2addr)
+s390_irgen_CGRB(UChar r1, UChar r2, UChar m3, IRTemp op4addr)
 {
-   IRTemp op1 = newTemp(Ity_I32);
-   IRTemp op2 = newTemp(Ity_I32);
+   IRTemp op1 = newTemp(Ity_I64);
+   IRTemp op2 = newTemp(Ity_I64);
+   IRTemp cond = newTemp(Ity_I32);
 
-   assign(op1, get_gpr_w0(r1));
-   assign(op2, load(Ity_I32, mkexpr(op2addr)));
-   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_COMPARE, op1, op2);
+   if (m3 == 0) {
+   } else {
+      if (m3 == 14) {
+         always_goto(mkexpr(op4addr));
+      } else {
+         assign(op1, get_gpr_dw0(r1));
+         assign(op2, get_gpr_dw0(r2));
+         assign(cond, s390_call_calculate_icc(m3, S390_CC_OP_SIGNED_COMPARE,
+                                              op1, op2));
+         if_condition_goto_computed(binop(Iop_CmpNE32, mkexpr(cond),
+                                          mkU32(0)), mkexpr(op4addr));
+      }
+   }
 
-   return "chf";
+   return "cgrb";
 }
 
 static const HChar *
-s390_irgen_CIH(UChar r1, UInt i2)
+s390_irgen_CRJ(UChar r1, UChar r2, UShort i4, UChar m3)
 {
    IRTemp op1 = newTemp(Ity_I32);
-   Int op2;
-
-   assign(op1, get_gpr_w0(r1));
-   op2 = (Int)i2;
-   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_COMPARE, op1, mktemp(Ity_I32,
-                       mkU32((UInt)op2)));
+   IRTemp op2 = newTemp(Ity_I32);
+   IRTemp cond = newTemp(Ity_I32);
 
-   return "cih";
+   if (m3 == 0) {
+   } else {
+      if (m3 == 14) {
+         always_goto_and_chase(
+                guest_IA_curr_instr + ((ULong)(Long)(Short)i4 << 1));
+      } else {
+         assign(op1, get_gpr_w1(r1));
+         assign(op2, get_gpr_w1(r2));
+         assign(cond, s390_call_calculate_icc(m3, S390_CC_OP_SIGNED_COMPARE,
+                                              op1, op2));
+         if_condition_goto(binop(Iop_CmpNE32, mkexpr(cond), mkU32(0)),
+                           guest_IA_curr_instr + ((ULong)(Long)(Short)i4 << 1));
+
+      }
+   }
+
+   return "crj";
 }
 
 static const HChar *
-s390_irgen_CLR(UChar r1, UChar r2)
+s390_irgen_CGRJ(UChar r1, UChar r2, UShort i4, UChar m3)
 {
-   IRTemp op1 = newTemp(Ity_I32);
-   IRTemp op2 = newTemp(Ity_I32);
+   IRTemp op1 = newTemp(Ity_I64);
+   IRTemp op2 = newTemp(Ity_I64);
+   IRTemp cond = newTemp(Ity_I32);
 
-   assign(op1, get_gpr_w1(r1));
-   assign(op2, get_gpr_w1(r2));
-   s390_cc_thunk_putZZ(S390_CC_OP_UNSIGNED_COMPARE, op1, op2);
+   if (m3 == 0) {
+   } else {
+      if (m3 == 14) {
+         always_goto_and_chase(
+                guest_IA_curr_instr + ((ULong)(Long)(Short)i4 << 1));
+      } else {
+         assign(op1, get_gpr_dw0(r1));
+         assign(op2, get_gpr_dw0(r2));
+         assign(cond, s390_call_calculate_icc(m3, S390_CC_OP_SIGNED_COMPARE,
+                                              op1, op2));
+         if_condition_goto(binop(Iop_CmpNE32, mkexpr(cond), mkU32(0)),
+                           guest_IA_curr_instr + ((ULong)(Long)(Short)i4 << 1));
 
-   return "clr";
+      }
+   }
+
+   return "cgrj";
 }
 
 static const HChar *
-s390_irgen_CLGR(UChar r1, UChar r2)
+s390_irgen_CIB(UChar r1, UChar m3, UChar i2, IRTemp op4addr)
 {
-   IRTemp op1 = newTemp(Ity_I64);
-   IRTemp op2 = newTemp(Ity_I64);
+   IRTemp op1 = newTemp(Ity_I32);
+   Int op2;
+   IRTemp cond = newTemp(Ity_I32);
 
-   assign(op1, get_gpr_dw0(r1));
-   assign(op2, get_gpr_dw0(r2));
-   s390_cc_thunk_putZZ(S390_CC_OP_UNSIGNED_COMPARE, op1, op2);
+   if (m3 == 0) {
+   } else {
+      if (m3 == 14) {
+         always_goto(mkexpr(op4addr));
+      } else {
+         assign(op1, get_gpr_w1(r1));
+         op2 = (Int)(Char)i2;
+         assign(cond, s390_call_calculate_icc(m3, S390_CC_OP_SIGNED_COMPARE, op1,
+                                              mktemp(Ity_I32, mkU32((UInt)op2))));
+         if_condition_goto_computed(binop(Iop_CmpNE32, mkexpr(cond), mkU32(0)),
+                                    mkexpr(op4addr));
+      }
+   }
 
-   return "clgr";
+   return "cib";
 }
 
 static const HChar *
-s390_irgen_CLGFR(UChar r1, UChar r2)
+s390_irgen_CGIB(UChar r1, UChar m3, UChar i2, IRTemp op4addr)
 {
    IRTemp op1 = newTemp(Ity_I64);
-   IRTemp op2 = newTemp(Ity_I64);
+   Long op2;
+   IRTemp cond = newTemp(Ity_I32);
 
-   assign(op1, get_gpr_dw0(r1));
-   assign(op2, unop(Iop_32Uto64, get_gpr_w1(r2)));
-   s390_cc_thunk_putZZ(S390_CC_OP_UNSIGNED_COMPARE, op1, op2);
+   if (m3 == 0) {
+   } else {
+      if (m3 == 14) {
+         always_goto(mkexpr(op4addr));
+      } else {
+         assign(op1, get_gpr_dw0(r1));
+         op2 = (Long)(Char)i2;
+         assign(cond, s390_call_calculate_icc(m3, S390_CC_OP_SIGNED_COMPARE, op1,
+                                              mktemp(Ity_I64, mkU64((ULong)op2))));
+         if_condition_goto_computed(binop(Iop_CmpNE32, mkexpr(cond), mkU32(0)),
+                                    mkexpr(op4addr));
+      }
+   }
 
-   return "clgfr";
+   return "cgib";
 }
 
 static const HChar *
-s390_irgen_CL(UChar r1, IRTemp op2addr)
+s390_irgen_CIJ(UChar r1, UChar m3, UShort i4, UChar i2)
 {
    IRTemp op1 = newTemp(Ity_I32);
-   IRTemp op2 = newTemp(Ity_I32);
+   Int op2;
+   IRTemp cond = newTemp(Ity_I32);
 
-   assign(op1, get_gpr_w1(r1));
-   assign(op2, load(Ity_I32, mkexpr(op2addr)));
-   s390_cc_thunk_putZZ(S390_CC_OP_UNSIGNED_COMPARE, op1, op2);
+   if (m3 == 0) {
+   } else {
+      if (m3 == 14) {
+         always_goto_and_chase(guest_IA_curr_instr + ((ULong)(Long)(Short)i4 << 1));
+      } else {
+         assign(op1, get_gpr_w1(r1));
+         op2 = (Int)(Char)i2;
+         assign(cond, s390_call_calculate_icc(m3, S390_CC_OP_SIGNED_COMPARE, op1,
+                                              mktemp(Ity_I32, mkU32((UInt)op2))));
+         if_condition_goto(binop(Iop_CmpNE32, mkexpr(cond), mkU32(0)),
+                           guest_IA_curr_instr + ((ULong)(Long)(Short)i4 << 1));
 
-   return "cl";
+      }
+   }
+
+   return "cij";
 }
 
 static const HChar *
-s390_irgen_CLY(UChar r1, IRTemp op2addr)
+s390_irgen_CGIJ(UChar r1, UChar m3, UShort i4, UChar i2)
+{
+   IRTemp op1 = newTemp(Ity_I64);
+   Long op2;
+   IRTemp cond = newTemp(Ity_I32);
+
+   if (m3 == 0) {
+   } else {
+      if (m3 == 14) {
+         always_goto_and_chase(guest_IA_curr_instr + ((ULong)(Long)(Short)i4 << 1));
+      } else {
+         assign(op1, get_gpr_dw0(r1));
+         op2 = (Long)(Char)i2;
+         assign(cond, s390_call_calculate_icc(m3, S390_CC_OP_SIGNED_COMPARE, op1,
+                                              mktemp(Ity_I64, mkU64((ULong)op2))));
+         if_condition_goto(binop(Iop_CmpNE32, mkexpr(cond), mkU32(0)),
+                           guest_IA_curr_instr + ((ULong)(Long)(Short)i4 << 1));
+
+      }
+   }
+
+   return "cgij";
+}
+
+static const HChar *
+s390_irgen_CH(UChar r1, IRTemp op2addr)
 {
    IRTemp op1 = newTemp(Ity_I32);
    IRTemp op2 = newTemp(Ity_I32);
 
    assign(op1, get_gpr_w1(r1));
-   assign(op2, load(Ity_I32, mkexpr(op2addr)));
-   s390_cc_thunk_putZZ(S390_CC_OP_UNSIGNED_COMPARE, op1, op2);
+   assign(op2, unop(Iop_16Sto32, load(Ity_I16, mkexpr(op2addr))));
+   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_COMPARE, op1, op2);
 
-   return "cly";
+   return "ch";
 }
 
 static const HChar *
-s390_irgen_CLG(UChar r1, IRTemp op2addr)
+s390_irgen_CHY(UChar r1, IRTemp op2addr)
 {
-   IRTemp op1 = newTemp(Ity_I64);
-   IRTemp op2 = newTemp(Ity_I64);
+   IRTemp op1 = newTemp(Ity_I32);
+   IRTemp op2 = newTemp(Ity_I32);
 
-   assign(op1, get_gpr_dw0(r1));
-   assign(op2, load(Ity_I64, mkexpr(op2addr)));
-   s390_cc_thunk_putZZ(S390_CC_OP_UNSIGNED_COMPARE, op1, op2);
+   assign(op1, get_gpr_w1(r1));
+   assign(op2, unop(Iop_16Sto32, load(Ity_I16, mkexpr(op2addr))));
+   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_COMPARE, op1, op2);
 
-   return "clg";
+   return "chy";
 }
 
 static const HChar *
-s390_irgen_CLGF(UChar r1, IRTemp op2addr)
+s390_irgen_CGH(UChar r1, IRTemp op2addr)
 {
    IRTemp op1 = newTemp(Ity_I64);
    IRTemp op2 = newTemp(Ity_I64);
 
    assign(op1, get_gpr_dw0(r1));
-   assign(op2, unop(Iop_32Uto64, load(Ity_I32, mkexpr(op2addr))));
-   s390_cc_thunk_putZZ(S390_CC_OP_UNSIGNED_COMPARE, op1, op2);
+   assign(op2, unop(Iop_16Sto64, load(Ity_I16, mkexpr(op2addr))));
+   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_COMPARE, op1, op2);
 
-   return "clgf";
+   return "cgh";
 }
 
 static const HChar *
-s390_irgen_CLFI(UChar r1, UInt i2)
+s390_irgen_CHI(UChar r1, UShort i2)
 {
    IRTemp op1 = newTemp(Ity_I32);
-   UInt op2;
+   Int op2;
 
    assign(op1, get_gpr_w1(r1));
-   op2 = i2;
-   s390_cc_thunk_putZZ(S390_CC_OP_UNSIGNED_COMPARE, op1, mktemp(Ity_I32,
-                       mkU32(op2)));
+   op2 = (Int)(Short)i2;
+   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_COMPARE, op1, mktemp(Ity_I32,
+                       mkU32((UInt)op2)));
 
-   return "clfi";
+   return "chi";
 }
 
 static const HChar *
-s390_irgen_CLGFI(UChar r1, UInt i2)
+s390_irgen_CGHI(UChar r1, UShort i2)
 {
    IRTemp op1 = newTemp(Ity_I64);
-   ULong op2;
+   Long op2;
 
    assign(op1, get_gpr_dw0(r1));
-   op2 = (ULong)i2;
-   s390_cc_thunk_putZZ(S390_CC_OP_UNSIGNED_COMPARE, op1, mktemp(Ity_I64,
-                       mkU64(op2)));
-
-   return "clgfi";
-}
-
-static const HChar *
-s390_irgen_CLI(UChar i2, IRTemp op1addr)
-{
-   IRTemp op1 = newTemp(Ity_I8);
-   UChar op2;
-
-   assign(op1, load(Ity_I8, mkexpr(op1addr)));
-   op2 = i2;
-   s390_cc_thunk_putZZ(S390_CC_OP_UNSIGNED_COMPARE, op1, mktemp(Ity_I8,
-                       mkU8(op2)));
+   op2 = (Long)(Short)i2;
+   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_COMPARE, op1, mktemp(Ity_I64,
+                       mkU64((ULong)op2)));
 
-   return "cli";
+   return "cghi";
 }
 
 static const HChar *
-s390_irgen_CLIY(UChar i2, IRTemp op1addr)
+s390_irgen_CHHSI(UShort i2, IRTemp op1addr)
 {
-   IRTemp op1 = newTemp(Ity_I8);
-   UChar op2;
+   IRTemp op1 = newTemp(Ity_I16);
+   Short op2;
 
-   assign(op1, load(Ity_I8, mkexpr(op1addr)));
-   op2 = i2;
-   s390_cc_thunk_putZZ(S390_CC_OP_UNSIGNED_COMPARE, op1, mktemp(Ity_I8,
-                       mkU8(op2)));
+   assign(op1, load(Ity_I16, mkexpr(op1addr)));
+   op2 = (Short)i2;
+   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_COMPARE, op1, mktemp(Ity_I16,
+                       mkU16((UShort)op2)));
 
-   return "cliy";
+   return "chhsi";
 }
 
 static const HChar *
-s390_irgen_CLFHSI(UShort i2, IRTemp op1addr)
+s390_irgen_CHSI(UShort i2, IRTemp op1addr)
 {
    IRTemp op1 = newTemp(Ity_I32);
-   UInt op2;
+   Int op2;
 
    assign(op1, load(Ity_I32, mkexpr(op1addr)));
-   op2 = (UInt)i2;
-   s390_cc_thunk_putZZ(S390_CC_OP_UNSIGNED_COMPARE, op1, mktemp(Ity_I32,
-                       mkU32(op2)));
+   op2 = (Int)(Short)i2;
+   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_COMPARE, op1, mktemp(Ity_I32,
+                       mkU32((UInt)op2)));
 
-   return "clfhsi";
+   return "chsi";
 }
 
 static const HChar *
-s390_irgen_CLGHSI(UShort i2, IRTemp op1addr)
+s390_irgen_CGHSI(UShort i2, IRTemp op1addr)
 {
    IRTemp op1 = newTemp(Ity_I64);
-   ULong op2;
+   Long op2;
 
    assign(op1, load(Ity_I64, mkexpr(op1addr)));
-   op2 = (ULong)i2;
-   s390_cc_thunk_putZZ(S390_CC_OP_UNSIGNED_COMPARE, op1, mktemp(Ity_I64,
-                       mkU64(op2)));
+   op2 = (Long)(Short)i2;
+   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_COMPARE, op1, mktemp(Ity_I64,
+                       mkU64((ULong)op2)));
 
-   return "clghsi";
+   return "cghsi";
 }
 
 static const HChar *
-s390_irgen_CLHHSI(UShort i2, IRTemp op1addr)
+s390_irgen_CHRL(UChar r1, UInt i2)
 {
-   IRTemp op1 = newTemp(Ity_I16);
-   UShort op2;
+   IRTemp op1 = newTemp(Ity_I32);
+   IRTemp op2 = newTemp(Ity_I32);
+
+   assign(op1, get_gpr_w1(r1));
+   assign(op2, unop(Iop_16Sto32, load(Ity_I16, mkU64(guest_IA_curr_instr +
+          ((ULong)(Long)(Int)i2 << 1)))));
+   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_COMPARE, op1, op2);
+
+   return "chrl";
+}
+
+static const HChar *
+s390_irgen_CGHRL(UChar r1, UInt i2)
+{
+   IRTemp op1 = newTemp(Ity_I64);
+   IRTemp op2 = newTemp(Ity_I64);
+
+   assign(op1, get_gpr_dw0(r1));
+   assign(op2, unop(Iop_16Sto64, load(Ity_I16, mkU64(guest_IA_curr_instr +
+          ((ULong)(Long)(Int)i2 << 1)))));
+   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_COMPARE, op1, op2);
+
+   return "cghrl";
+}
+
+static const HChar *
+s390_irgen_CHHR(UChar r1, UChar r2)
+{
+   IRTemp op1 = newTemp(Ity_I32);
+   IRTemp op2 = newTemp(Ity_I32);
+
+   assign(op1, get_gpr_w0(r1));
+   assign(op2, get_gpr_w0(r2));
+   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_COMPARE, op1, op2);
+
+   return "chhr";
+}
+
+static const HChar *
+s390_irgen_CHLR(UChar r1, UChar r2)
+{
+   IRTemp op1 = newTemp(Ity_I32);
+   IRTemp op2 = newTemp(Ity_I32);
+
+   assign(op1, get_gpr_w0(r1));
+   assign(op2, get_gpr_w1(r2));
+   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_COMPARE, op1, op2);
+
+   return "chlr";
+}
+
+static const HChar *
+s390_irgen_CHF(UChar r1, IRTemp op2addr)
+{
+   IRTemp op1 = newTemp(Ity_I32);
+   IRTemp op2 = newTemp(Ity_I32);
+
+   assign(op1, get_gpr_w0(r1));
+   assign(op2, load(Ity_I32, mkexpr(op2addr)));
+   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_COMPARE, op1, op2);
+
+   return "chf";
+}
+
+static const HChar *
+s390_irgen_CIH(UChar r1, UInt i2)
+{
+   IRTemp op1 = newTemp(Ity_I32);
+   Int op2;
+
+   assign(op1, get_gpr_w0(r1));
+   op2 = (Int)i2;
+   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_COMPARE, op1, mktemp(Ity_I32,
+                       mkU32((UInt)op2)));
+
+   return "cih";
+}
+
+static const HChar *
+s390_irgen_CLR(UChar r1, UChar r2)
+{
+   IRTemp op1 = newTemp(Ity_I32);
+   IRTemp op2 = newTemp(Ity_I32);
+
+   assign(op1, get_gpr_w1(r1));
+   assign(op2, get_gpr_w1(r2));
+   s390_cc_thunk_putZZ(S390_CC_OP_UNSIGNED_COMPARE, op1, op2);
+
+   return "clr";
+}
+
+static const HChar *
+s390_irgen_CLGR(UChar r1, UChar r2)
+{
+   IRTemp op1 = newTemp(Ity_I64);
+   IRTemp op2 = newTemp(Ity_I64);
+
+   assign(op1, get_gpr_dw0(r1));
+   assign(op2, get_gpr_dw0(r2));
+   s390_cc_thunk_putZZ(S390_CC_OP_UNSIGNED_COMPARE, op1, op2);
+
+   return "clgr";
+}
+
+static const HChar *
+s390_irgen_CLGFR(UChar r1, UChar r2)
+{
+   IRTemp op1 = newTemp(Ity_I64);
+   IRTemp op2 = newTemp(Ity_I64);
+
+   assign(op1, get_gpr_dw0(r1));
+   assign(op2, unop(Iop_32Uto64, get_gpr_w1(r2)));
+   s390_cc_thunk_putZZ(S390_CC_OP_UNSIGNED_COMPARE, op1, op2);
+
+   return "clgfr";
+}
+
+static const HChar *
+s390_irgen_CL(UChar r1, IRTemp op2addr)
+{
+   IRTemp op1 = newTemp(Ity_I32);
+   IRTemp op2 = newTemp(Ity_I32);
+
+   assign(op1, get_gpr_w1(r1));
+   assign(op2, load(Ity_I32, mkexpr(op2addr)));
+   s390_cc_thunk_putZZ(S390_CC_OP_UNSIGNED_COMPARE, op1, op2);
+
+   return "cl";
+}
+
+static const HChar *
+s390_irgen_CLY(UChar r1, IRTemp op2addr)
+{
+   IRTemp op1 = newTemp(Ity_I32);
+   IRTemp op2 = newTemp(Ity_I32);
+
+   assign(op1, get_gpr_w1(r1));
+   assign(op2, load(Ity_I32, mkexpr(op2addr)));
+   s390_cc_thunk_putZZ(S390_CC_OP_UNSIGNED_COMPARE, op1, op2);
+
+   return "cly";
+}
+
+static const HChar *
+s390_irgen_CLG(UChar r1, IRTemp op2addr)
+{
+   IRTemp op1 = newTemp(Ity_I64);
+   IRTemp op2 = newTemp(Ity_I64);
+
+   assign(op1, get_gpr_dw0(r1));
+   assign(op2, load(Ity_I64, mkexpr(op2addr)));
+   s390_cc_thunk_putZZ(S390_CC_OP_UNSIGNED_COMPARE, op1, op2);
+
+   return "clg";
+}
+
+static const HChar *
+s390_irgen_CLGF(UChar r1, IRTemp op2addr)
+{
+   IRTemp op1 = newTemp(Ity_I64);
+   IRTemp op2 = newTemp(Ity_I64);
+
+   assign(op1, get_gpr_dw0(r1));
+   assign(op2, unop(Iop_32Uto64, load(Ity_I32, mkexpr(op2addr))));
+   s390_cc_thunk_putZZ(S390_CC_OP_UNSIGNED_COMPARE, op1, op2);
+
+   return "clgf";
+}
+
+static const HChar *
+s390_irgen_CLFI(UChar r1, UInt i2)
+{
+   IRTemp op1 = newTemp(Ity_I32);
+   UInt op2;
+
+   assign(op1, get_gpr_w1(r1));
+   op2 = i2;
+   s390_cc_thunk_putZZ(S390_CC_OP_UNSIGNED_COMPARE, op1, mktemp(Ity_I32,
+                       mkU32(op2)));
+
+   return "clfi";
+}
+
+static const HChar *
+s390_irgen_CLGFI(UChar r1, UInt i2)
+{
+   IRTemp op1 = newTemp(Ity_I64);
+   ULong op2;
+
+   assign(op1, get_gpr_dw0(r1));
+   op2 = (ULong)i2;
+   s390_cc_thunk_putZZ(S390_CC_OP_UNSIGNED_COMPARE, op1, mktemp(Ity_I64,
+                       mkU64(op2)));
+
+   return "clgfi";
+}
+
+static const HChar *
+s390_irgen_CLI(UChar i2, IRTemp op1addr)
+{
+   IRTemp op1 = newTemp(Ity_I8);
+   UChar op2;
+
+   assign(op1, load(Ity_I8, mkexpr(op1addr)));
+   op2 = i2;
+   s390_cc_thunk_putZZ(S390_CC_OP_UNSIGNED_COMPARE, op1, mktemp(Ity_I8,
+                       mkU8(op2)));
+
+   return "cli";
+}
+
+static const HChar *
+s390_irgen_CLIY(UChar i2, IRTemp op1addr)
+{
+   IRTemp op1 = newTemp(Ity_I8);
+   UChar op2;
+
+   assign(op1, load(Ity_I8, mkexpr(op1addr)));
+   op2 = i2;
+   s390_cc_thunk_putZZ(S390_CC_OP_UNSIGNED_COMPARE, op1, mktemp(Ity_I8,
+                       mkU8(op2)));
+
+   return "cliy";
+}
+
+static const HChar *
+s390_irgen_CLFHSI(UShort i2, IRTemp op1addr)
+{
+   IRTemp op1 = newTemp(Ity_I32);
+   UInt op2;
+
+   assign(op1, load(Ity_I32, mkexpr(op1addr)));
+   op2 = (UInt)i2;
+   s390_cc_thunk_putZZ(S390_CC_OP_UNSIGNED_COMPARE, op1, mktemp(Ity_I32,
+                       mkU32(op2)));
+
+   return "clfhsi";
+}
+
+static const HChar *
+s390_irgen_CLGHSI(UShort i2, IRTemp op1addr)
+{
+   IRTemp op1 = newTemp(Ity_I64);
+   ULong op2;
+
+   assign(op1, load(Ity_I64, mkexpr(op1addr)));
+   op2 = (ULong)i2;
+   s390_cc_thunk_putZZ(S390_CC_OP_UNSIGNED_COMPARE, op1, mktemp(Ity_I64,
+                       mkU64(op2)));
+
+   return "clghsi";
+}
+
+static const HChar *
+s390_irgen_CLHHSI(UShort i2, IRTemp op1addr)
+{
+   IRTemp op1 = newTemp(Ity_I16);
+   UShort op2;
 
    assign(op1, load(Ity_I16, mkexpr(op1addr)));
    op2 = i2;
@@ -4985,6 +6251,214 @@ s390_irgen_CLGRB(UChar r1, UChar r2, UChar m3, IRTemp op4addr)
    return "clgrb";
 }
 
+/* Raise the appropriate signal for a compare-and-trap-instruction data
+   exception if the condition is true. */
+static void
+s390_trap_on_condition(IRExpr *cond)
+{
+   stmt(IRStmt_Exit(cond, Ijk_SigFPE, IRConst_U64(guest_IA_next_instr),
+                    S390X_GUEST_OFFSET(guest_IA)));
+}
+
+/* Handle the various flavors of compare (logical) and trap. */
+static void
+s390_irgen_CxRT(UChar m3, UChar r1, UChar r2, IRType type, UInt opc)
+{
+   IRExpr *cond;
+
+   if (m3 == 0) {
+      /* Trap never (NOP) */
+      return;
+   } else if (m3 == 14) {
+      /* Trap always */
+      cond = IRExpr_Const(IRConst_U1 (True));
+   } else {
+      IRTemp op1 = newTemp(type);
+      IRTemp op2 = newTemp(type);
+
+      assign(op1, get_gpr_int(r1, type));
+      assign(op2, get_gpr_int(r2, type));
+      cond = binop(Iop_CmpNE32,
+                   s390_call_calculate_icc(m3, opc, op1, op2), mkU32(0));
+   }
+   s390_trap_on_condition(cond);
+}
+
+static const HChar *
+s390_irgen_CGRT(UChar m3, UChar r1, UChar r2)
+{
+   s390_irgen_CxRT(m3, r1, r2, Ity_I64, S390_CC_OP_SIGNED_COMPARE);
+   return "cgrt";
+}
+
+static const HChar *
+s390_irgen_CRT(UChar m3, UChar r1, UChar r2)
+{
+   s390_irgen_CxRT(m3, r1, r2, Ity_I32, S390_CC_OP_SIGNED_COMPARE);
+   return "crt";
+}
+
+static const HChar *
+s390_irgen_CLGRT(UChar m3, UChar r1, UChar r2)
+{
+   s390_irgen_CxRT(m3, r1, r2, Ity_I64, S390_CC_OP_UNSIGNED_COMPARE);
+   return "clgrt";
+}
+
+static const HChar *
+s390_irgen_CLRT(UChar m3, UChar r1, UChar r2)
+{
+   s390_irgen_CxRT(m3, r1, r2, Ity_I32, S390_CC_OP_UNSIGNED_COMPARE);
+   return "clrt";
+}
+
+/* Handle the various flavors of compare (logical) immediate and trap. */
+static void
+s390_irgen_CxIT(UChar m3, UChar r1, UShort i2, IRType type, UInt opc)
+{
+   IRExpr *cond;
+
+   if (m3 == 0) {
+      /* Trap never (NOP) */
+      return;
+   } else if (m3 == 14) {
+      /* Trap always */
+      cond = IRExpr_Const(IRConst_U1 (True));
+   } else {
+      IRTemp op1 = newTemp(type);
+      IRTemp op2 = newTemp(type);
+
+      assign(op1, get_gpr_int(r1, type));
+      if (opc == S390_CC_OP_SIGNED_COMPARE) {
+         assign(op2, type == Ity_I64 ?
+                mkU64((ULong)(Short)i2) : mkU32((UInt)(Short)i2));
+      } else {
+         assign(op2, type == Ity_I64 ?
+                mkU64((ULong)i2) : mkU32((UInt)i2));
+      }
+      cond = binop(Iop_CmpNE32,
+                   s390_call_calculate_icc(m3, opc, op1, op2), mkU32(0));
+   }
+   s390_trap_on_condition(cond);
+}
+
+static const HChar *
+s390_irgen_CGIT(UChar r1, UShort i2, UChar m3)
+{
+   s390_irgen_CxIT(m3, r1, i2, Ity_I64, S390_CC_OP_SIGNED_COMPARE);
+   return "cgit";
+}
+
+static const HChar *
+s390_irgen_CIT(UChar r1, UShort i2, UChar m3)
+{
+   s390_irgen_CxIT(m3, r1, i2, Ity_I32, S390_CC_OP_SIGNED_COMPARE);
+   return "cit";
+}
+
+static const HChar *
+s390_irgen_CLGIT(UChar r1, UShort i2, UChar m3)
+{
+   s390_irgen_CxIT(m3, r1, i2, Ity_I64, S390_CC_OP_UNSIGNED_COMPARE);
+   return "clgit";
+}
+
+static const HChar *
+s390_irgen_CLFIT(UChar r1, UShort i2, UChar m3)
+{
+   s390_irgen_CxIT(m3, r1, i2, Ity_I32, S390_CC_OP_UNSIGNED_COMPARE);
+   return "clfit";
+}
+
+/* Handle the variants of compare logical and trap with memory operand. */
+static void
+s390_irgen_CLxT(UChar r1, UChar m3, IRTemp op2addr, IRType type, UInt opc)
+{
+   IRExpr *cond;
+
+   if (m3 == 0) {
+      /* Trap never (NOP) */
+      return;
+   } else if (m3 == 14) {
+      /* Trap always */
+      cond = IRExpr_Const(IRConst_U1 (True));
+   } else {
+      IRTemp op1 = newTemp(type);
+      IRTemp op2 = newTemp(type);
+
+      assign(op1, get_gpr_int(r1, type));
+      assign(op2, load(type, mkexpr(op2addr)));
+      cond = binop(Iop_CmpNE32,
+                   s390_call_calculate_icc(m3, opc, op1, op2), mkU32(0));
+   }
+   s390_trap_on_condition(cond);
+}
+
+static const HChar *
+s390_irgen_CLT(UChar r1, UChar m3, IRTemp op2addr)
+{
+   s390_irgen_CLxT(r1, m3, op2addr, Ity_I32, S390_CC_OP_UNSIGNED_COMPARE);
+   return "clt";
+}
+
+static const HChar *
+s390_irgen_CLGT(UChar r1, UChar m3, IRTemp op2addr)
+{
+   s390_irgen_CLxT(r1, m3, op2addr, Ity_I64, S390_CC_OP_UNSIGNED_COMPARE);
+   return "clgt";
+}
+
+static const HChar *
+s390_irgen_LAT(UChar r1, IRTemp op2addr)
+{
+   IRTemp val = newTemp(Ity_I32);
+   assign(val, load(Ity_I32, mkexpr(op2addr)));
+   put_gpr_w1(r1, mkexpr(val));
+   s390_trap_on_condition(binop(Iop_CmpEQ32, mkexpr(val), mkU32(0)));
+   return "lat";
+}
+
+static const HChar *
+s390_irgen_LGAT(UChar r1, IRTemp op2addr)
+{
+   IRTemp val = newTemp(Ity_I64);
+   assign(val, load(Ity_I64, mkexpr(op2addr)));
+   put_gpr_dw0(r1, mkexpr(val));
+   s390_trap_on_condition(binop(Iop_CmpEQ64, mkexpr(val), mkU64(0)));
+   return "lgat";
+}
+
+static const HChar *
+s390_irgen_LFHAT(UChar r1, IRTemp op2addr)
+{
+   IRTemp val = newTemp(Ity_I32);
+   assign(val, load(Ity_I32, mkexpr(op2addr)));
+   put_gpr_w0(r1, mkexpr(val));
+   s390_trap_on_condition(binop(Iop_CmpEQ32, mkexpr(val), mkU32(0)));
+   return "lfhat";
+}
+
+static const HChar *
+s390_irgen_LLGFAT(UChar r1, IRTemp op2addr)
+{
+   IRTemp val = newTemp(Ity_I64);
+   assign(val, unop(Iop_32Uto64, load(Ity_I32, mkexpr(op2addr))));
+   put_gpr_dw0(r1, mkexpr(val));
+   s390_trap_on_condition(binop(Iop_CmpEQ64, mkexpr(val), mkU64(0)));
+   return "llgfat";
+}
+
+static const HChar *
+s390_irgen_LLGTAT(UChar r1, IRTemp op2addr)
+{
+   IRTemp val = newTemp(Ity_I64);
+   assign(val, binop(Iop_And64, mkU64(0x7fffffff),
+                     unop(Iop_32Uto64, load(Ity_I32, mkexpr(op2addr)))));
+   put_gpr_dw0(r1, mkexpr(val));
+   s390_trap_on_condition(binop(Iop_CmpEQ64, mkexpr(val), mkU64(0)));
+   return "llgtat";
+}
+
 static const HChar *
 s390_irgen_CLRJ(UChar r1, UChar r2, UShort i4, UChar m3)
 {
@@ -11030,7 +12504,7 @@ s390_irgen_CLCL(UChar r1, UChar r2)
    assign(pad, get_gpr_b4(r2 + 1));
 
    /* len1 == 0 and len2 == 0? Exit */
-   s390_cc_set(0);
+   s390_cc_set_val(0);
    next_insn_if(binop(Iop_CmpEQ32, binop(Iop_Or32, mkexpr(len1),
                                          mkexpr(len2)), mkU32(0)));
 
@@ -11106,7 +12580,7 @@ s390_irgen_CLCLE(UChar r1, UChar r3, IRTemp pad2)
    assign(len3, get_gpr_dw0(r3 + 1));
 
    /* len1 == 0 and len3 == 0? Exit */
-   s390_cc_set(0);
+   s390_cc_set_val(0);
    next_insn_if(binop(Iop_CmpEQ64,binop(Iop_Or64, mkexpr(len1),
                                         mkexpr(len3)), mkU64(0)));
 
@@ -11271,6 +12745,7 @@ s390_irgen_EX_SS(UChar r, IRTemp addr2,
    IRTemp cond;
    IRDirty *d;
    IRTemp torun;
+   unsigned long ovl;
 
    IRTemp start1 = newTemp(Ity_I64);
    IRTemp start2 = newTemp(Ity_I64);
@@ -11293,19 +12768,20 @@ s390_irgen_EX_SS(UChar r, IRTemp addr2,
    stmt(IRStmt_Put(S390X_GUEST_OFFSET(guest_CMLEN), mkU64(4)));
    restart_if(mkexpr(cond));
 
-   assign(start1, binop(Iop_Add64, mkU64(SS_d1(last_execute_target)),
-          SS_b1(last_execute_target) != 0 ? get_gpr_dw0(SS_b1(last_execute_target)) : mkU64(0)));
-   assign(start2, binop(Iop_Add64, mkU64(SS_d2(last_execute_target)),
-          SS_b2(last_execute_target) != 0 ? get_gpr_dw0(SS_b2(last_execute_target)) : mkU64(0)));
+   ovl = last_execute_target;
+   assign(start1, binop(Iop_Add64, mkU64(SS_d1(ovl)),
+          SS_b1(ovl) != 0 ? get_gpr_dw0(SS_b1(ovl)) : mkU64(0)));
+   assign(start2, binop(Iop_Add64, mkU64(SS_d2(ovl)),
+          SS_b2(ovl) != 0 ? get_gpr_dw0(SS_b2(ovl)) : mkU64(0)));
    assign(len, unop(lensize == 64 ? Iop_8Uto64 : Iop_8Uto32, binop(Iop_Or8,
-          r != 0 ? get_gpr_b7(r): mkU8(0), mkU8(SS_l(last_execute_target)))));
+          r != 0 ? get_gpr_b7(r): mkU8(0), mkU8(SS_l(ovl)))));
    irgen(len, start1, start2);
 
    last_execute_target = 0;
 }
 
 static const HChar *
-s390_irgen_EX(UChar r1, IRTemp addr2, VexEndness host_endness)
+s390_irgen_EX(UChar r1, IRTemp addr2)
 {
    switch(last_execute_target & 0xff00000000000000ULL) {
    case 0:
@@ -11404,7 +12880,7 @@ s390_irgen_EX(UChar r1, IRTemp addr2, VexEndness host_endness)
       /* Now comes the actual translation */
       bytes = (UChar *) &last_execute_target;
       s390_decode_and_irgen(bytes, ((((bytes[0] >> 6) + 1) >> 1) + 1) << 1,
-                            dis_res, host_endness);
+                            dis_res);
       if (UNLIKELY(vex_traceflags & VEX_TRACE_FE))
          vex_printf("    which was executed by\n");
       /* dont make useless translations in the next execute */
@@ -11414,58 +12890,29 @@ s390_irgen_EX(UChar r1, IRTemp addr2, VexEndness host_endness)
    return "ex";
 }
 
-static const HChar *
-s390_irgen_EX_BE(UChar r1, IRTemp addr2)
-{
-  return s390_irgen_EX(r1, addr2, VexEndnessBE);
-}
-
-static const HChar *
-s390_irgen_EX_LE(UChar r1, IRTemp addr2)
-{
-  return s390_irgen_EX(r1, addr2, VexEndnessLE);
-}
-
 static const UChar *exrl_bytes;
 
 static const HChar *
-s390_irgen_EXRL(UChar r1, UInt offset, VexEndness host_endness)
+s390_irgen_EXRL(UChar r1, UInt offset)
 {
    const UChar *exrl_target;
    IRTemp addr = newTemp(Ity_I64);
+   Addr64 bytes_addr = guest_IA_curr_instr + offset * 2UL;
    /* we might save one round trip because we know the target */
    if (!last_execute_target) {
       exrl_target = exrl_bytes + offset * 2UL;
-      if (host_endness == VexEndnessBE)
-        last_execute_target = *(ULong *)exrl_target;
-      else {
-        ((UChar *)&last_execute_target)[0] = exrl_target[7];
-        ((UChar *)&last_execute_target)[1] = exrl_target[6];
-        ((UChar *)&last_execute_target)[2] = exrl_target[5];
-        ((UChar *)&last_execute_target)[3] = exrl_target[4];
-        ((UChar *)&last_execute_target)[4] = exrl_target[3];
-        ((UChar *)&last_execute_target)[5] = exrl_target[2];
-        ((UChar *)&last_execute_target)[6] = exrl_target[1];
-        ((UChar *)&last_execute_target)[7] = exrl_target[0];
-      }
+      last_execute_target = ((ULong)exrl_target[0] << 56) |
+                            ((ULong)exrl_target[1] << 48) |
+                            ((ULong)exrl_target[2] << 40) |
+                            ((ULong)exrl_target[3] << 32) |
+                            ((ULong)exrl_target[4] << 24) |
+                            ((ULong)exrl_target[5] << 16);
    }
-   assign(addr, mkU64(guest_IA_curr_instr + offset * 2UL));
-   s390_irgen_EX(r1, addr, host_endness);
+   assign(addr, mkU64(bytes_addr));
+   s390_irgen_EX(r1, addr);
    return "exrl";
 }
 
-static const HChar *
-s390_irgen_EXRL_BE(UChar r1, UInt offset)
-{
-  return s390_irgen_EXRL(r1, offset, VexEndnessBE);
-}
-
-static const HChar *
-s390_irgen_EXRL_LE(UChar r1, UInt offset)
-{
-  return s390_irgen_EXRL(r1, offset, VexEndnessLE);
-}
-
 static const HChar *
 s390_irgen_IPM(UChar r1)
 {
@@ -11493,7 +12940,7 @@ s390_irgen_SRST(UChar r1, UChar r2)
    put_counter_dw0(mkU64(0));
 
    // start = next?  CC=2 and out r1 and r2 unchanged
-   s390_cc_set(2);
+   s390_cc_set_val(2);
    put_gpr_dw0(r2, binop(Iop_Sub64, mkexpr(address), mkexpr(counter)));
    next_insn_if(binop(Iop_CmpEQ64, mkexpr(address), mkexpr(next)));
 
@@ -11501,7 +12948,7 @@ s390_irgen_SRST(UChar r1, UChar r2)
    assign(delim, get_gpr_b7(0));
 
    // byte = delim? CC=1, R1=address
-   s390_cc_set(1);
+   s390_cc_set_val(1);
    put_gpr_dw0(r1,  mkexpr(address));
    next_insn_if(binop(Iop_CmpEQ8, mkexpr(delim), mkexpr(byte)));
 
@@ -11534,7 +12981,7 @@ s390_irgen_CLST(UChar r1, UChar r2)
    assign(byte2, load(Ity_I8, mkexpr(address2)));
 
    // end in both? all equal, reset r1 and r2 to start values
-   s390_cc_set(0);
+   s390_cc_set_val(0);
    put_gpr_dw0(r1, binop(Iop_Sub64, mkexpr(address1), mkexpr(counter)));
    put_gpr_dw0(r2, binop(Iop_Sub64, mkexpr(address2), mkexpr(counter)));
    next_insn_if(binop(Iop_CmpEQ8, mkU8(0),
@@ -11546,20 +12993,20 @@ s390_irgen_CLST(UChar r1, UChar r2)
    put_gpr_dw0(r2, mkexpr(address2));
 
    // End found in string1
-   s390_cc_set(1);
+   s390_cc_set_val(1);
    next_insn_if(binop(Iop_CmpEQ8, mkexpr(end), mkexpr(byte1)));
 
    // End found in string2
-   s390_cc_set(2);
+   s390_cc_set_val(2);
    next_insn_if(binop(Iop_CmpEQ8, mkexpr(end), mkexpr(byte2)));
 
    // string1 < string2
-   s390_cc_set(1);
+   s390_cc_set_val(1);
    next_insn_if(binop(Iop_CmpLT32U, unop(Iop_8Uto32, mkexpr(byte1)),
                       unop(Iop_8Uto32, mkexpr(byte2))));
 
    // string2 < string1
-   s390_cc_set(2);
+   s390_cc_set_val(2);
    next_insn_if(binop(Iop_CmpLT32U, unop(Iop_8Uto32, mkexpr(byte2)),
                       unop(Iop_8Uto32, mkexpr(byte1))));
 
@@ -11885,7 +13332,7 @@ s390_irgen_MVCL(UChar r1, UChar r2)
 
    /* Check for destructive overlap:
       addr1 > addr2 && addr2 + len1 > addr1 && (addr2 + len2) > addr1 */
-   s390_cc_set(3);
+   s390_cc_set_val(3);
    IRTemp cond1 = newTemp(Ity_I32);
    assign(cond1, unop(Iop_1Uto32,
                       binop(Iop_CmpLT64U, mkexpr(addr2), mkexpr(addr1))));
@@ -12015,7 +13462,7 @@ s390_irgen_MVST(UChar r1, UChar r2)
    iterate_if(binop(Iop_CmpNE8, mkexpr(end), mkexpr(byte)));
 
    // and always set cc=1 at the end + update r1
-   s390_cc_set(1);
+   s390_cc_set_val(1);
    put_gpr_dw0(r1, binop(Iop_Add64, mkexpr(addr1), mkexpr(counter)));
    put_counter_dw0(mkU64(0));
 
@@ -13370,8 +14817,7 @@ s390_irgen_STCK(IRTemp op2addr)
    d->mAddr = mkexpr(op2addr);
    d->mSize = 8;
    stmt(IRStmt_Dirty(d));
-   s390_cc_thunk_fill(mkU64(S390_CC_OP_SET),
-                      mkexpr(cc), mkU64(0), mkU64(0));
+   s390_cc_set(cc);
    return "stck";
 }
 
@@ -13390,8 +14836,7 @@ s390_irgen_STCKF(IRTemp op2addr)
       d->mAddr = mkexpr(op2addr);
       d->mSize = 8;
       stmt(IRStmt_Dirty(d));
-      s390_cc_thunk_fill(mkU64(S390_CC_OP_SET),
-                         mkexpr(cc), mkU64(0), mkU64(0));
+      s390_cc_set(cc);
    }
    return "stckf";
 }
@@ -13409,8 +14854,7 @@ s390_irgen_STCKE(IRTemp op2addr)
    d->mAddr = mkexpr(op2addr);
    d->mSize = 16;
    stmt(IRStmt_Dirty(d));
-   s390_cc_thunk_fill(mkU64(S390_CC_OP_SET),
-                      mkexpr(cc), mkU64(0), mkU64(0));
+   s390_cc_set(cc);
    return "stcke";
 }
 
@@ -13444,7 +14888,7 @@ s390_irgen_STFLE(IRTemp op2addr)
 
    stmt(IRStmt_Dirty(d));
 
-   s390_cc_thunk_fill(mkU64(S390_CC_OP_SET), mkexpr(cc), mkU64(0), mkU64(0));
+   s390_cc_set(cc);
 
    return "stfle";
 }
@@ -13467,7 +14911,7 @@ s390_irgen_CKSM(UChar r1,UChar r2)
    assign(len, get_gpr_dw0(r2+1));
 
    /* Condition code is always zero. */
-   s390_cc_set(0);
+   s390_cc_set_val(0);
 
    /* If length is zero, there is no need to calculate the checksum */
    next_insn_if(binop(Iop_CmpEQ64, mkexpr(len), mkU64(0)));
@@ -13534,7 +14978,7 @@ s390_irgen_TROO(UChar m3, UChar r1, UChar r2)
    IRTemp result = newTemp(Ity_I64);
 
    /* End of source string? We're done; proceed to next insn */
-   s390_cc_set(0);
+   s390_cc_set_val(0);
    next_insn_if(binop(Iop_CmpEQ64, mkexpr(src_len), mkU64(0)));
 
    /* Load character from source string, index translation table and
@@ -13546,7 +14990,7 @@ s390_irgen_TROO(UChar m3, UChar r1, UChar r2)
    assign(op1, load(Ity_I8, mkexpr(result)));
 
    if (! s390_host_has_etf2 || (m3 & 0x1) == 0) {
-      s390_cc_set(1);
+      s390_cc_set_val(1);
       next_insn_if(binop(Iop_CmpEQ8, mkexpr(op1), mkexpr(test_byte)));
    }
    store(get_gpr_dw0(r1), mkexpr(op1));
@@ -13581,7 +15025,7 @@ s390_irgen_TRTO(UChar m3, UChar r1, UChar r2)
    IRTemp result = newTemp(Ity_I64);
 
    /* End of source string? We're done; proceed to next insn */
-   s390_cc_set(0);
+   s390_cc_set_val(0);
    next_insn_if(binop(Iop_CmpEQ64, mkexpr(src_len), mkU64(0)));
 
    /* Load character from source string, index translation table and
@@ -13594,7 +15038,7 @@ s390_irgen_TRTO(UChar m3, UChar r1, UChar r2)
    assign(op1, load(Ity_I8, mkexpr(result)));
 
    if (! s390_host_has_etf2 || (m3 & 0x1) == 0) {
-      s390_cc_set(1);
+      s390_cc_set_val(1);
       next_insn_if(binop(Iop_CmpEQ8, mkexpr(op1), mkexpr(test_byte)));
    }
    store(get_gpr_dw0(r1), mkexpr(op1));
@@ -13629,7 +15073,7 @@ s390_irgen_TROT(UChar m3, UChar r1, UChar r2)
    IRTemp result = newTemp(Ity_I64);
 
    /* End of source string? We're done; proceed to next insn */
-   s390_cc_set(0);
+   s390_cc_set_val(0);
    next_insn_if(binop(Iop_CmpEQ64, mkexpr(src_len), mkU64(0)));
 
    /* Load character from source string, index translation table and
@@ -13641,7 +15085,7 @@ s390_irgen_TROT(UChar m3, UChar r1, UChar r2)
    assign(op1, load(Ity_I16, mkexpr(result)));
 
    if (! s390_host_has_etf2 || (m3 & 0x1) == 0) {
-      s390_cc_set(1);
+      s390_cc_set_val(1);
       next_insn_if(binop(Iop_CmpEQ16, mkexpr(op1), mkexpr(test_byte)));
    }
    store(get_gpr_dw0(r1), mkexpr(op1));
@@ -13676,7 +15120,7 @@ s390_irgen_TRTT(UChar m3, UChar r1, UChar r2)
    IRTemp result = newTemp(Ity_I64);
 
    /* End of source string? We're done; proceed to next insn */
-   s390_cc_set(0);
+   s390_cc_set_val(0);
    next_insn_if(binop(Iop_CmpEQ64, mkexpr(src_len), mkU64(0)));
 
    /* Load character from source string, index translation table and
@@ -13688,7 +15132,7 @@ s390_irgen_TRTT(UChar m3, UChar r1, UChar r2)
    assign(op1, load(Ity_I16, mkexpr(result)));
 
    if (! s390_host_has_etf2 || (m3 & 0x1) == 0) {
-      s390_cc_set(1);
+      s390_cc_set_val(1);
       next_insn_if(binop(Iop_CmpEQ16, mkexpr(op1), mkexpr(test_byte)));
    }
 
@@ -13733,13 +15177,13 @@ s390_irgen_TRE(UChar r1,UChar r2)
    IRTemp result = newTemp(Ity_I64);
 
    /* End of source string? We're done; proceed to next insn */   
-   s390_cc_set(0);
+   s390_cc_set_val(0);
    next_insn_if(binop(Iop_CmpEQ64, mkexpr(src_len), mkU64(0)));
 
    /* Load character from source string and compare with test byte */
    assign(op, load(Ity_I8, mkexpr(src_addr)));
    
-   s390_cc_set(1);
+   s390_cc_set_val(1);
    next_insn_if(binop(Iop_CmpEQ8, mkexpr(op), mkexpr(test_byte)));
 
    assign(result, binop(Iop_Add64, unop(Iop_8Uto64, mkexpr(op)), 
@@ -13786,7 +15230,7 @@ s390_irgen_CU21(UChar m3, UChar r1, UChar r2)
    /* We're processing the 2nd operand 2 bytes at a time. Therefore, if
       there are less than 2 bytes left, then the 2nd operand is exhausted
       and we're done here. cc = 0 */
-   s390_cc_set(0);
+   s390_cc_set_val(0);
    next_insn_if(binop(Iop_CmpLT64U, mkexpr(len2), mkU64(2)));
 
    /* There are at least two bytes there. Read them. */
@@ -13832,7 +15276,7 @@ s390_irgen_CU21(UChar m3, UChar r1, UChar r2)
       IRExpr *invalid_low_surrogate =
          binop(Iop_And64, mkexpr(retval), mkU64(0xff));
 
-      s390_cc_set(2);
+      s390_cc_set_val(2);
       next_insn_if(binop(Iop_CmpEQ64, invalid_low_surrogate, mkU64(1)));
    }
 
@@ -13841,7 +15285,7 @@ s390_irgen_CU21(UChar m3, UChar r1, UChar r2)
    assign(num_bytes, binop(Iop_And64,
                            binop(Iop_Shr64, mkexpr(retval), mkU8(8)),
                            mkU64(0xff)));
-   s390_cc_set(1);
+   s390_cc_set_val(1);
    next_insn_if(binop(Iop_CmpLT64U, mkexpr(len1), mkexpr(num_bytes)));
 
    /* Extract the bytes to be stored at addr1 */
@@ -13913,7 +15357,7 @@ s390_irgen_CU24(UChar m3, UChar r1, UChar r2)
    /* We're processing the 2nd operand 2 bytes at a time. Therefore, if
       there are less than 2 bytes left, then the 2nd operand is exhausted
       and we're done here. cc = 0 */
-   s390_cc_set(0);
+   s390_cc_set_val(0);
    next_insn_if(binop(Iop_CmpLT64U, mkexpr(len2), mkU64(2)));
 
    /* There are at least two bytes there. Read them. */
@@ -13960,12 +15404,12 @@ s390_irgen_CU24(UChar m3, UChar r1, UChar r2)
       IRExpr *invalid_low_surrogate =
          binop(Iop_And64, mkexpr(retval), mkU64(0xff));
 
-      s390_cc_set(2);
+      s390_cc_set_val(2);
       next_insn_if(binop(Iop_CmpEQ64, invalid_low_surrogate, mkU64(1)));
    }
 
    /* Now test whether the 1st operand is exhausted */
-   s390_cc_set(1);
+   s390_cc_set_val(1);
    next_insn_if(binop(Iop_CmpLT64U, mkexpr(len1), mkU64(4)));
 
    /* Extract the bytes to be stored at addr1 */
@@ -14020,7 +15464,7 @@ s390_irgen_CU42(UChar r1, UChar r2)
    /* We're processing the 2nd operand 4 bytes at a time. Therefore, if
       there are less than 4 bytes left, then the 2nd operand is exhausted
       and we're done here. cc = 0 */
-   s390_cc_set(0);
+   s390_cc_set_val(0);
    next_insn_if(binop(Iop_CmpLT64U, mkexpr(len2), mkU64(4)));
 
    /* Read the 2nd operand. */
@@ -14035,7 +15479,7 @@ s390_irgen_CU42(UChar r1, UChar r2)
       cc=2 outranks cc=1 (1st operand exhausted) */
    IRExpr *invalid_character = binop(Iop_And64, mkexpr(retval), mkU64(0xff));
 
-   s390_cc_set(2);
+   s390_cc_set_val(2);
    next_insn_if(binop(Iop_CmpEQ64, invalid_character, mkU64(1)));
 
    /* Now test whether the 1st operand is exhausted */
@@ -14043,7 +15487,7 @@ s390_irgen_CU42(UChar r1, UChar r2)
    assign(num_bytes, binop(Iop_And64,
                            binop(Iop_Shr64, mkexpr(retval), mkU8(8)),
                            mkU64(0xff)));
-   s390_cc_set(1);
+   s390_cc_set_val(1);
    next_insn_if(binop(Iop_CmpLT64U, mkexpr(len1), mkexpr(num_bytes)));
 
    /* Extract the bytes to be stored at addr1 */
@@ -14114,7 +15558,7 @@ s390_irgen_CU41(UChar r1, UChar r2)
    /* We're processing the 2nd operand 4 bytes at a time. Therefore, if
       there are less than 4 bytes left, then the 2nd operand is exhausted
       and we're done here. cc = 0 */
-   s390_cc_set(0);
+   s390_cc_set_val(0);
    next_insn_if(binop(Iop_CmpLT64U, mkexpr(len2), mkU64(4)));
 
    /* Read the 2nd operand. */
@@ -14129,7 +15573,7 @@ s390_irgen_CU41(UChar r1, UChar r2)
       cc=2 outranks cc=1 (1st operand exhausted) */
    IRExpr *invalid_character = binop(Iop_And64, mkexpr(retval), mkU64(0xff));
 
-   s390_cc_set(2);
+   s390_cc_set_val(2);
    next_insn_if(binop(Iop_CmpEQ64, invalid_character, mkU64(1)));
 
    /* Now test whether the 1st operand is exhausted */
@@ -14137,7 +15581,7 @@ s390_irgen_CU41(UChar r1, UChar r2)
    assign(num_bytes, binop(Iop_And64,
                            binop(Iop_Shr64, mkexpr(retval), mkU8(8)),
                            mkU64(0xff)));
-   s390_cc_set(1);
+   s390_cc_set_val(1);
    next_insn_if(binop(Iop_CmpLT64U, mkexpr(len1), mkexpr(num_bytes)));
 
    /* Extract the bytes to be stored at addr1 */
@@ -14186,209 +15630,3127 @@ s390_call_cu12_cu14_helper1(IRExpr *byte1, IRExpr *etf3_and_m3_is_1)
    /* Nothing is excluded from definedness checking. */
    call->Iex.CCall.cee->mcx_mask = 0;
 
-   return call;
+   return call;
+}
+
+static IRExpr *
+s390_call_cu12_helper2(IRExpr *byte1, IRExpr *byte2, IRExpr *byte3,
+                       IRExpr *byte4, IRExpr *stuff)
+{
+   IRExpr **args, *call;
+   args = mkIRExprVec_5(byte1, byte2, byte3, byte4, stuff);
+   call = mkIRExprCCall(Ity_I64, 0 /*regparm*/,
+                        "s390_do_cu12_helper2", &s390_do_cu12_helper2, args);
+
+   /* Nothing is excluded from definedness checking. */
+   call->Iex.CCall.cee->mcx_mask = 0;
+
+   return call;
+}
+
+static IRExpr *
+s390_call_cu14_helper2(IRExpr *byte1, IRExpr *byte2, IRExpr *byte3,
+                       IRExpr *byte4, IRExpr *stuff)
+{
+   IRExpr **args, *call;
+   args = mkIRExprVec_5(byte1, byte2, byte3, byte4, stuff);
+   call = mkIRExprCCall(Ity_I64, 0 /*regparm*/,
+                        "s390_do_cu14_helper2", &s390_do_cu14_helper2, args);
+
+   /* Nothing is excluded from definedness checking. */
+   call->Iex.CCall.cee->mcx_mask = 0;
+
+   return call;
+}
+
+static void
+s390_irgen_cu12_cu14(UChar m3, UChar r1, UChar r2, Bool is_cu12)
+{
+   IRTemp addr1 = newTemp(Ity_I64);
+   IRTemp addr2 = newTemp(Ity_I64);
+   IRTemp len1 = newTemp(Ity_I64);
+   IRTemp len2 = newTemp(Ity_I64);
+
+   assign(addr1, get_gpr_dw0(r1));
+   assign(addr2, get_gpr_dw0(r2));
+   assign(len1, get_gpr_dw0(r1 + 1));
+   assign(len2, get_gpr_dw0(r2 + 1));
+
+   UInt extended_checking = s390_host_has_etf3 && (m3 & 0x1) == 1;
+
+   /* We're processing the 2nd operand 1 byte at a time. Therefore, if
+      there is less than 1 byte left, then the 2nd operand is exhausted
+      and we're done here. cc = 0 */
+   s390_cc_set_val(0);
+   next_insn_if(binop(Iop_CmpLT64U, mkexpr(len2), mkU64(1)));
+
+   /* There is at least one byte there. Read it. */
+   IRTemp byte1 = newTemp(Ity_I64);
+   assign(byte1, unop(Iop_8Uto64, load(Ity_I8, mkexpr(addr2))));
+
+   /* Call the helper to get number of bytes and invalid byte indicator */
+   IRTemp retval1 = newTemp(Ity_I64);
+   assign(retval1, s390_call_cu12_cu14_helper1(mkexpr(byte1),
+                                               mkU64(extended_checking)));
+
+   /* Check for invalid 1st byte */
+   IRExpr *is_invalid = unop(Iop_64to1, mkexpr(retval1));
+   s390_cc_set_val(2);
+   next_insn_if(is_invalid);
+
+   /* How many bytes do we have to read? */
+   IRTemp num_src_bytes = newTemp(Ity_I64);
+   assign(num_src_bytes, binop(Iop_Shr64, mkexpr(retval1), mkU8(8)));
+
+   /* Now test whether the 2nd operand is exhausted */
+   s390_cc_set_val(0);
+   next_insn_if(binop(Iop_CmpLT64U, mkexpr(len2), mkexpr(num_src_bytes)));
+
+   /* Read the remaining bytes */
+   IRExpr *cond, *addr, *byte2, *byte3, *byte4;
+
+   cond  = binop(Iop_CmpLE64U, mkU64(2), mkexpr(num_src_bytes));
+   addr  = binop(Iop_Add64, mkexpr(addr2), mkU64(1));
+   byte2 = mkite(cond, unop(Iop_8Uto64, load(Ity_I8, addr)), mkU64(0));
+   cond  = binop(Iop_CmpLE64U, mkU64(3), mkexpr(num_src_bytes));
+   addr  = binop(Iop_Add64, mkexpr(addr2), mkU64(2));
+   byte3 = mkite(cond, unop(Iop_8Uto64, load(Ity_I8, addr)), mkU64(0));
+   cond  = binop(Iop_CmpLE64U, mkU64(4), mkexpr(num_src_bytes));
+   addr  = binop(Iop_Add64, mkexpr(addr2), mkU64(3));
+   byte4 = mkite(cond, unop(Iop_8Uto64, load(Ity_I8, addr)), mkU64(0));
+
+   /* Call the helper to get the converted value and invalid byte indicator.
+      We can pass at most 5 arguments; therefore some encoding is needed
+      here */
+   IRExpr *stuff = binop(Iop_Or64,
+                         binop(Iop_Shl64, mkexpr(num_src_bytes), mkU8(1)),
+                         mkU64(extended_checking));
+   IRTemp retval2 = newTemp(Ity_I64);
+
+   if (is_cu12) {
+      assign(retval2, s390_call_cu12_helper2(mkexpr(byte1), byte2, byte3,
+                                             byte4, stuff));
+   } else {
+      assign(retval2, s390_call_cu14_helper2(mkexpr(byte1), byte2, byte3,
+                                             byte4, stuff));
+   }
+
+   /* Check for invalid character */
+   s390_cc_set_val(2);
+   is_invalid = unop(Iop_64to1, mkexpr(retval2));
+   next_insn_if(is_invalid);
+
+   /* Now test whether the 1st operand is exhausted */
+   IRTemp num_bytes = newTemp(Ity_I64);
+   assign(num_bytes, binop(Iop_And64,
+                           binop(Iop_Shr64, mkexpr(retval2), mkU8(8)),
+                           mkU64(0xff)));
+   s390_cc_set_val(1);
+   next_insn_if(binop(Iop_CmpLT64U, mkexpr(len1), mkexpr(num_bytes)));
+
+   /* Extract the bytes to be stored at addr1 */
+   IRTemp data = newTemp(Ity_I64);
+   assign(data, binop(Iop_Shr64, mkexpr(retval2), mkU8(16)));
+
+   if (is_cu12) {
+      /* To store the bytes construct 2 dirty helper calls. The helper calls
+         are guarded (num_bytes == 2 and num_bytes == 4, respectively) such
+         that only one of them will be called at runtime. */
+
+      Int i;
+      for (i = 2; i <= 4; ++i) {
+         IRDirty *d;
+
+         if (i == 3) continue;  // skip this one
+
+         d = unsafeIRDirty_0_N(0 /* regparms */, "s390x_dirtyhelper_CUxy",
+                               &s390x_dirtyhelper_CUxy,
+                               mkIRExprVec_3(mkexpr(addr1), mkexpr(data),
+                                             mkexpr(num_bytes)));
+         d->guard = binop(Iop_CmpEQ64, mkexpr(num_bytes), mkU64(i));
+         d->mFx   = Ifx_Write;
+         d->mAddr = mkexpr(addr1);
+         d->mSize = i;
+         stmt(IRStmt_Dirty(d));
+      }
+   } else {
+      // cu14
+      store(mkexpr(addr1), unop(Iop_64to32, mkexpr(data)));
+   }
+
+   /* Update source address and length */
+   put_gpr_dw0(r2,     binop(Iop_Add64, mkexpr(addr2), mkexpr(num_src_bytes)));
+   put_gpr_dw0(r2 + 1, binop(Iop_Sub64, mkexpr(len2),  mkexpr(num_src_bytes)));
+
+   /* Update destination address and length */
+   put_gpr_dw0(r1,     binop(Iop_Add64, mkexpr(addr1), mkexpr(num_bytes)));
+   put_gpr_dw0(r1 + 1, binop(Iop_Sub64, mkexpr(len1),  mkexpr(num_bytes)));
+
+   iterate();
+}
+
+static const HChar *
+s390_irgen_CU12(UChar m3, UChar r1, UChar r2)
+{
+   s390_irgen_cu12_cu14(m3, r1, r2, /* is_cu12 = */ 1);
+
+   return "cu12";
+}
+
+static const HChar *
+s390_irgen_CU14(UChar m3, UChar r1, UChar r2)
+{
+   s390_irgen_cu12_cu14(m3, r1, r2, /* is_cu12 = */ 0);
+
+   return "cu14";
+}
+
+static IRExpr *
+s390_call_ecag(IRExpr *op2addr)
+{
+   IRExpr **args, *call;
+
+   args = mkIRExprVec_1(op2addr);
+   call = mkIRExprCCall(Ity_I64, 0 /*regparm*/,
+                        "s390_do_ecag", &s390_do_ecag, args);
+
+   /* Nothing is excluded from definedness checking. */
+   call->Iex.CCall.cee->mcx_mask = 0;
+
+   return call;
+}
+
+static const HChar *
+s390_irgen_ECAG(UChar r1, UChar r3 __attribute__((unused)), IRTemp op2addr)
+{
+   if (! s390_host_has_gie) {
+      emulation_failure(EmFail_S390X_ecag);
+   } else {
+      put_gpr_dw0(r1, s390_call_ecag(mkexpr(op2addr)));
+   }
+
+   return "ecag";
+}
+
+static const HChar *
+s390_irgen_VL(UChar v1, IRTemp op2addr)
+{
+   put_vr_qw(v1, load(Ity_V128, mkexpr(op2addr)));
+
+   return "vl";
+}
+
+static const HChar *
+s390_irgen_VLR(UChar v1, UChar v2)
+{
+   put_vr_qw(v1, get_vr_qw(v2));
+
+   return "vlr";
+}
+
+static const HChar *
+s390_irgen_VST(UChar v1, IRTemp op2addr)
+{
+   store(mkexpr(op2addr), get_vr_qw(v1));
+
+   return "vst";
+}
+
+static const HChar *
+s390_irgen_VLREP(UChar v1, IRTemp op2addr, UChar m3)
+{
+   IRType o2type = s390_vr_get_type(m3);
+   IRExpr* o2 = load(o2type, mkexpr(op2addr));
+   s390_vr_fill(v1, o2);
+   return "vlrep";
+}
+
+static const HChar *
+s390_irgen_VLEB(UChar v1, IRTemp op2addr, UChar m3)
+{
+   IRExpr* o2 = load(Ity_I8, mkexpr(op2addr));
+   put_vr(v1, Ity_I8, m3, o2);
+
+   return "vleb";
+}
+
+static const HChar *
+s390_irgen_VLEH(UChar v1, IRTemp op2addr, UChar m3)
+{
+   IRExpr* o2 = load(Ity_I16, mkexpr(op2addr));
+   put_vr(v1, Ity_I16, m3, o2);
+
+   return "vleh";
+}
+
+static const HChar *
+s390_irgen_VLEF(UChar v1, IRTemp op2addr, UChar m3)
+{
+   IRExpr* o2 = load(Ity_I32, mkexpr(op2addr));
+   put_vr(v1, Ity_I32, m3, o2);
+
+   return "vlef";
+}
+
+static const HChar *
+s390_irgen_VLEG(UChar v1, IRTemp op2addr, UChar m3)
+{
+   IRExpr* o2 = load(Ity_I64, mkexpr(op2addr));
+   put_vr(v1, Ity_I64, m3, o2);
+
+   return "vleg";
+}
+
+static const HChar *
+s390_irgen_VLEIB(UChar v1, UShort i2, UChar m3)
+{
+   IRExpr* o2 = unop(Iop_16to8, mkU16(i2));
+   put_vr(v1, Ity_I8, m3, o2);
+
+   return "vleib";
+}
+
+static const HChar *
+s390_irgen_VLEIH(UChar v1, UShort i2, UChar m3)
+{
+   IRExpr* o2 = mkU16(i2);
+   put_vr(v1, Ity_I16, m3, o2);
+
+   return "vleih";
+}
+
+static const HChar *
+s390_irgen_VLEIF(UChar v1, UShort i2, UChar m3)
+{
+   IRExpr* o2 = unop(Iop_16Sto32, mkU16(i2));
+   put_vr(v1, Ity_I32, m3, o2);
+
+   return "vleif";
+}
+
+static const HChar *
+s390_irgen_VLEIG(UChar v1, UShort i2, UChar m3)
+{
+   IRExpr* o2 = unop(Iop_16Sto64, mkU16(i2));
+   put_vr(v1, Ity_I64, m3, o2);
+
+   return "vleig";
+}
+
+static const HChar *
+s390_irgen_VLGV(UChar r1, IRTemp op2addr, UChar v3, UChar m4)
+{
+   IRType o2type = s390_vr_get_type(m4);
+   IRExpr* index = unop(Iop_64to8, binop(Iop_And64, mkexpr(op2addr), mkU64(0xf)));
+   IRExpr* o2;
+   IRExpr* result;
+   switch (o2type) {
+   case Ity_I8:
+      o2 = binop(Iop_GetElem8x16, get_vr_qw(v3), index);
+      result = unop(Iop_8Uto64, o2);
+      break;
+   case Ity_I16:
+      o2 = binop(Iop_GetElem16x8, get_vr_qw(v3), index);
+      result = unop(Iop_16Uto64, o2);
+      break;
+   case Ity_I32:
+      o2 = binop(Iop_GetElem32x4, get_vr_qw(v3), index);
+      result = unop(Iop_32Uto64, o2);
+      break;
+   case Ity_I64:
+      result = binop(Iop_GetElem64x2, get_vr_qw(v3), index);
+      break;
+   default:
+      ppIRType(o2type);
+      vpanic("s390_irgen_VLGV: unknown o2type");
+   }
+
+   put_gpr_dw0(r1, result);
+   return "vlgv";
+}
+
+static const HChar *
+s390_irgen_VGBM(UChar v1, UShort i2, UChar m3 __attribute__((unused)))
+{
+   put_vr_qw(v1, IRExpr_Const(IRConst_V128(i2)));
+
+   return "vgbm";
+}
+
+static const HChar *
+s390_irgen_VGM(UChar v1, UShort i2, UChar m3)
+{
+   UChar from = (i2 & 0xff00) >> 8;
+   UChar to   = (i2 & 0x00ff);
+   ULong value = 0UL;
+   IRType type = s390_vr_get_type(m3);
+   vassert(from <= to);
+
+   UChar maxIndex = 0;
+   switch (type) {
+   case Ity_I8:
+      maxIndex = 7;
+      break;
+   case Ity_I16:
+      maxIndex = 15;
+      break;
+   case Ity_I32:
+      maxIndex = 31;
+      break;
+   case Ity_I64:
+      maxIndex = 63;
+      break;
+   default:
+      vpanic("s390_irgen_VGM: unknown type");
+   }
+
+   for(UChar index = from; index <= to; index++) {
+      value |= (1ULL << (maxIndex - index));
+   }
+
+   IRExpr *fillValue;
+   switch (type) {
+   case Ity_I8:
+      fillValue = mkU8(value);
+      break;
+   case Ity_I16:
+      fillValue = mkU16(value);
+      break;
+   case Ity_I32:
+      fillValue = mkU32(value);
+      break;
+   case Ity_I64:
+      fillValue = mkU64(value);
+      break;
+   default:
+      vpanic("s390_irgen_VGM: unknown type");
+   }
+
+   s390_vr_fill(v1, fillValue);
+   return "vgm";
+}
+
+static const HChar *
+s390_irgen_VLLEZ(UChar v1, IRTemp op2addr, UChar m3)
+{
+   IRType type = s390_vr_get_type(m3);
+   IRExpr* op2 = load(type, mkexpr(op2addr));
+   IRExpr* op2as64bit;
+   switch (type) {
+   case Ity_I8:
+      op2as64bit = unop(Iop_8Uto64, op2);
+      break;
+   case Ity_I16:
+      op2as64bit = unop(Iop_16Uto64, op2);
+      break;
+   case Ity_I32:
+      op2as64bit = unop(Iop_32Uto64, op2);
+      break;
+   case Ity_I64:
+      op2as64bit = op2;
+      break;
+   default:
+      vpanic("s390_irgen_VLLEZ: unknown type");
+   }
+
+   put_vr_dw0(v1, op2as64bit);
+   put_vr_dw1(v1, mkU64(0));
+   return "vllez";
+}
+
+static const HChar *
+s390_irgen_VGEF(UChar v1, IRTemp op2addr, UChar m3)
+{
+   put_vr(v1, Ity_I32, m3, load(Ity_I32, mkexpr(op2addr)));
+   return "vgef";
+}
+
+static const HChar *
+s390_irgen_VGEG(UChar v1, IRTemp op2addr, UChar m3)
+{
+   put_vr(v1, Ity_I64, m3, load(Ity_I64, mkexpr(op2addr)));
+   return "vgeg";
+}
+
+static const HChar *
+s390_irgen_VLM(UChar v1, IRTemp op2addr, UChar v3)
+{
+   IRExpr* current = mkexpr(op2addr);
+   vassert(v3 >= v1);
+   vassert(v3 - v1 <= 16);
+
+   for(UChar vr = v1; vr <= v3; vr++) {
+         IRExpr* next = binop(Iop_Add64, current, mkU64(16));
+         put_vr_qw(vr, load(Ity_V128, current));
+         current = next;
+   }
+
+   return "vlm";
+}
+
+static const HChar *
+s390_irgen_VLVGP(UChar v1, UChar r2, UChar r3)
+{
+   put_vr_qw(v1, binop(Iop_64HLtoV128, get_gpr_dw0(r2), get_gpr_dw0(r3)));
+
+   return "vlvgp";
+}
+
+static const HChar *
+s390_irgen_VLVG(UChar v1, IRTemp op2addr, UChar r3, UChar m4)
+{
+   IRType type = s390_vr_get_type(m4);
+   IRExpr* index = unop(Iop_64to8, mkexpr(op2addr));
+   IRExpr* vr = get_vr_qw(v1);
+   IRExpr* operand;
+   switch (type) {
+   case Ity_I8:
+      operand = unop(Iop_64to8, get_gpr_dw0(r3));
+      put_vr_qw(v1, triop(Iop_SetElem8x16, vr, index, operand));
+      break;
+   case Ity_I16:
+      operand = unop(Iop_64to16, get_gpr_dw0(r3));
+      put_vr_qw(v1, triop(Iop_SetElem16x8, vr, index, operand));
+      break;
+   case Ity_I32:
+      operand = unop(Iop_64to32, get_gpr_dw0(r3));
+      put_vr_qw(v1, triop(Iop_SetElem32x4, vr, index, operand));
+      break;
+   case Ity_I64:
+      operand = get_gpr_dw0(r3);
+      put_vr_qw(v1, triop(Iop_SetElem64x2, vr, index, operand));
+      break;
+   default:
+      vpanic("s390_irgen_VLVG: unknown type");
+   }
+
+   return "vlvg";
+}
+
+static const HChar *
+s390_irgen_VMRH(UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   const IROp ops[] = { Iop_InterleaveHI8x16, Iop_InterleaveHI16x8,
+                        Iop_InterleaveHI32x4, Iop_InterleaveHI64x2 };
+   vassert(m4 < sizeof(ops) / sizeof(ops[0]));
+   put_vr_qw(v1, binop(ops[m4], get_vr_qw(v2), get_vr_qw(v3)));
+
+   return "vmrh";
+}
+
+static const HChar *
+s390_irgen_VMRL(UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   const IROp ops[] = { Iop_InterleaveLO8x16, Iop_InterleaveLO16x8,
+                        Iop_InterleaveLO32x4, Iop_InterleaveLO64x2 };
+   vassert(m4 < sizeof(ops) / sizeof(ops[0]));
+   put_vr_qw(v1, binop(ops[m4], get_vr_qw(v2), get_vr_qw(v3)));
+
+   return "vmrl";
+}
+
+static const HChar *
+s390_irgen_VPK(UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   const IROp ops[] = { Iop_NarrowBin16to8x16, Iop_NarrowBin32to16x8,
+                        Iop_NarrowBin64to32x4 };
+   Char index = m4 - 1;
+   vassert((index >= 0) && (index < sizeof(ops) / sizeof(ops[0])));
+   put_vr_qw(v1, binop(ops[index], get_vr_qw(v2), get_vr_qw(v3)));
+   return "vpk";
+}
+
+static const HChar *
+s390_irgen_VPERM(UChar v1, UChar v2, UChar v3, UChar v4)
+{
+   put_vr_qw(v1, triop(Iop_Perm8x16x2,
+             get_vr_qw(v2), get_vr_qw(v3), get_vr_qw(v4)));
+
+   return "vperm";
+}
+
+static const HChar *
+s390_irgen_VSCEF(UChar v1, IRTemp op2addr, UChar m3)
+{
+   store(mkexpr(op2addr), get_vr(v1, Ity_I32, m3));
+   return "vscef";
+}
+
+static const HChar *
+s390_irgen_VSCEG(UChar v1, IRTemp op2addr, UChar m3)
+{
+   store(mkexpr(op2addr), get_vr(v1, Ity_I64, m3));
+   return "vsceg";
+}
+
+static const HChar *
+s390_irgen_VPDI(UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   /* Theese bits are reserved by specification */
+   vassert((m4 & 2) == 0);
+   vassert((m4 & 8) == 0);
+
+   if((m4 & 4) != 0)
+      put_vr_dw0(v1, get_vr_dw1(v2));
+   else
+      put_vr_dw0(v1, get_vr_dw0(v2));
+
+   if((m4 & 1) != 0)
+      put_vr_dw1(v1, get_vr_dw1(v3));
+   else
+      put_vr_dw1(v1, get_vr_dw0(v3));
+
+   return "vpdi";
+}
+
+static const HChar *
+s390_irgen_VSEG(UChar v1, UChar v2, UChar m3)
+{
+   IRType type = s390_vr_get_type(m3);
+   switch(type) {
+   case Ity_I8:
+      put_vr_dw0(v1, unop(Iop_8Sto64, get_vr_b7(v2)));
+      put_vr_dw1(v1, unop(Iop_8Sto64, get_vr_b15(v2)));
+      break;
+   case Ity_I16:
+      put_vr_dw0(v1, unop(Iop_16Sto64, get_vr_hw3(v2)));
+      put_vr_dw1(v1, unop(Iop_16Sto64, get_vr_hw7(v2)));
+      break;
+   case Ity_I32:
+      put_vr_dw0(v1, unop(Iop_32Sto64, get_vr_w1(v2)));
+      put_vr_dw1(v1, unop(Iop_32Sto64, get_vr_w3(v2)));
+      break;
+   default:
+      ppIRType(type);
+      vpanic("s390_irgen_VSEG: unknown type");
+   }
+
+   return "vseg";
+}
+
+static const HChar *
+s390_irgen_VSTEB(UChar v1, IRTemp op2addr, UChar m3)
+{
+   store(mkexpr(op2addr), get_vr(v1, Ity_I8, m3));
+
+   return "vsteb";
+}
+
+static const HChar *
+s390_irgen_VSTEH(UChar v1, IRTemp op2addr, UChar m3)
+{
+   store(mkexpr(op2addr), get_vr(v1, Ity_I16, m3));
+
+   return "vsteh";
+}
+
+static const HChar *
+s390_irgen_VSTEF(UChar v1, IRTemp op2addr, UChar m3)
+{
+   store(mkexpr(op2addr), get_vr(v1, Ity_I32, m3));
+
+   return "vstef";
+}
+
+static const HChar *
+s390_irgen_VSTEG(UChar v1, IRTemp op2addr, UChar m3)
+{
+   store(mkexpr(op2addr), get_vr(v1, Ity_I64, m3));
+
+   return "vsteg";
+}
+
+static const HChar *
+s390_irgen_VSTM(UChar v1, IRTemp op2addr, UChar v3)
+{
+   IRExpr* current = mkexpr(op2addr);
+   vassert(v3 >= v1);
+   vassert(v3 - v1 <= 16);
+
+   for(UChar vr = v1; vr <= v3; vr++) {
+         IRExpr* next = binop(Iop_Add64, current, mkU64(16));
+         store(current, get_vr_qw(vr));
+         current = next;
+   }
+
+   return "vstm";
+}
+
+static const HChar *
+s390_irgen_VUPH(UChar v1, UChar v2, UChar m3)
+{
+   const IROp ops[] = { Iop_Widen8Sto16x8, Iop_Widen16Sto32x4, Iop_Widen32Sto64x2 };
+   vassert(m3 < sizeof(ops) / sizeof(ops[0]));
+   put_vr_qw(v1, unop(ops[m3], get_vr_dw0(v2)));
+
+   return "vuph";
+}
+
+static const HChar *
+s390_irgen_VUPLH(UChar v1, UChar v2, UChar m3)
+{
+   const IROp ops[] = { Iop_Widen8Uto16x8, Iop_Widen16Uto32x4, Iop_Widen32Uto64x2 };
+   vassert(m3 < sizeof(ops) / sizeof(ops[0]));
+   put_vr_qw(v1, unop(ops[m3], get_vr_dw0(v2)));
+   return "vuplh";
+}
+
+static const HChar *
+s390_irgen_VUPL(UChar v1, UChar v2, UChar m3)
+{
+   const IROp ops[] = { Iop_Widen8Sto16x8, Iop_Widen16Sto32x4, Iop_Widen32Sto64x2 };
+   vassert(m3 < sizeof(ops) / sizeof(ops[0]));
+   put_vr_qw(v1, unop(ops[m3], get_vr_dw1(v2)));
+
+   return "vupl";
+}
+
+static const HChar *
+s390_irgen_VUPLL(UChar v1, UChar v2, UChar m3)
+{
+   const IROp ops[] = { Iop_Widen8Uto16x8, Iop_Widen16Uto32x4, Iop_Widen32Uto64x2 };
+   vassert(m3 < sizeof(ops) / sizeof(ops[0]));
+   put_vr_qw(v1, unop(ops[m3], get_vr_dw1(v2)));
+
+   return "vupll";
+}
+
+static const HChar *
+s390_irgen_VREP(UChar v1, UChar v3, UShort i2, UChar m4)
+{
+   IRType type = s390_vr_get_type(m4);
+   IRExpr* arg = get_vr(v3, type, i2);
+   s390_vr_fill(v1, arg);
+
+   return "vrep";
+}
+
+static const HChar *
+s390_irgen_VREPI(UChar v1, UShort i2, UChar m3)
+{
+   IRType type = s390_vr_get_type(m3);
+   IRExpr *value;
+   switch (type) {
+   case Ity_I8:
+      value = mkU8((UChar)i2);
+      break;
+   case Ity_I16:
+      value = mkU16(i2);
+      break;
+   case Ity_I32:
+      value = unop(Iop_16Sto32, mkU16(i2));
+      break;
+   case Ity_I64:
+      value = unop(Iop_16Sto64, mkU16(i2));
+      break;
+  default:
+    ppIRType(type);
+    vpanic("s390_irgen_VREPI: unknown type");
+  }
+  s390_vr_fill(v1, value);
+
+  return "vrepi";
+}
+
+static const HChar *
+s390_irgen_VPKS(UChar v1, UChar v2, UChar v3, UChar m4, UChar m5)
+{
+   if (!s390_vr_is_cs_set(m5)) {
+      const IROp ops[] = { Iop_QNarrowBin16Sto8Sx16, Iop_QNarrowBin32Sto16Sx8,
+                           Iop_QNarrowBin64Sto32Sx4 };
+      Char index = m4 - 1;
+      vassert((index >= 0) && (index < sizeof(ops) / sizeof(ops[0])));
+      put_vr_qw(v1, binop(ops[index], get_vr_qw(v2), get_vr_qw(v3)));
+
+   } else {
+      IRDirty* d;
+      IRTemp cc = newTemp(Ity_I64);
+
+      s390x_vec_op_details_t details = { .serialized = 0ULL };
+      details.op = S390_VEC_OP_VPKS;
+      details.v1 = v1;
+      details.v2 = v2;
+      details.v3 = v3;
+      details.m4 = m4;
+      details.m5 = m5;
+
+      d = unsafeIRDirty_1_N(cc, 0, "s390x_dirtyhelper_vec_op",
+                            &s390x_dirtyhelper_vec_op,
+                            mkIRExprVec_2(IRExpr_GSPTR(),
+                                          mkU64(details.serialized)));
+
+      d->nFxState = 3;
+      vex_bzero(&d->fxState, sizeof(d->fxState));
+      d->fxState[0].fx     = Ifx_Read;
+      d->fxState[0].offset = S390X_GUEST_OFFSET(guest_v0) + v2 * sizeof(V128);
+      d->fxState[0].size   = sizeof(V128);
+      d->fxState[1].fx     = Ifx_Read;
+      d->fxState[1].offset = S390X_GUEST_OFFSET(guest_v0) + v3 * sizeof(V128);
+      d->fxState[1].size   = sizeof(V128);
+      d->fxState[2].fx     = Ifx_Write;
+      d->fxState[2].offset = S390X_GUEST_OFFSET(guest_v0) + v1 * sizeof(V128);
+      d->fxState[2].size   = sizeof(V128);
+
+      stmt(IRStmt_Dirty(d));
+      s390_cc_set(cc);
+   }
+
+   return "vpks";
+}
+
+static const HChar *
+s390_irgen_VPKLS(UChar v1, UChar v2, UChar v3, UChar m4, UChar m5)
+{
+   if (!s390_vr_is_cs_set(m5)) {
+      const IROp ops[] = { Iop_QNarrowBin16Uto8Ux16, Iop_QNarrowBin32Uto16Ux8,
+                           Iop_QNarrowBin64Uto32Ux4 };
+      Char index = m4 - 1;
+      vassert((index >= 0) && (index < sizeof(ops) / sizeof(ops[0])));
+      put_vr_qw(v1, binop(ops[index], get_vr_qw(v2), get_vr_qw(v3)));
+
+   } else {
+      IRDirty* d;
+      IRTemp cc = newTemp(Ity_I64);
+
+      s390x_vec_op_details_t details = { .serialized = 0ULL };
+      details.op = S390_VEC_OP_VPKLS;
+      details.v1 = v1;
+      details.v2 = v2;
+      details.v3 = v3;
+      details.m4 = m4;
+      details.m5 = m5;
+
+      d = unsafeIRDirty_1_N(cc, 0, "s390x_dirtyhelper_vec_op",
+                            &s390x_dirtyhelper_vec_op,
+                            mkIRExprVec_2(IRExpr_GSPTR(),
+                                          mkU64(details.serialized)));
+
+      d->nFxState = 3;
+      vex_bzero(&d->fxState, sizeof(d->fxState));
+      d->fxState[0].fx     = Ifx_Read;
+      d->fxState[0].offset = S390X_GUEST_OFFSET(guest_v0) + v2 * sizeof(V128);
+      d->fxState[0].size   = sizeof(V128);
+      d->fxState[1].fx     = Ifx_Read;
+      d->fxState[1].offset = S390X_GUEST_OFFSET(guest_v0) + v3 * sizeof(V128);
+      d->fxState[1].size   = sizeof(V128);
+      d->fxState[2].fx     = Ifx_Write;
+      d->fxState[2].offset = S390X_GUEST_OFFSET(guest_v0) + v1 * sizeof(V128);
+      d->fxState[2].size   = sizeof(V128);
+
+      stmt(IRStmt_Dirty(d));
+      s390_cc_set(cc);
+   }
+
+   return "vpkls";
+}
+
+static const HChar *
+s390_irgen_VSEL(UChar v1, UChar v2, UChar v3, UChar v4)
+{
+   IRExpr* vIfTrue = get_vr_qw(v2);
+   IRExpr* vIfFalse = get_vr_qw(v3);
+   IRExpr* vCond = get_vr_qw(v4);
+
+   put_vr_qw(v1, s390_V128_bitwiseITE(vCond, vIfTrue, vIfFalse));
+   return "vsel";
+}
+
+static const HChar *
+s390_irgen_VLBB(UChar v1, IRTemp addr, UChar m3)
+{
+   IRExpr* maxIndex = binop(Iop_Sub32,
+                            s390_getCountToBlockBoundary(addr, m3),
+                            mkU32(1));
+
+   s390_vr_loadWithLength(v1, addr, maxIndex);
+
+   return "vlbb";
+}
+
+static const HChar *
+s390_irgen_VLL(UChar v1, IRTemp addr, UChar r3)
+{
+   s390_vr_loadWithLength(v1, addr, get_gpr_w1(r3));
+
+   return "vll";
+}
+
+static const HChar *
+s390_irgen_VSTL(UChar v1, IRTemp addr, UChar r3)
+{
+   IRTemp counter = newTemp(Ity_I64);
+   IRTemp maxIndexToStore = newTemp(Ity_I64);
+   IRTemp gpr3 = newTemp(Ity_I64);
+
+   assign(gpr3, unop(Iop_32Uto64, get_gpr_w1(r3)));
+   assign(maxIndexToStore, mkite(binop(Iop_CmpLE64U,
+                                       mkexpr(gpr3),
+                                       mkU64(16)
+                                       ),
+                                 mkexpr(gpr3),
+                                 mkU64(16)
+                                 )
+         );
+
+   assign(counter, get_counter_dw0());
+
+   store(binop(Iop_Add64, mkexpr(addr), mkexpr(counter)),
+         binop(Iop_GetElem8x16, get_vr_qw(v1), unop(Iop_64to8, mkexpr(counter))));
+
+   /* Check for end of field */
+   put_counter_dw0(binop(Iop_Add64, mkexpr(counter), mkU64(1)));
+   iterate_if(binop(Iop_CmpNE64, mkexpr(counter), mkexpr(maxIndexToStore)));
+   put_counter_dw0(mkU64(0));
+
+   return "vstl";
+}
+
+static const HChar *
+s390_irgen_VX(UChar v1, UChar v2, UChar v3)
+{
+   put_vr_qw(v1, binop(Iop_XorV128, get_vr_qw(v2), get_vr_qw(v3)));
+
+   return "vx";
+}
+
+static const HChar *
+s390_irgen_VN(UChar v1, UChar v2, UChar v3)
+{
+   put_vr_qw(v1, binop(Iop_AndV128, get_vr_qw(v2), get_vr_qw(v3)));
+
+   return "vn";
+}
+
+static const HChar *
+s390_irgen_VO(UChar v1, UChar v2, UChar v3)
+{
+   put_vr_qw(v1, binop(Iop_OrV128, get_vr_qw(v2), get_vr_qw(v3)));
+
+   return "vo";
+}
+
+static const HChar *
+s390_irgen_VNO(UChar v1, UChar v2, UChar v3)
+{
+   put_vr_qw(v1, unop(Iop_NotV128,
+                      binop(Iop_OrV128, get_vr_qw(v2), get_vr_qw(v3))));
+
+   return "vno";
+}
+
+static const HChar *
+s390_irgen_LZRF(UChar r1, IRTemp op2addr)
+{
+   IRTemp op2 = newTemp(Ity_I32);
+
+   assign(op2, binop(Iop_And32, load(Ity_I32, mkexpr(op2addr)), mkU32(0xffffff00)));
+   put_gpr_w1(r1, mkexpr(op2));
+
+   return "lzrf";
+}
+
+static const HChar *
+s390_irgen_LZRG(UChar r1, IRTemp op2addr)
+{
+   IRTemp op2 = newTemp(Ity_I64);
+
+   assign(op2, binop(Iop_And64, load(Ity_I64, mkexpr(op2addr)), mkU64(0xffffffffffffff00UL)));
+   put_gpr_dw0(r1, mkexpr(op2));
+
+   return "lzrg";
+}
+
+static const HChar *
+s390_irgen_LLZRGF(UChar r1, IRTemp op2addr)
+{
+   IRTemp op2 = newTemp(Ity_I32);
+
+   assign(op2, binop(Iop_And32, load(Ity_I32, mkexpr(op2addr)), mkU32(0xffffff00)));
+   put_gpr_w1(r1, mkexpr(op2));
+   put_gpr_w0(r1, mkU32(0));
+
+   return "llzrgf";
+}
+
+static const HChar *
+s390_irgen_LOCFH(UChar r1, IRTemp op2addr)
+{
+   /* condition is checked in format handler */
+   put_gpr_w0(r1, load(Ity_I32, mkexpr(op2addr)));
+
+   return "locfh";
+}
+
+static const HChar *
+s390_irgen_LOCFHR(UChar m3, UChar r1, UChar r2)
+{
+   next_insn_if(binop(Iop_CmpEQ32, s390_call_calculate_cond(m3), mkU32(0)));
+   put_gpr_w0(r1, get_gpr_w0(r2));
+
+   return "locfhr";
+}
+
+static const HChar *
+s390_irgen_LOCHHI(UChar r1, UChar m3, UShort i2, UChar unused)
+{
+   next_insn_if(binop(Iop_CmpEQ32, s390_call_calculate_cond(m3), mkU32(0)));
+   put_gpr_w0(r1, mkU32((UInt)(Int)(Short)i2));
+
+   return "lochhi";
+}
+
+static const HChar *
+s390_irgen_LOCHI(UChar r1, UChar m3, UShort i2, UChar unused)
+{
+   next_insn_if(binop(Iop_CmpEQ32, s390_call_calculate_cond(m3), mkU32(0)));
+   put_gpr_w1(r1, mkU32((UInt)(Int)(Short)i2));
+
+   return "lochi";
+}
+
+static const HChar *
+s390_irgen_LOCGHI(UChar r1, UChar m3, UShort i2, UChar unused)
+{
+   next_insn_if(binop(Iop_CmpEQ32, s390_call_calculate_cond(m3), mkU32(0)));
+   put_gpr_dw0(r1, mkU64((ULong)(Long)(Short)i2));
+
+   return "locghi";
+}
+
+static const HChar *
+s390_irgen_STOCFH(UChar r1, IRTemp op2addr)
+{
+   /* condition is checked in format handler */
+   store(mkexpr(op2addr), get_gpr_w1(r1));
+
+   return "stocfh";
+}
+
+static const HChar *
+s390_irgen_LCBB(UChar r1, IRTemp op2addr, UChar m3)
+{
+   IRTemp op2 = newTemp(Ity_I32);
+   assign(op2, s390_getCountToBlockBoundary(op2addr, m3));
+   put_gpr_w1(r1, mkexpr(op2));
+
+   IRExpr* cc = mkite(binop(Iop_CmpEQ32, mkexpr(op2), mkU32(16)), mkU64(0), mkU64(3));
+   s390_cc_thunk_fill(mkU64(S390_CC_OP_SET), cc, mkU64(0), mkU64(0));
+
+   return "lcbb";
+}
+
+/* Regarding the use of 
+   // Dummy helper which is used to signal VEX library that memory was loaded
+   sha512_loadparam 
+     = unsafeIRDirty_0_N(0, "s390x_dirtyhelper_PPNO_sha512_load_param_block",
+                             &s390x_dirtyhelper_PPNO_sha512_load_param_block,
+                             mkIRExprVec_0());
+
+   in the following function (s390_irgen_PPNO).  This is a workaround to get
+   around the fact that IRDirty annotations cannot indicate two memory side
+   effects, which are unfortunately necessary here.  It will possibly lead to
+   losing undefinedness (undefinedness in some inputs might not be propagated
+   to the outputs as it shouod, in Memcheck).  The correct fix would be to
+   extend IRDirty to represent two memory side effects, but that's quite a bit
+   of work.
+
+   Here's a summary of what this insn does.
+
+   // getReg(RegisterNumber n) returns the value of GPR number 'n'
+
+   // reg1 and reg2 are even
+   void ppno(RegisterNumber reg1, RegisterNumber reg2) {
+
+       switch(getReg(0)) {
+       case 0x0:
+           // Query mode, ignore reg1 and reg2
+           // Write 16 bytes                    at  getReg(1)
+           break;
+
+       case 0x3:
+           // SHA-512 generate mode, ignore reg2
+
+           // Read 240 bytes                    at  getReg(1)
+           // Write getReg(reg1 + 1) bytes      at  getReg(reg1)
+           // Write some of 240 bytes starting  at  getReg(1)
+           break;
+
+       case 0x83:
+           // SHA-512 seed mode, ignore reg1
+
+           // Read some of 240 bytes starting  at  getReg(1)
+           // Read getReg(reg2 + 1) bytes      at  getReg(reg2)
+           // Write 240 bytes                  at  getReg(1)
+           break;
+
+       default:
+           // Specification exception, abort execution.
+       }
+   }
+*/
+/* Also known as "prno"
+   If you implement new functions please don't forget to update
+   "s390x_dirtyhelper_PPNO_query" function.
+ */
+static const HChar *
+s390_irgen_PPNO(UChar r1, UChar r2)
+{
+   if (!s390_host_has_msa5) {
+      emulation_failure(EmFail_S390X_ppno);
+      return "ppno";
+   }
+
+   /* Theese conditions lead to specification exception */
+   vassert(r1 % 2 == 0);
+   vassert(r2 % 2 == 0);
+   vassert((r1 != 0) && (r2 != 0));
+
+   IRDirty *query, *sha512_gen, *sha512_seed, *sha512_loadparam;
+   IRTemp gpr1num = newTemp(Ity_I64);
+   IRTemp gpr2num = newTemp(Ity_I64);
+
+   IRTemp funcCode = newTemp(Ity_I8);
+   IRTemp is_query = newTemp(Ity_I1);
+   IRTemp is_sha512_gen = newTemp(Ity_I1);
+   IRTemp is_sha512_seed = newTemp(Ity_I1);
+   IRTemp is_sha512 = newTemp(Ity_I1);
+
+   assign(funcCode, unop(Iop_64to8, binop(Iop_And64, get_gpr_dw0(0),
+                                          mkU64(0xffULL))));
+   assign(gpr1num, mkU64(r1));
+   assign(gpr2num, mkU64(r2));
+
+   assign(is_query, binop(Iop_CmpEQ8, mkexpr(funcCode), mkU8(S390_PPNO_QUERY)));
+   assign(is_sha512_gen, binop(Iop_CmpEQ8, mkexpr(funcCode),
+                               mkU8(S390_PPNO_SHA512_GEN)));
+   assign(is_sha512_seed, binop(Iop_CmpEQ8, mkexpr(funcCode),
+                                mkU8(S390_PPNO_SHA512_SEED)));
+   assign(is_sha512, binop(Iop_CmpEQ8,
+                           mkU8(S390_PPNO_SHA512_GEN),
+                           binop(Iop_And8,
+                                 mkexpr(funcCode),
+                                 mkU8(S390_PPNO_SHA512_GEN)
+                                 )
+                           ));
+
+   query = unsafeIRDirty_0_N(0, "s390x_dirtyhelper_PPNO_query",
+                             &s390x_dirtyhelper_PPNO_query,
+                             mkIRExprVec_3(IRExpr_GSPTR(), mkexpr(gpr1num),
+                                           mkexpr(gpr2num)));
+   query->guard = mkexpr(is_query);
+   query->nFxState = 1;
+   vex_bzero(&query->fxState, sizeof(query->fxState));
+   query->fxState[0].fx     = Ifx_Read;
+   query->fxState[0].offset = S390X_GUEST_OFFSET(guest_r0);
+   query->fxState[0].size   = 2 * sizeof(ULong); /* gpr0 and gpr1 are read */
+   query->mAddr = get_gpr_dw0(1);
+   query->mSize = S390_PPNO_PARAM_BLOCK_SIZE_QUERY;
+   query->mFx   = Ifx_Write;
+
+   IRTemp gen_cc = newTemp(Ity_I64);
+   sha512_gen = unsafeIRDirty_1_N(gen_cc, 0, "s390x_dirtyhelper_PPNO_sha512",
+                             &s390x_dirtyhelper_PPNO_sha512,
+                             mkIRExprVec_3(IRExpr_GSPTR(), mkexpr(gpr1num),
+                                           mkexpr(gpr2num)));
+   sha512_gen->guard = mkexpr(is_sha512_gen);
+   sha512_gen->nFxState = 3;
+   vex_bzero(&sha512_gen->fxState, sizeof(sha512_gen->fxState));
+   sha512_gen->fxState[0].fx     = Ifx_Read;
+   sha512_gen->fxState[0].offset = S390X_GUEST_OFFSET(guest_r0);
+   sha512_gen->fxState[0].size   = 2 * sizeof(ULong); /* gpr0 and gpr1 are read */
+   sha512_gen->fxState[1].fx     = Ifx_Read;
+   sha512_gen->fxState[1].offset = S390X_GUEST_OFFSET(guest_r0) + r1 * sizeof(ULong);
+   sha512_gen->fxState[1].size   = sizeof(ULong);
+   sha512_gen->fxState[2].fx     = Ifx_Modify;
+   sha512_gen->fxState[2].offset = S390X_GUEST_OFFSET(guest_r0) + (r1 + 1) * sizeof(ULong);
+   sha512_gen->fxState[2].size   = sizeof(ULong);
+   sha512_gen->mAddr = get_gpr_dw0(r1);
+   sha512_gen->mSize = S390_PPNO_MAX_SIZE_SHA512_GEN;
+   sha512_gen->mFx   = Ifx_Write;
+
+   IRTemp unused = newTemp(Ity_I64);
+   sha512_seed = unsafeIRDirty_1_N(unused, 0, "s390x_dirtyhelper_PPNO_sha512",
+                             &s390x_dirtyhelper_PPNO_sha512,
+                             mkIRExprVec_3(IRExpr_GSPTR(), mkexpr(gpr1num),
+                                           mkexpr(gpr2num)));
+   sha512_seed->guard = mkexpr(is_sha512_seed);
+   sha512_seed->nFxState = 2;
+   vex_bzero(&sha512_seed->fxState, sizeof(sha512_seed->fxState));
+   sha512_seed->fxState[0].fx     = Ifx_Read;
+   sha512_seed->fxState[0].offset = S390X_GUEST_OFFSET(guest_r0);
+   sha512_seed->fxState[0].size   = 2 * sizeof(ULong); /* gpr0 and gpr1 are read */
+   sha512_seed->fxState[1].fx     = Ifx_Read;
+   sha512_seed->fxState[1].offset = S390X_GUEST_OFFSET(guest_r0) + r2 * sizeof(ULong);
+   sha512_seed->fxState[1].size   = 2 * sizeof(ULong); /* r2 and r2 + 1 are read */
+   sha512_seed->mAddr = get_gpr_dw0(r2);
+   sha512_seed->mSize = S390_PPNO_MAX_SIZE_SHA512_SEED;
+   sha512_seed->mFx   = Ifx_Write;
+
+   /* Dummy helper which is used to signal VEX library that memory was loaded */
+   sha512_loadparam =
+      unsafeIRDirty_0_N(0, "s390x_dirtyhelper_PPNO_sha512_load_param_block",
+                        &s390x_dirtyhelper_PPNO_sha512_load_param_block,
+                        mkIRExprVec_0());
+   sha512_loadparam->guard = mkexpr(is_sha512);
+   sha512_loadparam->nFxState = 0;
+   vex_bzero(&sha512_loadparam->fxState, sizeof(sha512_loadparam->fxState));
+   sha512_loadparam->mAddr = get_gpr_dw0(1);
+   sha512_loadparam->mSize = S390_PPNO_PARAM_BLOCK_SIZE_SHA512;
+   sha512_loadparam->mFx   = Ifx_Read;
+
+   IRDirty* sha512_saveparam =
+      unsafeIRDirty_0_N(0, "s390x_dirtyhelper_PPNO_sha512_save_param_block",
+                        &s390x_dirtyhelper_PPNO_sha512_load_param_block,
+                        mkIRExprVec_0());
+   sha512_saveparam->guard = mkexpr(is_sha512);
+   sha512_saveparam->nFxState = 0;
+   vex_bzero(&sha512_saveparam->fxState, sizeof(sha512_saveparam->fxState));
+   sha512_saveparam->mAddr = get_gpr_dw0(1);
+   sha512_saveparam->mSize = S390_PPNO_PARAM_BLOCK_SIZE_SHA512;
+   sha512_saveparam->mFx   = Ifx_Write;
+
+   stmt(IRStmt_Dirty(query));
+   stmt(IRStmt_Dirty(sha512_loadparam));
+   stmt(IRStmt_Dirty(sha512_gen));
+   stmt(IRStmt_Dirty(sha512_seed));
+   stmt(IRStmt_Dirty(sha512_saveparam));
+
+   IRTemp cc = newTemp(Ity_I64);
+   assign(cc,
+          mkite(mkexpr(is_sha512_gen),
+                mkexpr(gen_cc),
+                mkU64(0)
+               )
+         );
+
+   s390_cc_thunk_fill(mkU64(S390_CC_OP_SET), mkexpr(cc), mkU64(0), mkU64(0));
+
+   return "ppno";
+}
+
+static const HChar *
+s390_irgen_VFAE(UChar v1, UChar v2, UChar v3, UChar m4, UChar m5)
+{
+   IRDirty* d;
+   IRTemp cc = newTemp(Ity_I64);
+
+   /* Check for specification exception */
+   vassert(m4 < 3);
+
+   s390x_vec_op_details_t details = { .serialized = 0ULL };
+   details.op = S390_VEC_OP_VFAE;
+   details.v1 = v1;
+   details.v2 = v2;
+   details.v3 = v3;
+   details.m4 = m4;
+   details.m5 = m5;
+
+   d = unsafeIRDirty_1_N(cc, 0, "s390x_dirtyhelper_vec_op",
+                         &s390x_dirtyhelper_vec_op,
+                         mkIRExprVec_2(IRExpr_GSPTR(),
+                                       mkU64(details.serialized)));
+
+   d->nFxState = 3;
+   vex_bzero(&d->fxState, sizeof(d->fxState));
+   d->fxState[0].fx     = Ifx_Read;
+   d->fxState[0].offset = S390X_GUEST_OFFSET(guest_v0) + v2 * sizeof(V128);
+   d->fxState[0].size   = sizeof(V128);
+   d->fxState[1].fx     = Ifx_Read;
+   d->fxState[1].offset = S390X_GUEST_OFFSET(guest_v0) + v3 * sizeof(V128);
+   d->fxState[1].size   = sizeof(V128);
+   d->fxState[2].fx     = Ifx_Write;
+   d->fxState[2].offset = S390X_GUEST_OFFSET(guest_v0) + v1 * sizeof(V128);
+   d->fxState[2].size   = sizeof(V128);
+
+   stmt(IRStmt_Dirty(d));
+
+   if (s390_vr_is_cs_set(m5)) {
+      s390_cc_set(cc);
+   }
+
+   return "vfae";
+}
+
+static const HChar *
+s390_irgen_VFEE(UChar v1, UChar v2, UChar v3, UChar m4, UChar m5)
+{
+   IRDirty* d;
+   IRTemp cc = newTemp(Ity_I64);
+
+   /* Check for specification exception */
+   vassert(m4 < 3);
+   vassert((m5 & 0b1100) == 0);
+
+   s390x_vec_op_details_t details = { .serialized = 0ULL };
+   details.op = S390_VEC_OP_VFEE;
+   details.v1 = v1;
+   details.v2 = v2;
+   details.v3 = v3;
+   details.m4 = m4;
+   details.m5 = m5;
+
+   d = unsafeIRDirty_1_N(cc, 0, "s390x_dirtyhelper_vec_op",
+                         &s390x_dirtyhelper_vec_op,
+                         mkIRExprVec_2(IRExpr_GSPTR(),
+                                       mkU64(details.serialized)));
+
+   d->nFxState = 3;
+   vex_bzero(&d->fxState, sizeof(d->fxState));
+   d->fxState[0].fx     = Ifx_Read;
+   d->fxState[0].offset = S390X_GUEST_OFFSET(guest_v0) + v2 * sizeof(V128);
+   d->fxState[0].size   = sizeof(V128);
+   d->fxState[1].fx     = Ifx_Read;
+   d->fxState[1].offset = S390X_GUEST_OFFSET(guest_v0) + v3 * sizeof(V128);
+   d->fxState[1].size   = sizeof(V128);
+   d->fxState[2].fx     = Ifx_Write;
+   d->fxState[2].offset = S390X_GUEST_OFFSET(guest_v0) + v1 * sizeof(V128);
+   d->fxState[2].size   = sizeof(V128);
+
+   stmt(IRStmt_Dirty(d));
+
+   if (s390_vr_is_cs_set(m5)) {
+      s390_cc_set(cc);
+   }
+
+   return "vfee";
+}
+
+static const HChar *
+s390_irgen_VFENE(UChar v1, UChar v2, UChar v3, UChar m4, UChar m5)
+{
+   const Bool negateComparison = True;
+   const IRType type = s390_vr_get_type(m4);
+
+   /* Check for specification exception */
+   vassert(m4 < 3);
+   vassert((m5 & 0b1100) == 0);
+
+   static const IROp elementGetters[] = {
+      Iop_GetElem8x16, Iop_GetElem16x8, Iop_GetElem32x4
+   };
+   IROp getter = elementGetters[m4];
+
+   static const IROp elementComparators[] = {
+      Iop_CmpEQ8, Iop_CmpEQ16, Iop_CmpEQ32
+   };
+   IROp comparator = elementComparators[m4];
+
+   static const IROp resultConverter[] = {Iop_64to8, Iop_64to16, Iop_64to32};
+   IROp converter = resultConverter[m4];
+
+   IRTemp isZeroElem;
+
+   IRTemp counter = newTemp(Ity_I64);
+   assign(counter, get_counter_dw0());
+
+   IRTemp arg1 = newTemp(type);
+   assign(arg1, binop(getter, get_vr_qw(v2), unop(Iop_64to8, mkexpr(counter))));
+   IRTemp arg2 = newTemp(type);
+   assign(arg2, binop(getter, get_vr_qw(v3), unop(Iop_64to8, mkexpr(counter))));
+
+   IRTemp isGoodPair = newTemp(Ity_I1);
+   if(negateComparison) {
+      assign(isGoodPair, unop(Iop_Not1, binop(comparator, mkexpr(arg1),
+                                              mkexpr(arg2))));
+   } else {
+      assign(isGoodPair, binop(comparator, mkexpr(arg1), mkexpr(arg2)));
+   }
+
+   if(s390_vr_is_zs_set(m5)) {
+      isZeroElem = newTemp(Ity_I1);
+      assign(isZeroElem, binop(comparator, mkexpr(arg1),
+                               unop(converter, mkU64(0))));
+   }
+
+   static const UChar invalidIndices[] = {16, 8, 4};
+   const UChar invalidIndex = invalidIndices[m4];
+   IRTemp endOfVectorIsReached = newTemp(Ity_I1);
+   assign(endOfVectorIsReached, binop(Iop_CmpEQ64, mkexpr(counter),
+                                      mkU64(invalidIndex)));
+
+   put_counter_dw0(binop(Iop_Add64, mkexpr(counter), mkU64(1)));
+   IRExpr* shouldBreak = binop(Iop_Or32,
+                               unop(Iop_1Uto32, mkexpr(isGoodPair)),
+                               unop(Iop_1Uto32, mkexpr(endOfVectorIsReached))
+                              );
+   if(s390_vr_is_zs_set(m5)) {
+      shouldBreak = binop(Iop_Or32,
+                          shouldBreak,
+                          unop(Iop_1Uto32, mkexpr(isZeroElem)));
+   }
+   iterate_if(binop(Iop_CmpEQ32, shouldBreak, mkU32(0)));
+
+   IRExpr* foundIndex = binop(Iop_Sub64, get_counter_dw0(), mkU64(1));
+   if(m4 > 0) {
+      /* We should return index of byte but we found index of element in
+         general case.
+            if byte elem (m4 == 0) then indexOfByte = indexOfElement
+            if halfword elem (m4 == 1) then indexOfByte = 2 * indexOfElement
+                                                        = indexOfElement << 1
+            if word elem (m4 == 2) then indexOfByte = 4 * indexOfElement
+                                                    = indexOfElement << 2
+      */
+      foundIndex = binop(Iop_Shl64, foundIndex, mkU8(m4));
+   }
+
+   IRTemp result = newTemp(Ity_I64);
+   assign(result, mkite(mkexpr(endOfVectorIsReached),
+                        mkU64(16),
+                        foundIndex));
+   put_vr_qw(v1, binop(Iop_64HLtoV128, mkexpr(result), mkU64(0)));
+
+
+   if (s390_vr_is_cs_set(m5)) {
+      static const IROp to64Converters[] = {Iop_8Uto64, Iop_16Uto64, Iop_32Uto64};
+      IROp to64Converter = to64Converters[m4];
+
+      IRExpr* arg1IsLessThanArg2 = binop(Iop_CmpLT64U,
+                                         unop(to64Converter, mkexpr(arg1)),
+                                         unop(to64Converter, mkexpr(arg2)));
+
+      IRExpr* ccexp = mkite(binop(Iop_CmpEQ32,
+                                  unop(Iop_1Uto32, mkexpr(isGoodPair)),
+                                  mkU32(1)),
+                            mkite(arg1IsLessThanArg2, mkU64(1), mkU64(2)),
+                            mkU64(3));
+
+      if(s390_vr_is_zs_set(m5)) {
+         IRExpr* arg2IsZero = binop(comparator, mkexpr(arg2),
+                                    unop(converter, mkU64(0)));
+         IRExpr* bothArgsAreZero = binop(Iop_And32,
+                                         unop(Iop_1Uto32, mkexpr(isZeroElem)),
+                                         unop(Iop_1Uto32, arg2IsZero));
+         ccexp = mkite(binop(Iop_CmpEQ32, bothArgsAreZero, mkU32(1)),
+                       mkU64(0),
+                       ccexp);
+      }
+      IRTemp cc = newTemp(Ity_I64);
+      assign(cc, ccexp);
+
+      s390_cc_set(cc);
+   }
+
+
+   put_counter_dw0(mkU64(0));
+   return "vfene";
+}
+
+static const HChar *
+s390_irgen_VISTR(UChar v1, UChar v2, UChar m3, UChar m5)
+{
+   IRDirty* d;
+   IRTemp cc = newTemp(Ity_I64);
+
+   /* Check for specification exception */
+   vassert(m3 < 3);
+   vassert((m5 & 0b1110) == 0);
+
+   s390x_vec_op_details_t details = { .serialized = 0ULL };
+   details.op = S390_VEC_OP_VISTR;
+   details.v1 = v1;
+   details.v2 = v2;
+   details.m4 = m3;
+   details.m5 = m5;
+
+   d = unsafeIRDirty_1_N(cc, 0, "s390x_dirtyhelper_vec_op",
+                         &s390x_dirtyhelper_vec_op,
+                         mkIRExprVec_2(IRExpr_GSPTR(),
+                                       mkU64(details.serialized)));
+
+   d->nFxState = 2;
+   vex_bzero(&d->fxState, sizeof(d->fxState));
+   d->fxState[0].fx     = Ifx_Read;
+   d->fxState[0].offset = S390X_GUEST_OFFSET(guest_v0) + v2 * sizeof(V128);
+   d->fxState[0].size   = sizeof(V128);
+   d->fxState[1].fx     = Ifx_Write;
+   d->fxState[1].offset = S390X_GUEST_OFFSET(guest_v0) + v1 * sizeof(V128);
+   d->fxState[1].size   = sizeof(V128);
+
+   stmt(IRStmt_Dirty(d));
+
+   if (s390_vr_is_cs_set(m5)) {
+      s390_cc_set(cc);
+   }
+
+   return "vistr";
+}
+
+static const HChar *
+s390_irgen_VSTRC(UChar v1, UChar v2, UChar v3, UChar v4, UChar m5, UChar m6)
+{
+   IRDirty* d;
+   IRTemp cc = newTemp(Ity_I64);
+
+   /* Check for specification exception */
+   vassert(m5 < 3);
+
+   s390x_vec_op_details_t details = { .serialized = 0ULL };
+   details.op = S390_VEC_OP_VSTRC;
+   details.v1 = v1;
+   details.v2 = v2;
+   details.v3 = v3;
+   details.v4 = v4;
+   details.m4 = m5;
+   details.m5 = m6;
+
+   d = unsafeIRDirty_1_N(cc, 0, "s390x_dirtyhelper_vec_op",
+                         &s390x_dirtyhelper_vec_op,
+                         mkIRExprVec_2(IRExpr_GSPTR(),
+                                       mkU64(details.serialized)));
+
+   d->nFxState = 4;
+   vex_bzero(&d->fxState, sizeof(d->fxState));
+   d->fxState[0].fx     = Ifx_Read;
+   d->fxState[0].offset = S390X_GUEST_OFFSET(guest_v0) + v2 * sizeof(V128);
+   d->fxState[0].size   = sizeof(V128);
+   d->fxState[1].fx     = Ifx_Read;
+   d->fxState[1].offset = S390X_GUEST_OFFSET(guest_v0) + v3 * sizeof(V128);
+   d->fxState[1].size   = sizeof(V128);
+   d->fxState[2].fx     = Ifx_Read;
+   d->fxState[2].offset = S390X_GUEST_OFFSET(guest_v0) + v4 * sizeof(V128);
+   d->fxState[2].size   = sizeof(V128);
+   d->fxState[3].fx     = Ifx_Write;
+   d->fxState[3].offset = S390X_GUEST_OFFSET(guest_v0) + v1 * sizeof(V128);
+   d->fxState[3].size   = sizeof(V128);
+
+   stmt(IRStmt_Dirty(d));
+
+   if (s390_vr_is_cs_set(m6)) {
+      s390_cc_set(cc);
+   }
+
+   return "vstrc";
+}
+
+static const HChar *
+s390_irgen_VNC(UChar v1, UChar v2, UChar v3)
+{
+   put_vr_qw(v1, binop(Iop_AndV128,
+             get_vr_qw(v2), unop(Iop_NotV128, get_vr_qw(v3)))
+             );
+
+   return "vnc";
+}
+
+static const HChar *
+s390_irgen_VA(UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   const IROp ops[] = { Iop_Add8x16, Iop_Add16x8, Iop_Add32x4,
+                        Iop_Add64x2, Iop_Add128x1 };
+   vassert(m4 < sizeof(ops) / sizeof(ops[0]));
+   put_vr_qw(v1, binop(ops[m4], get_vr_qw(v2), get_vr_qw(v3)));
+
+   return "va";
+}
+
+static const HChar *
+s390_irgen_VS(UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   const IROp ops[] = { Iop_Sub8x16, Iop_Sub16x8, Iop_Sub32x4,
+                        Iop_Sub64x2, Iop_Sub128x1 };
+   vassert(m4 < sizeof(ops) / sizeof(ops[0]));
+   put_vr_qw(v1, binop(ops[m4], get_vr_qw(v2), get_vr_qw(v3)));
+
+   return "vs";
+}
+
+static const HChar *
+s390_irgen_VMX(UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   const IROp ops[] = { Iop_Max8Sx16, Iop_Max16Sx8, Iop_Max32Sx4, Iop_Max64Sx2 };
+   vassert(m4 < sizeof(ops) / sizeof(ops[0]));
+   put_vr_qw(v1, binop(ops[m4], get_vr_qw(v2), get_vr_qw(v3)));
+
+   return "vmx";
+}
+
+static const HChar *
+s390_irgen_VMXL(UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   const IROp ops[] = { Iop_Max8Ux16, Iop_Max16Ux8, Iop_Max32Ux4, Iop_Max64Ux2 };
+   vassert(m4 < sizeof(ops) / sizeof(ops[0]));
+   put_vr_qw(v1, binop(ops[m4], get_vr_qw(v2), get_vr_qw(v3)));
+
+   return "vmxl";
+}
+
+static const HChar *
+s390_irgen_VMN(UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   const IROp ops[] = { Iop_Min8Sx16, Iop_Min16Sx8, Iop_Min32Sx4, Iop_Min64Sx2 };
+   vassert(m4 < sizeof(ops) / sizeof(ops[0]));
+   put_vr_qw(v1, binop(ops[m4], get_vr_qw(v2), get_vr_qw(v3)));
+
+   return "vmn";
+}
+
+static const HChar *
+s390_irgen_VMNL(UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   const IROp ops[] = { Iop_Min8Ux16, Iop_Min16Ux8, Iop_Min32Ux4, Iop_Min64Ux2 };
+   vassert(m4 < sizeof(ops) / sizeof(ops[0]));
+   put_vr_qw(v1, binop(ops[m4], get_vr_qw(v2), get_vr_qw(v3)));
+
+   return "vmnl";
+}
+
+static const HChar *
+s390_irgen_VAVG(UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   const IROp ops[] = { Iop_Avg8Sx16, Iop_Avg16Sx8, Iop_Avg32Sx4, Iop_Avg64Sx2 };
+   vassert(m4 < sizeof(ops) / sizeof(ops[0]));
+   put_vr_qw(v1, binop(ops[m4], get_vr_qw(v2), get_vr_qw(v3)));
+
+   return "vavg";
+}
+
+static const HChar *
+s390_irgen_VAVGL(UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   const IROp ops[] = { Iop_Avg8Ux16, Iop_Avg16Ux8, Iop_Avg32Ux4, Iop_Avg64Ux2 };
+   vassert(m4 < sizeof(ops) / sizeof(ops[0]));
+   put_vr_qw(v1, binop(ops[m4], get_vr_qw(v2), get_vr_qw(v3)));
+
+   return "vavgl";
+}
+
+static const HChar *
+s390_irgen_VLC(UChar v1, UChar v2, UChar m3)
+{
+   vassert(m3 < 4);
+   IRType type = s390_vr_get_type(m3);
+   put_vr_qw(v1, s390_V128_get_complement(get_vr_qw(v2), type));
+   return "vlc";
+}
+
+static const HChar *
+s390_irgen_VLP(UChar v1, UChar v2, UChar m3)
+{
+   const IROp ops[] = { Iop_Abs8x16, Iop_Abs16x8, Iop_Abs32x4, Iop_Abs64x2 };
+   vassert(m3 < sizeof(ops) / sizeof(ops[0]));
+   put_vr_qw(v1, unop(ops[m3], get_vr_qw(v2)));
+
+   return "vlp";
+}
+
+static const HChar *
+s390_irgen_VCH(UChar v1, UChar v2, UChar v3, UChar m4, UChar m5)
+{
+   if (!s390_vr_is_cs_set(m5)) {
+      const IROp ops[] = { Iop_CmpGT8Sx16, Iop_CmpGT16Sx8, Iop_CmpGT32Sx4,
+                           Iop_CmpGT64Sx2 };
+      vassert(m4 < sizeof(ops) / sizeof(ops[0]));
+      put_vr_qw(v1, binop(ops[m4], get_vr_qw(v2), get_vr_qw(v3)));
+
+   } else {
+      IRDirty* d;
+      IRTemp cc = newTemp(Ity_I64);
+
+      s390x_vec_op_details_t details = { .serialized = 0ULL };
+      details.op = S390_VEC_OP_VCH;
+      details.v1 = v1;
+      details.v2 = v2;
+      details.v3 = v3;
+      details.m4 = m4;
+      details.m5 = m5;
+
+      d = unsafeIRDirty_1_N(cc, 0, "s390x_dirtyhelper_vec_op",
+                            &s390x_dirtyhelper_vec_op,
+                            mkIRExprVec_2(IRExpr_GSPTR(),
+                                          mkU64(details.serialized)));
+
+      d->nFxState = 3;
+      vex_bzero(&d->fxState, sizeof(d->fxState));
+      d->fxState[0].fx     = Ifx_Read;
+      d->fxState[0].offset = S390X_GUEST_OFFSET(guest_v0) + v2 * sizeof(V128);
+      d->fxState[0].size   = sizeof(V128);
+      d->fxState[1].fx     = Ifx_Read;
+      d->fxState[1].offset = S390X_GUEST_OFFSET(guest_v0) + v3 * sizeof(V128);
+      d->fxState[1].size   = sizeof(V128);
+      d->fxState[2].fx     = Ifx_Write;
+      d->fxState[2].offset = S390X_GUEST_OFFSET(guest_v0) + v1 * sizeof(V128);
+      d->fxState[2].size   = sizeof(V128);
+
+      stmt(IRStmt_Dirty(d));
+      s390_cc_set(cc);
+   }
+
+   return "vch";
+}
+
+static const HChar *
+s390_irgen_VCHL(UChar v1, UChar v2, UChar v3, UChar m4, UChar m5)
+{
+   if (!s390_vr_is_cs_set(m5)) {
+      const IROp ops[] = { Iop_CmpGT8Ux16, Iop_CmpGT16Ux8, Iop_CmpGT32Ux4,
+                           Iop_CmpGT64Ux2 };
+      vassert(m4 < sizeof(ops) / sizeof(ops[0]));
+      put_vr_qw(v1, binop(ops[m4], get_vr_qw(v2), get_vr_qw(v3)));
+
+   } else {
+      IRDirty* d;
+      IRTemp cc = newTemp(Ity_I64);
+
+      s390x_vec_op_details_t details = { .serialized = 0ULL };
+      details.op = S390_VEC_OP_VCHL;
+      details.v1 = v1;
+      details.v2 = v2;
+      details.v3 = v3;
+      details.m4 = m4;
+      details.m5 = m5;
+
+      d = unsafeIRDirty_1_N(cc, 0, "s390x_dirtyhelper_vec_op",
+                            &s390x_dirtyhelper_vec_op,
+                            mkIRExprVec_2(IRExpr_GSPTR(),
+                                          mkU64(details.serialized)));
+
+      d->nFxState = 3;
+      vex_bzero(&d->fxState, sizeof(d->fxState));
+      d->fxState[0].fx     = Ifx_Read;
+      d->fxState[0].offset = S390X_GUEST_OFFSET(guest_v0) + v2 * sizeof(V128);
+      d->fxState[0].size   = sizeof(V128);
+      d->fxState[1].fx     = Ifx_Read;
+      d->fxState[1].offset = S390X_GUEST_OFFSET(guest_v0) + v3 * sizeof(V128);
+      d->fxState[1].size   = sizeof(V128);
+      d->fxState[2].fx     = Ifx_Write;
+      d->fxState[2].offset = S390X_GUEST_OFFSET(guest_v0) + v1 * sizeof(V128);
+      d->fxState[2].size   = sizeof(V128);
+
+      stmt(IRStmt_Dirty(d));
+      s390_cc_set(cc);
+   }
+
+   return "vchl";
+}
+
+static const HChar *
+s390_irgen_VCLZ(UChar v1, UChar v2, UChar m3)
+{
+   const IROp ops[] = { Iop_Clz8x16, Iop_Clz16x8, Iop_Clz32x4, Iop_Clz64x2 };
+   vassert(m3 < sizeof(ops) / sizeof(ops[0]));
+   put_vr_qw(v1, unop(ops[m3], get_vr_qw(v2)));
+
+   return "vclz";
+}
+
+static const HChar *
+s390_irgen_VCTZ(UChar v1, UChar v2, UChar m3)
+{
+   const IROp ops[] = { Iop_Ctz8x16, Iop_Ctz16x8, Iop_Ctz32x4, Iop_Ctz64x2 };
+   vassert(m3 < sizeof(ops) / sizeof(ops[0]));
+   put_vr_qw(v1, unop(ops[m3], get_vr_qw(v2)));
+
+   return "vctz";
+}
+
+static const HChar *
+s390_irgen_VPOPCT(UChar v1, UChar v2, UChar m3)
+{
+   vassert(m3 == 0);
+
+   put_vr_qw(v1, unop(Iop_Cnt8x16, get_vr_qw(v2)));
+
+   return "vpopct";
+}
+
+static const HChar *
+s390_irgen_VML(UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   const IROp ops[] = { Iop_Mul8x16, Iop_Mul16x8, Iop_Mul32x4 };
+   vassert(m4 < sizeof(ops) / sizeof(ops[0]));
+   put_vr_qw(v1, binop(ops[m4], get_vr_qw(v2), get_vr_qw(v3)));
+
+   return "vml";
+}
+
+static const HChar *
+s390_irgen_VMLH(UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   const IROp ops[] = { Iop_MulHi8Ux16, Iop_MulHi16Ux8, Iop_MulHi32Ux4 };
+   vassert(m4 < sizeof(ops) / sizeof(ops[0]));
+   put_vr_qw(v1, binop(ops[m4], get_vr_qw(v2), get_vr_qw(v3)));
+
+   return "vmlh";
+}
+
+static const HChar *
+s390_irgen_VMH(UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   const IROp ops[] = { Iop_MulHi8Sx16, Iop_MulHi16Sx8, Iop_MulHi32Sx4 };
+   vassert(m4 < sizeof(ops) / sizeof(ops[0]));
+   put_vr_qw(v1, binop(ops[m4], get_vr_qw(v2), get_vr_qw(v3)));
+
+   return "vmh";
+}
+
+static const HChar *
+s390_irgen_VME(UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   const IROp ops[] = { Iop_MullEven8Sx16, Iop_MullEven16Sx8, Iop_MullEven32Sx4 };
+   vassert(m4 < sizeof(ops) / sizeof(ops[0]));
+   put_vr_qw(v1, binop(ops[m4], get_vr_qw(v2), get_vr_qw(v3)));
+
+   return "vme";
+}
+
+static const HChar *
+s390_irgen_VMLE(UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   const IROp ops[] = { Iop_MullEven8Ux16, Iop_MullEven16Ux8, Iop_MullEven32Ux4 };
+   vassert(m4 < sizeof(ops) / sizeof(ops[0]));
+   put_vr_qw(v1, binop(ops[m4], get_vr_qw(v2), get_vr_qw(v3)));
+
+   return "vmle";
+}
+
+static const HChar *
+s390_irgen_VESLV(UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   const IROp ops[] = { Iop_Shl8x16, Iop_Shl16x8, Iop_Shl32x4, Iop_Shl64x2};
+   vassert(m4 < sizeof(ops) / sizeof(ops[0]));
+   put_vr_qw(v1, binop(ops[m4], get_vr_qw(v2), get_vr_qw(v3)));
+
+   return "veslv";
+}
+
+static const HChar *
+s390_irgen_VESL(UChar v1, IRTemp op2addr, UChar v3, UChar m4)
+{
+   IRExpr* shift_amount = unop(Iop_64to8, mkexpr(op2addr));
+   const IROp ops[] = { Iop_ShlN8x16, Iop_ShlN16x8, Iop_ShlN32x4, Iop_ShlN64x2 };
+   vassert(m4 < sizeof(ops) / sizeof(ops[0]));
+   put_vr_qw(v1, binop(ops[m4], get_vr_qw(v3), shift_amount));
+
+   return "vesl";
+}
+
+static const HChar *
+s390_irgen_VESRAV(UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   const IROp ops[] = { Iop_Sar8x16, Iop_Sar16x8, Iop_Sar32x4, Iop_Sar64x2 };
+   vassert(m4 < sizeof(ops) / sizeof(ops[0]));
+   put_vr_qw(v1, binop(ops[m4], get_vr_qw(v2), get_vr_qw(v3)));
+
+   return "vesrav";
+}
+
+static const HChar *
+s390_irgen_VESRA(UChar v1, IRTemp op2addr, UChar v3, UChar m4)
+{
+   IRExpr* shift_amount = unop(Iop_64to8, mkexpr(op2addr));
+   const IROp ops[] = { Iop_SarN8x16, Iop_SarN16x8, Iop_SarN32x4, Iop_SarN64x2 };
+   vassert(m4 < sizeof(ops) / sizeof(ops[0]));
+   put_vr_qw(v1, binop(ops[m4], get_vr_qw(v3), shift_amount));
+
+   return "vesra";
+}
+
+static const HChar *
+s390_irgen_VESRLV(UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   const IROp ops[] = { Iop_Shr8x16, Iop_Shr16x8, Iop_Shr32x4, Iop_Shr64x2 };
+   vassert(m4 < sizeof(ops) / sizeof(ops[0]));
+   put_vr_qw(v1, binop(ops[m4], get_vr_qw(v2), get_vr_qw(v3)));
+
+   return "vesrlv";
+}
+
+static const HChar *
+s390_irgen_VESRL(UChar v1, IRTemp op2addr, UChar v3, UChar m4)
+{
+   IRExpr* shift_amount = unop(Iop_64to8, mkexpr(op2addr));
+   const IROp ops[] = { Iop_ShrN8x16, Iop_ShrN16x8, Iop_ShrN32x4, Iop_ShrN64x2 };
+   vassert(m4 < sizeof(ops) / sizeof(ops[0]));
+   put_vr_qw(v1, binop(ops[m4], get_vr_qw(v3), shift_amount));
+
+   return "vesrl";
+}
+
+static const HChar *
+s390_irgen_VERLLV(UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   const IROp ops[] = { Iop_Rol8x16, Iop_Rol16x8, Iop_Rol32x4, Iop_Rol64x2 };
+   vassert(m4 < sizeof(ops) / sizeof(ops[0]));
+   put_vr_qw(v1, binop(ops[m4], get_vr_qw(v2), get_vr_qw(v3)));
+
+   return "verllv";
+}
+
+static const HChar *
+s390_irgen_VERLL(UChar v1, IRTemp op2addr, UChar v3, UChar m4)
+{
+   /*
+      There is no Iop_RolN?x?? operations
+      so we have to use VECTOR x VECTOR variant.
+    */
+   IRExpr* shift_vector = unop(Iop_Dup8x16, unop(Iop_64to8, mkexpr(op2addr)));
+   const IROp ops[] = { Iop_Rol8x16, Iop_Rol16x8, Iop_Rol32x4, Iop_Rol64x2 };
+   vassert(m4 < sizeof(ops) / sizeof(ops[0]));
+   put_vr_qw(v1, binop(ops[m4], get_vr_qw(v3), shift_vector));
+
+   return "verll";
+}
+
+static const HChar *
+s390_irgen_VSL(UChar v1, UChar v2, UChar v3)
+{
+   IRTemp shift_amount = newTemp(Ity_I8);
+   assign(shift_amount, binop(Iop_And8, get_vr_b7(v3), mkU8(0b00000111)));
+
+   put_vr_qw(v1, binop(Iop_ShlV128, get_vr_qw(v2), mkexpr(shift_amount)));
+   return "vsl";
+}
+
+static const HChar *
+s390_irgen_VSRL(UChar v1, UChar v2, UChar v3)
+{
+   IRTemp shift_amount = newTemp(Ity_I8);
+   assign(shift_amount, binop(Iop_And8, get_vr_b7(v3), mkU8(0b00000111)));
+
+   put_vr_qw(v1, binop(Iop_ShrV128, get_vr_qw(v2), mkexpr(shift_amount)));
+   return "vsrl";
+}
+
+static const HChar *
+s390_irgen_VSRA(UChar v1, UChar v2, UChar v3)
+{
+   IRTemp shift_amount = newTemp(Ity_I8);
+   assign(shift_amount, binop(Iop_And8, get_vr_b7(v3), mkU8(0b00000111)));
+
+   put_vr_qw(v1, binop(Iop_SarV128, get_vr_qw(v2), mkexpr(shift_amount)));
+   return "vsra";
+}
+
+static const HChar *
+s390_irgen_VERIM(UChar v1, UChar v2, UChar v3, UChar i4, UChar m5)
+{
+   /*
+      There is no Iop_RolN?x?? operations
+      so we have to use VECTOR x VECTOR variant.
+    */
+   const IROp ops[] = { Iop_Rol8x16, Iop_Rol16x8, Iop_Rol32x4, Iop_Rol64x2 };
+   vassert(m5 < sizeof(ops) / sizeof(ops[0]));
+   IRExpr* shift_vector = unop(Iop_Dup8x16, mkU8(i4));
+   IRExpr* rotated_vector = binop(ops[m5], get_vr_qw(v2), shift_vector);
+
+   /* result = (result & ~mask) | (rotated_vector & mask) */
+   IRExpr* mask = get_vr_qw(v3);
+   IRExpr* result = get_vr_qw(v1);
+   put_vr_qw(v1, s390_V128_bitwiseITE(mask, rotated_vector, result));
+
+   return "verim";
+}
+
+static const HChar *
+s390_irgen_VEC(UChar v1, UChar v2, UChar m3)
+{
+   IRType type = s390_vr_get_type(m3);
+   IRTemp op1 = newTemp(type);
+   IRTemp op2 = newTemp(type);
+
+   switch(type) {
+   case Ity_I8:
+      assign(op1, get_vr_b7(v1));
+      assign(op2, get_vr_b7(v2));
+      break;
+   case Ity_I16:
+      assign(op1, get_vr_hw3(v1));
+      assign(op2, get_vr_hw3(v2));
+      break;
+   case Ity_I32:
+      assign(op1, get_vr_w1(v1));
+      assign(op2, get_vr_w1(v2));
+      break;
+   case Ity_I64:
+      assign(op1, get_vr_dw0(v1));
+      assign(op2, get_vr_dw0(v2));
+      break;
+   default:
+      vpanic("s390_irgen_VEC: unknown type");
+   }
+
+   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_COMPARE, op1, op2);
+
+   return "vec";
+}
+
+static const HChar *
+s390_irgen_VECL(UChar v1, UChar v2, UChar m3)
+{
+   IRType type = s390_vr_get_type(m3);
+   IRTemp op1 = newTemp(type);
+   IRTemp op2 = newTemp(type);
+
+   switch(type) {
+   case Ity_I8:
+      assign(op1, get_vr_b7(v1));
+      assign(op2, get_vr_b7(v2));
+      break;
+   case Ity_I16:
+      assign(op1, get_vr_hw3(v1));
+      assign(op2, get_vr_hw3(v2));
+      break;
+   case Ity_I32:
+      assign(op1, get_vr_w1(v1));
+      assign(op2, get_vr_w1(v2));
+      break;
+   case Ity_I64:
+      assign(op1, get_vr_dw0(v1));
+      assign(op2, get_vr_dw0(v2));
+      break;
+   default:
+      vpanic("s390_irgen_VECL: unknown type");
+   }
+
+   s390_cc_thunk_putZZ(S390_CC_OP_UNSIGNED_COMPARE, op1, op2);
+
+   return "vecl";
+}
+
+static const HChar *
+s390_irgen_VCEQ(UChar v1, UChar v2, UChar v3, UChar m4, UChar m5)
+{
+   if (!s390_vr_is_cs_set(m5)) {
+      const IROp ops[] = { Iop_CmpEQ8x16, Iop_CmpEQ16x8, Iop_CmpEQ32x4,
+                           Iop_CmpEQ64x2 };
+      vassert(m4 < sizeof(ops) / sizeof(ops[0]));
+      put_vr_qw(v1, binop(ops[m4], get_vr_qw(v2), get_vr_qw(v3)));
+
+   } else {
+      IRDirty* d;
+      IRTemp cc = newTemp(Ity_I64);
+
+      s390x_vec_op_details_t details = { .serialized = 0ULL };
+      details.op = S390_VEC_OP_VCEQ;
+      details.v1 = v1;
+      details.v2 = v2;
+      details.v3 = v3;
+      details.m4 = m4;
+      details.m5 = m5;
+
+      d = unsafeIRDirty_1_N(cc, 0, "s390x_dirtyhelper_vec_op",
+                            &s390x_dirtyhelper_vec_op,
+                            mkIRExprVec_2(IRExpr_GSPTR(),
+                                          mkU64(details.serialized)));
+
+      d->nFxState = 3;
+      vex_bzero(&d->fxState, sizeof(d->fxState));
+      d->fxState[0].fx     = Ifx_Read;
+      d->fxState[0].offset = S390X_GUEST_OFFSET(guest_v0) + v2 * sizeof(V128);
+      d->fxState[0].size   = sizeof(V128);
+      d->fxState[1].fx     = Ifx_Read;
+      d->fxState[1].offset = S390X_GUEST_OFFSET(guest_v0) + v3 * sizeof(V128);
+      d->fxState[1].size   = sizeof(V128);
+      d->fxState[2].fx     = Ifx_Write;
+      d->fxState[2].offset = S390X_GUEST_OFFSET(guest_v0) + v1 * sizeof(V128);
+      d->fxState[2].size   = sizeof(V128);
+
+      stmt(IRStmt_Dirty(d));
+      s390_cc_set(cc);
+   }
+
+   return "vceq";
+}
+
+static const HChar *
+s390_irgen_VSLB(UChar v1, UChar v2, UChar v3)
+{
+   IRTemp shift_amount = newTemp(Ity_I8);
+   assign(shift_amount, binop(Iop_And8, get_vr_b7(v3), mkU8(0b01111000)));
+
+   put_vr_qw(v1, binop(Iop_ShlV128, get_vr_qw(v2), mkexpr(shift_amount)));
+   return "vslb";
+}
+
+static const HChar *
+s390_irgen_VSRLB(UChar v1, UChar v2, UChar v3)
+{
+   IRTemp shift_amount = newTemp(Ity_I8);
+   assign(shift_amount, binop(Iop_And8, get_vr_b7(v3), mkU8(0b01111000)));
+
+   put_vr_qw(v1, binop(Iop_ShrV128, get_vr_qw(v2), mkexpr(shift_amount)));
+   return "vsrlb";
+}
+
+static const HChar *
+s390_irgen_VSRAB(UChar v1, UChar v2, UChar v3)
+{
+   IRTemp shift_amount = newTemp(Ity_I8);
+   assign(shift_amount, binop(Iop_And8, get_vr_b7(v3), mkU8(0b01111000)));
+
+   put_vr_qw(v1, binop(Iop_SarV128, get_vr_qw(v2), mkexpr(shift_amount)));
+   return "vsrab";
+}
+
+static const HChar *
+s390_irgen_VSLDB(UChar v1, UChar v2, UChar v3, UChar i4)
+{
+   UChar imm = i4 & 0b00001111;
+
+   if (imm == 0) {
+      /* Just copy v2. */
+      put_vr_qw(v1, get_vr_qw(v2));
+   } else {
+      /* Concatenate v2's tail with v3's head. */
+      put_vr_qw(v1,
+                binop(Iop_OrV128,
+                      binop(Iop_ShlV128, get_vr_qw(v2), mkU8(imm * 8)),
+                      binop(Iop_ShrV128, get_vr_qw(v3), mkU8((16 - imm) * 8))
+                     )
+               );
+   }
+
+   return "vsldb";
+}
+
+static const HChar *
+s390_irgen_VMO(UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   const IROp ops[] = { Iop_MullEven8Sx16, Iop_MullEven16Sx8,
+                        Iop_MullEven32Sx4 };
+   UChar shifts[] = { 8, 16, 32 };
+   vassert(m4 < sizeof(ops) / sizeof(ops[0]));
+   IRExpr* result = binop(ops[m4],
+                          binop(Iop_ShlV128, get_vr_qw(v2), mkU8(shifts[m4])),
+                          binop(Iop_ShlV128, get_vr_qw(v3), mkU8(shifts[m4]))
+                         );
+   put_vr_qw(v1, result);
+
+   return "vmo";
+}
+
+static const HChar *
+s390_irgen_VMLO(UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   const IROp ops[] = { Iop_MullEven8Ux16, Iop_MullEven16Ux8,
+                        Iop_MullEven32Ux4 };
+   UChar shifts[] = { 8, 16, 32 };
+   vassert(m4 < sizeof(ops) / sizeof(ops[0]));
+   IRExpr* result = binop(ops[m4],
+                          binop(Iop_ShlV128, get_vr_qw(v2), mkU8(shifts[m4])),
+                          binop(Iop_ShlV128, get_vr_qw(v3), mkU8(shifts[m4]))
+                         );
+   put_vr_qw(v1, result);
+
+   return "vmlo";
+}
+
+static const HChar *
+s390_irgen_VMAE(UChar v1, UChar v2, UChar v3, UChar v4, UChar m5)
+{
+   const IROp mul_ops[] = { Iop_MullEven8Sx16, Iop_MullEven16Sx8,
+                            Iop_MullEven32Sx4 };
+   const IROp add_ops[] = { Iop_Add16x8, Iop_Add32x4, Iop_Add64x2};
+   vassert(m5 < sizeof(mul_ops) / sizeof(mul_ops[0]));
+
+   IRExpr* mul_result = binop(mul_ops[m5], get_vr_qw(v2), get_vr_qw(v3));
+   IRExpr* result = binop(add_ops[m5], mul_result, get_vr_qw(v4));
+   put_vr_qw(v1, result);
+
+   return "vmae";
+}
+
+static const HChar *
+s390_irgen_VMALE(UChar v1, UChar v2, UChar v3, UChar v4, UChar m5)
+{
+   const IROp mul_ops[] = { Iop_MullEven8Ux16, Iop_MullEven16Ux8,
+                            Iop_MullEven32Ux4 };
+   const IROp add_ops[] = { Iop_Add16x8, Iop_Add32x4, Iop_Add64x2 };
+   vassert(m5 < sizeof(mul_ops) / sizeof(mul_ops[0]));
+
+   IRExpr* mul_result = binop(mul_ops[m5], get_vr_qw(v2), get_vr_qw(v3));
+   IRExpr* result = binop(add_ops[m5], mul_result, get_vr_qw(v4));
+   put_vr_qw(v1, result);
+
+   return "vmale";
+}
+
+static const HChar *
+s390_irgen_VMAO(UChar v1, UChar v2, UChar v3, UChar v4, UChar m5)
+{
+   const IROp mul_ops[] = { Iop_MullEven8Sx16, Iop_MullEven16Sx8,
+                            Iop_MullEven32Sx4 };
+   const IROp add_ops[] = { Iop_Add16x8, Iop_Add32x4, Iop_Add64x2 };
+   UChar shifts[] = { 8, 16, 32 };
+   vassert(m5 < sizeof(mul_ops) / sizeof(mul_ops[0]));
+
+   IRExpr* mul_result =
+      binop(mul_ops[m5],
+            binop(Iop_ShlV128, get_vr_qw(v2), mkU8(shifts[m5])),
+            binop(Iop_ShlV128, get_vr_qw(v3), mkU8(shifts[m5])));
+   IRExpr* result = binop(add_ops[m5], mul_result, get_vr_qw(v4));
+   put_vr_qw(v1, result);
+
+   return "vmao";
+}
+
+static const HChar *
+s390_irgen_VMALO(UChar v1, UChar v2, UChar v3, UChar v4, UChar m5)
+{
+   const IROp mul_ops[] = { Iop_MullEven8Ux16, Iop_MullEven16Ux8,
+                            Iop_MullEven32Ux4 };
+   const IROp add_ops[] = { Iop_Add16x8, Iop_Add32x4, Iop_Add64x2 };
+   UChar shifts[] = { 8, 16, 32 };
+   vassert(m5 < sizeof(mul_ops) / sizeof(mul_ops[0]));
+
+   IRExpr* mul_result = binop(mul_ops[m5],
+                              binop(Iop_ShlV128,
+                                    get_vr_qw(v2), mkU8(shifts[m5])),
+                              binop(Iop_ShlV128,
+                                    get_vr_qw(v3), mkU8(shifts[m5]))
+   );
+
+   IRExpr* result = binop(add_ops[m5], mul_result, get_vr_qw(v4));
+   put_vr_qw(v1, result);
+
+   return "vmalo";
+}
+
+static const HChar *
+s390_irgen_VMAL(UChar v1, UChar v2, UChar v3, UChar v4, UChar m5)
+{
+   const IROp mul_ops[] = { Iop_Mul8x16, Iop_Mul16x8, Iop_Mul32x4 };
+   const IROp add_ops[] = { Iop_Add8x16, Iop_Add16x8, Iop_Add32x4 };
+   vassert(m5 < sizeof(mul_ops) / sizeof(mul_ops[0]));
+
+   IRExpr* mul_result = binop(mul_ops[m5], get_vr_qw(v2), get_vr_qw(v3));
+   IRExpr* result = binop(add_ops[m5], mul_result, get_vr_qw(v4));
+   put_vr_qw(v1, result);
+
+   return "vmal";
+}
+
+static const HChar *
+s390_irgen_VSUM(UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   IRType type = s390_vr_get_type(m4);
+   IRExpr* mask;
+   IRExpr* sum;
+   switch(type) {
+   case Ity_I8:
+      sum = unop(Iop_PwAddL16Ux8, unop(Iop_PwAddL8Ux16, get_vr_qw(v2)));
+      mask = IRExpr_Const(IRConst_V128(0b0001000100010001));
+      break;
+   case Ity_I16:
+      sum = unop(Iop_PwAddL16Ux8, get_vr_qw(v2));
+      mask = IRExpr_Const(IRConst_V128(0b0011001100110011));
+      break;
+   default:
+      vpanic("s390_irgen_VSUM: invalid type ");
+   }
+
+   IRExpr* addition = binop(Iop_AndV128, get_vr_qw(v3), mask);
+   put_vr_qw(v1, binop(Iop_Add32x4, sum, addition));
+
+   return "vsum";
+}
+
+static const HChar *
+s390_irgen_VSUMG(UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   IRType type = s390_vr_get_type(m4);
+   IRExpr* mask;
+   IRExpr* sum;
+   switch(type) {
+   case Ity_I16:
+      sum = unop(Iop_PwAddL32Ux4, unop(Iop_PwAddL16Ux8, get_vr_qw(v2)));
+      mask = IRExpr_Const(IRConst_V128(0b0000001100000011));
+      break;
+   case Ity_I32:
+      sum = unop(Iop_PwAddL32Ux4, get_vr_qw(v2));
+      mask = IRExpr_Const(IRConst_V128(0b0000111100001111));
+      break;
+   default:
+      vpanic("s390_irgen_VSUMG: invalid type ");
+   }
+
+   IRExpr* addition = binop(Iop_AndV128, get_vr_qw(v3), mask);
+   put_vr_qw(v1, binop(Iop_Add64x2, sum, addition));
+
+   return "vsumg";
+}
+
+static const HChar *
+s390_irgen_VSUMQ(UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   IRType type = s390_vr_get_type(m4);
+   IRExpr* mask;
+   IRExpr* sum;
+   switch(type) {
+   case Ity_I32:
+      sum = unop(Iop_PwAddL64Ux2, unop(Iop_PwAddL32Ux4, get_vr_qw(v2)));
+      mask = IRExpr_Const(IRConst_V128(0b0000000000001111));
+      break;
+   case Ity_I64:
+      sum = unop(Iop_PwAddL64Ux2, get_vr_qw(v2));
+      mask = IRExpr_Const(IRConst_V128(0b0000000011111111));
+      break;
+   default:
+      vpanic("s390_irgen_VSUMQ: invalid type ");
+   }
+
+   IRExpr* addition = binop(Iop_AndV128, get_vr_qw(v3), mask);
+   put_vr_qw(v1, binop(Iop_Add128x1, sum, addition));
+
+   return "vsumq";
+}
+
+static const HChar *
+s390_irgen_VTM(UChar v1, UChar v2)
+{
+   IRDirty* d;
+   IRTemp cc = newTemp(Ity_I64);
+
+   s390x_vec_op_details_t details = { .serialized = 0ULL };
+   details.op = S390_VEC_OP_VTM;
+   details.v2 = v1;
+   details.v3 = v2;
+   details.read_only = 1;
+
+   d = unsafeIRDirty_1_N(cc, 0, "s390x_dirtyhelper_vec_op",
+                         &s390x_dirtyhelper_vec_op,
+                         mkIRExprVec_2(IRExpr_GSPTR(),
+                                       mkU64(details.serialized)));
+
+   d->nFxState = 2;
+   vex_bzero(&d->fxState, sizeof(d->fxState));
+   d->fxState[0].fx     = Ifx_Read;
+   d->fxState[0].offset = S390X_GUEST_OFFSET(guest_v0) + v1 * sizeof(V128);
+   d->fxState[0].size   = sizeof(V128);
+   d->fxState[1].fx     = Ifx_Read;
+   d->fxState[1].offset = S390X_GUEST_OFFSET(guest_v0) + v2 * sizeof(V128);
+   d->fxState[1].size   = sizeof(V128);
+
+   stmt(IRStmt_Dirty(d));
+   s390_cc_set(cc);
+
+   return "vtm";
+}
+
+static const HChar *
+s390_irgen_VAC(UChar v1, UChar v2, UChar v3, UChar v4, UChar m5)
+{
+   vassert(m5 == 4); /* specification exception otherwise */
+
+   IRTemp sum = newTemp(Ity_V128);
+   assign(sum, binop(Iop_Add128x1, get_vr_qw(v2), get_vr_qw(v3)));
+
+   IRExpr* mask = binop(Iop_64HLtoV128, mkU64(0), mkU64(1));
+   IRExpr* carry_in = binop(Iop_AndV128, get_vr_qw(v4), mask);
+   put_vr_qw(v1, binop(Iop_Add128x1, mkexpr(sum), carry_in));
+
+   return "vac";
+}
+
+static const HChar *
+s390_irgen_VACC(UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   IRType type = s390_vr_get_type(m4);
+   IRExpr* arg1 = get_vr_qw(v2);
+   IRExpr* arg2 = get_vr_qw(v3);
+
+   put_vr_qw(v1, s390_V128_calculate_carry_out(arg1, arg2, type, False));
+   return "vacc";
+}
+
+static const HChar *
+s390_irgen_VACCC(UChar v1, UChar v2, UChar v3, UChar v4, UChar m5)
+{
+   vassert(m5 == 4); /* specification exception otherwise */
+   IRExpr* result =
+         s390_V128_calculate_carry_out_with_carry(get_vr_qw(v2),
+                                                  get_vr_qw(v3),
+                                                  get_vr_qw(v4)
+                                                  );
+
+   put_vr_qw(v1, result);
+   return "vaccc";
+}
+
+static const HChar*
+s390_irgen_VCKSM(UChar v1, UChar v2, UChar v3)
+{
+
+   IRTemp sum1 = s390_checksum_add(get_vr_w1(v3), get_vr_w0(v2));
+   IRTemp sum2 = s390_checksum_add(mkexpr(sum1), get_vr_w1(v2));
+   IRTemp sum3 = s390_checksum_add(mkexpr(sum2), get_vr_w2(v2));
+   IRTemp result = s390_checksum_add(mkexpr(sum3), get_vr_w3(v2));
+
+   put_vr_qw(v1, binop(Iop_64HLtoV128,
+                       unop(Iop_32Uto64, mkexpr(result)), mkU64(0ULL)));
+
+   return "vcksm";
+}
+
+static const HChar *
+s390_irgen_VGFM(UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   IRDirty* d;
+   IRTemp cc = newTemp(Ity_I64);
+
+   s390x_vec_op_details_t details = { .serialized = 0ULL };
+   details.op = S390_VEC_OP_VGFM;
+   details.v1 = v1;
+   details.v2 = v2;
+   details.v3 = v3;
+   details.m4 = m4;
+
+   d = unsafeIRDirty_1_N(cc, 0, "s390x_dirtyhelper_vec_op",
+                         &s390x_dirtyhelper_vec_op,
+                         mkIRExprVec_2(IRExpr_GSPTR(),
+                                       mkU64(details.serialized)));
+
+   d->nFxState = 3;
+   vex_bzero(&d->fxState, sizeof(d->fxState));
+   d->fxState[0].fx     = Ifx_Read;
+   d->fxState[0].offset = S390X_GUEST_OFFSET(guest_v0) + v2 * sizeof(V128);
+   d->fxState[0].size   = sizeof(V128);
+   d->fxState[1].fx     = Ifx_Read;
+   d->fxState[1].offset = S390X_GUEST_OFFSET(guest_v0) + v3 * sizeof(V128);
+   d->fxState[1].size   = sizeof(V128);
+   d->fxState[2].fx     = Ifx_Write;
+   d->fxState[2].offset = S390X_GUEST_OFFSET(guest_v0) + v1 * sizeof(V128);
+   d->fxState[2].size   = sizeof(V128);
+
+   stmt(IRStmt_Dirty(d));
+   return "vgfm";
+}
+
+static const HChar *
+s390_irgen_VGFMA(UChar v1, UChar v2, UChar v3, UChar v4, UChar m5)
+{
+   IRDirty* d;
+   IRTemp cc = newTemp(Ity_I64);
+
+   s390x_vec_op_details_t details = { .serialized = 0ULL };
+   details.op = S390_VEC_OP_VGFMA;
+   details.v1 = v1;
+   details.v2 = v2;
+   details.v3 = v3;
+   details.v4 = v4;
+   details.m4 = m5;
+
+   d = unsafeIRDirty_1_N(cc, 0, "s390x_dirtyhelper_vec_op",
+                         &s390x_dirtyhelper_vec_op,
+                         mkIRExprVec_2(IRExpr_GSPTR(),
+                                       mkU64(details.serialized)));
+
+   d->nFxState = 4;
+   vex_bzero(&d->fxState, sizeof(d->fxState));
+   d->fxState[0].fx     = Ifx_Read;
+   d->fxState[0].offset = S390X_GUEST_OFFSET(guest_v0) + v2 * sizeof(V128);
+   d->fxState[0].size   = sizeof(V128);
+   d->fxState[1].fx     = Ifx_Read;
+   d->fxState[1].offset = S390X_GUEST_OFFSET(guest_v0) + v3 * sizeof(V128);
+   d->fxState[1].size   = sizeof(V128);
+   d->fxState[2].fx     = Ifx_Read;
+   d->fxState[2].offset = S390X_GUEST_OFFSET(guest_v0) + v4 * sizeof(V128);
+   d->fxState[2].size   = sizeof(V128);
+   d->fxState[3].fx     = Ifx_Write;
+   d->fxState[3].offset = S390X_GUEST_OFFSET(guest_v0) + v1 * sizeof(V128);
+   d->fxState[3].size   = sizeof(V128);
+
+   stmt(IRStmt_Dirty(d));
+   return "vgfma";
+}
+
+static const HChar *
+s390_irgen_VSBI(UChar v1, UChar v2, UChar v3, UChar v4, UChar m5)
+{
+   vassert(m5 == 4); /* specification exception otherwise */
+
+   IRExpr* mask = binop(Iop_64HLtoV128, mkU64(0ULL), mkU64(1ULL));
+   IRExpr* carry_in = binop(Iop_AndV128, get_vr_qw(v4), mask);
+
+   IRTemp sum = newTemp(Ity_V128);
+   assign(sum, binop(Iop_Add128x1,
+                     get_vr_qw(v2),
+                     unop(Iop_NotV128, get_vr_qw(v3))
+                     )
+         );
+
+   put_vr_qw(v1, binop(Iop_Add128x1, mkexpr(sum), carry_in));
+   return "vsbi";
+}
+
+static const HChar *
+s390_irgen_VSCBI(UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   IRType type = s390_vr_get_type(m4);
+   IRExpr* arg1 = get_vr_qw(v2);
+   IRExpr* arg2 = s390_V128_get_complement(get_vr_qw(v3), type);
+   IRExpr* result = s390_V128_calculate_carry_out(arg1, arg2, type, True);
+
+   put_vr_qw(v1, result);
+   return "vscbi";
+}
+
+static const HChar *
+s390_irgen_VSBCBI(UChar v1, UChar v2, UChar v3, UChar v4, UChar m5)
+{
+   vassert(m5 == 4); /* specification exception otherwise */
+   IRExpr* result =
+      s390_V128_calculate_carry_out_with_carry(get_vr_qw(v2),
+                                               unop(Iop_NotV128, get_vr_qw(v3)),
+                                               get_vr_qw(v4));
+
+   put_vr_qw(v1, result);
+   return "vsbcbi";
+}
+
+static const HChar *
+s390_irgen_VMAH(UChar v1, UChar v2, UChar v3, UChar v4, UChar m5)
+{
+   IRDirty* d;
+   IRTemp cc = newTemp(Ity_I64);
+
+   /* Check for specification exception */
+   vassert(m5 < 3);
+
+   s390x_vec_op_details_t details = { .serialized = 0ULL };
+   details.op = S390_VEC_OP_VMAH;
+   details.v1 = v1;
+   details.v2 = v2;
+   details.v3 = v3;
+   details.v4 = v4;
+   details.m4 = m5;
+
+   d = unsafeIRDirty_1_N(cc, 0, "s390x_dirtyhelper_vec_op",
+                         &s390x_dirtyhelper_vec_op,
+                         mkIRExprVec_2(IRExpr_GSPTR(),
+                                       mkU64(details.serialized)));
+
+   d->nFxState = 4;
+   vex_bzero(&d->fxState, sizeof(d->fxState));
+   d->fxState[0].fx     = Ifx_Read;
+   d->fxState[0].offset = S390X_GUEST_OFFSET(guest_v0) + v2 * sizeof(V128);
+   d->fxState[0].size   = sizeof(V128);
+   d->fxState[1].fx     = Ifx_Read;
+   d->fxState[1].offset = S390X_GUEST_OFFSET(guest_v0) + v3 * sizeof(V128);
+   d->fxState[1].size   = sizeof(V128);
+   d->fxState[2].fx     = Ifx_Read;
+   d->fxState[2].offset = S390X_GUEST_OFFSET(guest_v0) + v4 * sizeof(V128);
+   d->fxState[2].size   = sizeof(V128);
+   d->fxState[3].fx     = Ifx_Write;
+   d->fxState[3].offset = S390X_GUEST_OFFSET(guest_v0) + v1 * sizeof(V128);
+   d->fxState[3].size   = sizeof(V128);
+
+   stmt(IRStmt_Dirty(d));
+
+   return "vmah";
+}
+
+static const HChar *
+s390_irgen_VMALH(UChar v1, UChar v2, UChar v3, UChar v4, UChar m5)
+{
+   IRDirty* d;
+   IRTemp cc = newTemp(Ity_I64);
+
+   /* Check for specification exception */
+   vassert(m5 < 3);
+
+   s390x_vec_op_details_t details = { .serialized = 0ULL };
+   details.op = S390_VEC_OP_VMALH;
+   details.v1 = v1;
+   details.v2 = v2;
+   details.v3 = v3;
+   details.v4 = v4;
+   details.m4 = m5;
+
+   d = unsafeIRDirty_1_N(cc, 0, "s390x_dirtyhelper_vec_op",
+                         &s390x_dirtyhelper_vec_op,
+                         mkIRExprVec_2(IRExpr_GSPTR(),
+                                       mkU64(details.serialized)));
+
+   d->nFxState = 4;
+   vex_bzero(&d->fxState, sizeof(d->fxState));
+   d->fxState[0].fx     = Ifx_Read;
+   d->fxState[0].offset = S390X_GUEST_OFFSET(guest_v0) + v2 * sizeof(V128);
+   d->fxState[0].size   = sizeof(V128);
+   d->fxState[1].fx     = Ifx_Read;
+   d->fxState[1].offset = S390X_GUEST_OFFSET(guest_v0) + v3 * sizeof(V128);
+   d->fxState[1].size   = sizeof(V128);
+   d->fxState[2].fx     = Ifx_Read;
+   d->fxState[2].offset = S390X_GUEST_OFFSET(guest_v0) + v4 * sizeof(V128);
+   d->fxState[2].size   = sizeof(V128);
+   d->fxState[3].fx     = Ifx_Write;
+   d->fxState[3].offset = S390X_GUEST_OFFSET(guest_v0) + v1 * sizeof(V128);
+   d->fxState[3].size   = sizeof(V128);
+
+   stmt(IRStmt_Dirty(d));
+
+   return "vmalh";
+}
+
+static void
+s390_vector_fp_convert(IROp op, IRType fromType, IRType toType,
+                       UChar v1, UChar v2, UChar m3, UChar m4, UChar m5)
+{
+   Bool isSingleElementOp = s390_vr_is_single_element_control_set(m4);
+   UChar maxIndex = isSingleElementOp ? 0 : 1;
+
+   /* For Iop_F32toF64 we do this:
+      f32[0] -> f64[0]
+      f32[2] -> f64[1]
+
+      For Iop_F64toF32 we do this:
+      f64[0] -> f32[0]
+      f64[1] -> f32[2]
+
+      The magic below with scaling factors is used to achieve the logic
+      described above.
+   */
+   const UChar sourceIndexScaleFactor = (op == Iop_F32toF64) ? 2 : 1;
+   const UChar destinationIndexScaleFactor = (op == Iop_F64toF32) ? 2 : 1;
+
+   const Bool isUnary = (op == Iop_F32toF64);
+   for (UChar i = 0; i <= maxIndex; i++) {
+      IRExpr* argument = get_vr(v2, fromType, i * sourceIndexScaleFactor);
+      IRExpr* result;
+      if (!isUnary) {
+         result = binop(op,
+                        mkexpr(encode_bfp_rounding_mode(m5)),
+                        argument);
+      } else {
+         result = unop(op, argument);
+      }
+      put_vr(v1, toType, i * destinationIndexScaleFactor, result);
+   }
+
+   if (isSingleElementOp) {
+      put_vr_dw1(v1, mkU64(0));
+   }
+}
+
+static const HChar *
+s390_irgen_VCDG(UChar v1, UChar v2, UChar m3, UChar m4, UChar m5)
+{
+   s390_insn_assert("vcdg", m3 == 3);
+
+   if (!s390_host_has_fpext && m5 != S390_BFP_ROUND_PER_FPC) {
+      emulation_warning(EmWarn_S390X_fpext_rounding);
+      m5 = S390_BFP_ROUND_PER_FPC;
+   }
+
+   s390_vector_fp_convert(Iop_I64StoF64, Ity_I64, Ity_F64, v1, v2, m3, m4, m5);
+
+   return "vcdg";
+}
+
+static const HChar *
+s390_irgen_VCDLG(UChar v1, UChar v2, UChar m3, UChar m4, UChar m5)
+{
+   s390_insn_assert("vcdlg", m3 == 3);
+
+   if (!s390_host_has_fpext && m5 != S390_BFP_ROUND_PER_FPC) {
+      emulation_warning(EmWarn_S390X_fpext_rounding);
+      m5 = S390_BFP_ROUND_PER_FPC;
+   }
+
+   s390_vector_fp_convert(Iop_I64UtoF64, Ity_I64, Ity_F64, v1, v2, m3, m4, m5);
+
+   return "vcdlg";
+}
+
+static const HChar *
+s390_irgen_VCGD(UChar v1, UChar v2, UChar m3, UChar m4, UChar m5)
+{
+   s390_insn_assert("vcgd", m3 == 3);
+
+   if (!s390_host_has_fpext && m5 != S390_BFP_ROUND_PER_FPC) {
+      emulation_warning(EmWarn_S390X_fpext_rounding);
+      m5 = S390_BFP_ROUND_PER_FPC;
+   }
+
+   s390_vector_fp_convert(Iop_F64toI64S, Ity_F64, Ity_I64, v1, v2, m3, m4, m5);
+
+   return "vcgd";
+}
+
+static const HChar *
+s390_irgen_VCLGD(UChar v1, UChar v2, UChar m3, UChar m4, UChar m5)
+{
+   s390_insn_assert("vclgd", m3 == 3);
+
+   if (!s390_host_has_fpext && m5 != S390_BFP_ROUND_PER_FPC) {
+      emulation_warning(EmWarn_S390X_fpext_rounding);
+      m5 = S390_BFP_ROUND_PER_FPC;
+   }
+
+   s390_vector_fp_convert(Iop_F64toI64U, Ity_F64, Ity_I64, v1, v2, m3, m4, m5);
+
+   return "vclgd";
+}
+
+static const HChar *
+s390_irgen_VFI(UChar v1, UChar v2, UChar m3, UChar m4, UChar m5)
+{
+   s390_insn_assert("vfi", m3 == 3);
+
+   if (!s390_host_has_fpext && m5 != S390_BFP_ROUND_PER_FPC) {
+      emulation_warning(EmWarn_S390X_fpext_rounding);
+      m5 = S390_BFP_ROUND_PER_FPC;
+   }
+
+   s390_vector_fp_convert(Iop_RoundF64toInt, Ity_F64, Ity_F64,
+                          v1, v2, m3, m4, m5);
+
+   return "vcgld";
+}
+
+static const HChar *
+s390_irgen_VLDE(UChar v1, UChar v2, UChar m3, UChar m4, UChar m5)
+{
+   s390_insn_assert("vlde", m3 == 2);
+
+   s390_vector_fp_convert(Iop_F32toF64, Ity_F32, Ity_F64, v1, v2, m3, m4, m5);
+
+   return "vlde";
+}
+
+static const HChar *
+s390_irgen_VLED(UChar v1, UChar v2, UChar m3, UChar m4, UChar m5)
+{
+   s390_insn_assert("vled", m3 == 3);
+
+   if (!s390_host_has_fpext && m5 != S390_BFP_ROUND_PER_FPC) {
+      m5 = S390_BFP_ROUND_PER_FPC;
+   }
+
+   s390_vector_fp_convert(Iop_F64toF32, Ity_F64, Ity_F32, v1, v2, m3, m4, m5);
+
+   return "vled";
+}
+
+static const HChar *
+s390_irgen_VFPSO(UChar v1, UChar v2, UChar m3, UChar m4, UChar m5)
+{
+   s390_insn_assert("vfpso", m3 == 3);
+
+   IRExpr* result;
+   switch (m5) {
+   case 0: {
+      /* Invert sign */
+      if (!s390_vr_is_single_element_control_set(m4)) {
+         result = unop(Iop_Neg64Fx2, get_vr_qw(v2));
+      }
+      else {
+         result = binop(Iop_64HLtoV128,
+                        unop(Iop_ReinterpF64asI64,
+                             unop(Iop_NegF64, get_vr(v2, Ity_F64, 0))),
+                        mkU64(0));
+      }
+      break;
+   }
+
+   case 1: {
+      /* Set sign to negative */
+      IRExpr* highHalf = mkU64(0x8000000000000000ULL);
+      if (!s390_vr_is_single_element_control_set(m4)) {
+         IRExpr* lowHalf = highHalf;
+         IRExpr* mask = binop(Iop_64HLtoV128, highHalf, lowHalf);
+         result = binop(Iop_OrV128, get_vr_qw(v2), mask);
+      }
+      else {
+         result = binop(Iop_64HLtoV128,
+                        binop(Iop_Or64, get_vr_dw0(v2), highHalf),
+                        mkU64(0ULL));
+      }
+
+      break;
+   }
+
+   case 2: {
+      /* Set sign to positive */
+      if (!s390_vr_is_single_element_control_set(m4)) {
+         result = unop(Iop_Abs64Fx2, get_vr_qw(v2));
+      }
+      else {
+         result = binop(Iop_64HLtoV128,
+                        unop(Iop_ReinterpF64asI64,
+                             unop(Iop_AbsF64, get_vr(v2, Ity_F64, 0))),
+                        mkU64(0));
+      }
+
+      break;
+   }
+
+   default:
+      vpanic("s390_irgen_VFPSO: Invalid m5 value");
+   }
+
+   put_vr_qw(v1, result);
+   if (s390_vr_is_single_element_control_set(m4)) {
+      put_vr_dw1(v1, mkU64(0ULL));
+   }
+
+   return "vfpso";
+}
+
+static void s390x_vec_fp_binary_op(IROp generalOp, IROp singleElementOp,
+                                   UChar v1, UChar v2, UChar v3, UChar m4,
+                                   UChar m5)
+{
+   IRExpr* result;
+   if (!s390_vr_is_single_element_control_set(m5)) {
+      result = triop(generalOp, get_bfp_rounding_mode_from_fpc(),
+                     get_vr_qw(v2), get_vr_qw(v3));
+   } else {
+      IRExpr* highHalf = triop(singleElementOp,
+                               get_bfp_rounding_mode_from_fpc(),
+                               get_vr(v2, Ity_F64, 0),
+                               get_vr(v3, Ity_F64, 0));
+      result = binop(Iop_64HLtoV128, unop(Iop_ReinterpF64asI64, highHalf),
+                     mkU64(0ULL));
+   }
+
+   put_vr_qw(v1, result);
+}
+
+static void s390x_vec_fp_unary_op(IROp generalOp, IROp singleElementOp,
+                                  UChar v1, UChar v2, UChar m3, UChar m4)
+{
+   IRExpr* result;
+   if (!s390_vr_is_single_element_control_set(m4)) {
+      result = binop(generalOp, get_bfp_rounding_mode_from_fpc(),
+                     get_vr_qw(v2));
+   }
+   else {
+      IRExpr* highHalf = binop(singleElementOp,
+                               get_bfp_rounding_mode_from_fpc(),
+                               get_vr(v2, Ity_F64, 0));
+      result = binop(Iop_64HLtoV128, unop(Iop_ReinterpF64asI64, highHalf),
+                     mkU64(0ULL));
+   }
+
+   put_vr_qw(v1, result);
 }
 
-static IRExpr *
-s390_call_cu12_helper2(IRExpr *byte1, IRExpr *byte2, IRExpr *byte3,
-                       IRExpr *byte4, IRExpr *stuff)
-{
-   IRExpr **args, *call;
-   args = mkIRExprVec_5(byte1, byte2, byte3, byte4, stuff);
-   call = mkIRExprCCall(Ity_I64, 0 /*regparm*/,
-                        "s390_do_cu12_helper2", &s390_do_cu12_helper2, args);
 
-   /* Nothing is excluded from definedness checking. */
-   call->Iex.CCall.cee->mcx_mask = 0;
+static void
+s390_vector_fp_mulAddOrSub(IROp singleElementOp,
+                           UChar v1, UChar v2, UChar v3, UChar v4,
+                           UChar m5, UChar m6)
+{
+   Bool isSingleElementOp = s390_vr_is_single_element_control_set(m5);
+   IRTemp irrm_temp = newTemp(Ity_I32);
+   assign(irrm_temp, get_bfp_rounding_mode_from_fpc());
+   IRExpr* irrm = mkexpr(irrm_temp);
+   IRExpr* result;
+   IRExpr* highHalf = qop(singleElementOp,
+                          irrm,
+                          get_vr(v2, Ity_F64, 0),
+                          get_vr(v3, Ity_F64, 0),
+                          get_vr(v4, Ity_F64, 0));
+
+   if (isSingleElementOp) {
+      result = binop(Iop_64HLtoV128, unop(Iop_ReinterpF64asI64, highHalf),
+                     mkU64(0ULL));
+   } else {
+      IRExpr* lowHalf = qop(singleElementOp,
+                            irrm,
+                            get_vr(v2, Ity_F64, 1),
+                            get_vr(v3, Ity_F64, 1),
+                            get_vr(v4, Ity_F64, 1));
+      result = binop(Iop_64HLtoV128, unop(Iop_ReinterpF64asI64, highHalf),
+                     unop(Iop_ReinterpF64asI64, lowHalf));
+   }
 
-   return call;
+   put_vr_qw(v1, result);
 }
 
-static IRExpr *
-s390_call_cu14_helper2(IRExpr *byte1, IRExpr *byte2, IRExpr *byte3,
-                       IRExpr *byte4, IRExpr *stuff)
+static const HChar *
+s390_irgen_VFA(UChar v1, UChar v2, UChar v3, UChar m4, UChar m5)
 {
-   IRExpr **args, *call;
-   args = mkIRExprVec_5(byte1, byte2, byte3, byte4, stuff);
-   call = mkIRExprCCall(Ity_I64, 0 /*regparm*/,
-                        "s390_do_cu14_helper2", &s390_do_cu14_helper2, args);
+   s390_insn_assert("vfa", m4 == 3);
+   s390x_vec_fp_binary_op(Iop_Add64Fx2, Iop_AddF64, v1, v2, v3, m4, m5);
+   return "vfa";
+}
 
-   /* Nothing is excluded from definedness checking. */
-   call->Iex.CCall.cee->mcx_mask = 0;
+static const HChar *
+s390_irgen_VFS(UChar v1, UChar v2, UChar v3, UChar m4, UChar m5)
+{
+   s390_insn_assert("vfs", m4 == 3);
+   s390x_vec_fp_binary_op(Iop_Sub64Fx2, Iop_SubF64, v1, v2, v3, m4, m5);
+   return "vfs";
+}
 
-   return call;
+static const HChar *
+s390_irgen_VFM(UChar v1, UChar v2, UChar v3, UChar m4, UChar m5)
+{
+   s390_insn_assert("vfm", m4 == 3);
+   s390x_vec_fp_binary_op(Iop_Mul64Fx2, Iop_MulF64, v1, v2, v3, m4, m5);
+   return "vfm";
 }
 
-static void
-s390_irgen_cu12_cu14(UChar m3, UChar r1, UChar r2, Bool is_cu12)
+static const HChar *
+s390_irgen_VFD(UChar v1, UChar v2, UChar v3, UChar m4, UChar m5)
 {
-   IRTemp addr1 = newTemp(Ity_I64);
-   IRTemp addr2 = newTemp(Ity_I64);
-   IRTemp len1 = newTemp(Ity_I64);
-   IRTemp len2 = newTemp(Ity_I64);
+   s390_insn_assert("vfd", m4 == 3);
+   s390x_vec_fp_binary_op(Iop_Div64Fx2, Iop_DivF64, v1, v2, v3, m4, m5);
+   return "vfd";
+}
 
-   assign(addr1, get_gpr_dw0(r1));
-   assign(addr2, get_gpr_dw0(r2));
-   assign(len1, get_gpr_dw0(r1 + 1));
-   assign(len2, get_gpr_dw0(r2 + 1));
+static const HChar *
+s390_irgen_VFSQ(UChar v1, UChar v2, UChar m3, UChar m4)
+{
+   s390_insn_assert("vfsq", m3 == 3);
+   s390x_vec_fp_unary_op(Iop_Sqrt64Fx2, Iop_SqrtF64, v1, v2, m3, m4);
 
-   UInt extended_checking = s390_host_has_etf3 && (m3 & 0x1) == 1;
+   return "vfsq";
+}
 
-   /* We're processing the 2nd operand 1 byte at a time. Therefore, if
-      there is less than 1 byte left, then the 2nd operand is exhausted
-      and we're done here. cc = 0 */
-   s390_cc_set(0);
-   next_insn_if(binop(Iop_CmpLT64U, mkexpr(len2), mkU64(1)));
+static const HChar *
+s390_irgen_VFMA(UChar v1, UChar v2, UChar v3, UChar v4, UChar m5, UChar m6)
+{
+   s390_insn_assert("vfma", m6 == 3);
+   s390_vector_fp_mulAddOrSub(Iop_MAddF64, v1, v2, v3, v4, m5, m6);
+   return "vfma";
+}
 
-   /* There is at least one byte there. Read it. */
-   IRTemp byte1 = newTemp(Ity_I64);
-   assign(byte1, unop(Iop_8Uto64, load(Ity_I8, mkexpr(addr2))));
+static const HChar *
+s390_irgen_VFMS(UChar v1, UChar v2, UChar v3, UChar v4, UChar m5, UChar m6)
+{
+   s390_insn_assert("vfms", m6 == 3);
+   s390_vector_fp_mulAddOrSub(Iop_MSubF64, v1, v2, v3, v4, m5, m6);
+   return "vfms";
+}
 
-   /* Call the helper to get number of bytes and invalid byte indicator */
-   IRTemp retval1 = newTemp(Ity_I64);
-   assign(retval1, s390_call_cu12_cu14_helper1(mkexpr(byte1),
-                                               mkU64(extended_checking)));
+static const HChar *
+s390_irgen_WFC(UChar v1, UChar v2, UChar m3, UChar m4)
+{
+   s390_insn_assert("wfc", m3 == 3);
+   s390_insn_assert("wfc", m4 == 0);
 
-   /* Check for invalid 1st byte */
-   IRExpr *is_invalid = unop(Iop_64to1, mkexpr(retval1));
-   s390_cc_set(2);
-   next_insn_if(is_invalid);
+   IRTemp cc_vex = newTemp(Ity_I32);
+   assign(cc_vex, binop(Iop_CmpF64,
+                        get_vr(v1, Ity_F64, 0), get_vr(v2, Ity_F64, 0)));
 
-   /* How many bytes do we have to read? */
-   IRTemp num_src_bytes = newTemp(Ity_I64);
-   assign(num_src_bytes, binop(Iop_Shr64, mkexpr(retval1), mkU8(8)));
+   IRTemp cc_s390 = newTemp(Ity_I32);
+   assign(cc_s390, convert_vex_bfpcc_to_s390(cc_vex));
+   s390_cc_thunk_put1(S390_CC_OP_SET, cc_s390, False);
 
-   /* Now test whether the 2nd operand is exhausted */
-   s390_cc_set(0);
-   next_insn_if(binop(Iop_CmpLT64U, mkexpr(len2), mkexpr(num_src_bytes)));
+   return "wfc";
+}
 
-   /* Read the remaining bytes */
-   IRExpr *cond, *addr, *byte2, *byte3, *byte4;
+static const HChar *
+s390_irgen_WFK(UChar v1, UChar v2, UChar m3, UChar m4)
+{
+   s390_irgen_WFC(v1, v2, m3, m4);
 
-   cond  = binop(Iop_CmpLE64U, mkU64(2), mkexpr(num_src_bytes));
-   addr  = binop(Iop_Add64, mkexpr(addr2), mkU64(1));
-   byte2 = mkite(cond, unop(Iop_8Uto64, load(Ity_I8, addr)), mkU64(0));
-   cond  = binop(Iop_CmpLE64U, mkU64(3), mkexpr(num_src_bytes));
-   addr  = binop(Iop_Add64, mkexpr(addr2), mkU64(2));
-   byte3 = mkite(cond, unop(Iop_8Uto64, load(Ity_I8, addr)), mkU64(0));
-   cond  = binop(Iop_CmpLE64U, mkU64(4), mkexpr(num_src_bytes));
-   addr  = binop(Iop_Add64, mkexpr(addr2), mkU64(3));
-   byte4 = mkite(cond, unop(Iop_8Uto64, load(Ity_I8, addr)), mkU64(0));
+   return "wfk";
+}
 
-   /* Call the helper to get the converted value and invalid byte indicator.
-      We can pass at most 5 arguments; therefore some encoding is needed
-      here */
-   IRExpr *stuff = binop(Iop_Or64,
-                         binop(Iop_Shl64, mkexpr(num_src_bytes), mkU8(1)),
-                         mkU64(extended_checking));
-   IRTemp retval2 = newTemp(Ity_I64);
+static const HChar *
+s390_irgen_VFCE(UChar v1, UChar v2, UChar v3, UChar m4, UChar m5, UChar m6)
+{
+   s390_insn_assert("vfce", m4 == 3);
 
-   if (is_cu12) {
-      assign(retval2, s390_call_cu12_helper2(mkexpr(byte1), byte2, byte3,
-                                             byte4, stuff));
+   Bool isSingleElementOp = s390_vr_is_single_element_control_set(m5);
+   if (!s390_vr_is_cs_set(m6)) {
+      if (!isSingleElementOp) {
+         put_vr_qw(v1, binop(Iop_CmpEQ64Fx2, get_vr_qw(v2), get_vr_qw(v3)));
+      } else {
+         IRExpr* comparisonResult = binop(Iop_CmpF64, get_vr(v2, Ity_F64, 0),
+                                          get_vr(v3, Ity_F64, 0));
+         IRExpr* result = mkite(binop(Iop_CmpEQ32, comparisonResult,
+                                      mkU32(Ircr_EQ)),
+                                mkU64(0xffffffffffffffffULL),
+                                mkU64(0ULL));
+         put_vr_qw(v1, binop(Iop_64HLtoV128, result, mkU64(0ULL)));
+      }
    } else {
-      assign(retval2, s390_call_cu14_helper2(mkexpr(byte1), byte2, byte3,
-                                             byte4, stuff));
-   }
-
-   /* Check for invalid character */
-   s390_cc_set(2);
-   is_invalid = unop(Iop_64to1, mkexpr(retval2));
-   next_insn_if(is_invalid);
-
-   /* Now test whether the 1st operand is exhausted */
-   IRTemp num_bytes = newTemp(Ity_I64);
-   assign(num_bytes, binop(Iop_And64,
-                           binop(Iop_Shr64, mkexpr(retval2), mkU8(8)),
-                           mkU64(0xff)));
-   s390_cc_set(1);
-   next_insn_if(binop(Iop_CmpLT64U, mkexpr(len1), mkexpr(num_bytes)));
+      IRDirty* d;
+      IRTemp cc = newTemp(Ity_I64);
 
-   /* Extract the bytes to be stored at addr1 */
-   IRTemp data = newTemp(Ity_I64);
-   assign(data, binop(Iop_Shr64, mkexpr(retval2), mkU8(16)));
+      s390x_vec_op_details_t details = { .serialized = 0ULL };
+      details.op = S390_VEC_OP_VFCE;
+      details.v1 = v1;
+      details.v2 = v2;
+      details.v3 = v3;
+      details.m4 = m4;
+      details.m5 = m5;
+      details.m6 = m6;
+
+      d = unsafeIRDirty_1_N(cc, 0, "s390x_dirtyhelper_vec_op",
+                            &s390x_dirtyhelper_vec_op,
+                            mkIRExprVec_2(IRExpr_GSPTR(),
+                                          mkU64(details.serialized)));
+
+      const UChar elementSize = isSingleElementOp ? sizeof(ULong) : sizeof(V128);
+      d->nFxState = 3;
+      vex_bzero(&d->fxState, sizeof(d->fxState));
+      d->fxState[0].fx = Ifx_Read;
+      d->fxState[0].offset = S390X_GUEST_OFFSET(guest_v0) + v2 * sizeof(V128);
+      d->fxState[0].size = elementSize;
+      d->fxState[1].fx = Ifx_Read;
+      d->fxState[1].offset = S390X_GUEST_OFFSET(guest_v0) + v3 * sizeof(V128);
+      d->fxState[1].size = elementSize;
+      d->fxState[2].fx = Ifx_Write;
+      d->fxState[2].offset = S390X_GUEST_OFFSET(guest_v0) + v1 * sizeof(V128);
+      d->fxState[2].size = sizeof(V128);
 
-   if (is_cu12) {
-      /* To store the bytes construct 2 dirty helper calls. The helper calls
-         are guarded (num_bytes == 2 and num_bytes == 4, respectively) such
-         that only one of them will be called at runtime. */
+      stmt(IRStmt_Dirty(d));
+      s390_cc_set(cc);
+   }
 
-      Int i;
-      for (i = 2; i <= 4; ++i) {
-         IRDirty *d;
+   return "vfce";
+}
 
-         if (i == 3) continue;  // skip this one
+static const HChar *
+s390_irgen_VFCH(UChar v1, UChar v2, UChar v3, UChar m4, UChar m5, UChar m6)
+{
+   vassert(m4 == 3);
 
-         d = unsafeIRDirty_0_N(0 /* regparms */, "s390x_dirtyhelper_CUxy",
-                               &s390x_dirtyhelper_CUxy,
-                               mkIRExprVec_3(mkexpr(addr1), mkexpr(data),
-                                             mkexpr(num_bytes)));
-         d->guard = binop(Iop_CmpEQ64, mkexpr(num_bytes), mkU64(i));
-         d->mFx   = Ifx_Write;
-         d->mAddr = mkexpr(addr1);
-         d->mSize = i;
-         stmt(IRStmt_Dirty(d));
+   Bool isSingleElementOp = s390_vr_is_single_element_control_set(m5);
+   if (!s390_vr_is_cs_set(m6)) {
+      if (!isSingleElementOp) {
+         put_vr_qw(v1, binop(Iop_CmpLE64Fx2, get_vr_qw(v3), get_vr_qw(v2)));
+      } else {
+         IRExpr* comparisonResult = binop(Iop_CmpF64, get_vr(v2, Ity_F64, 0),
+                                          get_vr(v3, Ity_F64, 0));
+         IRExpr* result = mkite(binop(Iop_CmpEQ32, comparisonResult,
+                                      mkU32(Ircr_GT)),
+                                mkU64(0xffffffffffffffffULL),
+                                mkU64(0ULL));
+         put_vr_qw(v1, binop(Iop_64HLtoV128, result, mkU64(0ULL)));
       }
-   } else {
-      // cu14
-      store(mkexpr(addr1), unop(Iop_64to32, mkexpr(data)));
    }
+   else {
+      IRDirty* d;
+      IRTemp cc = newTemp(Ity_I64);
 
-   /* Update source address and length */
-   put_gpr_dw0(r2,     binop(Iop_Add64, mkexpr(addr2), mkexpr(num_src_bytes)));
-   put_gpr_dw0(r2 + 1, binop(Iop_Sub64, mkexpr(len2),  mkexpr(num_src_bytes)));
+      s390x_vec_op_details_t details = { .serialized = 0ULL };
+      details.op = S390_VEC_OP_VFCH;
+      details.v1 = v1;
+      details.v2 = v2;
+      details.v3 = v3;
+      details.m4 = m4;
+      details.m5 = m5;
+      details.m6 = m6;
+
+      d = unsafeIRDirty_1_N(cc, 0, "s390x_dirtyhelper_vec_op",
+                            &s390x_dirtyhelper_vec_op,
+                            mkIRExprVec_2(IRExpr_GSPTR(),
+                                          mkU64(details.serialized)));
+
+      const UChar elementSize = isSingleElementOp ? sizeof(ULong) : sizeof(V128);
+      d->nFxState = 3;
+      vex_bzero(&d->fxState, sizeof(d->fxState));
+      d->fxState[0].fx = Ifx_Read;
+      d->fxState[0].offset = S390X_GUEST_OFFSET(guest_v0) + v2 * sizeof(V128);
+      d->fxState[0].size = elementSize;
+      d->fxState[1].fx = Ifx_Read;
+      d->fxState[1].offset = S390X_GUEST_OFFSET(guest_v0) + v3 * sizeof(V128);
+      d->fxState[1].size = elementSize;
+      d->fxState[2].fx = Ifx_Write;
+      d->fxState[2].offset = S390X_GUEST_OFFSET(guest_v0) + v1 * sizeof(V128);
+      d->fxState[2].size = sizeof(V128);
 
-   /* Update destination address and length */
-   put_gpr_dw0(r1,     binop(Iop_Add64, mkexpr(addr1), mkexpr(num_bytes)));
-   put_gpr_dw0(r1 + 1, binop(Iop_Sub64, mkexpr(len1),  mkexpr(num_bytes)));
+      stmt(IRStmt_Dirty(d));
+      s390_cc_set(cc);
+   }
 
-   iterate();
+   return "vfch";
 }
 
 static const HChar *
-s390_irgen_CU12(UChar m3, UChar r1, UChar r2)
+s390_irgen_VFCHE(UChar v1, UChar v2, UChar v3, UChar m4, UChar m5, UChar m6)
 {
-   s390_irgen_cu12_cu14(m3, r1, r2, /* is_cu12 = */ 1);
+   s390_insn_assert("vfche", m4 == 3);
 
-   return "cu12";
-}
+   Bool isSingleElementOp = s390_vr_is_single_element_control_set(m5);
+   if (!s390_vr_is_cs_set(m6)) {
+      if (!isSingleElementOp) {
+         put_vr_qw(v1, binop(Iop_CmpLT64Fx2, get_vr_qw(v3), get_vr_qw(v2)));
+      }
+      else {
+         IRExpr* comparisonResult = binop(Iop_CmpF64, get_vr(v3, Ity_F64, 0),
+                                          get_vr(v2, Ity_F64, 0));
+         IRExpr* result = mkite(binop(Iop_CmpEQ32, comparisonResult,
+                                      mkU32(Ircr_LT)),
+                                mkU64(0xffffffffffffffffULL),
+                                mkU64(0ULL));
+         put_vr_qw(v1, binop(Iop_64HLtoV128, result, mkU64(0ULL)));
+      }
+   }
+   else {
+      IRDirty* d;
+      IRTemp cc = newTemp(Ity_I64);
 
-static const HChar *
-s390_irgen_CU14(UChar m3, UChar r1, UChar r2)
-{
-   s390_irgen_cu12_cu14(m3, r1, r2, /* is_cu12 = */ 0);
+      s390x_vec_op_details_t details = { .serialized = 0ULL };
+      details.op = S390_VEC_OP_VFCHE;
+      details.v1 = v1;
+      details.v2 = v2;
+      details.v3 = v3;
+      details.m4 = m4;
+      details.m5 = m5;
+      details.m6 = m6;
+
+      d = unsafeIRDirty_1_N(cc, 0, "s390x_dirtyhelper_vec_op",
+                            &s390x_dirtyhelper_vec_op,
+                            mkIRExprVec_2(IRExpr_GSPTR(),
+                                          mkU64(details.serialized)));
+
+      const UChar elementSize = isSingleElementOp ? sizeof(ULong) : sizeof(V128);
+      d->nFxState = 3;
+      vex_bzero(&d->fxState, sizeof(d->fxState));
+      d->fxState[0].fx = Ifx_Read;
+      d->fxState[0].offset = S390X_GUEST_OFFSET(guest_v0) + v2 * sizeof(V128);
+      d->fxState[0].size = elementSize;
+      d->fxState[1].fx = Ifx_Read;
+      d->fxState[1].offset = S390X_GUEST_OFFSET(guest_v0) + v3 * sizeof(V128);
+      d->fxState[1].size = elementSize;
+      d->fxState[2].fx = Ifx_Write;
+      d->fxState[2].offset = S390X_GUEST_OFFSET(guest_v0) + v1 * sizeof(V128);
+      d->fxState[2].size = sizeof(V128);
 
-   return "cu14";
+      stmt(IRStmt_Dirty(d));
+      s390_cc_set(cc);
+   }
+
+   return "vfche";
 }
 
-static IRExpr *
-s390_call_ecag(IRExpr *op2addr)
+static const HChar *
+s390_irgen_VFTCI(UChar v1, UChar v2, UShort i3, UChar m4, UChar m5)
 {
-   IRExpr **args, *call;
+   s390_insn_assert("vftci", m4 == 3);
 
-   args = mkIRExprVec_1(op2addr);
-   call = mkIRExprCCall(Ity_I64, 0 /*regparm*/,
-                        "s390_do_ecag", &s390_do_ecag, args);
+   Bool isSingleElementOp = s390_vr_is_single_element_control_set(m5);
 
-   /* Nothing is excluded from definedness checking. */
-   call->Iex.CCall.cee->mcx_mask = 0;
+   IRDirty* d;
+   IRTemp cc = newTemp(Ity_I64);
 
-   return call;
-}
+   s390x_vec_op_details_t details = { .serialized = 0ULL };
+   details.op = S390_VEC_OP_VFTCI;
+   details.v1 = v1;
+   details.v2 = v2;
+   details.i3 = i3;
+   details.m4 = m4;
+   details.m5 = m5;
+
+   d = unsafeIRDirty_1_N(cc, 0, "s390x_dirtyhelper_vec_op",
+                         &s390x_dirtyhelper_vec_op,
+                         mkIRExprVec_2(IRExpr_GSPTR(),
+                                       mkU64(details.serialized)));
+
+   const UChar elementSize = isSingleElementOp ? sizeof(ULong) : sizeof(V128);
+   d->nFxState = 2;
+   vex_bzero(&d->fxState, sizeof(d->fxState));
+   d->fxState[0].fx = Ifx_Read;
+   d->fxState[0].offset = S390X_GUEST_OFFSET(guest_v0) + v2 * sizeof(V128);
+   d->fxState[0].size = elementSize;
+   d->fxState[1].fx = Ifx_Write;
+   d->fxState[1].offset = S390X_GUEST_OFFSET(guest_v0) + v1 * sizeof(V128);
+   d->fxState[1].size = sizeof(V128);
 
-static const HChar *
-s390_irgen_ECAG(UChar r1, UChar r3 __attribute__((unused)), IRTemp op2addr)
-{
-   if (! s390_host_has_gie) {
-      emulation_failure(EmFail_S390X_ecag);
-   } else {
-      put_gpr_dw0(r1, s390_call_ecag(mkexpr(op2addr)));
-   }
+   stmt(IRStmt_Dirty(d));
+   s390_cc_set(cc);
 
-   return "ecag";
+   return "vftci";
 }
 
-
 /* New insns are added here.
    If an insn is contingent on a facility being installed also
    check whether the list of supported facilities in function
@@ -14439,17 +18801,9 @@ s390_irgen_call_noredir(void)
 }
 
 static s390_decode_t
-s390_decode_2byte_and_irgen(const UChar *bytes, VexEndness host_endness)
+s390_decode_2byte_and_irgen(const UChar *bytes)
 {
-   UShort ovl;
-
-   if (host_endness == VexEndnessBE) {
-      ((UChar *)&ovl)[0] = bytes[0];
-      ((UChar *)&ovl)[1] = bytes[1];
-   } else {
-      ((UChar *)&ovl)[0] = bytes[1];
-      ((UChar *)&ovl)[1] = bytes[0];
-   }
+   UShort ovl = ((UShort)bytes[0] << 8) | (UShort)bytes[1];
 
    switch (ovl & 0xffff) {
    case 0x0101: /* PR */ goto unimplemented;
@@ -14558,21 +18912,10 @@ s390_decode_2byte_and_irgen(const UChar *bytes, VexEndness host_endness)
 }
 
 static s390_decode_t
-s390_decode_4byte_and_irgen(const UChar *bytes, VexEndness host_endness)
+s390_decode_4byte_and_irgen(const UChar *bytes)
 {
-   UInt ovl;
-
-   if (host_endness == VexEndnessBE) {
-      ((UChar *)&ovl)[0] = bytes[0];
-      ((UChar *)&ovl)[1] = bytes[1];
-      ((UChar *)&ovl)[2] = bytes[2];
-      ((UChar *)&ovl)[3] = bytes[3];
-   } else {
-      ((UChar *)&ovl)[0] = bytes[3];
-      ((UChar *)&ovl)[1] = bytes[2];
-      ((UChar *)&ovl)[2] = bytes[1];
-      ((UChar *)&ovl)[3] = bytes[0];
-   }
+   UInt ovl = ((UInt)bytes[0] << 24) | ((UInt)bytes[1] << 16) |
+              ((UInt)bytes[2] << 8) | (UInt)bytes[3];
 
    switch ((ovl & 0xff0f0000) >> 16) {
    case 0xa500: s390_format_RI_RU(s390_irgen_IIHH, RI_r1(ovl),
@@ -15165,6 +19508,7 @@ s390_decode_4byte_and_irgen(const UChar *bytes, VexEndness host_endness)
    case 0xb927: s390_format_RRE_RR(s390_irgen_LHR, RRE_r1(ovl),
                                    RRE_r2(ovl));  goto ok;
    case 0xb928: /* PCKMO */ goto unimplemented;
+   case 0xb929: /* KMA */ goto unimplemented;
    case 0xb92a: /* KMF */ goto unimplemented;
    case 0xb92b: /* KMO */ goto unimplemented;
    case 0xb92c: /* PCC */ goto unimplemented;
@@ -15175,6 +19519,8 @@ s390_decode_4byte_and_irgen(const UChar *bytes, VexEndness host_endness)
                                    RRE_r2(ovl));  goto ok;
    case 0xb931: s390_format_RRE_RR(s390_irgen_CLGFR, RRE_r1(ovl),
                                    RRE_r2(ovl));  goto ok;
+   case 0xb93c: s390_format_RRE_RR(s390_irgen_PPNO, RRE_r1(ovl),
+                                   RRE_r2(ovl));  goto ok;
    case 0xb93e: /* KIMD */ goto unimplemented;
    case 0xb93f: /* KLMD */ goto unimplemented;
    case 0xb941: s390_format_RRF_UURF(s390_irgen_CFDTR, RRF2_m3(ovl),
@@ -15215,10 +19561,18 @@ s390_decode_4byte_and_irgen(const UChar *bytes, VexEndness host_endness)
    case 0xb95b: s390_format_RRF_UUFR(s390_irgen_CXLFTR, RRF2_m3(ovl),
                                      RRF2_m4(ovl), RRF2_r1(ovl),
                                      RRF2_r2(ovl));  goto ok;
-   case 0xb960: /* CGRT */ goto unimplemented;
-   case 0xb961: /* CLGRT */ goto unimplemented;
-   case 0xb972: /* CRT */ goto unimplemented;
-   case 0xb973: /* CLRT */ goto unimplemented;
+   case 0xb960: s390_format_RRF_U0RR(s390_irgen_CGRT, RRF2_m3(ovl),
+                                     RRF2_r1(ovl), RRF2_r2(ovl),
+                                     S390_XMNM_CAB); goto ok;
+   case 0xb961: s390_format_RRF_U0RR(s390_irgen_CLGRT, RRF2_m3(ovl),
+                                     RRF2_r1(ovl), RRF2_r2(ovl),
+                                     S390_XMNM_CAB); goto ok;
+   case 0xb972: s390_format_RRF_U0RR(s390_irgen_CRT, RRF2_m3(ovl),
+                                     RRF2_r1(ovl), RRF2_r2(ovl),
+                                     S390_XMNM_CAB); goto ok;
+   case 0xb973: s390_format_RRF_U0RR(s390_irgen_CLRT, RRF2_m3(ovl),
+                                     RRF2_r1(ovl), RRF2_r2(ovl),
+                                     S390_XMNM_CAB); goto ok;
    case 0xb980: s390_format_RRE_RR(s390_irgen_NGR, RRE_r1(ovl),
                                    RRE_r2(ovl));  goto ok;
    case 0xb981: s390_format_RRE_RR(s390_irgen_OGR, RRE_r1(ovl),
@@ -15317,6 +19671,9 @@ s390_decode_4byte_and_irgen(const UChar *bytes, VexEndness host_endness)
                                    RRE_r2(ovl));  goto ok;
    case 0xb9df: s390_format_RRE_RR(s390_irgen_CLHLR, RRE_r1(ovl),
                                    RRE_r2(ovl));  goto ok;
+   case 0xb9e0: s390_format_RRF_U0RR(s390_irgen_LOCFHR, RRF3_r3(ovl),
+                                     RRF3_r1(ovl), RRF3_r2(ovl),
+                                     S390_XMNM_LOCFHR);  goto ok;
    case 0xb9e1: s390_format_RRE_RR(s390_irgen_POPCNT, RRE_r1(ovl),
                                    RRE_r2(ovl));  goto ok;
    case 0xb9e2: s390_format_RRF_U0RR(s390_irgen_LOCGR, RRF3_r3(ovl),
@@ -15343,6 +19700,8 @@ s390_decode_4byte_and_irgen(const UChar *bytes, VexEndness host_endness)
    case 0xb9eb: s390_format_RRF_R0RR2(s390_irgen_SLGRK, RRF4_r3(ovl),
                                       RRF4_r1(ovl), RRF4_r2(ovl));
                                       goto ok;
+   case 0xb9ec: /* MGRK */ goto unimplemented;
+   case 0xb9ed: /* MSGRKC */ goto unimplemented;
    case 0xb9f2: s390_format_RRF_U0RR(s390_irgen_LOCR, RRF3_r3(ovl),
                                      RRF3_r1(ovl), RRF3_r2(ovl),
                                      S390_XMNM_LOCR);  goto ok;
@@ -15367,6 +19726,7 @@ s390_decode_4byte_and_irgen(const UChar *bytes, VexEndness host_endness)
    case 0xb9fb: s390_format_RRF_R0RR2(s390_irgen_SLRK, RRF4_r3(ovl),
                                       RRF4_r1(ovl), RRF4_r2(ovl));
                                       goto ok;
+   case 0xb9fd: /* MSRKC */ goto unimplemented;
    }
 
    switch ((ovl & 0xff000000) >> 24) {
@@ -15378,7 +19738,7 @@ s390_decode_4byte_and_irgen(const UChar *bytes, VexEndness host_endness)
                                   RX_b2(ovl), RX_d2(ovl));  goto ok;
    case 0x43: s390_format_RX_RRRD(s390_irgen_IC, RX_r1(ovl), RX_x2(ovl),
                                   RX_b2(ovl), RX_d2(ovl));  goto ok;
-   case 0x44: s390_format_RX_RRRD(host_endness == VexEndnessBE ? s390_irgen_EX_BE : s390_irgen_EX_LE, RX_r1(ovl), RX_x2(ovl),
+   case 0x44: s390_format_RX_RRRD(s390_irgen_EX, RX_r1(ovl), RX_x2(ovl),
                                   RX_b2(ovl), RX_d2(ovl));  goto ok;
    case 0x45: /* BAL */ goto unimplemented;
    case 0x46: s390_format_RX_RRRD(s390_irgen_BCT, RX_r1(ovl), RX_x2(ovl),
@@ -15535,29 +19895,11 @@ s390_decode_4byte_and_irgen(const UChar *bytes, VexEndness host_endness)
 }
 
 static s390_decode_t
-s390_decode_6byte_and_irgen(const UChar *bytes, VexEndness host_endness)
-{
-   ULong ovl;
-
-   if (host_endness == VexEndnessBE) {
-      ((UChar *)&ovl)[0] = bytes[0];
-      ((UChar *)&ovl)[1] = bytes[1];
-      ((UChar *)&ovl)[2] = bytes[2];
-      ((UChar *)&ovl)[3] = bytes[3];
-      ((UChar *)&ovl)[4] = bytes[4];
-      ((UChar *)&ovl)[5] = bytes[5];
-      ((UChar *)&ovl)[6] = 0x0;
-      ((UChar *)&ovl)[7] = 0x0;
-   } else {
-      ((UChar *)&ovl)[0] = 0x0;
-      ((UChar *)&ovl)[1] = 0x0;
-      ((UChar *)&ovl)[2] = bytes[5];
-      ((UChar *)&ovl)[3] = bytes[4];
-      ((UChar *)&ovl)[4] = bytes[3];
-      ((UChar *)&ovl)[5] = bytes[2];
-      ((UChar *)&ovl)[6] = bytes[1];
-      ((UChar *)&ovl)[7] = bytes[0];
-   }
+s390_decode_6byte_and_irgen(const UChar *bytes)
+{
+   ULong ovl = ((ULong)bytes[0] << 56) | ((ULong)bytes[1] << 48) |
+               ((ULong)bytes[2] << 40) | ((ULong)bytes[3] << 32) |
+               ((ULong)bytes[4] << 24) | ((ULong)bytes[5] << 16);
 
    switch ((ovl >> 16) & 0xff00000000ffULL) {
    case 0xe30000000002ULL: s390_format_RXY_RRRD(s390_irgen_LTG, RXY_r1(ovl),
@@ -15672,6 +20014,10 @@ s390_decode_6byte_and_irgen(const UChar *bytes, VexEndness host_endness)
                                                 RXY_x2(ovl), RXY_b2(ovl),
                                                 RXY_dl2(ovl),
                                                 RXY_dh2(ovl));  goto ok;
+   case 0xe3000000002aULL: s390_format_RXY_RRRD(s390_irgen_LZRG, RXY_r1(ovl),
+                                                RXY_x2(ovl), RXY_b2(ovl),
+                                                RXY_dl2(ovl),
+                                                RXY_dh2(ovl));  goto ok;
    case 0xe3000000002eULL: /* CVDG */ goto unimplemented;
    case 0xe3000000002fULL: s390_format_RXY_RRRD(s390_irgen_STRVG,
                                                 RXY_r1(ovl), RXY_x2(ovl),
@@ -15697,6 +20043,17 @@ s390_decode_6byte_and_irgen(const UChar *bytes, VexEndness host_endness)
                                                 RXY_x2(ovl), RXY_b2(ovl),
                                                 RXY_dl2(ovl),
                                                 RXY_dh2(ovl));  goto ok;
+   case 0xe30000000038ULL: /* AGH */ goto unimplemented;
+   case 0xe30000000039ULL: /* SGH */ goto unimplemented;
+   case 0xe3000000003aULL: s390_format_RXY_RRRD(s390_irgen_LLZRGF, RXY_r1(ovl),
+                                                RXY_x2(ovl), RXY_b2(ovl),
+                                                RXY_dl2(ovl),
+                                                RXY_dh2(ovl));  goto ok;
+   case 0xe3000000003bULL: s390_format_RXY_RRRD(s390_irgen_LZRF, RXY_r1(ovl),
+                                                RXY_x2(ovl), RXY_b2(ovl),
+                                                RXY_dl2(ovl),
+                                                RXY_dh2(ovl));  goto ok;
+   case 0xe3000000003cULL: /* MGH */ goto unimplemented;
    case 0xe3000000003eULL: s390_format_RXY_RRRD(s390_irgen_STRV, RXY_r1(ovl),
                                                 RXY_x2(ovl), RXY_b2(ovl),
                                                 RXY_dl2(ovl),
@@ -15709,6 +20066,11 @@ s390_decode_6byte_and_irgen(const UChar *bytes, VexEndness host_endness)
                                                 RXY_x2(ovl), RXY_b2(ovl),
                                                 RXY_dl2(ovl),
                                                 RXY_dh2(ovl));  goto ok;
+   case 0xe30000000047ULL: /* BIC */ goto unimplemented;
+   case 0xe30000000048ULL: /* LLGFSG */ goto unimplemented;
+   case 0xe30000000049ULL: /* STGSC */ goto unimplemented;
+   case 0xe3000000004cULL: /* LGG */ goto unimplemented;
+   case 0xe3000000004dULL: /* LGSC */ goto unimplemented;
    case 0xe30000000050ULL: s390_format_RXY_RRRD(s390_irgen_STY, RXY_r1(ovl),
                                                 RXY_x2(ovl), RXY_b2(ovl),
                                                 RXY_dl2(ovl),
@@ -15717,6 +20079,7 @@ s390_decode_6byte_and_irgen(const UChar *bytes, VexEndness host_endness)
                                                 RXY_x2(ovl), RXY_b2(ovl),
                                                 RXY_dl2(ovl),
                                                 RXY_dh2(ovl));  goto ok;
+   case 0xe30000000053ULL: /* MSC */ goto unimplemented;
    case 0xe30000000054ULL: s390_format_RXY_RRRD(s390_irgen_NY, RXY_r1(ovl),
                                                 RXY_x2(ovl), RXY_b2(ovl),
                                                 RXY_dl2(ovl),
@@ -15821,7 +20184,13 @@ s390_decode_6byte_and_irgen(const UChar *bytes, VexEndness host_endness)
                                                 RXY_x2(ovl), RXY_b2(ovl),
                                                 RXY_dl2(ovl),
                                                 RXY_dh2(ovl));  goto ok;
-   case 0xe30000000085ULL: /* LGAT */ goto unimplemented;
+   case 0xe30000000083ULL: /* MSGC */ goto unimplemented;
+   case 0xe30000000084ULL: /* MG */ goto unimplemented;
+   case 0xe30000000085ULL: s390_format_RXY_RRRD(s390_irgen_LGAT, RXY_r1(ovl),
+                                                RXY_x2(ovl), RXY_b2(ovl),
+                                                RXY_dl2(ovl),
+                                                RXY_dh2(ovl));  goto ok;
+
    case 0xe30000000086ULL: s390_format_RXY_RRRD(s390_irgen_MLG, RXY_r1(ovl),
                                                 RXY_x2(ovl), RXY_b2(ovl),
                                                 RXY_dl2(ovl),
@@ -15878,9 +20247,18 @@ s390_decode_6byte_and_irgen(const UChar *bytes, VexEndness host_endness)
                                                 RXY_x2(ovl), RXY_b2(ovl),
                                                 RXY_dl2(ovl),
                                                 RXY_dh2(ovl));  goto ok;
-   case 0xe3000000009cULL: /* LLGTAT */ goto unimplemented;
-   case 0xe3000000009dULL: /* LLGFAT */ goto unimplemented;
-   case 0xe3000000009fULL: /* LAT */ goto unimplemented;
+   case 0xe3000000009cULL: s390_format_RXY_RRRD(s390_irgen_LLGTAT, RXY_r1(ovl),
+                                                RXY_x2(ovl), RXY_b2(ovl),
+                                                RXY_dl2(ovl),
+                                                RXY_dh2(ovl));  goto ok;
+   case 0xe3000000009dULL: s390_format_RXY_RRRD(s390_irgen_LLGFAT, RXY_r1(ovl),
+                                                RXY_x2(ovl), RXY_b2(ovl),
+                                                RXY_dl2(ovl),
+                                                RXY_dh2(ovl));  goto ok;
+   case 0xe3000000009fULL: s390_format_RXY_RRRD(s390_irgen_LAT, RXY_r1(ovl),
+                                                RXY_x2(ovl), RXY_b2(ovl),
+                                                RXY_dl2(ovl),
+                                                RXY_dh2(ovl));  goto ok;
    case 0xe300000000c0ULL: s390_format_RXY_RRRD(s390_irgen_LBH, RXY_r1(ovl),
                                                 RXY_x2(ovl), RXY_b2(ovl),
                                                 RXY_dl2(ovl),
@@ -15905,7 +20283,10 @@ s390_decode_6byte_and_irgen(const UChar *bytes, VexEndness host_endness)
                                                 RXY_x2(ovl), RXY_b2(ovl),
                                                 RXY_dl2(ovl),
                                                 RXY_dh2(ovl));  goto ok;
-   case 0xe300000000c8ULL: /* LFHAT */ goto unimplemented;
+   case 0xe300000000c8ULL: s390_format_RXY_RRRD(s390_irgen_LFHAT, RXY_r1(ovl),
+                                                RXY_x2(ovl), RXY_b2(ovl),
+                                                RXY_dl2(ovl),
+                                                RXY_dh2(ovl));  goto ok;
    case 0xe300000000caULL: s390_format_RXY_RRRD(s390_irgen_LFH, RXY_r1(ovl),
                                                 RXY_x2(ovl), RXY_b2(ovl),
                                                 RXY_dl2(ovl),
@@ -15922,6 +20303,518 @@ s390_decode_6byte_and_irgen(const UChar *bytes, VexEndness host_endness)
                                                 RXY_x2(ovl), RXY_b2(ovl),
                                                 RXY_dl2(ovl),
                                                 RXY_dh2(ovl));  goto ok;
+   case 0xe60000000034ULL: /* VPKZ */ goto unimplemented;
+   case 0xe60000000035ULL: /* VLRL */ goto unimplemented;
+   case 0xe60000000037ULL: /* VLRLR */ goto unimplemented;
+   case 0xe6000000003cULL: /* VUPKZ */ goto unimplemented;
+   case 0xe6000000003dULL: /* VSTRL */ goto unimplemented;
+   case 0xe6000000003fULL: /* VSTRLR */ goto unimplemented;
+   case 0xe60000000049ULL: /* VLIP */ goto unimplemented;
+   case 0xe60000000050ULL: /* VCVB */ goto unimplemented;
+   case 0xe60000000052ULL: /* VCVBG */ goto unimplemented;
+   case 0xe60000000058ULL: /* VCVD */ goto unimplemented;
+   case 0xe60000000059ULL: /* VSRP */ goto unimplemented;
+   case 0xe6000000005aULL: /* VCVDG */ goto unimplemented;
+   case 0xe6000000005bULL: /* VPSOP */ goto unimplemented;
+   case 0xe6000000005fULL: /* VTP */ goto unimplemented;
+   case 0xe60000000071ULL: /* VAP */ goto unimplemented;
+   case 0xe60000000073ULL: /* VSP */ goto unimplemented;
+   case 0xe60000000077ULL: /* VCP */ goto unimplemented;
+   case 0xe60000000078ULL: /* VMP */ goto unimplemented;
+   case 0xe60000000079ULL: /* VMSP */ goto unimplemented;
+   case 0xe6000000007aULL: /* VDP */ goto unimplemented;
+   case 0xe6000000007bULL: /* VRP */ goto unimplemented;
+   case 0xe6000000007eULL: /* VSDP */ goto unimplemented;
+   case 0xe70000000000ULL: s390_format_VRX_VRRDM(s390_irgen_VLEB, VRX_v1(ovl),
+                                                 VRX_x2(ovl), VRX_b2(ovl),
+                                                 VRX_d2(ovl), VRX_m3(ovl),
+                                                 VRX_rxb(ovl));  goto ok;
+   case 0xe70000000001ULL: s390_format_VRX_VRRDM(s390_irgen_VLEH, VRX_v1(ovl),
+                                                 VRX_x2(ovl), VRX_b2(ovl),
+                                                 VRX_d2(ovl), VRX_m3(ovl),
+                                                 VRX_rxb(ovl));  goto ok;
+   case 0xe70000000002ULL: s390_format_VRX_VRRDM(s390_irgen_VLEG, VRX_v1(ovl),
+                                                 VRX_x2(ovl), VRX_b2(ovl),
+                                                 VRX_d2(ovl), VRX_m3(ovl),
+                                                 VRX_rxb(ovl));  goto ok;
+   case 0xe70000000003ULL: s390_format_VRX_VRRDM(s390_irgen_VLEF, VRX_v1(ovl),
+                                                 VRX_x2(ovl), VRX_b2(ovl),
+                                                 VRX_d2(ovl), VRX_m3(ovl),
+                                                 VRX_rxb(ovl));  goto ok;
+   case 0xe70000000004ULL: s390_format_VRX_VRRDM(s390_irgen_VLLEZ, VRX_v1(ovl),
+                                                 VRX_x2(ovl), VRX_b2(ovl),
+                                                 VRX_d2(ovl), VRX_m3(ovl),
+                                                 VRX_rxb(ovl));  goto ok;
+   case 0xe70000000005ULL: s390_format_VRX_VRRDM(s390_irgen_VLREP, VRX_v1(ovl),
+                                                VRX_x2(ovl), VRX_b2(ovl),
+                                                VRX_d2(ovl), VRX_m3(ovl),
+                                                VRX_rxb(ovl));  goto ok;
+   case 0xe70000000006ULL: s390_format_VRX_VRRD(s390_irgen_VL, VRX_v1(ovl),
+                                                VRX_x2(ovl), VRX_b2(ovl),
+                                                VRX_d2(ovl), VRX_rxb(ovl));  goto ok;
+   case 0xe70000000007ULL: s390_format_VRX_VRRDM(s390_irgen_VLBB, VRX_v1(ovl),
+                                                 VRX_x2(ovl), VRX_b2(ovl),
+                                                 VRX_d2(ovl), VRX_m3(ovl),
+                                                 VRX_rxb(ovl));  goto ok;
+   case 0xe70000000008ULL: s390_format_VRX_VRRDM(s390_irgen_VSTEB, VRX_v1(ovl),
+                                                 VRX_x2(ovl), VRX_b2(ovl),
+                                                 VRX_d2(ovl), VRX_m3(ovl),
+                                                 VRX_rxb(ovl));  goto ok;
+   case 0xe70000000009ULL: s390_format_VRX_VRRDM(s390_irgen_VSTEH, VRX_v1(ovl),
+                                                 VRX_x2(ovl), VRX_b2(ovl),
+                                                 VRX_d2(ovl), VRX_m3(ovl),
+                                                 VRX_rxb(ovl));  goto ok;
+   case 0xe7000000000aULL: s390_format_VRX_VRRDM(s390_irgen_VSTEG, VRX_v1(ovl),
+                                                 VRX_x2(ovl), VRX_b2(ovl),
+                                                 VRX_d2(ovl), VRX_m3(ovl),
+                                                 VRX_rxb(ovl));  goto ok;
+   case 0xe7000000000bULL: s390_format_VRX_VRRDM(s390_irgen_VSTEF, VRX_v1(ovl),
+                                                 VRX_x2(ovl), VRX_b2(ovl),
+                                                 VRX_d2(ovl), VRX_m3(ovl),
+                                                 VRX_rxb(ovl));  goto ok;
+   case 0xe7000000000eULL: s390_format_VRX_VRRD(s390_irgen_VST, VRX_v1(ovl),
+                                                VRX_x2(ovl), VRX_b2(ovl),
+                                                VRX_d2(ovl), VRX_rxb(ovl));  goto ok;
+   case 0xe70000000012ULL: s390_format_VRV_VVRDMT(s390_irgen_VGEG, VRX_v1(ovl),
+                                                  VRX_x2(ovl), VRX_b2(ovl),
+                                                  VRX_d2(ovl), VRX_m3(ovl),
+                                                  VRX_rxb(ovl), Ity_I64);  goto ok;
+   case 0xe70000000013ULL: s390_format_VRV_VVRDMT(s390_irgen_VGEF, VRX_v1(ovl),
+                                                  VRX_x2(ovl), VRX_b2(ovl),
+                                                  VRX_d2(ovl), VRX_m3(ovl),
+                                                  VRX_rxb(ovl), Ity_I32);  goto ok;
+   case 0xe7000000001aULL: s390_format_VRV_VVRDMT(s390_irgen_VSCEG, VRX_v1(ovl),
+                                                  VRX_x2(ovl), VRX_b2(ovl),
+                                                  VRX_d2(ovl), VRX_m3(ovl),
+                                                  VRX_rxb(ovl), Ity_I64);  goto ok;
+   case 0xe7000000001bULL: s390_format_VRV_VVRDMT(s390_irgen_VSCEF, VRX_v1(ovl),
+                                                  VRX_x2(ovl), VRX_b2(ovl),
+                                                  VRX_d2(ovl), VRX_m3(ovl),
+                                                  VRX_rxb(ovl), Ity_I32);  goto ok;
+   case 0xe70000000021ULL: s390_format_VRS_RRDVM(s390_irgen_VLGV, VRS_v1(ovl),
+                                                VRS_b2(ovl), VRS_d2(ovl), VRS_v3(ovl),
+                                                VRS_m4(ovl), VRS_rxb(ovl));  goto ok;
+   case 0xe70000000022ULL: s390_format_VRS_VRRDM(s390_irgen_VLVG, VRS_v1(ovl),
+                                                VRS_b2(ovl), VRS_d2(ovl), VRS_v3(ovl),
+                                                VRS_m4(ovl), VRS_rxb(ovl));  goto ok;
+   case 0xe70000000027ULL: s390_format_RXE_RRRDR(s390_irgen_LCBB, RXE_r1(ovl),
+                                                 RXE_x2(ovl), RXE_b2(ovl),
+                                                 RXE_d2(ovl), RXE_m3(ovl));  goto ok;
+   case 0xe70000000030ULL: s390_format_VRS_VRDVM(s390_irgen_VESL, VRS_v1(ovl),
+                                                 VRS_b2(ovl), VRS_d2(ovl),
+                                                 VRS_v3(ovl), VRS_m4(ovl),
+                                                 VRS_rxb(ovl));  goto ok;
+   case 0xe70000000033ULL: s390_format_VRS_VRDVM(s390_irgen_VERLL, VRS_v1(ovl),
+                                                 VRS_b2(ovl), VRS_d2(ovl),
+                                                 VRS_v3(ovl), VRS_m4(ovl),
+                                                 VRS_rxb(ovl));  goto ok;
+   case 0xe70000000036ULL: s390_format_VRS_VRDV(s390_irgen_VLM, VRS_v1(ovl),
+                                                VRS_b2(ovl), VRS_d2(ovl), VRS_v3(ovl),
+                                                VRS_rxb(ovl));  goto ok;
+   case 0xe70000000037ULL: s390_format_VRS_VRRD(s390_irgen_VLL, VRS_v1(ovl),
+                                                VRS_b2(ovl), VRS_d2(ovl), VRS_v3(ovl),
+                                                VRS_rxb(ovl));  goto ok;
+   case 0xe70000000038ULL: s390_format_VRS_VRDVM(s390_irgen_VESRL, VRS_v1(ovl),
+                                                 VRS_b2(ovl), VRS_d2(ovl),
+                                                 VRS_v3(ovl), VRS_m4(ovl),
+                                                 VRS_rxb(ovl));  goto ok;
+   case 0xe7000000003aULL: s390_format_VRS_VRDVM(s390_irgen_VESRA, VRS_v1(ovl),
+                                                 VRS_b2(ovl), VRS_d2(ovl),
+                                                 VRS_v3(ovl), VRS_m4(ovl),
+                                                 VRS_rxb(ovl));  goto ok;
+   case 0xe7000000003eULL: s390_format_VRS_VRDV(s390_irgen_VSTM, VRS_v1(ovl),
+                                                VRS_b2(ovl), VRS_d2(ovl), VRS_v3(ovl),
+                                                VRS_rxb(ovl));  goto ok;
+   case 0xe7000000003fULL: s390_format_VRS_VRRD(s390_irgen_VSTL, VRS_v1(ovl),
+                                                VRS_b2(ovl), VRS_d2(ovl), VRS_v3(ovl),
+                                                VRS_rxb(ovl));  goto ok;
+   case 0xe70000000040ULL: s390_format_VRI_VIM(s390_irgen_VLEIB, VRI_v1(ovl),
+                                                 VRI_i2(ovl), VRI_m3(ovl),
+                                                 VRI_rxb(ovl));  goto ok;
+   case 0xe70000000041ULL: s390_format_VRI_VIM(s390_irgen_VLEIH, VRI_v1(ovl),
+                                               VRI_i2(ovl), VRI_m3(ovl),
+                                               VRI_rxb(ovl));  goto ok;
+   case 0xe70000000042ULL: s390_format_VRI_VIM(s390_irgen_VLEIG, VRI_v1(ovl),
+                                               VRI_i2(ovl), VRI_m3(ovl),
+                                               VRI_rxb(ovl));  goto ok;
+   case 0xe70000000043ULL: s390_format_VRI_VIM(s390_irgen_VLEIF, VRI_v1(ovl),
+                                               VRI_i2(ovl), VRI_m3(ovl),
+                                               VRI_rxb(ovl));  goto ok;break;
+   case 0xe70000000044ULL: s390_format_VRI_VIM(s390_irgen_VGBM, VRI_v1(ovl),
+                                               VRI_i2(ovl), VRI_m3(ovl),
+                                               VRI_rxb(ovl));  goto ok;
+   case 0xe70000000045ULL: s390_format_VRI_VIM(s390_irgen_VREPI, VRI_v1(ovl),
+                                               VRI_i2(ovl), VRI_m3(ovl),
+                                               VRI_rxb(ovl));  goto ok;
+   case 0xe70000000046ULL: s390_format_VRI_VIM(s390_irgen_VGM, VRI_v1(ovl),
+                                               VRI_i2(ovl), VRI_m3(ovl),
+                                               VRI_rxb(ovl));  goto ok;
+   case 0xe7000000004aULL: s390_format_VRI_VVIMM(s390_irgen_VFTCI, VRIe_v1(ovl),
+                                                 VRIe_v2(ovl), VRIe_i3(ovl),
+                                                 VRIe_m4(ovl), VRIe_m5(ovl),
+                                                 VRIe_rxb(ovl));  goto ok;
+   case 0xe7000000004dULL: s390_format_VRI_VVIM(s390_irgen_VREP, VRI_v1(ovl),
+                                               VRI_v3(ovl), VRI_i2(ovl),
+                                               VRI_m3(ovl), VRI_rxb(ovl));  goto ok;
+   case 0xe70000000050ULL: s390_format_VRR_VVM(s390_irgen_VPOPCT, VRR_v1(ovl),
+                                               VRR_v2(ovl), VRR_m4(ovl),
+                                               VRR_rxb(ovl));  goto ok;
+   case 0xe70000000052ULL: s390_format_VRR_VVM(s390_irgen_VCTZ, VRR_v1(ovl),
+                                               VRR_v2(ovl), VRR_m4(ovl),
+                                               VRR_rxb(ovl));  goto ok;
+   case 0xe70000000053ULL: s390_format_VRR_VVM(s390_irgen_VCLZ, VRR_v1(ovl),
+                                               VRR_v2(ovl), VRR_m4(ovl),
+                                               VRR_rxb(ovl));  goto ok;
+   case 0xe70000000056ULL: s390_format_VRR_VV(s390_irgen_VLR, VRR_v1(ovl),
+                                              VRR_v2(ovl), VRR_rxb(ovl));  goto ok;
+   case 0xe7000000005cULL: s390_format_VRR_VVMM(s390_irgen_VISTR, VRR_v1(ovl),
+                                                VRR_v2(ovl), VRR_m4(ovl),
+                                                VRR_m5(ovl), VRR_rxb(ovl));  goto ok;
+   case 0xe7000000005fULL: s390_format_VRR_VVM(s390_irgen_VSEG, VRR_v1(ovl),
+                                               VRR_v2(ovl), VRR_m4(ovl),
+                                               VRR_rxb(ovl));  goto ok;
+   case 0xe70000000060ULL: s390_format_VRR_VVVM(s390_irgen_VMRL, VRR_v1(ovl),
+                                               VRR_v2(ovl), VRR_r3(ovl),
+                                               VRR_m4(ovl), VRR_rxb(ovl));  goto ok;
+   case 0xe70000000061ULL: s390_format_VRR_VVVM(s390_irgen_VMRH, VRR_v1(ovl),
+                                               VRR_v2(ovl), VRR_r3(ovl),
+                                               VRR_m4(ovl), VRR_rxb(ovl));  goto ok;
+   case 0xe70000000062ULL: s390_format_VRR_VRR(s390_irgen_VLVGP, VRR_v1(ovl),
+                                               VRR_v2(ovl), VRR_r3(ovl),
+                                               VRR_rxb(ovl));  goto ok;
+   case 0xe70000000064ULL: s390_format_VRR_VVVM(s390_irgen_VSUM, VRR_v1(ovl),
+                                                VRR_v2(ovl), VRR_r3(ovl),
+                                                VRR_m4(ovl), VRR_rxb(ovl));  goto ok;
+   case 0xe70000000065ULL: s390_format_VRR_VVVM(s390_irgen_VSUMG, VRR_v1(ovl),
+                                                VRR_v2(ovl), VRR_r3(ovl),
+                                                VRR_m4(ovl), VRR_rxb(ovl));  goto ok;
+   case 0xe70000000066ULL: s390_format_VRR_VVV(s390_irgen_VCKSM, VRR_v1(ovl),
+                                               VRR_v2(ovl), VRR_r3(ovl),
+                                               VRR_rxb(ovl));  goto ok;
+   case 0xe70000000067ULL: s390_format_VRR_VVVM(s390_irgen_VSUMQ, VRR_v1(ovl),
+                                                VRR_v2(ovl), VRR_r3(ovl),
+                                                VRR_m4(ovl), VRR_rxb(ovl));  goto ok;
+   case 0xe70000000068ULL: s390_format_VRR_VVV(s390_irgen_VN, VRR_v1(ovl),
+                                               VRR_v2(ovl), VRR_r3(ovl),
+                                               VRR_rxb(ovl));  goto ok;
+   case 0xe70000000069ULL: s390_format_VRR_VVV(s390_irgen_VNC, VRR_v1(ovl),
+                                               VRR_v2(ovl), VRR_r3(ovl),
+                                               VRR_rxb(ovl));  goto ok;
+   case 0xe7000000006aULL: s390_format_VRR_VVV(s390_irgen_VO, VRR_v1(ovl),
+                                               VRR_v2(ovl), VRR_r3(ovl),
+                                               VRR_rxb(ovl));  goto ok;
+   case 0xe7000000006bULL: s390_format_VRR_VVV(s390_irgen_VNO, VRR_v1(ovl),
+                                               VRR_v2(ovl), VRR_r3(ovl),
+                                               VRR_rxb(ovl));  goto ok;
+   case 0xe7000000006cULL: /* VNX */ goto unimplemented;
+   case 0xe7000000006dULL: s390_format_VRR_VVV(s390_irgen_VX, VRR_v1(ovl),
+                                               VRR_v2(ovl), VRR_r3(ovl),
+                                               VRR_rxb(ovl));  goto ok;
+   case 0xe7000000006eULL: /* VNN */ goto unimplemented;
+   case 0xe7000000006fULL: /* VOC */ goto unimplemented;
+   case 0xe70000000070ULL: s390_format_VRR_VVVM(s390_irgen_VESLV, VRR_v1(ovl),
+                                                VRR_v2(ovl), VRR_r3(ovl),
+                                                VRR_m4(ovl), VRR_rxb(ovl));  goto ok;
+   case 0xe70000000072ULL: s390_format_VRId_VVVIM(s390_irgen_VERIM, VRId_v1(ovl),
+                                                  VRId_v2(ovl), VRId_v3(ovl),
+                                                  VRId_i4(ovl), VRId_m5(ovl),
+                                                  VRId_rxb(ovl));  goto ok;
+   case 0xe70000000073ULL: s390_format_VRR_VVVM(s390_irgen_VERLLV, VRR_v1(ovl),
+                                                VRR_v2(ovl), VRR_r3(ovl),
+                                                VRR_m4(ovl), VRR_rxb(ovl));  goto ok;
+   case 0xe70000000074ULL: s390_format_VRR_VVV(s390_irgen_VSL, VRR_v1(ovl),
+                                               VRR_v2(ovl), VRR_r3(ovl),
+                                               VRR_rxb(ovl));  goto ok;
+   case 0xe70000000075ULL: s390_format_VRR_VVV(s390_irgen_VSLB, VRR_v1(ovl),
+                                               VRR_v2(ovl), VRR_r3(ovl),
+                                               VRR_rxb(ovl));  goto ok;
+   case 0xe70000000077ULL: s390_format_VRId_VVVI(s390_irgen_VSLDB, VRId_v1(ovl),
+                                                 VRId_v2(ovl), VRId_v3(ovl),
+                                                 VRId_i4(ovl), VRId_rxb(ovl));  goto ok;
+   case 0xe70000000078ULL: s390_format_VRR_VVVM(s390_irgen_VESRLV, VRR_v1(ovl),
+                                                VRR_v2(ovl), VRR_r3(ovl),
+                                                VRR_m4(ovl), VRR_rxb(ovl));  goto ok;
+   case 0xe7000000007aULL: s390_format_VRR_VVVM(s390_irgen_VESRAV, VRR_v1(ovl),
+                                                VRR_v2(ovl), VRR_r3(ovl),
+                                                VRR_m4(ovl), VRR_rxb(ovl));  goto ok;
+   case 0xe7000000007cULL: s390_format_VRR_VVV(s390_irgen_VSRL, VRR_v1(ovl),
+                                               VRR_v2(ovl), VRR_r3(ovl),
+                                               VRR_rxb(ovl));  goto ok;
+   case 0xe7000000007dULL: s390_format_VRR_VVV(s390_irgen_VSRLB, VRR_v1(ovl),
+                                               VRR_v2(ovl), VRR_r3(ovl),
+                                               VRR_rxb(ovl));  goto ok;
+   case 0xe7000000007eULL: s390_format_VRR_VVV(s390_irgen_VSRA, VRR_v1(ovl),
+                                               VRR_v2(ovl), VRR_r3(ovl),
+                                               VRR_rxb(ovl));  goto ok;
+   case 0xe7000000007fULL: s390_format_VRR_VVV(s390_irgen_VSRAB, VRR_v1(ovl),
+                                               VRR_v2(ovl), VRR_r3(ovl),
+                                               VRR_rxb(ovl));  goto ok;
+   case 0xe70000000080ULL: s390_format_VRR_VVVMM(s390_irgen_VFEE, VRR_v1(ovl),
+                                                 VRR_v2(ovl), VRR_r3(ovl),
+                                                 VRR_m4(ovl), VRR_m5(ovl),
+                                                 VRR_rxb(ovl));  goto ok;
+   case 0xe70000000081ULL: s390_format_VRR_VVVMM(s390_irgen_VFENE, VRR_v1(ovl),
+                                                 VRR_v2(ovl), VRR_r3(ovl),
+                                                 VRR_m4(ovl), VRR_m5(ovl),
+                                                 VRR_rxb(ovl));  goto ok;
+   case 0xe70000000082ULL: s390_format_VRR_VVVMM(s390_irgen_VFAE, VRR_v1(ovl),
+                                                 VRR_v2(ovl), VRR_r3(ovl),
+                                                 VRR_m4(ovl), VRR_m5(ovl),
+                                                 VRR_rxb(ovl));  goto ok;
+   case 0xe70000000084ULL: s390_format_VRR_VVVM(s390_irgen_VPDI, VRR_v1(ovl),
+                                               VRR_v2(ovl), VRR_r3(ovl),
+                                               VRR_m4(ovl), VRR_rxb(ovl));  goto ok;
+   case 0xe70000000085ULL: /* VBPERM */ goto unimplemented;
+   case 0xe7000000008aULL: s390_format_VRR_VVVVMM(s390_irgen_VSTRC, VRRd_v1(ovl),
+                                                  VRRd_v2(ovl), VRRd_v3(ovl),
+                                                  VRRd_v4(ovl), VRRd_m5(ovl),
+                                                  VRRd_m6(ovl),
+                                                  VRRd_rxb(ovl));  goto ok;
+   case 0xe7000000008cULL: s390_format_VRR_VVVV(s390_irgen_VPERM, VRR_v1(ovl),
+                                               VRR_v2(ovl), VRR_r3(ovl),
+                                               VRR_m4(ovl), VRR_rxb(ovl));  goto ok;
+   case 0xe7000000008dULL: s390_format_VRR_VVVV(s390_irgen_VSEL, VRR_v1(ovl),
+                                               VRR_v2(ovl), VRR_r3(ovl),
+                                               VRR_m4(ovl), VRR_rxb(ovl));  goto ok;
+   case 0xe7000000008eULL: s390_format_VRR_VVVVMM(s390_irgen_VFMS, VRRe_v1(ovl),
+                                                  VRRe_v2(ovl), VRRe_v3(ovl),
+                                                  VRRe_v4(ovl), VRRe_m5(ovl),
+                                                  VRRe_m6(ovl),
+                                                  VRRe_rxb(ovl));  goto ok;
+   case 0xe7000000008fULL: s390_format_VRR_VVVVMM(s390_irgen_VFMA, VRRe_v1(ovl),
+                                                  VRRe_v2(ovl), VRRe_v3(ovl),
+                                                  VRRe_v4(ovl), VRRe_m5(ovl),
+                                                  VRRe_m6(ovl),
+                                                  VRRe_rxb(ovl));  goto ok;
+   case 0xe70000000094ULL: s390_format_VRR_VVVM(s390_irgen_VPK, VRR_v1(ovl),
+                                               VRR_v2(ovl), VRR_r3(ovl),
+                                               VRR_m4(ovl), VRR_rxb(ovl));  goto ok;
+   case 0xe70000000095ULL: s390_format_VRR_VVVMM(s390_irgen_VPKLS, VRR_v1(ovl),
+                                               VRR_v2(ovl), VRR_r3(ovl),
+                                               VRR_m4(ovl), VRR_m5(ovl), VRR_rxb(ovl));  goto ok;
+   case 0xe70000000097ULL: s390_format_VRR_VVVMM(s390_irgen_VPKS, VRR_v1(ovl),
+                                               VRR_v2(ovl), VRR_r3(ovl),
+                                               VRR_m4(ovl), VRR_m5(ovl), VRR_rxb(ovl));  goto ok;
+   case 0xe7000000009eULL: /* VFNMS */ goto unimplemented;
+   case 0xe7000000009fULL: /* VFNMA */ goto unimplemented;
+   case 0xe700000000a1ULL: s390_format_VRR_VVVM(s390_irgen_VMLH, VRR_v1(ovl),
+                                                VRR_v2(ovl), VRR_r3(ovl),
+                                                VRR_m4(ovl), VRR_rxb(ovl));  goto ok;
+   case 0xe700000000a2ULL: s390_format_VRR_VVVM(s390_irgen_VML, VRR_v1(ovl),
+                                                VRR_v2(ovl), VRR_r3(ovl),
+                                                VRR_m4(ovl), VRR_rxb(ovl));  goto ok;
+   case 0xe700000000a3ULL: s390_format_VRR_VVVM(s390_irgen_VMH, VRR_v1(ovl),
+                                                VRR_v2(ovl), VRR_r3(ovl),
+                                                VRR_m4(ovl), VRR_rxb(ovl));  goto ok;
+   case 0xe700000000a4ULL: s390_format_VRR_VVVM(s390_irgen_VMLE, VRR_v1(ovl),
+                                                VRR_v2(ovl), VRR_r3(ovl),
+                                                VRR_m4(ovl), VRR_rxb(ovl));  goto ok;
+   case 0xe700000000a5ULL: s390_format_VRR_VVVM(s390_irgen_VMLO, VRR_v1(ovl),
+                                                VRR_v2(ovl), VRR_r3(ovl),
+                                                VRR_m4(ovl), VRR_rxb(ovl));  goto ok;
+   case 0xe700000000a6ULL: s390_format_VRR_VVVM(s390_irgen_VME, VRR_v1(ovl),
+                                                VRR_v2(ovl), VRR_r3(ovl),
+                                                VRR_m4(ovl), VRR_rxb(ovl));  goto ok;
+   case 0xe700000000a7ULL: s390_format_VRR_VVVM(s390_irgen_VMO, VRR_v1(ovl),
+                                                VRR_v2(ovl), VRR_r3(ovl),
+                                                VRR_m4(ovl), VRR_rxb(ovl));  goto ok;
+   case 0xe700000000a9ULL: s390_format_VRRd_VVVVM(s390_irgen_VMALH, VRRd_v1(ovl),
+                                                  VRRd_v2(ovl), VRRd_v3(ovl),
+                                                  VRRd_v4(ovl), VRRd_m5(ovl),
+                                                  VRRd_rxb(ovl));  goto ok;
+   case 0xe700000000aaULL: s390_format_VRRd_VVVVM(s390_irgen_VMAL, VRRd_v1(ovl),
+                                                  VRRd_v2(ovl), VRRd_v3(ovl),
+                                                  VRRd_v4(ovl), VRRd_m5(ovl),
+                                                  VRRd_rxb(ovl));  goto ok;
+   case 0xe700000000abULL: s390_format_VRRd_VVVVM(s390_irgen_VMAH, VRRd_v1(ovl),
+                                                  VRRd_v2(ovl), VRRd_v3(ovl),
+                                                  VRRd_v4(ovl), VRRd_m5(ovl),
+                                                  VRRd_rxb(ovl));  goto ok;
+   case 0xe700000000acULL: s390_format_VRRd_VVVVM(s390_irgen_VMALE, VRRd_v1(ovl),
+                                                  VRRd_v2(ovl), VRRd_v3(ovl),
+                                                  VRRd_v4(ovl), VRRd_m5(ovl),
+                                                  VRRd_rxb(ovl));  goto ok;
+   case 0xe700000000adULL: s390_format_VRRd_VVVVM(s390_irgen_VMALO, VRRd_v1(ovl),
+                                                  VRRd_v2(ovl), VRRd_v3(ovl),
+                                                  VRRd_v4(ovl), VRRd_m5(ovl),
+                                                  VRRd_rxb(ovl));  goto ok;
+   case 0xe700000000aeULL: s390_format_VRRd_VVVVM(s390_irgen_VMAE, VRRd_v1(ovl),
+                                                  VRRd_v2(ovl), VRRd_v3(ovl),
+                                                  VRRd_v4(ovl), VRRd_m5(ovl),
+                                                  VRRd_rxb(ovl));  goto ok;
+   case 0xe700000000afULL: s390_format_VRRd_VVVVM(s390_irgen_VMAO, VRRd_v1(ovl),
+                                                  VRRd_v2(ovl), VRRd_v3(ovl),
+                                                  VRRd_v4(ovl), VRRd_m5(ovl),
+                                                  VRRd_rxb(ovl));  goto ok;
+   case 0xe700000000b4ULL: s390_format_VRR_VVVM(s390_irgen_VGFM, VRR_v1(ovl),
+                                                VRR_v2(ovl), VRR_r3(ovl),
+                                                VRR_m4(ovl), VRR_rxb(ovl));  goto ok;
+   case 0xe700000000b8ULL: /* VMSL */ goto unimplemented;
+   case 0xe700000000b9ULL: s390_format_VRRd_VVVVM(s390_irgen_VACCC, VRRd_v1(ovl),
+                                                  VRRd_v2(ovl), VRRd_v3(ovl),
+                                                  VRRd_v4(ovl), VRRd_m5(ovl),
+                                                  VRRd_rxb(ovl));  goto ok;
+   case 0xe700000000bbULL: s390_format_VRRd_VVVVM(s390_irgen_VAC, VRRd_v1(ovl),
+                                                  VRRd_v2(ovl), VRRd_v3(ovl),
+                                                  VRRd_v4(ovl), VRRd_m5(ovl),
+                                                  VRRd_rxb(ovl));  goto ok;
+   case 0xe700000000bcULL: s390_format_VRRd_VVVVM(s390_irgen_VGFMA, VRRd_v1(ovl),
+                                                  VRRd_v2(ovl), VRRd_v3(ovl),
+                                                  VRRd_v4(ovl), VRRd_m5(ovl),
+                                                  VRRd_rxb(ovl));  goto ok;
+   case 0xe700000000bdULL: s390_format_VRRd_VVVVM(s390_irgen_VSBCBI, VRRd_v1(ovl),
+                                                  VRRd_v2(ovl), VRRd_v3(ovl),
+                                                  VRRd_v4(ovl), VRRd_m5(ovl),
+                                                  VRRd_rxb(ovl));  goto ok;
+   case 0xe700000000bfULL: s390_format_VRRd_VVVVM(s390_irgen_VSBI, VRRd_v1(ovl),
+                                                  VRRd_v2(ovl), VRRd_v3(ovl),
+                                                  VRRd_v4(ovl), VRRd_m5(ovl),
+                                                  VRRd_rxb(ovl));  goto ok;
+   case 0xe700000000c0ULL: s390_format_VRRa_VVMMM(s390_irgen_VCLGD, VRRa_v1(ovl),
+                                                  VRRa_v2(ovl), VRRa_m3(ovl),
+                                                  VRRa_m4(ovl), VRRa_m5(ovl),
+                                                  VRRa_rxb(ovl)); goto ok;
+   case 0xe700000000c1ULL: s390_format_VRRa_VVMMM(s390_irgen_VCDLG, VRRa_v1(ovl),
+                                                  VRRa_v2(ovl), VRRa_m3(ovl),
+                                                  VRRa_m4(ovl), VRRa_m5(ovl),
+                                                  VRRa_rxb(ovl)); goto ok;
+   case 0xe700000000c2ULL: s390_format_VRRa_VVMMM(s390_irgen_VCGD, VRRa_v1(ovl),
+                                                  VRRa_v2(ovl), VRRa_m3(ovl),
+                                                  VRRa_m4(ovl), VRRa_m5(ovl),
+                                                  VRRa_rxb(ovl)); goto ok;
+   case 0xe700000000c3ULL: s390_format_VRRa_VVMMM(s390_irgen_VCDG, VRRa_v1(ovl),
+                                                  VRRa_v2(ovl), VRRa_m3(ovl),
+                                                  VRRa_m4(ovl), VRRa_m5(ovl),
+                                                  VRRa_rxb(ovl)); goto ok;
+   case 0xe700000000c4ULL: s390_format_VRRa_VVMMM(s390_irgen_VLDE, VRRa_v1(ovl),
+                                                  VRRa_v2(ovl), VRRa_m3(ovl),
+                                                  VRRa_m4(ovl), VRRa_m5(ovl),
+                                                  VRRa_rxb(ovl)); goto ok;
+   case 0xe700000000c5ULL: s390_format_VRRa_VVMMM(s390_irgen_VLED, VRRa_v1(ovl),
+                                                  VRRa_v2(ovl), VRRa_m3(ovl),
+                                                  VRRa_m4(ovl), VRRa_m5(ovl),
+                                                  VRRa_rxb(ovl)); goto ok;
+   case 0xe700000000c7ULL: s390_format_VRRa_VVMMM(s390_irgen_VFI, VRRa_v1(ovl),
+                                                  VRRa_v2(ovl), VRRa_m3(ovl),
+                                                  VRRa_m4(ovl), VRRa_m5(ovl),
+                                                  VRRa_rxb(ovl)); goto ok;
+   case 0xe700000000caULL: s390_format_VRRa_VVMM(s390_irgen_WFK, VRRa_v1(ovl),
+                                                 VRRa_v2(ovl), VRRa_m3(ovl),
+                                                 VRRa_m4(ovl),
+                                                 VRRa_rxb(ovl)); goto ok;
+   case 0xe700000000cbULL: s390_format_VRRa_VVMM(s390_irgen_WFC, VRRa_v1(ovl),
+                                                 VRRa_v2(ovl), VRRa_m3(ovl),
+                                                 VRRa_m4(ovl),
+                                                 VRRa_rxb(ovl)); goto ok;
+   case 0xe700000000ccULL: s390_format_VRRa_VVMMM(s390_irgen_VFPSO, VRRa_v1(ovl),
+                                                  VRRa_v2(ovl), VRRa_m3(ovl),
+                                                  VRRa_m4(ovl), VRRa_m5(ovl),
+                                                  VRRa_rxb(ovl)); goto ok;
+   case 0xe700000000ceULL: s390_format_VRRa_VVMM(s390_irgen_VFSQ, VRRa_v1(ovl),
+                                                 VRRa_v2(ovl), VRRa_m3(ovl),
+                                                 VRRa_m4(ovl),
+                                                 VRRa_rxb(ovl)); goto ok;
+   case 0xe700000000d4ULL: s390_format_VRR_VVM(s390_irgen_VUPLL, VRR_v1(ovl),
+                                               VRR_v2(ovl), VRR_m4(ovl),
+                                               VRR_rxb(ovl));  goto ok;
+   case 0xe700000000d5ULL: s390_format_VRR_VVM(s390_irgen_VUPLH, VRR_v1(ovl),
+                                               VRR_v2(ovl), VRR_m4(ovl),
+                                               VRR_rxb(ovl));  goto ok;
+   case 0xe700000000d6ULL: s390_format_VRR_VVM(s390_irgen_VUPL, VRR_v1(ovl),
+                                               VRR_v2(ovl), VRR_m4(ovl),
+                                               VRR_rxb(ovl));  goto ok;
+   case 0xe700000000d7ULL: s390_format_VRR_VVM(s390_irgen_VUPH, VRR_v1(ovl),
+                                               VRR_v2(ovl), VRR_m4(ovl),
+                                               VRR_rxb(ovl));  goto ok;
+   case 0xe700000000d8ULL: s390_format_VRR_VV(s390_irgen_VTM, VRR_v1(ovl),
+                                              VRR_v2(ovl), VRR_rxb(ovl));  goto ok;
+   case 0xe700000000d9ULL: s390_format_VRR_VVM(s390_irgen_VECL, VRR_v1(ovl),
+                                               VRR_v2(ovl), VRR_m4(ovl),
+                                               VRR_rxb(ovl));  goto ok;
+   case 0xe700000000dbULL: s390_format_VRR_VVM(s390_irgen_VEC, VRR_v1(ovl),
+                                               VRR_v2(ovl), VRR_m4(ovl),
+                                               VRR_rxb(ovl));  goto ok;
+   case 0xe700000000deULL: s390_format_VRR_VVM(s390_irgen_VLC, VRR_v1(ovl),
+                                               VRR_v2(ovl), VRR_m4(ovl),
+                                               VRR_rxb(ovl));  goto ok;
+   case 0xe700000000dfULL: s390_format_VRR_VVM(s390_irgen_VLP, VRR_v1(ovl),
+                                               VRR_v2(ovl), VRR_m4(ovl),
+                                               VRR_rxb(ovl));  goto ok;
+   case 0xe700000000e2ULL: s390_format_VRRa_VVVMM(s390_irgen_VFS, VRRa_v1(ovl),
+                                                  VRRa_v2(ovl), VRRa_v3(ovl),
+                                                  VRRa_m3(ovl), VRRa_m4(ovl),
+                                                  VRRa_rxb(ovl)); goto ok;
+   case 0xe700000000e3ULL: s390_format_VRRa_VVVMM(s390_irgen_VFA, VRRa_v1(ovl),
+                                                  VRRa_v2(ovl), VRRa_v3(ovl),
+                                                  VRRa_m3(ovl), VRRa_m4(ovl),
+                                                  VRRa_rxb(ovl)); goto ok;
+   case 0xe700000000e5ULL: s390_format_VRRa_VVVMM(s390_irgen_VFD, VRRa_v1(ovl),
+                                                  VRRa_v2(ovl), VRRa_v3(ovl),
+                                                  VRRa_m3(ovl), VRRa_m4(ovl),
+                                                  VRRa_rxb(ovl)); goto ok;
+   case 0xe700000000e7ULL: s390_format_VRRa_VVVMM(s390_irgen_VFM, VRRa_v1(ovl),
+                                                  VRRa_v2(ovl), VRRa_v3(ovl),
+                                                  VRRa_m3(ovl), VRRa_m4(ovl),
+                                                  VRRa_rxb(ovl)); goto ok;
+   case 0xe700000000e8ULL: s390_format_VRRa_VVVMMM(s390_irgen_VFCE, VRRa_v1(ovl),
+                                                   VRRa_v2(ovl), VRRa_v3(ovl),
+                                                   VRRa_m3(ovl), VRRa_m4(ovl),
+                                                   VRRa_m5(ovl),
+                                                   VRRa_rxb(ovl)); goto ok;
+   case 0xe700000000eaULL: s390_format_VRRa_VVVMMM(s390_irgen_VFCHE, VRRa_v1(ovl),
+                                                   VRRa_v2(ovl), VRRa_v3(ovl),
+                                                   VRRa_m3(ovl), VRRa_m4(ovl),
+                                                   VRRa_m5(ovl),
+                                                   VRRa_rxb(ovl)); goto ok;
+   case 0xe700000000ebULL: s390_format_VRRa_VVVMMM(s390_irgen_VFCH, VRRa_v1(ovl),
+                                                   VRRa_v2(ovl), VRRa_v3(ovl),
+                                                   VRRa_m3(ovl), VRRa_m4(ovl),
+                                                   VRRa_m5(ovl),
+                                                   VRRa_rxb(ovl)); goto ok;
+   case 0xe700000000eeULL: /* VFMIN */ goto unimplemented;
+   case 0xe700000000efULL: /* VFMAX */ goto unimplemented;
+   case 0xe700000000f0ULL: s390_format_VRR_VVVM(s390_irgen_VAVGL, VRR_v1(ovl),
+                                                VRR_v2(ovl), VRR_r3(ovl),
+                                                VRR_m4(ovl), VRR_rxb(ovl));  goto ok;
+   case 0xe700000000f1ULL: s390_format_VRR_VVVM(s390_irgen_VACC, VRR_v1(ovl),
+                                                VRR_v2(ovl), VRR_r3(ovl),
+                                                VRR_m4(ovl), VRR_rxb(ovl));  goto ok;
+   case 0xe700000000f2ULL: s390_format_VRR_VVVM(s390_irgen_VAVG, VRR_v1(ovl),
+                                                VRR_v2(ovl), VRR_r3(ovl),
+                                                VRR_m4(ovl), VRR_rxb(ovl));  goto ok;
+   case 0xe700000000f3ULL: s390_format_VRR_VVVM(s390_irgen_VA, VRR_v1(ovl),
+                                                VRR_v2(ovl), VRR_r3(ovl),
+                                                VRR_m4(ovl), VRR_rxb(ovl));  goto ok;
+   case 0xe700000000f5ULL: s390_format_VRR_VVVM(s390_irgen_VSCBI, VRR_v1(ovl),
+                                                VRR_v2(ovl), VRR_r3(ovl),
+                                                VRR_m4(ovl), VRR_rxb(ovl));  goto ok;
+   case 0xe700000000f7ULL: s390_format_VRR_VVVM(s390_irgen_VS, VRR_v1(ovl),
+                                                VRR_v2(ovl), VRR_r3(ovl),
+                                                VRR_m4(ovl), VRR_rxb(ovl));  goto ok;
+   case 0xe700000000f8ULL: s390_format_VRR_VVVMM(s390_irgen_VCEQ, VRR_v1(ovl),
+                                                 VRR_v2(ovl), VRR_r3(ovl),
+                                                 VRR_m4(ovl), VRR_m5(ovl),
+                                                 VRR_rxb(ovl));  goto ok;
+   case 0xe700000000f9ULL: s390_format_VRR_VVVMM(s390_irgen_VCHL, VRR_v1(ovl),
+                                                 VRR_v2(ovl), VRR_r3(ovl),
+                                                 VRR_m4(ovl), VRR_m5(ovl),
+                                                 VRR_rxb(ovl));  goto ok;
+   case 0xe700000000fbULL: s390_format_VRR_VVVMM(s390_irgen_VCH, VRR_v1(ovl),
+                                                 VRR_v2(ovl), VRR_r3(ovl),
+                                                 VRR_m4(ovl), VRR_m5(ovl),
+                                                 VRR_rxb(ovl));  goto ok;
+   case 0xe700000000fcULL: s390_format_VRR_VVVM(s390_irgen_VMNL, VRR_v1(ovl),
+                                                VRR_v2(ovl), VRR_r3(ovl),
+                                                VRR_m4(ovl), VRR_rxb(ovl));  goto ok;
+   case 0xe700000000fdULL: s390_format_VRR_VVVM(s390_irgen_VMXL, VRR_v1(ovl),
+                                                VRR_v2(ovl), VRR_r3(ovl),
+                                                VRR_m4(ovl), VRR_rxb(ovl));  goto ok;
+   case 0xe700000000feULL: s390_format_VRR_VVVM(s390_irgen_VMN, VRR_v1(ovl),
+                                               VRR_v2(ovl), VRR_r3(ovl),
+                                               VRR_m4(ovl), VRR_rxb(ovl));  goto ok;
+   case 0xe700000000ffULL: s390_format_VRR_VVVM(s390_irgen_VMX, VRR_v1(ovl),
+                                               VRR_v2(ovl), VRR_r3(ovl),
+                                               VRR_m4(ovl), VRR_rxb(ovl));  goto ok;
    case 0xeb0000000004ULL: s390_format_RSY_RRRD(s390_irgen_LMG, RSY_r1(ovl),
                                                 RSY_r3(ovl), RSY_b2(ovl),
                                                 RSY_dl2(ovl),
@@ -15963,7 +20856,10 @@ s390_decode_6byte_and_irgen(const UChar *bytes, VexEndness host_endness)
                                                 RSY_r3(ovl), RSY_b2(ovl),
                                                 RSY_dl2(ovl),
                                                 RSY_dh2(ovl));  goto ok;
-   case 0xeb0000000023ULL: /* CLT */ goto unimplemented;
+   case 0xeb0000000023ULL: s390_format_RSY_RURD(s390_irgen_CLT, RSY_r1(ovl),
+                                                RSY_r3(ovl), RSY_b2(ovl),
+                                                RSY_dl2(ovl),
+                                                RSY_dh2(ovl));  goto ok;
    case 0xeb0000000024ULL: s390_format_RSY_RRRD(s390_irgen_STMG, RSY_r1(ovl),
                                                 RSY_r3(ovl), RSY_b2(ovl),
                                                 RSY_dl2(ovl),
@@ -15973,7 +20869,10 @@ s390_decode_6byte_and_irgen(const UChar *bytes, VexEndness host_endness)
                                                 RSY_r3(ovl), RSY_b2(ovl),
                                                 RSY_dl2(ovl),
                                                 RSY_dh2(ovl));  goto ok;
-   case 0xeb000000002bULL: /* CLGT */ goto unimplemented;
+   case 0xeb000000002bULL: s390_format_RSY_RURD(s390_irgen_CLGT, RSY_r1(ovl),
+                                                RSY_r3(ovl), RSY_b2(ovl),
+                                                RSY_dl2(ovl),
+                                                RSY_dh2(ovl));  goto ok;
    case 0xeb000000002cULL: s390_format_RSY_RURD(s390_irgen_STCMH,
                                                 RSY_r1(ovl), RSY_r3(ovl),
                                                 RSY_b2(ovl), RSY_dl2(ovl),
@@ -16005,7 +20904,7 @@ s390_decode_6byte_and_irgen(const UChar *bytes, VexEndness host_endness)
                                                 RSY_dh2(ovl));  goto ok;
    case 0xeb000000004cULL: s390_format_RSY_RRRD(s390_irgen_ECAG, RSY_r1(ovl),
                                                 RSY_r3(ovl), RSY_b2(ovl),
-                                                RSY_dl2(ovl), 
+                                                RSY_dl2(ovl),
                                                 RSY_dh2(ovl));  goto ok;
    case 0xeb0000000051ULL: s390_format_SIY_URD(s390_irgen_TMY, SIY_i2(ovl),
                                                SIY_b1(ovl), SIY_dl1(ovl),
@@ -16084,6 +20983,16 @@ s390_decode_6byte_and_irgen(const UChar *bytes, VexEndness host_endness)
                                                 RSY_r3(ovl), RSY_b2(ovl),
                                                 RSY_dl2(ovl),
                                                 RSY_dh2(ovl));  goto ok;
+   case 0xeb00000000e0ULL: s390_format_RSY_RDRM(s390_irgen_LOCFH, RSY_r1(ovl),
+                                                RSY_r3(ovl), RSY_b2(ovl),
+                                                RSY_dl2(ovl),
+                                                RSY_dh2(ovl),
+                                                S390_XMNM_LOCFH);  goto ok;
+   case 0xeb00000000e1ULL: s390_format_RSY_RDRM(s390_irgen_STOCFH, RSY_r1(ovl),
+                                                RSY_r3(ovl), RSY_b2(ovl),
+                                                RSY_dl2(ovl),
+                                                RSY_dh2(ovl),
+                                                S390_XMNM_STOCFH);  goto ok;
    case 0xeb00000000e2ULL: s390_format_RSY_RDRM(s390_irgen_LOCG, RSY_r1(ovl),
                                                 RSY_r3(ovl), RSY_b2(ovl),
                                                 RSY_dl2(ovl),
@@ -16144,12 +21053,30 @@ s390_decode_6byte_and_irgen(const UChar *bytes, VexEndness host_endness)
                                                 RSY_r3(ovl), RSY_b2(ovl),
                                                 RSY_dl2(ovl),
                                                 RSY_dh2(ovl));  goto ok;
+   case 0xec0000000042ULL: s390_format_RIE_RUPIX(s390_irgen_LOCHI,
+                                                 RIEv3_r1(ovl),
+                                                 RIEv3_m3(ovl),
+                                                 RIEv3_i4(ovl),
+                                                 RIEv3_i2(ovl),
+                                                 S390_XMNM_LOCHI);  goto ok;
    case 0xec0000000044ULL: s390_format_RIE_RRP(s390_irgen_BRXHG, RIE_r1(ovl),
                                                RIE_r3(ovl), RIE_i2(ovl));
                                                goto ok;
    case 0xec0000000045ULL: s390_format_RIE_RRP(s390_irgen_BRXLG, RIE_r1(ovl),
                                                RIE_r3(ovl), RIE_i2(ovl));
                                                goto ok;
+   case 0xec0000000046ULL: s390_format_RIE_RUPIX(s390_irgen_LOCGHI,
+                                                 RIEv3_r1(ovl),
+                                                 RIEv3_m3(ovl),
+                                                 RIEv3_i4(ovl),
+                                                 RIEv3_i2(ovl),
+                                                 S390_XMNM_LOCGHI);  goto ok;
+   case 0xec000000004eULL: s390_format_RIE_RUPIX(s390_irgen_LOCHHI,
+                                                 RIEv3_r1(ovl),
+                                                 RIEv3_m3(ovl),
+                                                 RIEv3_i4(ovl),
+                                                 RIEv3_i2(ovl),
+                                                 S390_XMNM_LOCHHI);  goto ok;
    case 0xec0000000051ULL: s390_format_RIE_RRUUU(s390_irgen_RISBLG,
                                                  RIE_RRUUU_r1(ovl),
                                                  RIE_RRUUU_r2(ovl),
@@ -16209,10 +21136,22 @@ s390_decode_6byte_and_irgen(const UChar *bytes, VexEndness host_endness)
                                                 RIE_RRPU_r2(ovl),
                                                 RIE_RRPU_i4(ovl),
                                                 RIE_RRPU_m3(ovl));  goto ok;
-   case 0xec0000000070ULL: /* CGIT */ goto unimplemented;
-   case 0xec0000000071ULL: /* CLGIT */ goto unimplemented;
-   case 0xec0000000072ULL: /* CIT */ goto unimplemented;
-   case 0xec0000000073ULL: /* CLFIT */ goto unimplemented;
+   case 0xec0000000070ULL: s390_format_RIEv1(s390_irgen_CGIT,
+                                             RIEv1_r1(ovl),
+                                             RIEv1_i2(ovl),
+                                             RIEv1_m3(ovl)); goto ok;
+   case 0xec0000000071ULL: s390_format_RIEv1(s390_irgen_CLGIT,
+                                             RIEv1_r1(ovl),
+                                             RIEv1_i2(ovl),
+                                             RIEv1_m3(ovl)); goto ok;
+   case 0xec0000000072ULL: s390_format_RIEv1(s390_irgen_CIT,
+                                             RIEv1_r1(ovl),
+                                             RIEv1_i2(ovl),
+                                             RIEv1_m3(ovl)); goto ok;
+   case 0xec0000000073ULL: s390_format_RIEv1(s390_irgen_CLFIT,
+                                             RIEv1_r1(ovl),
+                                             RIEv1_i2(ovl),
+                                             RIEv1_m3(ovl)); goto ok;
    case 0xec0000000076ULL: s390_format_RIE_RRPU(s390_irgen_CRJ,
                                                 RIE_RRPU_r1(ovl),
                                                 RIE_RRPU_r2(ovl),
@@ -16434,6 +21373,10 @@ s390_decode_6byte_and_irgen(const UChar *bytes, VexEndness host_endness)
    case 0xed00000000a9ULL: /* CZXT */ goto unimplemented;
    case 0xed00000000aaULL: /* CDZT */ goto unimplemented;
    case 0xed00000000abULL: /* CXZT */ goto unimplemented;
+   case 0xed00000000acULL: /* CPDT */ goto unimplemented;
+   case 0xed00000000adULL: /* CPXT */ goto unimplemented;
+   case 0xed00000000aeULL: /* CDPT */ goto unimplemented;
+   case 0xed00000000afULL: /* CXPT */ goto unimplemented;
    }
 
    switch (((ovl >> 16) & 0xff0f00000000ULL) >> 32) {
@@ -16512,7 +21455,7 @@ s390_decode_6byte_and_irgen(const UChar *bytes, VexEndness host_endness)
    case 0xc40fULL: s390_format_RIL_RP(s390_irgen_STRL, RIL_r1(ovl),
                                       RIL_i2(ovl));  goto ok;
    case 0xc600ULL: exrl_bytes = bytes;
-                   s390_format_RIL_RP(host_endness == VexEndnessBE ? s390_irgen_EXRL_BE : s390_irgen_EXRL_LE, RIL_r1(ovl),
+                   s390_format_RIL_RP(s390_irgen_EXRL, RIL_r1(ovl),
                                       RIL_i2(ovl));  goto ok;
    case 0xc602ULL: s390_format_RIL_UP(s390_irgen_PFDRL, RIL_r1(ovl),
                                       RIL_i2(ovl));  goto ok;
@@ -16700,7 +21643,7 @@ s390_decode_special_and_irgen(const UChar *bytes)
 
 /* Function returns # bytes that were decoded or 0 in case of failure */
 static UInt
-s390_decode_and_irgen(const UChar *bytes, UInt insn_length, DisResult *dres, VexEndness host_endness)
+s390_decode_and_irgen(const UChar *bytes, UInt insn_length, DisResult *dres)
 {
    s390_decode_t status;
 
@@ -16726,15 +21669,15 @@ s390_decode_and_irgen(const UChar *bytes, UInt insn_length, DisResult *dres, Vex
       /* Handle normal instructions. */
       switch (insn_length) {
       case 2:
-         status = s390_decode_2byte_and_irgen(bytes, host_endness);
+         status = s390_decode_2byte_and_irgen(bytes);
          break;
 
       case 4:
-         status = s390_decode_4byte_and_irgen(bytes, host_endness);
+         status = s390_decode_4byte_and_irgen(bytes);
          break;
 
       case 6:
-         status = s390_decode_6byte_and_irgen(bytes, host_endness);
+         status = s390_decode_6byte_and_irgen(bytes);
          break;
 
       default:
@@ -16749,7 +21692,13 @@ s390_decode_and_irgen(const UChar *bytes, UInt insn_length, DisResult *dres, Vex
       dis_res->jk_StopHere = Ijk_Boring;
    }
 
-   if (status == S390_DECODE_OK) return insn_length;  /* OK */
+   if (status == S390_DECODE_OK) {
+      /* Adjust status if a specification exception was indicated. */
+      if (is_specification_exception())
+         status = S390_DECODE_SPECIFICATION_EXCEPTION;
+      else
+         return insn_length;  /* OK */
+   }
 
    /* Decoding failed somehow */
    if (sigill_diag) {
@@ -16767,6 +21716,10 @@ s390_decode_and_irgen(const UChar *bytes, UInt insn_length, DisResult *dres, Vex
          vex_printf("unimplemented special insn: ");
          break;
 
+      case S390_DECODE_SPECIFICATION_EXCEPTION:
+         vex_printf("specification exception: ");
+         break;
+
       case S390_DECODE_ERROR:
          vex_printf("decoding error: ");
          break;
@@ -16791,7 +21744,7 @@ s390_decode_and_irgen(const UChar *bytes, UInt insn_length, DisResult *dres, Vex
 
 /* Disassemble a single instruction INSN into IR. */
 static DisResult
-disInstr_S390_WRK(const UChar *insn, VexEndness host_endness)
+disInstr_S390_WRK(const UChar *insn)
 {
    UChar byte;
    UInt  insn_length;
@@ -16817,11 +21770,12 @@ disInstr_S390_WRK(const UChar *insn, VexEndness host_endness)
    dres.len        = insn_length;
    dres.continueAt = 0;
    dres.jk_StopHere = Ijk_INVALID;
+   dres.hint        = Dis_HintNone;
 
    /* fixs390: consider chasing of conditional jumps */
 
    /* Normal and special instruction handling starts here. */
-   if (s390_decode_and_irgen(insn, insn_length, &dres, host_endness) == 0) {
+   if (s390_decode_and_irgen(insn, insn_length, &dres) == 0) {
       /* All decode failures end up here. The decoder has already issued an
          error message.
          Tell the dispatcher that this insn cannot be decoded, and so has
@@ -16893,7 +21847,7 @@ disInstr_S390(IRSB        *irsb_IN,
    resteer_data = callback_opaque;
    sigill_diag = sigill_diag_IN;
 
-   return disInstr_S390_WRK(guest_code + delta, host_endness);
+   return disInstr_S390_WRK(guest_code + delta);
 }
 
 /*---------------------------------------------------------------*/
diff --git a/priv/host_s390_defs.c b/priv/host_s390_defs.c
index 9885d474f..22cdd0425 100644
--- a/priv/host_s390_defs.c
+++ b/priv/host_s390_defs.c
@@ -8,8 +8,8 @@
    This file is part of Valgrind, a dynamic binary instrumentation
    framework.
 
-   Copyright IBM Corp. 2010-2015
-   Copyright (C) 2012-2015  Florian Krohm   (britzel@acm.org)
+   Copyright IBM Corp. 2010-2017
+   Copyright (C) 2012-2017  Florian Krohm   (britzel@acm.org)
 
    This program is free software; you can redistribute it and/or
    modify it under the terms of the GNU General Public License as
@@ -48,7 +48,6 @@
 /*--- Forward declarations                                 ---*/
 /*------------------------------------------------------------*/
 
-static Bool s390_insn_is_reg_reg_move(const s390_insn *, HReg *src, HReg *dst);
 static void s390_insn_map_regs(HRegRemap *, s390_insn *);
 static void s390_insn_get_reg_usage(HRegUsage *u, const s390_insn *);
 static UInt s390_tchain_load64_len(void);
@@ -60,7 +59,7 @@ static UInt s390_tchain_load64_len(void);
 
 /* A mapping from register number to register index */
 static Int gpr_index[16];  // GPR regno -> register index
-static Int fpr_index[16];  // FPR regno -> register index
+static Int vr_index[32];   // VR regno -> register index
 
 HReg
 s390_hreg_gpr(UInt regno)
@@ -73,11 +72,19 @@ s390_hreg_gpr(UInt regno)
 HReg
 s390_hreg_fpr(UInt regno)
 {
-   Int ix = fpr_index[regno];
+   Int ix = vr_index[regno];
    vassert(ix >= 0);
    return mkHReg(/*virtual*/False, HRcFlt64, regno, ix);
 }
 
+HReg
+s390_hreg_vr(UInt regno)
+{
+   Int ix = vr_index[regno];
+   vassert(ix >= 0);
+   return mkHReg(/*virtual*/False, HRcVec128, regno, ix);
+}
+
 static __inline__ UInt
 hregNumber(HReg reg)
 {
@@ -100,6 +107,13 @@ s390_hreg_as_string(HReg reg)
       "%f8",  "%f9",  "%f10", "%f11", "%f12", "%f13", "%f14", "%f15"
    };
 
+   static const HChar vreg_names[32][5] = {
+      "%v0",  "%v1",  "%v2",  "%v3",  "%v4",  "%v5",  "%v6",  "%v7",
+      "%v8",  "%v9",  "%v10", "%v11", "%v12", "%v13", "%v14", "%v15",
+      "%v16",  "%v17",  "%v18", "%v19", "%v20", "%v21", "%v22", "%v23",
+      "%v24",  "%v25",  "%v26", "%v27", "%v28", "%v29", "%v30", "%v31"
+   };
+
    UInt r;  /* hregNumber() returns an UInt */
 
    r = hregNumber(reg);
@@ -110,18 +124,18 @@ s390_hreg_as_string(HReg reg)
       switch (hregClass(reg)) {
       case HRcInt64: vex_sprintf(buf, "%%vR%u", r); break;
       case HRcFlt64: vex_sprintf(buf, "%%vF%u", r); break;
+      case HRcVec128: vex_sprintf(buf, "%%vV%u", r); break;
       default:       goto fail;
       }
       return buf;
    }
 
    /* But specific for real regs. */
-   vassert(r < 16);
-
    switch (hregClass(reg)) {
-   case HRcInt64: return ireg_names[r];
-   case HRcFlt64: return freg_names[r];
-   default:       goto fail;
+   case HRcInt64:  vassert(r < 16); return ireg_names[r];
+   case HRcFlt64:  vassert(r < 16); return freg_names[r];
+   case HRcVec128: vassert(r < 32); return vreg_names[r];
+   default:        goto fail;
    }
 
  fail: vpanic("s390_hreg_as_string");
@@ -135,6 +149,13 @@ s390_hreg_guest_state_pointer(void)
    return s390_hreg_gpr(S390_REGNO_GUEST_STATE_POINTER);
 }
 
+/* Return the real register that holds the stack pointer */
+HReg
+s390_hreg_stack_pointer(void)
+{
+   return s390_hreg_gpr(S390_REGNO_STACK_POINTER);
+}
+
 
 /* Is VALUE within the domain of a 20-bit signed integer. */
 static __inline__ Bool
@@ -246,6 +267,24 @@ s390_amode_for_guest_state(Int offset)
 }
 
 
+/* Construct an AMODE for accessing stack pointer at OFFSET.
+   OFFSET can be at most 3 * sizeof(VexGuestS390XState) + LibVEX_N_SPILL_BYTES
+   which may be too large for a B12 addressing mode.
+   Use a B20 amode as a fallback which will be safe for any offset.
+*/
+s390_amode *
+s390_amode_for_stack_pointer(Int offset)
+{
+   if (fits_unsigned_12bit(offset))
+      return s390_amode_b12(offset, s390_hreg_stack_pointer());
+
+   if (fits_signed_20bit(offset))
+      return s390_amode_b20(offset, s390_hreg_stack_pointer());
+
+   vpanic("invalid stack pointer offset");
+}
+
+
 /* Decompile the given amode into a static buffer and return it. */
 const HChar *
 s390_amode_as_string(const s390_amode *am)
@@ -285,6 +324,39 @@ is_virtual_gpr(HReg reg)
    return hregIsVirtual(reg) && hregClass(reg) == HRcInt64;
 }
 
+/* Helper function for all vector operations */
+static UChar
+s390_getM_from_size(const UChar size) {
+   switch(size) {
+   case 1:
+      return 0;
+   case 2:
+      return 1;
+   case 4:
+      return 2;
+   case 8:
+      return 3;
+   case 16:
+      return 4;
+   default:
+      vex_printf("size=%d\n", size);
+      vpanic("s390_getM_from_size: unknown size");
+   }
+}
+
+/* Helper for generating RXB field in vector instructions */
+static UChar
+s390_update_rxb(const UChar rxb, const UChar index, UChar* vr) {
+   vassert((index >= 1) && (index <= 4));
+   UChar result = rxb;
+   if(vr != NULL) {
+      if(*vr >= 16) {
+         result |= 1 << (4 - index);
+         *vr -= 16;
+      }
+   }
+   return result;
+}
 
 /* Sanity check for an amode */
 Bool
@@ -366,10 +438,10 @@ ppS390Instr(const s390_insn *insn, Bool mode64)
    vex_printf("%s", s390_insn_as_string(insn));
 }
 
-void
+UInt
 ppHRegS390(HReg reg)
 {
-   vex_printf("%s", s390_hreg_as_string(reg));
+   return vex_printf("%s", s390_hreg_as_string(reg));
 }
 
 /*------------------------------------------------------------*/
@@ -390,27 +462,50 @@ getRRegUniverse_S390(void)
 
    RRegUniverse__init(ru);
 
-   /* Assign invalid values to the gpr/fpr_index */
+   /* Assign invalid values to the gpr/vr_index */
    for (UInt i = 0; i < sizeof gpr_index / sizeof gpr_index[0]; ++i)
       gpr_index[i] = -1;
-   for (UInt i = 0; i < sizeof fpr_index / sizeof fpr_index[0]; ++i)
-      fpr_index[i] = -1;
+   for (UInt i = 0; i < sizeof vr_index / sizeof vr_index[0]; ++i)
+      vr_index[i] = -1;
+
 
    /* Add the registers that are available to the register allocator.
-      GPRs:  registers 1..11 are available
-      FPRs:  registers 0..15 are available
+      GPRs:  registers 6..11 are callee saved, list them first
+             registers 1..5 are caller saved, list them after
+      FPRs:  registers 8..15 are callee saved, list them first
+             registers 0..7 are caller saved, list them after
              FPR12 - FPR15 are also used as register pairs for 128-bit
              floating point operations
+      VRs:   registers 0..31 are available
    */
-   UInt regno;
-   for (regno = 1; regno <= 11; ++regno) {
+   ru->allocable_start[HRcInt64] = ru->size;
+   for (UInt regno = 6; regno <= 11; ++regno) {
+      gpr_index[regno] = ru->size;
+      ru->regs[ru->size++] = s390_hreg_gpr(regno);
+   }
+   for (UInt regno = 1; regno <= 5; ++regno) {
       gpr_index[regno] = ru->size;
       ru->regs[ru->size++] = s390_hreg_gpr(regno);
    }
-   for (regno = 0; regno <= 15; ++regno) {
-      fpr_index[regno] = ru->size;
+   ru->allocable_end[HRcInt64] = ru->size - 1;
+
+   ru->allocable_start[HRcFlt64] = ru->size;
+   for (UInt regno = 8; regno <= 15; ++regno) {
+      vr_index[regno] = ru->size;
+      ru->regs[ru->size++] = s390_hreg_fpr(regno);
+   }
+   for (UInt regno = 0; regno <= 7; ++regno) {
+      vr_index[regno] = ru->size;
       ru->regs[ru->size++] = s390_hreg_fpr(regno);
    }
+   ru->allocable_end[HRcFlt64] = ru->size - 1;
+
+   ru->allocable_start[HRcVec128] = ru->size;
+   for (UInt regno = 16; regno <= 31; ++regno) {
+      vr_index[regno] = ru->size;
+      ru->regs[ru->size++] = s390_hreg_vr(regno);
+   }
+   ru->allocable_end[HRcVec128] = ru->size - 1;
    ru->allocable = ru->size;
 
    /* Add the registers that are not available for allocation.
@@ -429,10 +524,12 @@ getRRegUniverse_S390(void)
    /* Sanity checking */
    for (UInt i = 0; i < sizeof gpr_index / sizeof gpr_index[0]; ++i)
       vassert(gpr_index[i] >= 0);
-   for (UInt i = 0; i < sizeof fpr_index / sizeof fpr_index[0]; ++i)
-      vassert(fpr_index[i] >= 0);
+   for (UInt i = 0; i < sizeof vr_index / sizeof vr_index[0]; ++i)
+      vassert(vr_index[i] >= 0);
                  
    initialised = True;
+
+   RRegUniverse__check_is_sane(ru);
    return ru;
 }
 
@@ -453,16 +550,6 @@ mapRegs_S390Instr(HRegRemap *m, s390_insn *insn, Bool mode64)
 }
 
 
-/* Figure out if the given insn represents a reg-reg move, and if so
-   assign the source and destination to *src and *dst.  If in doubt say No.
-   Used by the register allocator to do move coalescing. */
-Bool
-isMove_S390Instr(const s390_insn *insn, HReg *src, HReg *dst)
-{
-   return s390_insn_is_reg_reg_move(insn, src, dst);
-}
-
-
 /* Generate s390 spill/reload instructions under the direction of the
    register allocator.  Note it's critical these don't write the
    condition codes. This is like an Ist_Put */
@@ -483,7 +570,9 @@ genSpill_S390(HInstr **i1, HInstr **i2, HReg rreg, Int offsetB, Bool mode64)
    case HRcFlt64:
       *i1 = s390_insn_store(8, am, rreg);
       return;
-
+   case HRcVec128:
+      *i1 = s390_insn_store(16, am, rreg);
+      return;
    default:
       ppHRegClass(hregClass(rreg));
       vpanic("genSpill_S390: unimplemented regclass");
@@ -509,13 +598,28 @@ genReload_S390(HInstr **i1, HInstr **i2, HReg rreg, Int offsetB, Bool mode64)
    case HRcFlt64:
       *i1 = s390_insn_load(8, rreg, am);
       return;
-
+   case HRcVec128:
+      *i1 = s390_insn_load(16, rreg, am);
+      return;
    default:
       ppHRegClass(hregClass(rreg));
       vpanic("genReload_S390: unimplemented regclass");
    }
 }
 
+s390_insn* genMove_S390(HReg from, HReg to, Bool mode64)
+{
+   switch (hregClass(from)) {
+   case HRcInt64:
+      return s390_insn_move(sizeofIRType(Ity_I64), to, from);
+   case HRcVec128:
+      return s390_insn_move(sizeofIRType(Ity_V128), to, from);
+   default:
+      ppHRegClass(hregClass(from));
+      vpanic("genMove_S390: unimplemented regclass");
+   }
+}
+
 /* Helper function for s390_insn_get_reg_usage */
 static void
 s390_opnd_RMI_get_reg_usage(HRegUsage *u, s390_opnd_RMI op)
@@ -562,6 +666,12 @@ s390_insn_get_reg_usage(HRegUsage *u, const s390_insn *insn)
    case S390_INSN_MOVE:
       addHRegUse(u, HRmRead,  insn->variant.move.src);
       addHRegUse(u, HRmWrite, insn->variant.move.dst);
+
+      if (hregClass(insn->variant.move.src) == hregClass(insn->variant.move.dst)) {
+         u->isRegRegMove = True;
+         u->regMoveSrc   = insn->variant.move.src;
+         u->regMoveDst   = insn->variant.move.dst;
+      }
       break;
 
    case S390_INSN_MEMCPY:
@@ -839,6 +949,32 @@ s390_insn_get_reg_usage(HRegUsage *u, const s390_insn *insn)
       s390_amode_get_reg_usage(u, insn->variant.xassisted.guest_IA);
       break;
 
+   case S390_INSN_VEC_AMODEOP:
+      addHRegUse(u, HRmWrite, insn->variant.vec_amodeop.dst);
+      addHRegUse(u, HRmRead, insn->variant.vec_amodeop.op1);
+      s390_amode_get_reg_usage(u, insn->variant.vec_amodeop.op2);
+      break;
+
+   case S390_INSN_VEC_AMODEINTOP:
+      addHRegUse(u, HRmRead, insn->variant.vec_amodeintop.dst);
+      addHRegUse(u, HRmWrite, insn->variant.vec_amodeintop.dst);
+      s390_amode_get_reg_usage(u, insn->variant.vec_amodeintop.op2);
+      addHRegUse(u, HRmRead, insn->variant.vec_amodeintop.op3);
+      break;
+
+   case S390_INSN_VEC_BINOP:
+      addHRegUse(u, HRmWrite, insn->variant.vec_binop.dst);
+      addHRegUse(u, HRmRead, insn->variant.vec_binop.op1);
+      addHRegUse(u, HRmRead, insn->variant.vec_binop.op2);
+      break;
+
+   case S390_INSN_VEC_TRIOP:
+      addHRegUse(u, HRmWrite, insn->variant.vec_triop.dst);
+      addHRegUse(u, HRmRead, insn->variant.vec_triop.op1);
+      addHRegUse(u, HRmRead, insn->variant.vec_triop.op2);
+      addHRegUse(u, HRmRead, insn->variant.vec_triop.op3);
+      break;
+
    default:
       vpanic("s390_insn_get_reg_usage");
    }
@@ -1187,26 +1323,44 @@ s390_insn_map_regs(HRegRemap *m, s390_insn *insn)
          lookupHRegRemap(m, insn->variant.xassisted.dst);
       break;
 
-   default:
-      vpanic("s390_insn_map_regs");
-   }
-}
+   case S390_INSN_VEC_AMODEOP:
+      insn->variant.vec_amodeop.dst =
+         lookupHRegRemap(m, insn->variant.vec_amodeop.dst);
+      insn->variant.vec_amodeop.op1 =
+         lookupHRegRemap(m, insn->variant.vec_amodeop.op1);
+      s390_amode_map_regs(m, insn->variant.vec_amodeop.op2);
+      break;
 
+   case S390_INSN_VEC_AMODEINTOP:
+      insn->variant.vec_amodeintop.dst =
+         lookupHRegRemap(m, insn->variant.vec_amodeintop.dst);
+      s390_amode_map_regs(m, insn->variant.vec_amodeintop.op2);
+      insn->variant.vec_amodeintop.op3 =
+         lookupHRegRemap(m, insn->variant.vec_amodeintop.op3);
+      break;
 
-/* Return True, if INSN is a move between two registers of the same class.
-   In that case assign the source and destination registers to SRC and DST,
-   respectively. */
-static Bool
-s390_insn_is_reg_reg_move(const s390_insn *insn, HReg *src, HReg *dst)
-{
-   if (insn->tag == S390_INSN_MOVE &&
-       hregClass(insn->variant.move.src) == hregClass(insn->variant.move.dst)) {
-      *src = insn->variant.move.src;
-      *dst = insn->variant.move.dst;
-      return True;
-   }
+   case S390_INSN_VEC_BINOP:
+      insn->variant.vec_binop.dst =
+         lookupHRegRemap(m, insn->variant.vec_binop.dst);
+      insn->variant.vec_binop.op1 =
+         lookupHRegRemap(m, insn->variant.vec_binop.op1);
+      insn->variant.vec_binop.op2 =
+         lookupHRegRemap(m, insn->variant.vec_binop.op2);
+      break;
 
-   return False;
+   case S390_INSN_VEC_TRIOP:
+      insn->variant.vec_triop.dst =
+         lookupHRegRemap(m, insn->variant.vec_triop.dst);
+      insn->variant.vec_triop.op1 =
+         lookupHRegRemap(m, insn->variant.vec_triop.op1);
+      insn->variant.vec_triop.op2 =
+         lookupHRegRemap(m, insn->variant.vec_triop.op2);
+      insn->variant.vec_triop.op3 =
+         lookupHRegRemap(m, insn->variant.vec_triop.op3);
+      break;
+   default:
+      vpanic("s390_insn_map_regs");
+   }
 }
 
 
@@ -1494,6 +1648,199 @@ emit_SSa(UChar *p, ULong op, UChar l, UChar b1, UShort d1, UChar b2, UShort d2)
 }
 
 
+static UChar *
+emit_VRI_VI(UChar *p, ULong op, UChar v1, UShort i2)
+{
+   ULong the_insn = op;
+   ULong rxb = s390_update_rxb(0, 1, &v1);
+
+   the_insn |= ((ULong)v1) << 36;
+   the_insn |= ((ULong)i2) << 16;
+   the_insn |= ((ULong)rxb)<< 8;
+
+   return emit_6bytes(p, the_insn);
+}
+
+
+static UChar *
+emit_VRX(UChar *p, ULong op, UChar v1, UChar x2, UChar b2, UShort d2)
+{
+   ULong the_insn = op;
+   ULong rxb = s390_update_rxb(0, 1, &v1);
+
+   the_insn |= ((ULong)v1) << 36;
+   the_insn |= ((ULong)x2) << 32;
+   the_insn |= ((ULong)b2) << 28;
+   the_insn |= ((ULong)d2) << 16;
+   the_insn |= ((ULong)rxb)<< 8;
+
+   return emit_6bytes(p, the_insn);
+}
+
+
+static UChar *
+emit_VRS(UChar *p, ULong op, UChar reg1, UChar b2, UShort d2, UChar reg3, UChar m4)
+{
+   ULong the_insn = op;
+   ULong rxb = s390_update_rxb(0, 1, &reg1);
+   rxb = s390_update_rxb(rxb, 2, &reg3);
+
+   the_insn |= ((ULong)reg1) << 36;
+   the_insn |= ((ULong)reg3) << 32;
+   the_insn |= ((ULong)b2)   << 28;
+   the_insn |= ((ULong)d2)   << 16;
+   the_insn |= ((ULong)m4)   << 12;
+   the_insn |= ((ULong)rxb)  << 8;
+
+   return emit_6bytes(p, the_insn);
+}
+
+
+static UChar *
+emit_VRR_VVM(UChar *p, ULong op, UChar v1, UChar v2, UChar m4)
+{
+   ULong the_insn = op;
+   ULong rxb = s390_update_rxb(0, 1, &v1);
+   rxb = s390_update_rxb(rxb, 2, &v2);
+
+   the_insn |= ((ULong)v1) << 36;
+   the_insn |= ((ULong)v2) << 32;
+   the_insn |= ((ULong)m4) << 12;
+   the_insn |= ((ULong)rxb)<< 8;
+
+   return emit_6bytes(p, the_insn);
+}
+
+static UChar *
+emit_VRR_VVMMM(UChar *p, ULong op, UChar v1, UChar v2, UChar m3, UChar m4,
+               UChar m5)
+{
+   ULong the_insn = op;
+   ULong rxb = s390_update_rxb(0, 1, &v1);
+   rxb = s390_update_rxb(rxb, 2, &v2);
+
+   the_insn |= ((ULong)v1) << 36;
+   the_insn |= ((ULong)v2) << 32;
+   the_insn |= ((ULong)m5) << 20;
+   the_insn |= ((ULong)m4) << 16;
+   the_insn |= ((ULong)m3) << 12;
+   the_insn |= ((ULong)rxb) << 8;
+
+   return emit_6bytes(p, the_insn);
+}
+
+static UChar *
+emit_VRR_VVVM(UChar *p, ULong op, UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   ULong the_insn = op;
+   ULong rxb = s390_update_rxb(0, 1, &v1);
+   rxb = s390_update_rxb(rxb, 2, &v2);
+   rxb = s390_update_rxb(rxb, 3, &v3);
+
+   the_insn |= ((ULong)v1) << 36;
+   the_insn |= ((ULong)v2) << 32;
+   the_insn |= ((ULong)v3) << 28;
+   the_insn |= ((ULong)m4) << 12;
+   the_insn |= ((ULong)rxb)<< 8;
+
+   return emit_6bytes(p, the_insn);
+}
+
+
+static UChar *
+emit_VRR_VVV(UChar *p, ULong op, UChar v1, UChar v2, UChar v3)
+{
+   return emit_VRR_VVVM(p, op, v1, v2, v3, 0);
+}
+
+
+static UChar *
+emit_VRR_VV(UChar *p, ULong op, UChar v1, UChar v2)
+{
+   return emit_VRR_VVM(p, op, v1, v2, 0);
+}
+
+
+static UChar *
+emit_VRR_VVVV(UChar *p, ULong op, UChar v1, UChar v2, UChar v3, UChar v4)
+{
+   ULong the_insn = op;
+   ULong rxb = s390_update_rxb(0, 1, &v1);
+   rxb = s390_update_rxb(rxb, 2, &v2);
+   rxb = s390_update_rxb(rxb, 3, &v3);
+   rxb = s390_update_rxb(rxb, 4, &v4);
+
+   the_insn |= ((ULong)v1) << 36;
+   the_insn |= ((ULong)v2) << 32;
+   the_insn |= ((ULong)v3) << 28;
+   the_insn |= ((ULong)v4) << 12;
+   the_insn |= ((ULong)rxb)<< 8;
+
+   return emit_6bytes(p, the_insn);
+}
+
+static UChar *
+emit_VRRe_VVVVMM(UChar *p, ULong op, UChar v1, UChar v2, UChar v3, UChar v4,
+                 UChar m5, UChar m6)
+{
+   ULong the_insn = op;
+   ULong rxb = s390_update_rxb(0, 1, &v1);
+   rxb = s390_update_rxb(rxb, 2, &v2);
+   rxb = s390_update_rxb(rxb, 3, &v3);
+   rxb = s390_update_rxb(rxb, 4, &v4);
+
+   the_insn |= ((ULong)v1) << 36;
+   the_insn |= ((ULong)v2) << 32;
+   the_insn |= ((ULong)v3) << 28;
+   the_insn |= ((ULong)m6) << 24;
+   the_insn |= ((ULong)m5) << 16;
+   the_insn |= ((ULong)v4) << 12;
+   the_insn |= ((ULong)rxb) << 8;
+
+   return emit_6bytes(p, the_insn);
+}
+
+static UChar *
+emit_VRR_VRR(UChar *p, ULong op, UChar v1, UChar r2, UChar r3)
+{
+   ULong the_insn = op;
+   ULong rxb = s390_update_rxb(0, 1, &v1);
+
+   the_insn |= ((ULong)v1) << 36;
+   the_insn |= ((ULong)r2) << 32;
+   the_insn |= ((ULong)r3) << 28;
+   the_insn |= ((ULong)rxb)<< 8;
+
+   return emit_6bytes(p, the_insn);
+}
+
+static UChar *
+emit_VRR_VVVMMM(UChar *p, ULong op, UChar v1, UChar v2, UChar v3, UChar m4,
+                UChar m5, UChar m6)
+{
+   ULong the_insn = op;
+   ULong rxb = s390_update_rxb(0, 1, &v1);
+   rxb = s390_update_rxb(rxb, 2, &v2);
+   rxb = s390_update_rxb(rxb, 3, &v3);
+
+   the_insn |= ((ULong)v1) << 36;
+   the_insn |= ((ULong)v2) << 32;
+   the_insn |= ((ULong)v3) << 28;
+   the_insn |= ((ULong)m6) << 20;
+   the_insn |= ((ULong)m5) << 16;
+   the_insn |= ((ULong)m4) << 12;
+   the_insn |= ((ULong)rxb) << 8;
+
+   return emit_6bytes(p, the_insn);
+}
+
+static UChar*
+emit_VRR_VVVMM(UChar *p, ULong op, UChar v1, UChar v2, UChar v3, UChar m4,
+               UChar m5)
+{
+   return emit_VRR_VVVMMM(p, op, v1, v2, v3, m4, m5, 0);
+}
+
 /*------------------------------------------------------------*/
 /*--- Functions to emit particular instructions            ---*/
 /*------------------------------------------------------------*/
@@ -5256,83 +5603,711 @@ s390_emit_LDGRw(UChar *p, UChar r1, UChar r2)
 }
 
 
-/*---------------------------------------------------------------*/
-/*--- Constructors for the various s390_insn kinds            ---*/
-/*---------------------------------------------------------------*/
-
-s390_insn *
-s390_insn_load(UChar size, HReg dst, s390_amode *src)
+static UChar *
+s390_emit_VL(UChar *p, UChar v1, UChar x2, UChar b2, UShort d2)
 {
-   s390_insn *insn = LibVEX_Alloc_inline(sizeof(s390_insn));
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC3(MNM, VR, UDXB), "vl", v1, d2, x2, b2);
 
-   insn->tag  = S390_INSN_LOAD;
-   insn->size = size;
-   insn->variant.load.src  = src;
-   insn->variant.load.dst  = dst;
+   return emit_VRX(p, 0xE70000000006ULL, v1, x2, b2, d2);
+}
 
-   vassert(size == 1 || size == 2 || size == 4 || size == 8);
+static UChar *
+s390_emit_VLR(UChar *p, UChar v1, UChar v2)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC3(MNM, VR, UDXB), "vlr", v1, v2);
 
-   return insn;
+   return emit_VRR_VV(p, 0xE70000000056ULL, v1, v2);
 }
 
 
-s390_insn *
-s390_insn_store(UChar size, s390_amode *dst, HReg src)
+static UChar *
+s390_emit_VST(UChar *p, UChar v1, UChar x2, UChar b2, UShort d2)
 {
-   s390_insn *insn = LibVEX_Alloc_inline(sizeof(s390_insn));
-
-   insn->tag  = S390_INSN_STORE;
-   insn->size = size;
-   insn->variant.store.src  = src;
-   insn->variant.store.dst  = dst;
-
-   vassert(size == 1 || size == 2 || size == 4 || size == 8);
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC3(MNM, VR, UDXB), "vst", v1, d2, x2, b2);
 
-   return insn;
+   return emit_VRX(p, 0xE7000000000eULL, v1, x2, b2, d2);
 }
 
 
-s390_insn *
-s390_insn_move(UChar size, HReg dst, HReg src)
+static UChar *
+s390_emit_VLGV(UChar *p, UChar r1, UChar b2, UShort d2, UChar v3, UChar m4)
 {
-   s390_insn *insn = LibVEX_Alloc_inline(sizeof(s390_insn));
-
-   insn->tag  = S390_INSN_MOVE;
-   insn->size = size;
-   insn->variant.move.src  = src;
-   insn->variant.move.dst  = dst;
-
-   vassert(size == 1 || size == 2 || size == 4 || size == 8);
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC5(MNM, GPR, UDXB, VR, UINT), "vlgv", r1, d2, 0, b2, v3, m4);
 
-   return insn;
+   return emit_VRS(p, 0xE70000000021ULL, r1, b2, d2, v3, m4);
 }
 
 
-s390_insn *
-s390_insn_memcpy(UChar size, s390_amode *dst, s390_amode *src)
+static UChar *
+s390_emit_VLVG(UChar *p, UChar v1, UChar b2, UShort d2, UChar r3, UChar m4)
 {
-   s390_insn *insn = LibVEX_Alloc_inline(sizeof(s390_insn));
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC5(MNM, VR, UDXB, GPR, UINT), "vlvg", v1, d2, 0, b2, r3, m4);
 
-   /* This insn will be mapped to MVC which requires base register
-      plus 12-bit displacement */
-   vassert(src->tag == S390_AMODE_B12);
-   vassert(dst->tag == S390_AMODE_B12);
+   return emit_VRS(p, 0xE70000000022ULL, v1, b2, d2, r3, m4);
+}
 
-   insn->tag  = S390_INSN_MEMCPY;
-   insn->size = size;
-   insn->variant.memcpy.src = src;
-   insn->variant.memcpy.dst = dst;
 
-   vassert(size == 1 || size == 2 || size == 4 || size == 8);
+static UChar *
+s390_emit_VPERM(UChar *p, UChar v1, UChar v2, UChar v3, UChar v4)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC5(MNM, VR, VR, VR, VR), "vperm", v1, v2, v3, v4);
 
-   return insn;
+   return emit_VRR_VVVV(p, 0xE7000000008cULL, v1, v2, v3, v4);
 }
 
-
-s390_insn *
-s390_insn_cond_move(UChar size, s390_cc_t cond, HReg dst, s390_opnd_RMI src)
+static UChar *
+s390_emit_VO(UChar *p, UChar v1, UChar v2, UChar v3)
 {
-   s390_insn *insn = LibVEX_Alloc_inline(sizeof(s390_insn));
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC4(MNM, VR, VR, VR), "vo", v1, v2, v3);
+
+   return emit_VRR_VVV(p, 0xE7000000006aULL, v1, v2, v3);
+}
+
+static UChar *
+s390_emit_VX(UChar *p, UChar v1, UChar v2, UChar v3)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC4(MNM, VR, VR, VR), "vx", v1, v2, v3);
+
+   return emit_VRR_VVV(p, 0xE7000000006dULL, v1, v2, v3);
+}
+
+static UChar *
+s390_emit_VN(UChar *p, UChar v1, UChar v2, UChar v3)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC4(MNM, VR, VR, VR), "vn", v1, v2, v3);
+
+   return emit_VRR_VVV(p, 0xE70000000068ULL, v1, v2, v3);
+}
+
+static UChar*
+s390_emit_VCEQ(UChar *p, UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC5(MNM, VR, VR, VR, UINT), "vceq", v1, v2, v3, m4);
+
+   return emit_VRR_VVVM(p, 0xE700000000f8ULL, v1, v2, v3, m4);
+}
+
+
+static UChar *
+s390_emit_VGBM(UChar *p, UChar v1, UShort i2)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC3(MNM, VR, UINT), "vgbm", v1, i2);
+
+   return emit_VRI_VI(p, 0xE70000000044ULL, v1, i2);
+}
+
+
+static UChar *
+s390_emit_VPK(UChar *p, UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC5(MNM, VR, VR, VR, UINT), "vpk", v1, v2, v3, m4);
+
+   return emit_VRR_VVVM(p, 0xE70000000094ULL, v1, v2, v3, m4);
+}
+
+
+static UChar *
+s390_emit_VPKS(UChar *p, UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC6(MNM, VR, VR, VR, UINT, UINT), "vpks", v1, v2, v3, m4, 0);
+
+   return emit_VRR_VVVM(p, 0xE70000000097ULL, v1, v2, v3, m4);
+}
+
+
+static UChar *
+s390_emit_VPKLS(UChar *p, UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC6(MNM, VR, VR, VR, UINT, UINT), "vpkls", v1, v2, v3, m4, 0);
+
+   return emit_VRR_VVVM(p, 0xE70000000095ULL, v1, v2, v3, m4);
+}
+
+
+static UChar *
+s390_emit_VREP(UChar *p, UChar v1, UChar v3, UChar m3)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC5(MNM, VR, VR, UINT, UINT), "vrep", v1, v3, 0, m3);
+
+   return emit_VRR_VVM(p, 0xE7000000004DULL, v1, v3, m3);
+}
+
+
+
+static UChar *
+s390_emit_VUPH(UChar *p, UChar v1, UChar v3, UChar m3)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC4(MNM, VR, VR, UINT), "vuph", v1, v3, m3);
+
+   return emit_VRR_VVM(p, 0xE700000000D7ULL, v1, v3, m3);
+}
+
+
+static UChar *
+s390_emit_VUPLH(UChar *p, UChar v1, UChar v3, UChar m3)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC4(MNM, VR, VR, UINT), "vuplh", v1, v3, m3);
+
+   return emit_VRR_VVM(p, 0xE700000000D5ULL, v1, v3, m3);
+}
+
+
+static UChar*
+s390_emit_VMRH(UChar *p, UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC5(MNM, VR, VR, VR, UINT), "vmrh", v1, v2, v3, m4);
+
+   return emit_VRR_VVVM(p, 0xE70000000061ULL, v1, v2, v3, m4);
+}
+
+
+static UChar*
+s390_emit_VMRL(UChar *p, UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC5(MNM, VR, VR, VR, UINT), "vmrl", v1, v2, v3, m4);
+
+   return emit_VRR_VVVM(p, 0xE70000000060ULL, v1, v2, v3, m4);
+}
+
+static UChar *
+s390_emit_VA(UChar *p, UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC5(MNM, VR, VR, VR, UINT), "va", v1, v2, v3, m4);
+
+   return emit_VRR_VVVM(p, 0xE700000000f3ULL, v1, v2, v3, m4);
+}
+
+static UChar *
+s390_emit_VS(UChar *p, UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC5(MNM, VR, VR, VR, UINT), "vs", v1, v2, v3, m4);
+
+   return emit_VRR_VVVM(p, 0xE700000000f7ULL, v1, v2, v3, m4);
+}
+
+static UChar *
+s390_emit_VNO(UChar *p, UChar v1, UChar v2, UChar v3)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC4(MNM, VR, VR, VR), "vno", v1, v2, v3);
+
+   return emit_VRR_VVV(p, 0xE7000000006bULL, v1, v2, v3);
+}
+
+static UChar *
+s390_emit_VCH(UChar *p, UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC5(MNM, VR, VR, VR, UINT), "vch", v1, v2, v3, m4);
+
+   return emit_VRR_VVVM(p, 0xE700000000fbULL, v1, v2, v3, m4);
+}
+
+static UChar *
+s390_emit_VCHL(UChar *p, UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC5(MNM, VR, VR, VR, UINT), "vchl", v1, v2, v3, m4);
+
+   return emit_VRR_VVVM(p, 0xE700000000f9ULL, v1, v2, v3, m4);
+}
+
+static UChar *
+s390_emit_VCLZ(UChar *p, UChar v1, UChar v2, UChar m4)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC4(MNM, VR, VR, UINT), "vclz", v1, v2, m4);
+
+   return emit_VRR_VVM(p, 0xE70000000053ULL, v1, v2, m4);
+}
+
+static UChar *
+s390_emit_VCTZ(UChar *p, UChar v1, UChar v2, UChar m4)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC4(MNM, VR, VR, UINT), "vctz", v1, v2, m4);
+
+   return emit_VRR_VVM(p, 0xE70000000052ULL, v1, v2, m4);
+}
+
+static UChar *
+s390_emit_VPOPCT(UChar *p, UChar v1, UChar v2, UChar m4)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC4(MNM, VR, VR, UINT), "vpopct", v1, v2, m4);
+
+   return emit_VRR_VVM(p, 0xE70000000050ULL, v1, v2, m4);
+}
+
+static UChar *
+s390_emit_VMX(UChar *p, UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC5(MNM, VR, VR, VR, UINT), "vmx", v1, v2, v3, m4);
+
+   return emit_VRR_VVVM(p, 0xE700000000ffULL, v1, v2, v3, m4);
+}
+
+static UChar *
+s390_emit_VMXL(UChar *p, UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC5(MNM, VR, VR, VR, UINT), "vmxl", v1, v2, v3, m4);
+
+   return emit_VRR_VVVM(p, 0xE700000000fdULL, v1, v2, v3, m4);
+}
+
+static UChar *
+s390_emit_VMN(UChar *p, UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC5(MNM, VR, VR, VR, UINT), "vmn", v1, v2, v3, m4);
+
+   return emit_VRR_VVVM(p, 0xE700000000feULL, v1, v2, v3, m4);
+}
+
+static UChar *
+s390_emit_VMNL(UChar *p, UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC5(MNM, VR, VR, VR, UINT), "vmnl", v1, v2, v3, m4);
+
+   return emit_VRR_VVVM(p, 0xE700000000fcULL, v1, v2, v3, m4);
+}
+
+static UChar *
+s390_emit_VAVG(UChar *p, UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC5(MNM, VR, VR, VR, UINT), "vavg", v1, v2, v3, m4);
+
+   return emit_VRR_VVVM(p, 0xE700000000f2ULL, v1, v2, v3, m4);
+}
+
+static UChar *
+s390_emit_VAVGL(UChar *p, UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC5(MNM, VR, VR, VR, UINT), "vavgl", v1, v2, v3, m4);
+
+   return emit_VRR_VVVM(p, 0xE700000000f0ULL, v1, v2, v3, m4);
+}
+
+static UChar *
+s390_emit_VLP(UChar *p, UChar v1, UChar v2, UChar m3)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC4(MNM, VR, VR, UINT), "vlp", v1, v2, m3);
+
+   return emit_VRR_VVM(p, 0xE700000000DFULL, v1, v2, m3);
+}
+
+static UChar *
+s390_emit_VMH(UChar *p, UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC5(MNM, VR, VR, VR, UINT), "vmh", v1, v2, v3, m4);
+
+   return emit_VRR_VVVM(p, 0xE700000000a3ULL, v1, v2, v3, m4);
+}
+
+static UChar *
+s390_emit_VMLH(UChar *p, UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC5(MNM, VR, VR, VR, UINT), "vmlh", v1, v2, v3, m4);
+
+   return emit_VRR_VVVM(p, 0xE700000000a1ULL, v1, v2, v3, m4);
+}
+
+static UChar *
+s390_emit_VML(UChar *p, UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC5(MNM, VR, VR, VR, UINT), "vml", v1, v2, v3, m4);
+
+   return emit_VRR_VVVM(p, 0xE700000000a2ULL, v1, v2, v3, m4);
+}
+
+static UChar *
+s390_emit_VME(UChar *p, UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC5(MNM, VR, VR, VR, UINT), "vme", v1, v2, v3, m4);
+
+   return emit_VRR_VVVM(p, 0xE700000000a6ULL, v1, v2, v3, m4);
+}
+
+static UChar *
+s390_emit_VMLE(UChar *p, UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC5(MNM, VR, VR, VR, UINT), "vmle", v1, v2, v3, m4);
+
+   return emit_VRR_VVVM(p, 0xE700000000a4ULL, v1, v2, v3, m4);
+}
+
+static UChar *
+s390_emit_VESLV(UChar *p, UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC5(MNM, VR, VR, VR, UINT), "veslv", v1, v2, v3, m4);
+
+   return emit_VRR_VVVM(p, 0xE70000000070ULL, v1, v2, v3, m4);
+}
+
+static UChar *
+s390_emit_VESRAV(UChar *p, UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC5(MNM, VR, VR, VR, UINT), "vesrav", v1, v2, v3, m4);
+
+   return emit_VRR_VVVM(p, 0xE7000000007aULL, v1, v2, v3, m4);
+}
+
+static UChar *
+s390_emit_VESRLV(UChar *p, UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC5(MNM, VR, VR, VR, UINT), "vesrlv", v1, v2, v3, m4);
+
+   return emit_VRR_VVVM(p, 0xE70000000078ULL, v1, v2, v3, m4);
+}
+
+static UChar *
+s390_emit_VESL(UChar *p, UChar v1, UChar b2, UShort d2, UChar v3, UChar m4)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC5(MNM, VR, UDXB, VR, UINT), "vesl", v1, d2, 0, b2, v3, m4);
+
+   return emit_VRS(p, 0xE70000000030ULL, v1, b2, d2, v3, m4);
+}
+
+static UChar *
+s390_emit_VESRA(UChar *p, UChar v1, UChar b2, UShort d2, UChar v3, UChar m4)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC5(MNM, VR, UDXB, VR, UINT), "vesra", v1, d2, 0, b2, v3, m4);
+
+   return emit_VRS(p, 0xE7000000003aULL, v1, b2, d2, v3, m4);
+}
+
+static UChar *
+s390_emit_VESRL(UChar *p, UChar v1, UChar b2, UShort d2, UChar v3, UChar m4)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC5(MNM, VR, UDXB, VR, UINT), "vesrl", v1, d2, 0, b2, v3, m4);
+
+   return emit_VRS(p, 0xE70000000038ULL, v1, b2, d2, v3, m4);
+}
+
+static UChar *
+s390_emit_VERLLV(UChar *p, UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC5(MNM, VR, VR, VR, UINT), "verllv", v1, v2, v3, m4);
+
+   return emit_VRR_VVVM(p, 0xE70000000073ULL, v1, v2, v3, m4);
+}
+
+static UChar *
+s390_emit_VSL(UChar *p, UChar v1, UChar v2, UChar v3)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC4(MNM, VR, VR, VR), "vsl", v1, v2, v3);
+
+   return emit_VRR_VVV(p, 0xE70000000074ULL, v1, v2, v3);
+}
+
+static UChar *
+s390_emit_VSRL(UChar *p, UChar v1, UChar v2, UChar v3)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC4(MNM, VR, VR, VR), "vsrl", v1, v2, v3);
+
+   return emit_VRR_VVV(p, 0xE7000000007cULL, v1, v2, v3);
+}
+
+static UChar *
+s390_emit_VSRA(UChar *p, UChar v1, UChar v2, UChar v3)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC4(MNM, VR, VR, VR), "vsra", v1, v2, v3);
+
+   return emit_VRR_VVV(p, 0xE7000000007eULL, v1, v2, v3);
+}
+
+static UChar *
+s390_emit_VSLB(UChar *p, UChar v1, UChar v2, UChar v3)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC4(MNM, VR, VR, VR), "vslb", v1, v2, v3);
+
+   return emit_VRR_VVV(p, 0xE70000000075ULL, v1, v2, v3);
+}
+
+static UChar *
+s390_emit_VSRLB(UChar *p, UChar v1, UChar v2, UChar v3)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC4(MNM, VR, VR, VR), "vsrlb", v1, v2, v3);
+
+   return emit_VRR_VVV(p, 0xE7000000007dULL, v1, v2, v3);
+}
+
+static UChar *
+s390_emit_VSRAB(UChar *p, UChar v1, UChar v2, UChar v3)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC4(MNM, VR, VR, VR), "vsrab", v1, v2, v3);
+
+   return emit_VRR_VVV(p, 0xE7000000007fULL, v1, v2, v3);
+}
+
+static UChar *
+s390_emit_VSUM(UChar *p, UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC5(MNM, VR, VR, VR, UINT), "vsum", v1, v2, v3, m4);
+
+   return emit_VRR_VVVM(p, 0xE70000000064ULL, v1, v2, v3, m4);
+}
+
+static UChar *
+s390_emit_VSUMG(UChar *p, UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC5(MNM, VR, VR, VR, UINT), "vsumg", v1, v2, v3, m4);
+
+   return emit_VRR_VVVM(p, 0xE70000000065ULL, v1, v2, v3, m4);
+}
+
+static UChar *
+s390_emit_VSUMQ(UChar *p, UChar v1, UChar v2, UChar v3, UChar m4)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC5(MNM, VR, VR, VR, UINT), "vsumq", v1, v2, v3, m4);
+
+   return emit_VRR_VVVM(p, 0xE70000000067ULL, v1, v2, v3, m4);
+}
+
+static UChar *
+s390_emit_VLVGP(UChar *p, UChar v1, UChar r2, UChar r3)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC4(MNM, VR, GPR, GPR), "vlvgp", v1, r2, r3);
+
+   return emit_VRR_VRR(p, 0xE70000000062ULL, v1, r2, r3);
+}
+
+static UChar *
+s390_emit_VFPSO(UChar *p, UChar v1, UChar v2, UChar m3, UChar m4, UChar m5)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC6(MNM, VR, VR, UINT, UINT, UINT), "vfpso", v1, v2, m3, m4,
+                  m5);
+
+   return emit_VRR_VVMMM(p, 0xE700000000CCULL, v1, v2, m3, m4, m5);
+}
+
+static UChar *
+s390_emit_VFA(UChar *p, UChar v1, UChar v2, UChar v3, UChar m4, UChar m5)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC6(MNM, VR, VR, VR, UINT, UINT), "vfa", v1, v2, v3, m4, m5);
+
+   return emit_VRR_VVVMM(p, 0xE700000000e3ULL, v1, v2, v3, m4, m5);
+}
+
+static UChar *
+s390_emit_VFS(UChar *p, UChar v1, UChar v2, UChar v3, UChar m4, UChar m5)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC6(MNM, VR, VR, VR, UINT, UINT), "vfs", v1, v2, v3, m4, m5);
+
+   return emit_VRR_VVVMM(p, 0xE700000000e2ULL, v1, v2, v3, m4, m5);
+}
+
+static UChar *
+s390_emit_VFM(UChar *p, UChar v1, UChar v2, UChar v3, UChar m4, UChar m5)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC6(MNM, VR, VR, VR, UINT, UINT), "vfm", v1, v2, v3, m4, m5);
+
+   return emit_VRR_VVVMM(p, 0xE700000000e7ULL, v1, v2, v3, m4, m5);
+}
+
+static UChar *
+s390_emit_VFD(UChar *p, UChar v1, UChar v2, UChar v3, UChar m4, UChar m5)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC6(MNM, VR, VR, VR, UINT, UINT), "vfd", v1, v2, v3, m4, m5);
+
+   return emit_VRR_VVVMM(p, 0xE700000000e5ULL, v1, v2, v3, m4, m5);
+}
+
+static UChar *
+s390_emit_VFSQ(UChar *p, UChar v1, UChar v2, UChar m3, UChar m4)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC5(MNM, VR, VR, UINT, UINT), "vfsq", v1, v2, m3, m4);
+
+   return emit_VRR_VVMMM(p, 0xE700000000CEULL, v1, v2, m3, m4, 0);
+}
+
+static UChar *
+s390_emit_VFMA(UChar *p, UChar v1, UChar v2, UChar v3, UChar v4, UChar m5,
+               UChar m6)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC7(MNM, VR, VR, VR, VR, UINT, UINT), "vfma",
+                  v1, v2, v3, v4, m5, m6);
+
+   return emit_VRRe_VVVVMM(p, 0xE7000000008fULL, v1, v2, v3, v4, m5, m6);
+}
+
+static UChar *
+s390_emit_VFMS(UChar *p, UChar v1, UChar v2, UChar v3, UChar v4, UChar m5,
+               UChar m6)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC7(MNM, VR, VR, VR, VR, UINT, UINT), "vfms",
+                  v1, v2, v3, v4, m5, m6);
+
+   return emit_VRRe_VVVVMM(p, 0xE7000000008eULL, v1, v2, v3, v4, m5, m6);
+}
+
+static UChar *
+s390_emit_VFCE(UChar *p, UChar v1, UChar v2, UChar v3, UChar m4, UChar m5,
+               UChar m6)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC7(MNM, VR, VR, VR, UINT, UINT, UINT), "vfce",
+                  v1, v2, v3, m4, m5, m6);
+
+   return emit_VRR_VVVMMM(p, 0xE700000000e8ULL, v1, v2, v3, m4, m5, m6);
+}
+
+static UChar *
+s390_emit_VFCH(UChar *p, UChar v1, UChar v2, UChar v3, UChar m4, UChar m5,
+               UChar m6)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC7(MNM, VR, VR, VR, UINT, UINT, UINT), "vfch",
+                  v1, v2, v3, m4, m5, m6);
+
+   return emit_VRR_VVVMMM(p, 0xE700000000ebULL, v1, v2, v3, m4, m5, m6);
+}
+
+static UChar *
+s390_emit_VFCHE(UChar *p, UChar v1, UChar v2, UChar v3, UChar m4, UChar m5,
+                UChar m6)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC7(MNM, VR, VR, VR, UINT, UINT, UINT), "vfche",
+                  v1, v2, v3, m4, m5, m6);
+
+   return emit_VRR_VVVMMM(p, 0xE700000000eaULL, v1, v2, v3, m4, m5, m6);
+}
+
+/*---------------------------------------------------------------*/
+/*--- Constructors for the various s390_insn kinds            ---*/
+/*---------------------------------------------------------------*/
+
+s390_insn *
+s390_insn_load(UChar size, HReg dst, s390_amode *src)
+{
+   s390_insn *insn = LibVEX_Alloc_inline(sizeof(s390_insn));
+
+   insn->tag  = S390_INSN_LOAD;
+   insn->size = size;
+   insn->variant.load.src  = src;
+   insn->variant.load.dst  = dst;
+
+   vassert(size == 1 || size == 2 || size == 4 || size == 8 || size == 16);
+
+   return insn;
+}
+
+
+s390_insn *
+s390_insn_store(UChar size, s390_amode *dst, HReg src)
+{
+   s390_insn *insn = LibVEX_Alloc_inline(sizeof(s390_insn));
+
+   insn->tag  = S390_INSN_STORE;
+   insn->size = size;
+   insn->variant.store.src  = src;
+   insn->variant.store.dst  = dst;
+
+   vassert(size == 1 || size == 2 || size == 4 || size == 8 || size == 16);
+
+   return insn;
+}
+
+
+s390_insn *
+s390_insn_move(UChar size, HReg dst, HReg src)
+{
+   s390_insn *insn = LibVEX_Alloc_inline(sizeof(s390_insn));
+
+   insn->tag  = S390_INSN_MOVE;
+   insn->size = size;
+   insn->variant.move.src  = src;
+   insn->variant.move.dst  = dst;
+
+   vassert(size == 1 || size == 2 || size == 4 || size == 8 || size ==16);
+
+   return insn;
+}
+
+
+s390_insn *
+s390_insn_memcpy(UChar size, s390_amode *dst, s390_amode *src)
+{
+   s390_insn *insn = LibVEX_Alloc_inline(sizeof(s390_insn));
+
+   /* This insn will be mapped to MVC which requires base register
+      plus 12-bit displacement */
+   vassert(src->tag == S390_AMODE_B12);
+   vassert(dst->tag == S390_AMODE_B12);
+
+   insn->tag  = S390_INSN_MEMCPY;
+   insn->size = size;
+   insn->variant.memcpy.src = src;
+   insn->variant.memcpy.dst = dst;
+
+   vassert(size == 1 || size == 2 || size == 4 || size == 8);
+
+   return insn;
+}
+
+
+s390_insn *
+s390_insn_cond_move(UChar size, s390_cc_t cond, HReg dst, s390_opnd_RMI src)
+{
+   s390_insn *insn = LibVEX_Alloc_inline(sizeof(s390_insn));
 
    insn->tag  = S390_INSN_COND_MOVE;
    insn->size = size;
@@ -6343,6 +7318,75 @@ s390_insn_profinc(void)
 }
 
 
+s390_insn *
+s390_insn_vec_amodeop(UChar size, s390_vec_amodeop_t tag, HReg dst, HReg op1,
+                    s390_amode *op2)
+{
+   s390_insn *insn = LibVEX_Alloc_inline(sizeof(s390_insn));
+
+   vassert(size == 1 || size == 2 || size == 4 || size == 8);
+
+   insn->tag  = S390_INSN_VEC_AMODEOP;
+   insn->size = size;
+   insn->variant.vec_amodeop.tag = tag;
+   insn->variant.vec_amodeop.dst = dst;
+   insn->variant.vec_amodeop.op1 = op1;
+   insn->variant.vec_amodeop.op2 = op2;
+
+   return insn;
+}
+
+s390_insn *s390_insn_vec_amodeintop(UChar size, s390_vec_amodeintop_t tag, HReg dst,
+                                    s390_amode* op2, HReg op3)
+{
+   s390_insn *insn = LibVEX_Alloc_inline(sizeof(s390_insn));
+
+   vassert(size == 1 || size == 2 || size == 4 || size == 8);
+
+   insn->tag  = S390_INSN_VEC_AMODEINTOP;
+   insn->size = size;
+   insn->variant.vec_amodeintop.tag = tag;
+   insn->variant.vec_amodeintop.dst = dst;
+   insn->variant.vec_amodeintop.op2 = op2;
+   insn->variant.vec_amodeintop.op3 = op3;
+
+   return insn;
+}
+
+s390_insn *s390_insn_vec_binop(UChar size, s390_vec_binop_t tag, HReg dst,
+                               HReg op1, HReg op2)
+{
+   s390_insn *insn = LibVEX_Alloc_inline(sizeof(s390_insn));
+
+   vassert(size == 1 || size == 2 || size == 4 || size == 8 || size == 16);
+
+   insn->tag  = S390_INSN_VEC_BINOP;
+   insn->size = size;
+   insn->variant.vec_binop.tag = tag;
+   insn->variant.vec_binop.dst = dst;
+   insn->variant.vec_binop.op1 = op1;
+   insn->variant.vec_binop.op2 = op2;
+
+   return insn;
+}
+
+s390_insn *s390_insn_vec_triop(UChar size, s390_vec_triop_t tag, HReg dst,
+                               HReg op1, HReg op2, HReg op3)
+{
+   s390_insn *insn = LibVEX_Alloc_inline(sizeof(s390_insn));
+
+
+   insn->tag  = S390_INSN_VEC_TRIOP;
+   insn->size = size;
+   insn->variant.vec_triop.tag = tag;
+   insn->variant.vec_triop.dst = dst;
+   insn->variant.vec_triop.op1 = op1;
+   insn->variant.vec_triop.op2 = op2;
+   insn->variant.vec_triop.op3 = op3;
+
+   return insn;
+}
+
 /*---------------------------------------------------------------*/
 /*--- Debug print                                             ---*/
 /*---------------------------------------------------------------*/
@@ -6389,6 +7433,7 @@ s390_jump_kind_as_string(IRJumpKind kind)
    case Ijk_InvalICache: return "Invalidate";
    case Ijk_NoRedir:     return "NoRedir";
    case Ijk_SigTRAP:     return "SigTRAP";
+   case Ijk_SigFPE:      return "SigFPE";
    case Ijk_SigSEGV:     return "SigSEGV";
    case Ijk_SigBUS:      return "SigBUS";
    case Ijk_Sys_syscall: return "Sys_syscall";
@@ -6620,6 +7665,34 @@ s390_insn_as_string(const s390_insn *insn)
          op = "v-neg";
          break;
 
+      case S390_VEC_FILL:
+         op = "v-vfill";
+         break;
+
+      case S390_VEC_DUPLICATE:
+         op = "v-vdup";
+         break;
+
+      case S390_VEC_UNPACKLOWS:
+         op = "v-vunpacks";
+         break;
+
+      case S390_VEC_UNPACKLOWU:
+         op = "v-vunpacku";
+         break;
+
+      case S390_VEC_FLOAT_NEG:
+         op = "v-vfloatneg";
+         break;
+
+      case S390_VEC_FLOAT_SQRT:
+         op = "v-vfloatsqrt";
+         break;
+
+      case S390_VEC_FLOAT_ABS:
+         op = "v-vfloatabs";
+         break;
+
       default:
          goto fail;
       }
@@ -6928,6 +8001,95 @@ s390_insn_as_string(const s390_insn *insn)
                    insn->variant.xassisted.dst);
       return buf;   /* avoid printing "size = ..." which is meaningless */
 
+   case S390_INSN_VEC_AMODEOP:
+      switch (insn->variant.vec_amodeop.tag) {
+      case S390_VEC_GET_ELEM:  op = "v-vgetelem";  break;
+      case S390_VEC_ELEM_SHL_INT: op = "v-veshl"; break;
+      case S390_VEC_ELEM_SHRA_INT: op = "v-veshra"; break;
+      case S390_VEC_ELEM_SHRL_INT: op = "v-veshrl"; break;
+      default: goto fail;
+      }
+      s390_sprintf(buf, "%M %R, %R, %A", op, insn->variant.vec_amodeop.dst,
+                   insn->variant.vec_amodeop.op1,
+                   insn->variant.vec_amodeop.op2);
+      break;
+
+   case S390_INSN_VEC_AMODEINTOP:
+      switch (insn->variant.vec_amodeintop.tag) {
+      case S390_VEC_SET_ELEM:  op = "v-vsetelem";  break;
+      default: goto fail;
+      }
+      s390_sprintf(buf, "%M %R, %A, %R", op, insn->variant.vec_amodeintop.dst,
+                   insn->variant.vec_amodeintop.op2,
+                   insn->variant.vec_amodeintop.op3);
+      break;
+
+   case S390_INSN_VEC_BINOP:
+      switch (insn->variant.vec_binop.tag) {
+      case S390_VEC_PACK:           op = "v-vpack"; break;
+      case S390_VEC_PACK_SATURS:    op = "v-vpacksaturs"; break;
+      case S390_VEC_PACK_SATURU:    op = "v-vpacksaturu"; break;
+      case S390_VEC_COMPARE_EQUAL:  op = "v-vcmpeq"; break;
+      case S390_VEC_OR:             op = "v-vor"; break;
+      case S390_VEC_XOR:            op = "v-vxor";  break;
+      case S390_VEC_AND:            op = "v-vand"; break;
+      case S390_VEC_MERGEL:         op = "v-vmergel"; break;
+      case S390_VEC_MERGEH:         op = "v-vmergeh"; break;
+      case S390_VEC_NOR:            op = "v-vnor"; break;
+      case S390_VEC_INT_ADD:        op = "v-vintadd"; break;
+      case S390_VEC_INT_SUB:        op = "v-vintsub"; break;
+      case S390_VEC_MAXU:           op = "v-vmaxu"; break;
+      case S390_VEC_MAXS:           op = "v-vmaxs"; break;
+      case S390_VEC_MINU:           op = "v-vminu"; break;
+      case S390_VEC_MINS:           op = "v-vmins"; break;
+      case S390_VEC_AVGU:           op = "v-vavgu"; break;
+      case S390_VEC_AVGS:           op = "v-vavgs"; break;
+      case S390_VEC_COMPARE_GREATERS: op = "v-vcmpgts"; break;
+      case S390_VEC_COMPARE_GREATERU: op = "v-vcmpgtu"; break;
+      case S390_VEC_INT_MUL_HIGHS:    op = "v-vintmulhis"; break;
+      case S390_VEC_INT_MUL_HIGHU:    op = "v-vintmulhiu"; break;
+      case S390_VEC_INT_MUL_LOW:      op = "v-vintmullo"; break;
+      case S390_VEC_INT_MUL_EVENS:    op = "v-vintmulevens"; break;
+      case S390_VEC_INT_MUL_EVENU:    op = "v-vintmulevenu"; break;
+      case S390_VEC_ELEM_SHL_V:       op = "v-velemshl"; break;
+      case S390_VEC_ELEM_SHRA_V:      op = "v-vshrav"; break;
+      case S390_VEC_ELEM_SHRL_V:      op = "v-vshrlv"; break;
+      case S390_VEC_ELEM_ROLL_V:      op = "v-vrollv"; break;
+      case S390_VEC_SHL_BITS:         op = "v-vshlbits"; break;
+      case S390_VEC_SHRL_BITS:        op = "v-vshrlbits"; break;
+      case S390_VEC_SHRA_BITS:        op = "v-vshrabits"; break;
+      case S390_VEC_SHL_BYTES:        op = "v-vshlbytes"; break;
+      case S390_VEC_SHRL_BYTES:       op = "v-vshrlbytes"; break;
+      case S390_VEC_SHRA_BYTES:       op = "v-vshrabytes"; break;
+      case S390_VEC_PWSUM_W:          op = "v-vpwsumw"; break;
+      case S390_VEC_PWSUM_DW:         op = "v-vpwsumdw"; break;
+      case S390_VEC_PWSUM_QW:         op = "v-vpwsumqw"; break;
+      case S390_VEC_INIT_FROM_GPRS:   op = "v-vinitfromgprs"; break;
+      case S390_VEC_FLOAT_ADD:        op = "v-vfloatadd"; break;
+      case S390_VEC_FLOAT_SUB:        op = "v-vfloatsub"; break;
+      case S390_VEC_FLOAT_MUL:        op = "v-vfloatmul"; break;
+      case S390_VEC_FLOAT_DIV:        op = "v-vfloatdiv"; break;
+      case S390_VEC_FLOAT_COMPARE_EQUAL: op = "v-vfloatcmpeq"; break;
+      case S390_VEC_FLOAT_COMPARE_LESS_OR_EQUAL:  op = "v-vfloatcmple"; break;
+      case S390_VEC_FLOAT_COMPARE_LESS: op = "v-vfloatcmpl"; break;
+      default: goto fail;
+      }
+      s390_sprintf(buf, "%M %R, %R, %R", op, insn->variant.vec_binop.dst,
+                   insn->variant.vec_binop.op1, insn->variant.vec_binop.op2);
+      break;
+
+   case S390_INSN_VEC_TRIOP:
+      switch (insn->variant.vec_triop.tag) {
+      case S390_VEC_PERM:  op = "v-vperm";  break;
+      case S390_VEC_FLOAT_MADD: op = "v-vfloatmadd"; break;
+      case S390_VEC_FLOAT_MSUB: op = "v-vfloatmsub"; break;
+      default: goto fail;
+      }
+      s390_sprintf(buf, "%M %R, %R, %R, %R", op, insn->variant.vec_triop.dst,
+                   insn->variant.vec_triop.op1, insn->variant.vec_triop.op2,
+                   insn->variant.vec_triop.op3);
+      break;
+
    default: goto fail;
    }
 
@@ -7079,10 +8241,11 @@ s390_emit_load_mem(UChar *p, UInt num, UChar reg, const s390_amode *am)
    case S390_AMODE_B12:
    case S390_AMODE_BX12:
       switch (num) {
-      case 1: return s390_emit_IC(p, reg, x, b, d);
-      case 2: return s390_emit_LH(p, reg, x, b, d);
-      case 4: return s390_emit_L(p, reg, x, b, d);
-      case 8: return s390_emit_LG(p, reg, x, b, DISP20(d));
+      case 1:  return s390_emit_IC(p, reg, x, b, d);
+      case 2:  return s390_emit_LH(p, reg, x, b, d);
+      case 4:  return s390_emit_L(p, reg, x, b, d);
+      case 8:  return s390_emit_LG(p, reg, x, b, DISP20(d));
+      case 16: return s390_emit_VL(p, reg, x, b, d);
       default: goto fail;
       }
       break;
@@ -7214,6 +8377,17 @@ s390_insn_store_emit(UChar *buf, const s390_insn *insn)
       vpanic("s390_insn_store_emit");
    }
 
+   if (hregClass(insn->variant.store.src) == HRcVec128) {
+      vassert(insn->size == 16);
+      switch (dst->tag) {
+      case S390_AMODE_B12:
+      case S390_AMODE_BX12:
+         return s390_emit_VST(buf, r, x, b, d);
+
+      default:
+         vpanic("s390_insn_store_emit: unknown dst->tag for HRcVec128");
+      }
+   }
    /* Integer stuff */
    switch (insn->size) {
    case 1:
@@ -7280,6 +8454,9 @@ s390_insn_move_emit(UChar *buf, const s390_insn *insn)
          return s390_emit_LGR(buf, dst, src);
       if (dst_class == HRcFlt64)
          return s390_emit_LDR(buf, dst, src);
+      if (dst_class == HRcVec128) {
+         return s390_emit_VLR(buf, dst, src);
+      }
    } else {
       if (dst_class == HRcFlt64 && src_class == HRcInt64) {
          if (insn->size == 4) {
@@ -7998,9 +9175,85 @@ s390_insn_unop_emit(UChar *buf, const s390_insn *insn)
    case S390_SIGN_EXTEND_32: return s390_widen_emit(buf, insn, 4, 1);
 
    case S390_NEGATE:         return s390_negate_emit(buf, insn);
+   case S390_VEC_FILL: {
+      vassert(insn->variant.unop.src.tag == S390_OPND_IMMEDIATE);
+      UChar v1 = hregNumber(insn->variant.unop.dst);
+      UShort i2 = insn->variant.unop.src.variant.imm;
+      return s390_emit_VGBM(buf, v1, i2);
+      }
+   case S390_VEC_DUPLICATE: {
+      vassert(insn->variant.unop.src.tag == S390_OPND_REG);
+      UChar v1 = hregNumber(insn->variant.unop.dst);
+      UChar v2 = hregNumber(insn->variant.unop.src.variant.reg);
+      return s390_emit_VREP(buf, v1, v2, s390_getM_from_size(insn->size));
+      }
+   case S390_VEC_UNPACKLOWS: {
+      vassert(insn->variant.unop.src.tag == S390_OPND_REG);
+      vassert(insn->size < 8);
+      UChar v1 = hregNumber(insn->variant.unop.dst);
+      UChar v2 = hregNumber(insn->variant.unop.src.variant.reg);
+      return s390_emit_VUPH(buf, v1, v2, s390_getM_from_size(insn->size));
+      }
+   case S390_VEC_UNPACKLOWU: {
+      vassert(insn->variant.unop.src.tag == S390_OPND_REG);
+      vassert(insn->size < 8);
+      UChar v1 = hregNumber(insn->variant.unop.dst);
+      UChar v2 = hregNumber(insn->variant.unop.src.variant.reg);
+      return s390_emit_VUPLH(buf, v1, v2, s390_getM_from_size(insn->size));
+      }
+
+   case S390_VEC_ABS:{
+      vassert(insn->variant.unop.src.tag == S390_OPND_REG);
+      UChar v1 = hregNumber(insn->variant.unop.dst);
+      UChar v2 = hregNumber(insn->variant.unop.src.variant.reg);
+      return s390_emit_VLP(buf, v1, v2, s390_getM_from_size(insn->size));
    }
 
-   vpanic("s390_insn_unop_emit");
+   case S390_VEC_COUNT_LEADING_ZEROES:{
+      vassert(insn->variant.unop.src.tag == S390_OPND_REG);
+      UChar v1 = hregNumber(insn->variant.unop.dst);
+      UChar v2 = hregNumber(insn->variant.unop.src.variant.reg);
+      return s390_emit_VCLZ(buf, v1, v2, s390_getM_from_size(insn->size));
+   }
+
+   case S390_VEC_COUNT_TRAILING_ZEROES:{
+      vassert(insn->variant.unop.src.tag == S390_OPND_REG);
+      UChar v1 = hregNumber(insn->variant.unop.dst);
+      UChar v2 = hregNumber(insn->variant.unop.src.variant.reg);
+      return s390_emit_VCTZ(buf, v1, v2, s390_getM_from_size(insn->size));
+   }
+
+   case S390_VEC_COUNT_ONES:{
+      vassert(insn->variant.unop.src.tag == S390_OPND_REG);
+      UChar v1 = hregNumber(insn->variant.unop.dst);
+      UChar v2 = hregNumber(insn->variant.unop.src.variant.reg);
+      return s390_emit_VPOPCT(buf, v1, v2, s390_getM_from_size(insn->size));
+   }
+
+   case S390_VEC_FLOAT_NEG: {
+      vassert(insn->variant.unop.src.tag == S390_OPND_REG);
+      vassert(insn->size == 8);
+      UChar v1 = hregNumber(insn->variant.unop.dst);
+      UChar v2 = hregNumber(insn->variant.unop.src.variant.reg);
+      return s390_emit_VFPSO(buf, v1, v2, s390_getM_from_size(insn->size), 0, 0);
+   }
+   case S390_VEC_FLOAT_ABS: {
+      vassert(insn->variant.unop.src.tag == S390_OPND_REG);
+      vassert(insn->size == 8);
+      UChar v1 = hregNumber(insn->variant.unop.dst);
+      UChar v2 = hregNumber(insn->variant.unop.src.variant.reg);
+      return s390_emit_VFPSO(buf, v1, v2, s390_getM_from_size(insn->size), 0, 2);
+   }
+   case S390_VEC_FLOAT_SQRT: {
+      vassert(insn->variant.unop.src.tag == S390_OPND_REG);
+      vassert(insn->size == 8);
+      UChar v1 = hregNumber(insn->variant.unop.dst);
+      UChar v2 = hregNumber(insn->variant.unop.src.variant.reg);
+      return s390_emit_VFSQ(buf, v1, v2, s390_getM_from_size(insn->size), 0);
+   }
+   default:
+      vpanic("s390_insn_unop_emit");
+   }
 }
 
 
@@ -8654,36 +9907,37 @@ s390_insn_helper_call_emit(UChar *buf, const s390_insn *insn)
 {
    s390_cc_t cond;
    ULong target;
-   UChar *ptmp = buf;
+   Int delta;
    s390_helper_call *helper_call = insn->variant.helper_call.details;
 
    cond = helper_call->cond;
    target = helper_call->target;
 
-   if (cond != S390_CC_ALWAYS
-       && helper_call->rloc.pri != RLPri_None) {
-      /* The call might not happen (it isn't unconditional) and it
-         returns a result.  In this case we will need to generate a
-         control flow diamond to put 0x555..555 in the return
-         register(s) in the case where the call doesn't happen.  If
-         this ever becomes necessary, maybe copy code from the ARM
-         equivalent.  Until that day, just give up. */
-      return buf; /* To denote failure. */
-   }
+   const Bool not_always = (cond != S390_CC_ALWAYS);
+   const Bool not_void_return = (helper_call->rloc.pri != RLPri_None);
+
+   /* We have this situation:
+      ( *** code in this braces is for  not_always && not_void_return*** )
+         ...
+         before:
+           brc{!cond} else
+           call_helper
+         preElse:
+         ***  j after ***
+         else:
+         *** load_64imm $0x5555555555555555, %%r2  *** // e.g. for Int RetLoc
+         after:
+         ...
+   */
 
-   if (cond != S390_CC_ALWAYS) {
-      /* So we have something like this
-         if (cond) call X;
-         Y: ...
-         We convert this into
-         if (! cond) goto Y;        // BRC opcode; 4 bytes
-         call X;
-         Y:
-      */
+   // before:
+   UChar *pBefore = buf;
+   if (not_always) {
       /* 4 bytes (a BRC insn) to be filled in here */
       buf += 4;
    }
 
+   // call_helper
    /* Load the target address into a register, that
       (a) is not used for passing parameters to the helper and
       (b) can be clobbered by the callee
@@ -8701,12 +9955,45 @@ s390_insn_helper_call_emit(UChar *buf, const s390_insn *insn)
    buf = s390_emit_LFPC(buf, S390_REGNO_STACK_POINTER,          // restore FPC
                         S390_OFFSET_SAVED_FPC_C);
 
-   if (cond != S390_CC_ALWAYS) {
-      Int delta = buf - ptmp;
+   // preElse:
+   UChar* pPreElse = buf;
+   if (not_always && not_void_return) {
+      /* 4 bytes (a BRC insn) to be filled in here */
+      buf += 4;
+   }
+
+   // else:
+   UChar* pElse = buf;
+   if (not_always && not_void_return) {
+      switch (helper_call->rloc.pri) {
+      case RLPri_Int:
+         buf = s390_emit_load_64imm(buf, S390_REGNO_RETURN_VALUE, 0x5555555555555555ULL);
+         break;
+      default:
+         ppS390Instr(insn, True);
+         vpanic("s390_insn_helper_call_emit: invalid conditional RetLoc.");
+      }
+   }
+
+   // after:
+   UChar* pAfter = buf;
 
+   // fill "brc{!cond} else"
+   if(not_always)
+   {
+      delta = pElse - pBefore;
       delta >>= 1;  /* immediate constant is #half-words */
       vassert(delta > 0 && delta < (1 << 16));
-      s390_emit_BRC(ptmp, s390_cc_invert(cond), delta);
+      s390_emit_BRC(pBefore, s390_cc_invert(cond), delta);
+   }
+
+   // fill "brc{ALWAYS} after"
+   if (not_always && not_void_return)
+   {
+      delta = pAfter - pPreElse;
+      delta >>= 1;  /* immediate constant is #half-words */
+      vassert(delta > 0 && delta < (1 << 16));
+      s390_emit_BRC(pPreElse, S390_CC_ALWAYS, delta);
    }
 
    return buf;
@@ -9719,6 +11006,7 @@ s390_insn_xassisted_emit(UChar *buf, const s390_insn *insn,
    case Ijk_InvalICache: trcval = VEX_TRC_JMP_INVALICACHE; break;
    case Ijk_NoRedir:     trcval = VEX_TRC_JMP_NOREDIR;     break;
    case Ijk_SigTRAP:     trcval = VEX_TRC_JMP_SIGTRAP;     break;
+   case Ijk_SigFPE:      trcval = VEX_TRC_JMP_SIGFPE;      break;
    case Ijk_SigSEGV:     trcval = VEX_TRC_JMP_SIGSEGV;     break;
    case Ijk_Boring:      trcval = VEX_TRC_JMP_BORING;      break;
       /* We don't expect to see the following being assisted. */
@@ -9828,6 +11116,208 @@ s390_insn_profinc_emit(UChar *buf,
 }
 
 
+static UChar *
+s390_insn_vec_amodeop_emit(UChar *buf, const s390_insn *insn)
+{
+   UChar v1 = hregNumber(insn->variant.vec_amodeop.dst);
+   UChar v2 = hregNumber(insn->variant.vec_amodeop.op1);
+   s390_amode* op2 = insn->variant.vec_amodeop.op2;
+
+   vassert(hregNumber(op2->x) == 0);
+   vassert(fits_unsigned_12bit(op2->d));
+
+   UChar b = hregNumber(op2->b);
+   UShort d = op2->d;
+
+
+   switch (insn->variant.vec_amodeop.tag) {
+   case S390_VEC_GET_ELEM:
+      return s390_emit_VLGV(buf, v1, b, d, v2, s390_getM_from_size(insn->size));
+
+   case S390_VEC_ELEM_SHL_INT:
+      return s390_emit_VESL(buf, v1, b, d, v2, s390_getM_from_size(insn->size));
+
+   case S390_VEC_ELEM_SHRA_INT:
+      return s390_emit_VESRA(buf, v1, b, d, v2, s390_getM_from_size(insn->size));
+
+   case S390_VEC_ELEM_SHRL_INT:
+      return s390_emit_VESRL(buf, v1, b, d, v2, s390_getM_from_size(insn->size));
+
+   default:  goto fail;
+   }
+
+ fail:
+   vpanic("s390_insn_vec_amodeop_emit");
+}
+
+
+static UChar *
+s390_insn_vec_amodeintop_emit(UChar *buf, const s390_insn *insn)
+{
+   UChar v1 = hregNumber(insn->variant.vec_amodeintop.dst);
+   s390_amode* op2 = insn->variant.vec_amodeintop.op2;
+   UChar r3 = hregNumber(insn->variant.vec_amodeintop.op3);
+
+   vassert(hregNumber(op2->x) == 0);
+   UChar b = hregNumber(op2->b);
+   UShort d = op2->d;
+
+   switch (insn->variant.vec_amodeintop.tag) {
+   case S390_VEC_SET_ELEM:
+      return s390_emit_VLVG(buf, v1, b, d, r3, s390_getM_from_size(insn->size));
+   default:  goto fail;
+   }
+
+ fail:
+   vpanic("s390_insn_vec_amodeop_emit");
+}
+
+
+static UChar *
+s390_insn_vec_binop_emit(UChar *buf, const s390_insn *insn)
+{
+   s390_vec_binop_t tag = insn->variant.vec_binop.tag;
+   UChar size = insn->size;
+   UChar v1 = hregNumber(insn->variant.vec_binop.dst);
+   UChar v2 = hregNumber(insn->variant.vec_binop.op1);
+   UChar v3 = hregNumber(insn->variant.vec_binop.op2);
+
+   switch (tag) {
+      case S390_VEC_PACK:
+         return s390_emit_VPK(buf, v1, v2, v3, s390_getM_from_size(size));
+      case S390_VEC_PACK_SATURU:
+         return s390_emit_VPKLS(buf, v1, v2, v3, s390_getM_from_size(size));
+      case S390_VEC_PACK_SATURS:
+         return s390_emit_VPKS(buf, v1, v2, v3, s390_getM_from_size(size));
+      case S390_VEC_COMPARE_EQUAL:
+         return s390_emit_VCEQ(buf, v1, v2, v3, s390_getM_from_size(size));
+      case S390_VEC_OR:
+         return s390_emit_VO(buf, v1, v2, v3);
+      case S390_VEC_XOR:
+         return s390_emit_VX(buf, v1, v2, v3);
+      case S390_VEC_AND:
+         return s390_emit_VN(buf, v1, v2, v3);
+      case S390_VEC_MERGEL:
+         return s390_emit_VMRL(buf, v1, v2, v3, s390_getM_from_size(size));
+      case S390_VEC_MERGEH:
+         return s390_emit_VMRH(buf, v1, v2, v3, s390_getM_from_size(size));
+      case S390_VEC_NOR:
+         return s390_emit_VNO(buf, v1, v2, v3);
+      case S390_VEC_INT_ADD:
+         return s390_emit_VA(buf, v1, v2, v3, s390_getM_from_size(size));
+      case S390_VEC_INT_SUB:
+         return s390_emit_VS(buf, v1, v2, v3, s390_getM_from_size(size));
+      case S390_VEC_MAXU:
+         return s390_emit_VMXL(buf, v1, v2, v3, s390_getM_from_size(size));
+      case S390_VEC_MAXS:
+         return s390_emit_VMX(buf, v1, v2, v3, s390_getM_from_size(size));
+      case S390_VEC_MINU:
+         return s390_emit_VMNL(buf, v1, v2, v3, s390_getM_from_size(size));
+      case S390_VEC_MINS:
+         return s390_emit_VMN(buf, v1, v2, v3, s390_getM_from_size(size));
+      case S390_VEC_AVGU:
+         return s390_emit_VAVGL(buf, v1, v2, v3, s390_getM_from_size(size));
+      case S390_VEC_AVGS:
+         return s390_emit_VAVG(buf, v1, v2, v3, s390_getM_from_size(size));
+      case S390_VEC_COMPARE_GREATERS:
+         return s390_emit_VCH(buf, v1, v2, v3, s390_getM_from_size(size));
+      case S390_VEC_COMPARE_GREATERU:
+         return s390_emit_VCHL(buf, v1, v2, v3, s390_getM_from_size(size));
+      case S390_VEC_INT_MUL_HIGHS:
+         return s390_emit_VMH(buf, v1, v2, v3, s390_getM_from_size(size));
+      case S390_VEC_INT_MUL_HIGHU:
+         return s390_emit_VMLH(buf, v1, v2, v3, s390_getM_from_size(size));
+      case S390_VEC_INT_MUL_LOW:
+         return s390_emit_VML(buf, v1, v2, v3, s390_getM_from_size(size));
+      case S390_VEC_INT_MUL_EVENS:
+         return s390_emit_VME(buf, v1, v2, v3, s390_getM_from_size(size));
+      case S390_VEC_INT_MUL_EVENU:
+         return s390_emit_VMLE(buf, v1, v2, v3, s390_getM_from_size(size));
+      case S390_VEC_ELEM_SHL_V:
+         return s390_emit_VESLV(buf, v1, v2, v3, s390_getM_from_size(size));
+      case S390_VEC_ELEM_SHRA_V:
+         return s390_emit_VESRAV(buf, v1, v2, v3, s390_getM_from_size(size));
+      case S390_VEC_ELEM_SHRL_V:
+         return s390_emit_VESRLV(buf, v1, v2, v3, s390_getM_from_size(size));
+      case S390_VEC_ELEM_ROLL_V:
+         return s390_emit_VERLLV(buf, v1, v2, v3, s390_getM_from_size(size));
+      case S390_VEC_SHL_BITS:
+         return s390_emit_VSL(buf, v1, v2, v3);
+      case S390_VEC_SHRL_BITS:
+         return s390_emit_VSRL(buf, v1, v2, v3);
+      case S390_VEC_SHRA_BITS:
+         return s390_emit_VSRA(buf, v1, v2, v3);
+      case S390_VEC_SHL_BYTES:
+         return s390_emit_VSLB(buf, v1, v2, v3);
+      case S390_VEC_SHRL_BYTES:
+         return s390_emit_VSRLB(buf, v1, v2, v3);
+      case S390_VEC_SHRA_BYTES:
+         return s390_emit_VSRAB(buf, v1, v2, v3);
+      case S390_VEC_PWSUM_W:
+         vassert((size == 1) || (size == 2));
+         return s390_emit_VSUM(buf, v1, v2, v3, s390_getM_from_size(size));
+      case S390_VEC_PWSUM_DW:
+         vassert((size == 2) || (size == 4));
+         return s390_emit_VSUMG(buf, v1, v2, v3, s390_getM_from_size(size));
+      case S390_VEC_PWSUM_QW:
+         vassert((size == 4) || (size == 8));
+         return s390_emit_VSUMQ(buf, v1, v2, v3, s390_getM_from_size(size));
+      case S390_VEC_INIT_FROM_GPRS:
+         return s390_emit_VLVGP(buf, v1, v2, v3);
+      case S390_VEC_FLOAT_ADD:
+         return s390_emit_VFA(buf, v1, v2, v3, s390_getM_from_size(size), 0);
+      case S390_VEC_FLOAT_SUB:
+         return s390_emit_VFS(buf, v1, v2, v3, s390_getM_from_size(size), 0);
+      case S390_VEC_FLOAT_MUL:
+         return s390_emit_VFM(buf, v1, v2, v3, s390_getM_from_size(size), 0);
+      case S390_VEC_FLOAT_DIV:
+         return s390_emit_VFD(buf, v1, v2, v3, s390_getM_from_size(size), 0);
+      case S390_VEC_FLOAT_COMPARE_EQUAL:
+         return s390_emit_VFCE(buf, v1, v2, v3, s390_getM_from_size(size), 0, 0);
+      case S390_VEC_FLOAT_COMPARE_LESS_OR_EQUAL:
+         return s390_emit_VFCH(buf, v1, v3, v2, s390_getM_from_size(size), 0, 0);
+      case S390_VEC_FLOAT_COMPARE_LESS:
+         return s390_emit_VFCHE(buf, v1, v3, v2, s390_getM_from_size(size), 0, 0);
+
+      default:
+         goto fail;
+   }
+
+   fail:
+      ppS390Instr(insn, True);
+      vpanic("s390_insn_vec_binop_emit");
+
+}
+
+
+static UChar *
+s390_insn_vec_triop_emit(UChar *buf, const s390_insn *insn)
+{
+   s390_vec_triop_t tag = insn->variant.vec_triop.tag;
+   UChar v1 = hregNumber(insn->variant.vec_triop.dst);
+   UChar v2 = hregNumber(insn->variant.vec_triop.op1);
+   UChar v3 = hregNumber(insn->variant.vec_triop.op2);
+   UChar v4 = hregNumber(insn->variant.vec_triop.op3);
+
+   switch (tag) {
+      case S390_VEC_PERM: {
+         vassert(insn->size == 16);
+         return s390_emit_VPERM(buf, v1, v2, v3, v4);
+      }
+      case S390_VEC_FLOAT_MADD:
+         return s390_emit_VFMA(buf, v1, v2, v3, v4, 0, 3);
+      case S390_VEC_FLOAT_MSUB:
+         return s390_emit_VFMS(buf, v1, v2, v3, v4, 0, 3);
+      default:
+         goto fail;
+   }
+
+   fail:
+      vpanic("s390_insn_vec_triop_emit");
+
+}
+
+
 Int
 emit_S390Instr(Bool *is_profinc, UChar *buf, Int nbuf, const s390_insn *insn,
                Bool mode64, VexEndness endness_host,
@@ -10009,6 +11499,21 @@ emit_S390Instr(Bool *is_profinc, UChar *buf, Int nbuf, const s390_insn *insn,
       end = s390_insn_xassisted_emit(buf, insn, disp_cp_xassisted);
       break;
 
+   case S390_INSN_VEC_AMODEOP:
+      end = s390_insn_vec_amodeop_emit(buf, insn);
+      break;
+
+   case S390_INSN_VEC_AMODEINTOP:
+      end = s390_insn_vec_amodeintop_emit(buf, insn);
+      break;
+
+   case S390_INSN_VEC_BINOP:
+      end = s390_insn_vec_binop_emit(buf, insn);
+      break;
+
+   case S390_INSN_VEC_TRIOP:
+      end = s390_insn_vec_triop_emit(buf, insn);
+      break;
    fail:
    default:
       vpanic("emit_S390Instr");
diff --git a/priv/host_s390_defs.h b/priv/host_s390_defs.h
index e876cb2a4..ed1f3cfb2 100644
--- a/priv/host_s390_defs.h
+++ b/priv/host_s390_defs.h
@@ -8,7 +8,7 @@
    This file is part of Valgrind, a dynamic binary instrumentation
    framework.
 
-   Copyright IBM Corp. 2010-2015
+   Copyright IBM Corp. 2010-2017
 
    This program is free software; you can redistribute it and/or
    modify it under the terms of the GNU General Public License as
@@ -42,9 +42,11 @@
 const HChar *s390_hreg_as_string(HReg);
 HReg s390_hreg_gpr(UInt regno);
 HReg s390_hreg_fpr(UInt regno);
+HReg s390_hreg_vr(UInt regno);
 
 /* Dedicated registers */
 HReg s390_hreg_guest_state_pointer(void);
+HReg s390_hreg_stack_pointer(void);
 
 
 /* Given the index of a function argument, return the number of the
@@ -86,6 +88,7 @@ s390_amode *s390_amode_b20(Int d, HReg b);
 s390_amode *s390_amode_bx12(Int d, HReg b, HReg x);
 s390_amode *s390_amode_bx20(Int d, HReg b, HReg x);
 s390_amode *s390_amode_for_guest_state(Int d);
+s390_amode *s390_amode_for_stack_pointer(Int d);
 Bool        s390_amode_is_sane(const s390_amode *);
 
 const HChar *s390_amode_as_string(const s390_amode *);
@@ -160,7 +163,12 @@ typedef enum {
    S390_INSN_XINDIR,      /* indirect transfer to guest address */
    S390_INSN_XASSISTED,   /* assisted transfer to guest address */
    S390_INSN_EVCHECK,     /* Event check */
-   S390_INSN_PROFINC      /* 64-bit profile counter increment */
+   S390_INSN_PROFINC,     /* 64-bit profile counter increment */
+   S390_INSN_VEC_AMODEOP,
+   S390_INSN_VEC_AMODEINTOP,
+   S390_INSN_VEC_UNOP,
+   S390_INSN_VEC_BINOP,
+   S390_INSN_VEC_TRIOP
 } s390_insn_tag;
 
 
@@ -186,7 +194,19 @@ typedef enum {
    S390_SIGN_EXTEND_8,
    S390_SIGN_EXTEND_16,
    S390_SIGN_EXTEND_32,
-   S390_NEGATE
+   S390_NEGATE,
+   S390_VEC_FILL,
+   S390_VEC_DUPLICATE,
+   S390_VEC_UNPACKLOWS,
+   S390_VEC_UNPACKLOWU,
+   S390_VEC_ABS,
+   S390_VEC_COUNT_LEADING_ZEROES,
+   S390_VEC_COUNT_TRAILING_ZEROES,
+   S390_VEC_COUNT_ONES,
+   S390_VEC_FLOAT_NEG,
+   S390_VEC_FLOAT_ABS,
+   S390_VEC_FLOAT_SQRT,
+   S390_UNOP_T_INVALID
 } s390_unop_t;
 
 /* The kind of ternary BFP operations */
@@ -323,6 +343,79 @@ typedef enum {
    S390_DFP_COMPARE_EXP,
 } s390_dfp_cmp_t;
 
+/* The vector operations with 2 operands one of them being amode */
+typedef enum {
+   S390_VEC_GET_ELEM,
+   S390_VEC_ELEM_SHL_INT,
+   S390_VEC_ELEM_SHRA_INT,
+   S390_VEC_ELEM_SHRL_INT,
+   S390_VEC_AMODEOP_T_INVALID
+} s390_vec_amodeop_t;
+
+/* The vector operations with three (vector, amode and integer) operands */
+typedef enum {
+   S390_VEC_SET_ELEM
+} s390_vec_amodeintop_t;
+
+/* The vector operations with two operands */
+typedef enum {
+   S390_VEC_PACK,
+   S390_VEC_PACK_SATURS,
+   S390_VEC_PACK_SATURU,
+   S390_VEC_COMPARE_EQUAL,
+   S390_VEC_OR,
+   S390_VEC_XOR,
+   S390_VEC_AND,
+   S390_VEC_MERGEL,
+   S390_VEC_MERGEH,
+   S390_VEC_NOR,
+   S390_VEC_INT_ADD,
+   S390_VEC_INT_SUB,
+   S390_VEC_MAXU,
+   S390_VEC_MAXS,
+   S390_VEC_MINU,
+   S390_VEC_MINS,
+   S390_VEC_AVGU,
+   S390_VEC_AVGS,
+   S390_VEC_COMPARE_GREATERS,
+   S390_VEC_COMPARE_GREATERU,
+   S390_VEC_INT_MUL_HIGHS,
+   S390_VEC_INT_MUL_HIGHU,
+   S390_VEC_INT_MUL_LOW,
+   S390_VEC_INT_MUL_EVENS,
+   S390_VEC_INT_MUL_EVENU,
+   S390_VEC_ELEM_SHL_V,
+   S390_VEC_ELEM_SHRA_V,
+   S390_VEC_ELEM_SHRL_V,
+   S390_VEC_ELEM_ROLL_V,
+
+   /* host_s390_isel depends on this order. */
+   S390_VEC_SHL_BITS, S390_VEC_SHL_BYTES,
+   S390_VEC_SHRL_BITS, S390_VEC_SHRL_BYTES,
+   S390_VEC_SHRA_BITS, S390_VEC_SHRA_BYTES,
+
+   S390_VEC_PWSUM_W,
+   S390_VEC_PWSUM_DW,
+   S390_VEC_PWSUM_QW,
+
+   S390_VEC_INIT_FROM_GPRS,
+   S390_VEC_FLOAT_ADD,
+   S390_VEC_FLOAT_SUB,
+   S390_VEC_FLOAT_MUL,
+   S390_VEC_FLOAT_DIV,
+   S390_VEC_FLOAT_COMPARE_EQUAL,
+   S390_VEC_FLOAT_COMPARE_LESS_OR_EQUAL,
+   S390_VEC_FLOAT_COMPARE_LESS,
+   S390_VEC_BINOP_T_INVALID
+} s390_vec_binop_t;
+
+/* The vector operations with three operands */
+typedef enum {
+   S390_VEC_PERM,
+   S390_VEC_FLOAT_MADD,
+   S390_VEC_FLOAT_MSUB
+} s390_vec_triop_t;
+
 /* The details of a CDAS insn. Carved out to keep the size of
    s390_insn low */
 typedef struct {
@@ -618,9 +711,32 @@ typedef struct {
          /* No fields.  The address of the counter to increment is
             installed later, post-translation, by patching it in,
             as it is not known at translation time. */
-		Int nop;
       } profinc;
-
+      struct {
+         s390_vec_amodeop_t tag;
+         HReg          dst;    /* 64-bit result */
+         HReg          op1;    /* 128-bit operand */
+         s390_amode   *op2;    /* amode operand */
+      } vec_amodeop;
+      struct {
+         s390_vec_amodeintop_t tag;
+         HReg          dst;    /* 128-bit result */
+         s390_amode   *op2;    /* amode operand */
+         HReg          op3;    /* integer operand */
+      } vec_amodeintop;
+      struct {
+         s390_vec_binop_t tag;
+         HReg          dst;    /* 128-bit result */
+         HReg          op1;    /* 128-bit first operand */
+         HReg          op2;    /* 128-bit second operand */
+      } vec_binop;
+      struct {
+         s390_vec_triop_t tag;
+         HReg          dst;    /* 128-bit result */
+         HReg          op1;    /* 128-bit first operand */
+         HReg          op2;    /* 128-bit second operand */
+         HReg          op3;    /* 128-bit third operand */
+      } vec_triop;
    } variant;
 } s390_insn;
 
@@ -728,6 +844,14 @@ s390_insn *s390_insn_xassisted(s390_cc_t cond, HReg dst, s390_amode *guest_IA,
                                IRJumpKind kind);
 s390_insn *s390_insn_evcheck(s390_amode *counter, s390_amode *fail_addr);
 s390_insn *s390_insn_profinc(void);
+s390_insn *s390_insn_vec_amodeop(UChar size, s390_vec_amodeop_t, HReg dst,
+                                 HReg op1, s390_amode* op2);
+s390_insn *s390_insn_vec_amodeintop(UChar size, s390_vec_amodeintop_t, HReg dst,
+                                    s390_amode* op2, HReg op3);
+s390_insn *s390_insn_vec_binop(UChar size, s390_vec_binop_t, HReg dst, HReg op1,
+                               HReg op2);
+s390_insn *s390_insn_vec_triop(UChar size, s390_vec_triop_t, HReg dst, HReg op1,
+                               HReg op2, HReg op3);
 
 const HChar *s390_insn_as_string(const s390_insn *);
 
@@ -737,19 +861,19 @@ const HChar *s390_insn_as_string(const s390_insn *);
 
 void ppS390AMode(const s390_amode *);
 void ppS390Instr(const s390_insn *, Bool mode64);
-void ppHRegS390(HReg);
+UInt ppHRegS390(HReg);
 
 /* Some functions that insulate the register allocator from details
    of the underlying instruction set. */
 void  getRegUsage_S390Instr( HRegUsage *, const s390_insn *, Bool );
 void  mapRegs_S390Instr    ( HRegRemap *, s390_insn *, Bool );
-Bool  isMove_S390Instr     ( const s390_insn *, HReg *, HReg * );
 Int   emit_S390Instr       ( Bool *, UChar *, Int, const s390_insn *, Bool,
                              VexEndness, const void *, const void *,
                              const void *, const void *);
 const RRegUniverse *getRRegUniverse_S390( void );
 void  genSpill_S390        ( HInstr **, HInstr **, HReg , Int , Bool );
 void  genReload_S390       ( HInstr **, HInstr **, HReg , Int , Bool );
+extern s390_insn* genMove_S390(HReg from, HReg to, Bool mode64);
 HInstrArray *iselSB_S390   ( const IRSB *, VexArch, const VexArchInfo *,
                              const VexAbiInfo *, Int, Int, Bool, Bool, Addr);
 
@@ -800,7 +924,10 @@ extern UInt s390_host_hwcaps;
                       (s390_host_hwcaps & (VEX_HWCAPS_S390X_LSC))
 #define s390_host_has_pfpo \
                       (s390_host_hwcaps & (VEX_HWCAPS_S390X_PFPO))
-
+#define s390_host_has_vx \
+                      (s390_host_hwcaps & (VEX_HWCAPS_S390X_VX))
+#define s390_host_has_msa5 \
+                      (s390_host_hwcaps & (VEX_HWCAPS_S390X_MSA5))
 #endif /* ndef __VEX_HOST_S390_DEFS_H */
 
 /*---------------------------------------------------------------*/
diff --git a/priv/host_s390_isel.c b/priv/host_s390_isel.c
index 2310b0a7f..38989f217 100644
--- a/priv/host_s390_isel.c
+++ b/priv/host_s390_isel.c
@@ -8,8 +8,8 @@
    This file is part of Valgrind, a dynamic binary instrumentation
    framework.
 
-   Copyright IBM Corp. 2010-2015
-   Copyright (C) 2012-2015  Florian Krohm   (britzel@acm.org)
+   Copyright IBM Corp. 2010-2017
+   Copyright (C) 2012-2017  Florian Krohm   (britzel@acm.org)
 
    This program is free software; you can redistribute it and/or
    modify it under the terms of the GNU General Public License as
@@ -135,6 +135,7 @@ static HReg          s390_isel_float_expr(ISelEnv *, IRExpr *);
 static void          s390_isel_float128_expr(HReg *, HReg *, ISelEnv *, IRExpr *);
 static HReg          s390_isel_dfp_expr(ISelEnv *, IRExpr *);
 static void          s390_isel_dfp128_expr(HReg *, HReg *, ISelEnv *, IRExpr *);
+static HReg          s390_isel_vec_expr(ISelEnv *, IRExpr *);
 
 
 static Int
@@ -151,61 +152,15 @@ get_guest_reg(Int offset)
 
       /* Also make sure there is never a partial write to one of
          these registers. That would complicate matters. */
-   case S390X_GUEST_OFFSET(guest_IA)+1:
-   case S390X_GUEST_OFFSET(guest_IA)+2:
-   case S390X_GUEST_OFFSET(guest_IA)+3:
-   case S390X_GUEST_OFFSET(guest_IA)+4:
-   case S390X_GUEST_OFFSET(guest_IA)+5:
-   case S390X_GUEST_OFFSET(guest_IA)+6:
-   case S390X_GUEST_OFFSET(guest_IA)+7:
-
-   case S390X_GUEST_OFFSET(guest_CC_OP)+1:
-   case S390X_GUEST_OFFSET(guest_CC_OP)+2:
-   case S390X_GUEST_OFFSET(guest_CC_OP)+3:
-   case S390X_GUEST_OFFSET(guest_CC_OP)+4:
-   case S390X_GUEST_OFFSET(guest_CC_OP)+5:
-   case S390X_GUEST_OFFSET(guest_CC_OP)+6:
-   case S390X_GUEST_OFFSET(guest_CC_OP)+7:
-
-   case S390X_GUEST_OFFSET(guest_CC_DEP1)+1:
-   case S390X_GUEST_OFFSET(guest_CC_DEP1)+2:
-   case S390X_GUEST_OFFSET(guest_CC_DEP1)+3:
-   case S390X_GUEST_OFFSET(guest_CC_DEP1)+4:
-   case S390X_GUEST_OFFSET(guest_CC_DEP1)+5:
-   case S390X_GUEST_OFFSET(guest_CC_DEP1)+6:
-   case S390X_GUEST_OFFSET(guest_CC_DEP1)+7:
-
-   case S390X_GUEST_OFFSET(guest_CC_DEP2)+1:
-   case S390X_GUEST_OFFSET(guest_CC_DEP2)+2:
-   case S390X_GUEST_OFFSET(guest_CC_DEP2)+3:
-   case S390X_GUEST_OFFSET(guest_CC_DEP2)+4:
-   case S390X_GUEST_OFFSET(guest_CC_DEP2)+5:
-   case S390X_GUEST_OFFSET(guest_CC_DEP2)+6:
-   case S390X_GUEST_OFFSET(guest_CC_DEP2)+7:
-
-   case S390X_GUEST_OFFSET(guest_CC_NDEP)+1:
-   case S390X_GUEST_OFFSET(guest_CC_NDEP)+2:
-   case S390X_GUEST_OFFSET(guest_CC_NDEP)+3:
-   case S390X_GUEST_OFFSET(guest_CC_NDEP)+4:
-   case S390X_GUEST_OFFSET(guest_CC_NDEP)+5:
-   case S390X_GUEST_OFFSET(guest_CC_NDEP)+6:
-   case S390X_GUEST_OFFSET(guest_CC_NDEP)+7:
-
-   case S390X_GUEST_OFFSET(guest_SYSNO)+1:
-   case S390X_GUEST_OFFSET(guest_SYSNO)+2:
-   case S390X_GUEST_OFFSET(guest_SYSNO)+3:
-   case S390X_GUEST_OFFSET(guest_SYSNO)+4:
-   case S390X_GUEST_OFFSET(guest_SYSNO)+5:
-   case S390X_GUEST_OFFSET(guest_SYSNO)+6:
-   case S390X_GUEST_OFFSET(guest_SYSNO)+7:
-
+   case S390X_GUEST_OFFSET(guest_IA)+1      ... S390X_GUEST_OFFSET(guest_IA)+7:
+   case S390X_GUEST_OFFSET(guest_CC_OP)+1   ... S390X_GUEST_OFFSET(guest_CC_OP)+7:
+   case S390X_GUEST_OFFSET(guest_CC_DEP1)+1 ... S390X_GUEST_OFFSET(guest_CC_DEP1)+7:
+   case S390X_GUEST_OFFSET(guest_CC_DEP2)+1 ... S390X_GUEST_OFFSET(guest_CC_DEP2)+7:
+   case S390X_GUEST_OFFSET(guest_CC_NDEP)+1 ... S390X_GUEST_OFFSET(guest_CC_NDEP)+7:
+   case S390X_GUEST_OFFSET(guest_SYSNO)+1   ... S390X_GUEST_OFFSET(guest_SYSNO)+7:
       /* counter is used both as 4-byte and as 8-byte entity */
-   case S390X_GUEST_OFFSET(guest_counter)+1:
-   case S390X_GUEST_OFFSET(guest_counter)+2:
-   case S390X_GUEST_OFFSET(guest_counter)+3:
-   case S390X_GUEST_OFFSET(guest_counter)+5:
-   case S390X_GUEST_OFFSET(guest_counter)+6:
-   case S390X_GUEST_OFFSET(guest_counter)+7:
+   case S390X_GUEST_OFFSET(guest_counter)+1 ... S390X_GUEST_OFFSET(guest_counter)+3:
+   case S390X_GUEST_OFFSET(guest_counter)+5 ... S390X_GUEST_OFFSET(guest_counter)+7:
       vpanic("partial update of this guest state register is not allowed");
       break;
 
@@ -288,6 +243,18 @@ newVRegF(ISelEnv *env)
    return mkVRegF(env->vreg_ctr++);
 }
 
+/* Allocate a new virtual vector register */
+static HReg
+mkVRegV(UInt ix)
+{
+   return mkHReg(/*virtual*/True, HRcVec128, /*encoding*/0, ix);
+}
+
+static HReg
+newVRegV(ISelEnv *env)
+{
+   return mkVRegV(env->vreg_ctr++);
+}
 
 /* Construct a non-virtual general purpose register */
 static __inline__ HReg
@@ -528,6 +495,68 @@ get_const_value_as_ulong(const IRConst *con)
 }
 
 
+/*  Substract n from stack pointer. Assumes 0 <= n <= 256 && n % 8 == 0. */
+static void
+sub_from_SP ( ISelEnv* env, UInt n )
+{
+   HReg sp;
+   vassert( n < 256 && (n%8) == 0);
+   sp = s390_hreg_stack_pointer();
+   addInstr(env, s390_insn_alu(sizeof(ULong), S390_ALU_SUB, sp, s390_opnd_imm(n)));
+}
+
+
+/*  Substract n from stack pointer. Assumes 0 <= n <= 256 && n % 8 == 0. */
+static void
+add_to_SP ( ISelEnv* env, UInt n )
+{
+   HReg sp;
+   vassert(n < 256 && (n%8) == 0);
+   sp = s390_hreg_stack_pointer();
+   addInstr(env, s390_insn_alu(sizeof(ULong), S390_ALU_ADD, sp, s390_opnd_imm(n)));
+}
+
+
+static HReg
+vec_generate_zeroes(ISelEnv* env)
+{
+   HReg dst = newVRegV(env);
+   addInstr(env, s390_insn_unop(16, S390_VEC_FILL, dst, s390_opnd_imm(0x00)));
+   return dst;
+}
+
+static HReg
+vec_do_notV128(ISelEnv* env, HReg arg)
+{
+   HReg dst = newVRegV(env);
+   addInstr(env, s390_insn_vec_binop(16, S390_VEC_NOR, dst, arg, arg));
+   return dst;
+}
+
+#define IRCONST_IS_EQUAL_U8(arg, val)                   \
+   ( ((arg)->tag == Iex_Const)                          \
+     && ((arg)->Iex.Const.con->tag == Ico_U8)           \
+     && ((arg)->Iex.Const.con->Ico.U8 == (val)) )
+
+/* Returns true if (expr & 0x7 == 0) */
+static Bool
+vec_is_bytes_only_shift(const IRExpr* expr)
+{
+   const Bool is_good_const =
+                  (expr->tag == Iex_Const) &&
+                  ((expr->Iex.Const.con->Ico.U8 & 0b00000111) == 0);
+
+   const Bool good_mask_applied =
+                  (expr->tag == Iex_Binop) && (expr->Iex.Binop.op == Iop_And8) &&
+                     (IRCONST_IS_EQUAL_U8(expr->Iex.Binop.arg1, 0b01111000)
+                        ||
+                      IRCONST_IS_EQUAL_U8(expr->Iex.Binop.arg2, 0b01111000)
+                     );
+
+   return is_good_const || good_mask_applied;
+}
+#undef IRCONST_IS_EQUAL_U8
+
 /* Call a helper (clean or dirty)
    Arguments must satisfy the following conditions:
 
@@ -570,8 +599,7 @@ doHelperCall(/*OUT*/UInt *stackAdjustAfterCall,
 
    /* The return type can be I{64,32,16,8} or V{128,256}.  In the
       latter two cases, it is expected that |args| will contain the
-      special node IRExpr_VECRET(). For s390, however, V128 and V256 return
-      values do not occur as we generally do not support vector types.
+      special node IRExpr_VECRET().
 
       |args| may also contain IRExpr_GSPTR(), in which case the value
       in the guest state pointer register is passed as the
@@ -607,7 +635,7 @@ doHelperCall(/*OUT*/UInt *stackAdjustAfterCall,
             ++arg_errors;
             vex_printf("calling %s: argument #%u has type ", callee->name, i);
             ppIRType(type);
-            vex_printf("; Ity_I64 is required\n");
+            vex_printf("; Ity_I64 or Ity_V128 is required\n");
          }
       }
    }
@@ -617,7 +645,11 @@ doHelperCall(/*OUT*/UInt *stackAdjustAfterCall,
 
    /* If these fail, the IR is ill-formed */
    vassert(nGSPTRs == 0 || nGSPTRs == 1);
-   vassert(nVECRETs == 0);
+   if (UNLIKELY(retTy == Ity_V128)) {
+      vassert(nVECRETs == 1);
+   } else {
+      vassert(nVECRETs == 0);
+   }
 
    argreg = 0;
 
@@ -629,6 +661,11 @@ doHelperCall(/*OUT*/UInt *stackAdjustAfterCall,
          tmpregs[argreg] = newVRegI(env);
          addInstr(env, s390_insn_move(sizeof(ULong), tmpregs[argreg],
                                       s390_hreg_guest_state_pointer()));
+      } else if(UNLIKELY(arg->tag == Iex_VECRET)) {
+         /* Return vector via stack */
+         tmpregs[argreg] = newVRegI(env);
+         sub_from_SP(env, sizeofIRType(Ity_V128));
+         addInstr(env, s390_insn_move(sizeof(ULong), tmpregs[argreg], s390_hreg_stack_pointer()));
       } else {
          tmpregs[argreg] = s390_isel_int_expr(env, args[i]);
       }
@@ -671,6 +708,10 @@ doHelperCall(/*OUT*/UInt *stackAdjustAfterCall,
    case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8:
       *retloc = mk_RetLoc_simple(RLPri_Int);
       break;
+   case Ity_V128:
+      *retloc = mk_RetLoc_spRel(RLPri_V128SpRel, 0);
+      *stackAdjustAfterCall = sizeof(V128);
+      break;
    default:
       /* IR can denote other possible return types, but we don't
          handle those here. */
@@ -746,10 +787,12 @@ get_bfp_rounding_mode(ISelEnv *env, IRExpr *irrm)
       IRRoundingMode mode = irrm->Iex.Const.con->Ico.U32;
 
       switch (mode) {
-      case Irrm_NEAREST:  return S390_BFP_ROUND_NEAREST_EVEN;
-      case Irrm_ZERO:     return S390_BFP_ROUND_ZERO;
-      case Irrm_PosINF:   return S390_BFP_ROUND_POSINF;
-      case Irrm_NegINF:   return S390_BFP_ROUND_NEGINF;
+      case Irrm_NEAREST_TIE_AWAY_0: return S390_BFP_ROUND_NEAREST_AWAY;
+      case Irrm_PREPARE_SHORTER:    return S390_BFP_ROUND_PREPARE_SHORT;
+      case Irrm_NEAREST:            return S390_BFP_ROUND_NEAREST_EVEN;
+      case Irrm_ZERO:               return S390_BFP_ROUND_ZERO;
+      case Irrm_PosINF:             return S390_BFP_ROUND_POSINF;
+      case Irrm_NegINF:             return S390_BFP_ROUND_NEGINF;
       default:
          vpanic("get_bfp_rounding_mode");
       }
@@ -1533,6 +1576,32 @@ s390_isel_int_expr_wrk(ISelEnv *env, IRExpr *expr)
          is_commutative = False;
          break;
 
+      case Iop_GetElem8x16:
+      case Iop_GetElem16x8:
+      case Iop_GetElem32x4:
+      case Iop_GetElem64x2:{
+         HReg dst = newVRegI(env);
+         HReg vec = s390_isel_vec_expr(env, arg1);
+         s390_amode* operand = s390_isel_amode(env,IRExpr_Unop(Iop_8Uto64, arg2));
+         switch (expr->Iex.Binop.op) {
+         case Iop_GetElem8x16:
+            size = 1;
+            break;
+         case Iop_GetElem16x8:
+            size = 2;
+            break;
+         case Iop_GetElem32x4:
+            size = 4;
+            break;
+         case Iop_GetElem64x2:
+            size = 8;
+            break;
+         default:
+            vpanic("s390_isel_int_expr: impossible Iop_GetElem type");
+         }
+         addInstr(env, s390_insn_vec_amodeop(size, S390_VEC_GET_ELEM, dst, vec, operand));
+         return dst;
+      }
       default:
          goto irreducible;
       }
@@ -1726,6 +1795,38 @@ s390_isel_int_expr_wrk(ISelEnv *env, IRExpr *expr)
          return dst_hi;
       }
 
+      if(unop == Iop_V128to64 || unop == Iop_V128HIto64 || unop == Iop_V128to32) {
+         dst = newVRegI(env);
+         HReg vec = s390_isel_vec_expr(env, arg);
+         /* This is big-endian machine */
+         Int off;
+         switch (unop) {
+            case Iop_V128HIto64:
+               off = 0;
+               break;
+            case Iop_V128to64:
+               off = 8;
+               break;
+            case Iop_V128to32:
+               off = 12;
+               break;
+            default:
+               ppIROp(unop);
+               vpanic("s390_isel_int_expr: unhandled V128toSMTH operation");
+         }
+         s390_amode* m16_sp = s390_amode_for_stack_pointer(0);
+         s390_amode* off_sp = s390_amode_for_stack_pointer(off);
+
+         /* We could use negative displacement but vector instructions
+            require 12bit unsigned ones. So we have to allocate space on
+            stack just for one load and free it after. */
+         sub_from_SP(env, 16);
+         addInstr(env, s390_insn_store(sizeof(V128), m16_sp, vec));
+         addInstr(env, s390_insn_load(sizeof(ULong), dst, off_sp));
+         add_to_SP(env, 16);
+         return dst;
+      }
+
       dst  = newVRegI(env);     /* Result goes into a new register */
       opnd = s390_isel_int_expr_RMI(env, arg);     /* Process the operand */
 
@@ -3549,6 +3650,1017 @@ s390_isel_cc(ISelEnv *env, IRExpr *cond)
 }
 
 
+/*---------------------------------------------------------*/
+/*--- ISEL: Vector expressions (128 bit)                ---*/
+/*---------------------------------------------------------*/
+
+static HReg
+s390_isel_vec_expr_wrk(ISelEnv *env, IRExpr *expr)
+{
+   IRType ty = typeOfIRExpr(env->type_env, expr);
+   UChar size;
+
+   vassert(ty == Ity_V128);
+
+   size = sizeofIRType(ty);
+
+   switch (expr->tag) {
+   case Iex_RdTmp:
+      /* Return the virtual register that holds the temporary. */
+      return lookupIRTemp(env, expr->Iex.RdTmp.tmp);
+
+   /* --------- LOAD --------- */
+   case Iex_Load: {
+      HReg        dst = newVRegV(env);
+      s390_amode *am  = s390_isel_amode(env, expr->Iex.Load.addr);
+
+      if (expr->Iex.Load.end != Iend_BE)
+         goto irreducible;
+
+      addInstr(env, s390_insn_load(size, dst, am));
+
+      return dst;
+   }
+
+   /* --------- GET --------- */
+   case Iex_Get: {
+      HReg dst = newVRegV(env);
+      s390_amode *am = s390_amode_for_guest_state(expr->Iex.Get.offset);
+
+      addInstr(env, s390_insn_load(size, dst, am));
+
+      return dst;
+   }
+
+   case Iex_Const: {
+      HReg dst = newVRegV(env);
+      vassert(expr->Iex.Const.con->tag == Ico_V128);
+
+      addInstr(env, s390_insn_unop(16, S390_VEC_FILL, dst, s390_opnd_imm(expr->Iex.Const.con->Ico.V128)));
+      return dst;
+   }
+   /* --------- UNARY OP --------- */
+   case Iex_Unop: {
+      UChar size_for_int_arg = 0;
+      HReg dst = INVALID_HREG;
+      HReg reg1 = INVALID_HREG;
+      s390_unop_t vec_unop = S390_UNOP_T_INVALID;
+      s390_vec_binop_t vec_binop = S390_VEC_BINOP_T_INVALID;
+      IROp op = expr->Iex.Unop.op;
+      IRExpr* arg = expr->Iex.Unop.arg;
+      switch(op) {
+      case Iop_NotV128:
+         /* Not(Or(arg1, arg2)) -> Nor(arg1, arg2) */
+         if(UNLIKELY((arg->tag == Iex_Binop ) && (arg->Iex.Binop.op == Iop_OrV128)))
+         {
+            dst = newVRegV(env);
+            addInstr(env,
+                     s390_insn_vec_binop(16,
+                                         S390_VEC_NOR,
+                                         dst,
+                                         s390_isel_vec_expr(env, arg->Iex.Binop.arg1),
+                                         s390_isel_vec_expr(env, arg->Iex.Binop.arg2)
+                                        )
+                     );
+            return dst;
+         }
+         reg1 = s390_isel_vec_expr(env, arg);
+         return vec_do_notV128(env, reg1);
+
+      case Iop_CmpNEZ8x16:
+         size = 1;
+         goto Iop_CmpNEZ_wrk;
+      case Iop_CmpNEZ16x8:
+         size = 2;
+         goto Iop_CmpNEZ_wrk;
+      case Iop_CmpNEZ32x4:
+         size = 4;
+         goto Iop_CmpNEZ_wrk;
+      case Iop_CmpNEZ64x2:
+         size = 8;
+
+         Iop_CmpNEZ_wrk: {
+            dst = newVRegV(env);
+            reg1 = s390_isel_vec_expr(env, arg);
+            addInstr(env, s390_insn_vec_binop(size, S390_VEC_COMPARE_EQUAL, dst,
+                     reg1, vec_generate_zeroes(env)));
+            return vec_do_notV128(env, dst);
+         }
+
+      case Iop_CmpNEZ128x1: {
+         IRExpr* low64     = IRExpr_Unop(Iop_V128to64, arg);
+         IRExpr* high64    = IRExpr_Unop(Iop_V128HIto64, arg);
+         IRExpr* both      = IRExpr_Binop(Iop_Or64, low64, high64);
+         IRExpr* anyNonZ   = IRExpr_Unop(Iop_CmpNEZ64, both);
+         IRExpr* anyNonZ64 = IRExpr_Unop(Iop_1Sto64, anyNonZ);
+         reg1 = s390_isel_int_expr(env, anyNonZ64);
+
+         dst = newVRegV(env);
+         addInstr(env, s390_insn_vec_binop(size, S390_VEC_INIT_FROM_GPRS,
+                                           dst, reg1, reg1));
+         return dst;
+      }
+
+      case Iop_Dup8x16:
+         size = size_for_int_arg = 1;
+         vec_unop = S390_VEC_DUPLICATE;
+         goto Iop_V_int_wrk;
+      case Iop_Dup16x8:
+         size = size_for_int_arg = 2;
+         vec_unop = S390_VEC_DUPLICATE;
+         goto Iop_V_int_wrk;
+      case Iop_Dup32x4:
+         size = size_for_int_arg = 4;
+         vec_unop = S390_VEC_DUPLICATE;
+         goto Iop_V_int_wrk;
+
+      case Iop_Widen8Sto16x8:
+         size = 1;
+         size_for_int_arg = 8;
+         vec_unop = S390_VEC_UNPACKLOWS;
+         goto Iop_V_int_wrk;
+      case Iop_Widen16Sto32x4:
+         size = 2;
+         size_for_int_arg = 8;
+         vec_unop = S390_VEC_UNPACKLOWS;
+         goto Iop_V_int_wrk;
+      case Iop_Widen32Sto64x2:
+         size = 4;
+         size_for_int_arg = 8;
+         vec_unop = S390_VEC_UNPACKLOWS;
+         goto Iop_V_int_wrk;
+      case Iop_Widen8Uto16x8:
+         size = 1;
+         size_for_int_arg = 8;
+         vec_unop = S390_VEC_UNPACKLOWU;
+         goto Iop_V_int_wrk;
+      case Iop_Widen16Uto32x4:
+         size = 2;
+         size_for_int_arg = 8;
+         vec_unop = S390_VEC_UNPACKLOWU;
+         goto Iop_V_int_wrk;
+      case Iop_Widen32Uto64x2:
+         size = 4;
+         size_for_int_arg = 8;
+         vec_unop = S390_VEC_UNPACKLOWU;
+         goto Iop_V_int_wrk;
+
+      Iop_V_int_wrk: {
+         HReg vr1 = vec_generate_zeroes(env);
+         s390_amode* amode2 = s390_isel_amode(env, IRExpr_Const(IRConst_U64(0)));
+         reg1 = s390_isel_int_expr(env, arg);
+
+         vassert(vec_unop != S390_UNOP_T_INVALID);
+         addInstr(env,
+                  s390_insn_vec_amodeintop(size_for_int_arg, S390_VEC_SET_ELEM,
+                                           vr1, amode2, reg1));
+
+         dst = newVRegV(env);
+         addInstr(env, s390_insn_unop(size, vec_unop, dst, s390_opnd_reg(vr1)));
+         return dst;
+      }
+
+      case Iop_Abs8x16:
+         size = 1;
+         vec_unop = S390_VEC_ABS;
+         goto Iop_V_wrk;
+      case Iop_Abs16x8:
+         size = 2;
+         vec_unop = S390_VEC_ABS;
+         goto Iop_V_wrk;
+      case Iop_Abs32x4:
+         size = 4;
+         vec_unop = S390_VEC_ABS;
+         goto Iop_V_wrk;
+      case Iop_Abs64x2:
+         size = 8;
+         vec_unop = S390_VEC_ABS;
+         goto Iop_V_wrk;
+
+      case Iop_Clz8x16:
+         size = 1;
+         vec_unop = S390_VEC_COUNT_LEADING_ZEROES;
+         goto Iop_V_wrk;
+      case Iop_Ctz8x16:
+         size = 1;
+         vec_unop = S390_VEC_COUNT_TRAILING_ZEROES;
+         goto Iop_V_wrk;
+      case Iop_Clz16x8:
+         size = 2;
+         vec_unop = S390_VEC_COUNT_LEADING_ZEROES;
+         goto Iop_V_wrk;
+      case Iop_Ctz16x8:
+         size = 2;
+         vec_unop = S390_VEC_COUNT_TRAILING_ZEROES;
+         goto Iop_V_wrk;
+      case Iop_Clz32x4:
+         size = 4;
+         vec_unop = S390_VEC_COUNT_LEADING_ZEROES;
+         goto Iop_V_wrk;
+      case Iop_Ctz32x4:
+         size = 4;
+         vec_unop = S390_VEC_COUNT_TRAILING_ZEROES;
+         goto Iop_V_wrk;
+      case Iop_Clz64x2:
+         size = 8;
+         vec_unop = S390_VEC_COUNT_LEADING_ZEROES;
+         goto Iop_V_wrk;
+      case Iop_Ctz64x2:
+         size = 8;
+         vec_unop = S390_VEC_COUNT_TRAILING_ZEROES;
+         goto Iop_V_wrk;
+
+      case Iop_Cnt8x16:
+         size = 1;
+         vec_unop = S390_VEC_COUNT_ONES;
+         goto Iop_V_wrk;
+
+      case Iop_Neg64Fx2:
+         size = 8;
+         vec_unop = S390_VEC_FLOAT_NEG;
+         goto Iop_V_wrk;
+
+      case Iop_Abs64Fx2:
+         size = 8;
+         vec_unop = S390_VEC_FLOAT_ABS;
+         goto Iop_V_wrk;
+
+
+      Iop_V_wrk: {
+         dst = newVRegV(env);
+         reg1 = s390_isel_vec_expr(env, arg);
+
+         vassert(vec_unop != S390_UNOP_T_INVALID);
+         addInstr(env,
+                  s390_insn_unop(size, vec_unop, dst, s390_opnd_reg(reg1)));
+         return dst;
+      }
+
+      case Iop_PwAddL8Ux16: {
+         /* There is no such instruction. We have to emulate it. */
+         IRExpr *even = IRExpr_Binop(Iop_InterleaveEvenLanes8x16,
+                                     IRExpr_Const(IRConst_V128(0x0000)),
+                                     arg);
+         IRExpr *odd = IRExpr_Binop(Iop_InterleaveOddLanes8x16,
+                                    IRExpr_Const(IRConst_V128(0x0000)),
+                                    arg);
+         dst = s390_isel_vec_expr(env, IRExpr_Binop(Iop_Add16x8, even, odd));
+         return dst;
+      }
+
+      case Iop_PwAddL16Ux8:
+         if (arg->tag == Iex_Unop && arg->Iex.Unop.op == Iop_PwAddL8Ux16) {
+            size = 1;
+            arg = arg->Iex.Unop.arg;
+         } else {
+            size = 2;
+         }
+         vec_binop = S390_VEC_PWSUM_W;
+         goto Iop_Pairwise_wrk;
+
+      case Iop_PwAddL32Ux4:
+         if (arg->tag == Iex_Unop && arg->Iex.Unop.op == Iop_PwAddL16Ux8) {
+            size = 2;
+            arg = arg->Iex.Unop.arg;
+         } else {
+            size = 4;
+         }
+         vec_binop = S390_VEC_PWSUM_DW;
+         goto Iop_Pairwise_wrk;
+
+      case Iop_PwAddL64Ux2:
+         if (arg->tag == Iex_Unop && arg->Iex.Unop.op == Iop_PwAddL32Ux4) {
+            size = 4;
+            arg = arg->Iex.Unop.arg;
+         } else {
+            size = 8;
+         }
+         vec_binop = S390_VEC_PWSUM_QW;
+         goto Iop_Pairwise_wrk;
+
+      Iop_Pairwise_wrk: {
+         dst = newVRegV(env);
+         reg1 = s390_isel_vec_expr(env, arg);
+         vassert(vec_binop != S390_VEC_BINOP_T_INVALID);
+         addInstr(env,
+                  s390_insn_vec_binop(size, vec_binop, dst, reg1,
+                                      vec_generate_zeroes(env)));
+         return dst;
+      }
+
+      default:
+         goto irreducible;
+      }
+   }
+
+   /* --------- BINARY OP --------- */
+   case Iex_Binop: {
+      HReg dst = newVRegV(env);
+      HReg reg1 = INVALID_HREG, reg2 = INVALID_HREG;
+      IROp op = expr->Iex.Binop.op;
+      s390_unop_t vec_unop = S390_UNOP_T_INVALID;
+      s390_vec_binop_t vec_binop = S390_VEC_BINOP_T_INVALID;
+      s390_vec_amodeop_t shift_op = S390_VEC_AMODEOP_T_INVALID;
+      IRExpr* arg1 = expr->Iex.Binop.arg1;
+      IRExpr* arg2 = expr->Iex.Binop.arg2;
+      switch(op) {
+      case Iop_QNarrowBin16Uto8Ux16:
+         size = 2;
+         vec_binop = S390_VEC_PACK_SATURU;
+         goto Iop_VV_wrk;
+      case Iop_QNarrowBin16Sto8Sx16:
+         size = 2;
+         vec_binop = S390_VEC_PACK_SATURS;
+         goto Iop_VV_wrk;
+      case Iop_QNarrowBin32Uto16Ux8:
+         size = 4;
+         vec_binop = S390_VEC_PACK_SATURU;
+         goto Iop_VV_wrk;
+      case Iop_QNarrowBin32Sto16Sx8:
+         size = 4;
+         vec_binop = S390_VEC_PACK_SATURS;
+         goto Iop_VV_wrk;
+      case Iop_QNarrowBin64Uto32Ux4:
+         size = 8;
+         vec_binop = S390_VEC_PACK_SATURU;
+         goto Iop_VV_wrk;
+      case Iop_QNarrowBin64Sto32Sx4:
+         size = 8;
+         vec_binop = S390_VEC_PACK_SATURS;
+         goto Iop_VV_wrk;
+
+      case Iop_NarrowBin16to8x16:
+         size = 2;
+         vec_binop = S390_VEC_PACK;
+         goto Iop_VV_wrk;
+      case Iop_NarrowBin32to16x8:
+         size = 4;
+         vec_binop = S390_VEC_PACK;
+         goto Iop_VV_wrk;
+      case Iop_NarrowBin64to32x4:
+         size = 8;
+         vec_binop = S390_VEC_PACK;
+         goto Iop_VV_wrk;
+
+      case Iop_OrV128:
+         size = 16;
+         vec_binop = S390_VEC_OR;
+         goto Iop_VV_wrk;
+
+      case Iop_XorV128:
+         size = 16;
+         vec_binop = S390_VEC_XOR;
+         goto Iop_VV_wrk;
+
+      case Iop_AndV128:
+         size = 16;
+         vec_binop = S390_VEC_AND;
+         goto Iop_VV_wrk;
+
+      case Iop_InterleaveLO8x16:
+         size = 1;
+         vec_binop = S390_VEC_MERGEL;
+         goto Iop_VV_wrk;
+      case Iop_InterleaveLO16x8:
+         size = 2;
+         vec_binop = S390_VEC_MERGEL;
+         goto Iop_VV_wrk;
+      case Iop_InterleaveLO32x4:
+         size = 4;
+         vec_binop = S390_VEC_MERGEL;
+         goto Iop_VV_wrk;
+      case Iop_InterleaveLO64x2:
+         size = 8;
+         vec_binop = S390_VEC_MERGEL;
+         goto Iop_VV_wrk;
+
+      case Iop_InterleaveHI8x16:
+         size = 1;
+         vec_binop = S390_VEC_MERGEH;
+         goto Iop_VV_wrk;
+      case Iop_InterleaveHI16x8:
+         size = 2;
+         vec_binop = S390_VEC_MERGEH;
+         goto Iop_VV_wrk;
+      case Iop_InterleaveHI32x4:
+         size = 4;
+         vec_binop = S390_VEC_MERGEH;
+         goto Iop_VV_wrk;
+      case Iop_InterleaveHI64x2:
+         size = 8;
+         vec_binop = S390_VEC_MERGEH;
+         goto Iop_VV_wrk;
+
+      case Iop_InterleaveEvenLanes8x16: {
+         /* There is no such instruction. We have to emulate it. */
+         IRExpr* mask = IRExpr_Binop(Iop_64HLtoV128,
+                                     mkU64(0x0010021204140616ULL),
+                                     mkU64(0x08180a1a0c1c0e1eULL));
+         HReg reg_mask = s390_isel_vec_expr(env, mask);
+         reg1 = s390_isel_vec_expr(env, arg1);
+         reg2 = s390_isel_vec_expr(env, arg2);
+
+         addInstr(env,
+                  s390_insn_vec_triop(16, S390_VEC_PERM, dst, reg1, reg2,
+                                      reg_mask)
+                  );
+
+         return dst;
+      }
+      case Iop_InterleaveOddLanes8x16: {
+         /* There is no such instruction. We have to emulate it. */
+         IRExpr* mask = IRExpr_Binop(Iop_64HLtoV128,
+                                     mkU64(0x0111031305150717ULL),
+                                     mkU64(0x09190b1b0d1d0f1fULL));
+         HReg reg_mask = s390_isel_vec_expr(env, mask);
+         reg1 = s390_isel_vec_expr(env, arg1);
+         reg2 = s390_isel_vec_expr(env, arg2);
+
+         addInstr(env,
+                  s390_insn_vec_triop(16, S390_VEC_PERM, dst, reg1, reg2, reg_mask)
+                  );
+
+         return dst;
+      }
+
+      case Iop_CmpEQ8x16:
+         size = 1;
+         vec_binop = S390_VEC_COMPARE_EQUAL;
+         goto Iop_VV_wrk;
+      case Iop_CmpEQ16x8:
+         size = 2;
+         vec_binop = S390_VEC_COMPARE_EQUAL;
+         goto Iop_VV_wrk;
+      case Iop_CmpEQ32x4:
+         size = 4;
+         vec_binop = S390_VEC_COMPARE_EQUAL;
+         goto Iop_VV_wrk;
+      case Iop_CmpEQ64x2:
+         size = 8;
+         vec_binop = S390_VEC_COMPARE_EQUAL;
+         goto Iop_VV_wrk;
+
+      case Iop_Add8x16:
+         size = 1;
+         vec_binop = S390_VEC_INT_ADD;
+         goto Iop_VV_wrk;
+      case Iop_Add16x8:
+         size = 2;
+         vec_binop = S390_VEC_INT_ADD;
+         goto Iop_VV_wrk;
+      case Iop_Add32x4:
+         size = 4;
+         vec_binop = S390_VEC_INT_ADD;
+         goto Iop_VV_wrk;
+      case Iop_Add64x2:
+         size = 8;
+         vec_binop = S390_VEC_INT_ADD;
+         goto Iop_VV_wrk;
+      case Iop_Add128x1:
+         size = 16;
+         vec_binop = S390_VEC_INT_ADD;
+         goto Iop_VV_wrk;
+
+      case Iop_Sub8x16:
+         size = 1;
+         vec_binop = S390_VEC_INT_SUB;
+         goto Iop_VV_wrk;
+      case Iop_Sub16x8:
+         size = 2;
+         vec_binop = S390_VEC_INT_SUB;
+         goto Iop_VV_wrk;
+      case Iop_Sub32x4:
+         size = 4;
+         vec_binop = S390_VEC_INT_SUB;
+         goto Iop_VV_wrk;
+      case Iop_Sub64x2:
+         size = 8;
+         vec_binop = S390_VEC_INT_SUB;
+         goto Iop_VV_wrk;
+      case Iop_Sub128x1:
+         size = 16;
+         vec_binop = S390_VEC_INT_SUB;
+         goto Iop_VV_wrk;
+
+      case Iop_Max8Ux16:
+         size = 1;
+         vec_binop = S390_VEC_MAXU;
+         goto Iop_VV_wrk;
+      case Iop_Max8Sx16:
+         size = 1;
+         vec_binop = S390_VEC_MAXS;
+         goto Iop_VV_wrk;
+      case Iop_Max16Ux8:
+         size = 2;
+         vec_binop = S390_VEC_MAXU;
+         goto Iop_VV_wrk;
+      case Iop_Max16Sx8:
+         size = 2;
+         vec_binop = S390_VEC_MAXS;
+         goto Iop_VV_wrk;
+      case Iop_Max32Ux4:
+         size = 4;
+         vec_binop = S390_VEC_MAXU;
+         goto Iop_VV_wrk;
+      case Iop_Max32Sx4:
+         size = 4;
+         vec_binop = S390_VEC_MAXS;
+         goto Iop_VV_wrk;
+      case Iop_Max64Ux2:
+         size = 8;
+         vec_binop = S390_VEC_MAXU;
+         goto Iop_VV_wrk;
+      case Iop_Max64Sx2:
+         size = 8;
+         vec_binop = S390_VEC_MAXS;
+         goto Iop_VV_wrk;
+
+      case Iop_Min8Ux16:
+         size = 1;
+         vec_binop = S390_VEC_MINU;
+         goto Iop_VV_wrk;
+      case Iop_Min8Sx16:
+         size = 1;
+         vec_binop = S390_VEC_MINS;
+         goto Iop_VV_wrk;
+      case Iop_Min16Ux8:
+         size = 2;
+         vec_binop = S390_VEC_MINU;
+         goto Iop_VV_wrk;
+      case Iop_Min16Sx8:
+         size = 2;
+         vec_binop = S390_VEC_MINS;
+         goto Iop_VV_wrk;
+      case Iop_Min32Ux4:
+         size = 4;
+         vec_binop = S390_VEC_MINU;
+         goto Iop_VV_wrk;
+      case Iop_Min32Sx4:
+         size = 4;
+         vec_binop = S390_VEC_MINS;
+         goto Iop_VV_wrk;
+      case Iop_Min64Ux2:
+         size = 8;
+         vec_binop = S390_VEC_MINU;
+         goto Iop_VV_wrk;
+      case Iop_Min64Sx2:
+         size = 8;
+         vec_binop = S390_VEC_MINS;
+         goto Iop_VV_wrk;
+
+      case Iop_Avg8Ux16:
+         size = 1;
+         vec_binop = S390_VEC_AVGU;
+         goto Iop_VV_wrk;
+      case Iop_Avg8Sx16:
+         size = 1;
+         vec_binop = S390_VEC_AVGS;
+         goto Iop_VV_wrk;
+      case Iop_Avg16Ux8:
+         size = 2;
+         vec_binop = S390_VEC_AVGU;
+         goto Iop_VV_wrk;
+      case Iop_Avg16Sx8:
+         size = 2;
+         vec_binop = S390_VEC_AVGS;
+         goto Iop_VV_wrk;
+      case Iop_Avg32Ux4:
+         size = 4;
+         vec_binop = S390_VEC_AVGU;
+         goto Iop_VV_wrk;
+      case Iop_Avg32Sx4:
+         size = 4;
+         vec_binop = S390_VEC_AVGS;
+         goto Iop_VV_wrk;
+      case Iop_Avg64Ux2:
+         size = 8;
+         vec_binop = S390_VEC_AVGU;
+         goto Iop_VV_wrk;
+      case Iop_Avg64Sx2:
+         size = 8;
+         vec_binop = S390_VEC_AVGS;
+         goto Iop_VV_wrk;
+
+      case Iop_CmpGT8Ux16:
+         size = 1;
+         vec_binop = S390_VEC_COMPARE_GREATERU;
+         goto Iop_VV_wrk;
+      case Iop_CmpGT8Sx16:
+         size = 1;
+         vec_binop = S390_VEC_COMPARE_GREATERS;
+         goto Iop_VV_wrk;
+      case Iop_CmpGT16Ux8:
+         size = 2;
+         vec_binop = S390_VEC_COMPARE_GREATERU;
+         goto Iop_VV_wrk;
+      case Iop_CmpGT16Sx8:
+         size = 2;
+         vec_binop = S390_VEC_COMPARE_GREATERS;
+         goto Iop_VV_wrk;
+      case Iop_CmpGT32Ux4:
+         size = 4;
+         vec_binop = S390_VEC_COMPARE_GREATERU;
+         goto Iop_VV_wrk;
+      case Iop_CmpGT32Sx4:
+         size = 4;
+         vec_binop = S390_VEC_COMPARE_GREATERS;
+         goto Iop_VV_wrk;
+      case Iop_CmpGT64Ux2:
+         size = 8;
+         vec_binop = S390_VEC_COMPARE_GREATERU;
+         goto Iop_VV_wrk;
+      case Iop_CmpGT64Sx2:
+         size = 8;
+         vec_binop = S390_VEC_COMPARE_GREATERS;
+         goto Iop_VV_wrk;
+
+      case Iop_MulHi8Ux16:
+         size = 1;
+         vec_binop = S390_VEC_INT_MUL_HIGHU;
+         goto Iop_VV_wrk;
+      case Iop_MulHi8Sx16:
+         size = 1;
+         vec_binop = S390_VEC_INT_MUL_HIGHS;
+         goto Iop_VV_wrk;
+      case Iop_MulHi16Ux8:
+         size = 2;
+         vec_binop = S390_VEC_INT_MUL_HIGHU;
+         goto Iop_VV_wrk;
+      case Iop_MulHi16Sx8:
+         size = 2;
+         vec_binop = S390_VEC_INT_MUL_HIGHS;
+         goto Iop_VV_wrk;
+      case Iop_MulHi32Ux4:
+         size = 4;
+         vec_binop = S390_VEC_INT_MUL_HIGHU;
+         goto Iop_VV_wrk;
+      case Iop_MulHi32Sx4:
+         size = 4;
+         vec_binop = S390_VEC_INT_MUL_HIGHS;
+         goto Iop_VV_wrk;
+
+      case Iop_Mul8x16:
+         size = 1;
+         vec_binop = S390_VEC_INT_MUL_LOW;
+         goto Iop_VV_wrk;
+      case Iop_Mul16x8:
+         size = 2;
+         vec_binop = S390_VEC_INT_MUL_LOW;
+         goto Iop_VV_wrk;
+      case Iop_Mul32x4:
+         size = 4;
+         vec_binop = S390_VEC_INT_MUL_LOW;
+         goto Iop_VV_wrk;
+
+      case Iop_MullEven8Sx16:
+         size = 1;
+         vec_binop = S390_VEC_INT_MUL_EVENS;
+         goto Iop_VV_wrk;
+      case Iop_MullEven8Ux16:
+         size = 1;
+         vec_binop = S390_VEC_INT_MUL_EVENU;
+         goto Iop_VV_wrk;
+      case Iop_MullEven16Sx8:
+         size = 2;
+         vec_binop = S390_VEC_INT_MUL_EVENS;
+         goto Iop_VV_wrk;
+      case Iop_MullEven16Ux8:
+         size = 2;
+         vec_binop = S390_VEC_INT_MUL_EVENU;
+         goto Iop_VV_wrk;
+      case Iop_MullEven32Sx4:
+         size = 4;
+         vec_binop = S390_VEC_INT_MUL_EVENS;
+         goto Iop_VV_wrk;
+      case Iop_MullEven32Ux4:
+         size = 4;
+         vec_binop = S390_VEC_INT_MUL_EVENU;
+         goto Iop_VV_wrk;
+
+      case Iop_Shl8x16:
+         size = 1;
+         vec_binop = S390_VEC_ELEM_SHL_V;
+         goto Iop_VV_wrk;
+      case Iop_Shl16x8:
+         size = 2;
+         vec_binop = S390_VEC_ELEM_SHL_V;
+         goto Iop_VV_wrk;
+      case Iop_Shl32x4:
+         size = 4;
+         vec_binop = S390_VEC_ELEM_SHL_V;
+         goto Iop_VV_wrk;
+      case Iop_Shl64x2:
+         size = 8;
+         vec_binop = S390_VEC_ELEM_SHL_V;
+         goto Iop_VV_wrk;
+
+      case Iop_Shr8x16:
+         size = 1;
+         vec_binop = S390_VEC_ELEM_SHRL_V;
+         goto Iop_VV_wrk;
+      case Iop_Shr16x8:
+         size = 2;
+         vec_binop = S390_VEC_ELEM_SHRL_V;
+         goto Iop_VV_wrk;
+      case Iop_Shr32x4:
+         size = 4;
+         vec_binop = S390_VEC_ELEM_SHRL_V;
+         goto Iop_VV_wrk;
+      case Iop_Shr64x2:
+         size = 8;
+         vec_binop = S390_VEC_ELEM_SHRL_V;
+         goto Iop_VV_wrk;
+
+      case Iop_Sar8x16:
+         size = 1;
+         vec_binop = S390_VEC_ELEM_SHRA_V;
+         goto Iop_VV_wrk;
+      case Iop_Sar16x8:
+         size = 2;
+         vec_binop = S390_VEC_ELEM_SHRA_V;
+         goto Iop_VV_wrk;
+      case Iop_Sar32x4:
+         size = 4;
+         vec_binop = S390_VEC_ELEM_SHRA_V;
+         goto Iop_VV_wrk;
+      case Iop_Sar64x2:
+         size = 8;
+         vec_binop = S390_VEC_ELEM_SHRA_V;
+         goto Iop_VV_wrk;
+
+      case Iop_Rol8x16:
+         size = 1;
+         vec_binop = S390_VEC_ELEM_ROLL_V;
+         goto Iop_VV_wrk;
+      case Iop_Rol16x8:
+         size = 2;
+         vec_binop = S390_VEC_ELEM_ROLL_V;
+         goto Iop_VV_wrk;
+      case Iop_Rol32x4:
+         size = 4;
+         vec_binop = S390_VEC_ELEM_ROLL_V;
+         goto Iop_VV_wrk;
+      case Iop_Rol64x2:
+         size = 8;
+         vec_binop = S390_VEC_ELEM_ROLL_V;
+         goto Iop_VV_wrk;
+
+      case Iop_CmpEQ64Fx2:
+         size = 8;
+         vec_binop = S390_VEC_FLOAT_COMPARE_EQUAL;
+         goto Iop_VV_wrk;
+
+      case Iop_CmpLE64Fx2: {
+         size = 8;
+         vec_binop = S390_VEC_FLOAT_COMPARE_LESS_OR_EQUAL;
+         goto Iop_VV_wrk;
+      }
+
+      case Iop_CmpLT64Fx2: {
+         size = 8;
+         vec_binop = S390_VEC_FLOAT_COMPARE_LESS;
+         goto Iop_VV_wrk;
+      }
+
+      case Iop_Sqrt64Fx2:
+         size = 8;
+         vec_unop = S390_VEC_FLOAT_SQRT;
+         goto Iop_irrm_V_wrk;
+
+      case Iop_ShlN8x16:
+         size = 1;
+         shift_op = S390_VEC_ELEM_SHL_INT;
+         goto Iop_ShiftN_wrk;
+      case Iop_ShlN16x8:
+         size = 2;
+         shift_op = S390_VEC_ELEM_SHL_INT;
+         goto Iop_ShiftN_wrk;
+      case Iop_ShlN32x4:
+         size = 4;
+         shift_op = S390_VEC_ELEM_SHL_INT;
+         goto Iop_ShiftN_wrk;
+      case Iop_ShlN64x2:
+         size = 8;
+         shift_op = S390_VEC_ELEM_SHL_INT;
+         goto Iop_ShiftN_wrk;
+
+      case Iop_ShrN8x16:
+         size = 1;
+         shift_op = S390_VEC_ELEM_SHRL_INT;
+         goto Iop_ShiftN_wrk;
+      case Iop_ShrN16x8:
+         size = 2;
+         shift_op = S390_VEC_ELEM_SHRL_INT;
+         goto Iop_ShiftN_wrk;
+      case Iop_ShrN32x4:
+         size = 4;
+         shift_op = S390_VEC_ELEM_SHRL_INT;
+         goto Iop_ShiftN_wrk;
+      case Iop_ShrN64x2:
+         size = 8;
+         shift_op = S390_VEC_ELEM_SHRL_INT;
+         goto Iop_ShiftN_wrk;
+
+      case Iop_SarN8x16:
+         size = 1;
+         shift_op = S390_VEC_ELEM_SHRA_INT;
+         goto Iop_ShiftN_wrk;
+      case Iop_SarN16x8:
+         size = 2;
+         shift_op = S390_VEC_ELEM_SHRA_INT;
+         goto Iop_ShiftN_wrk;
+      case Iop_SarN32x4:
+         size = 4;
+         shift_op = S390_VEC_ELEM_SHRA_INT;
+         goto Iop_ShiftN_wrk;
+      case Iop_SarN64x2:
+         size = 8;
+         shift_op = S390_VEC_ELEM_SHRA_INT;
+         goto Iop_ShiftN_wrk;
+
+         Iop_ShiftN_wrk: {
+            HReg vec = s390_isel_vec_expr(env, arg1);
+            s390_amode* number = s390_isel_amode(env,IRExpr_Unop(Iop_8Uto64, arg2));
+
+            addInstr(env,
+                     s390_insn_vec_amodeop(size, shift_op, dst, vec, number));
+
+         return dst;
+         }
+
+      case Iop_ShlV128:
+         vec_binop = S390_VEC_SHL_BITS;
+         goto Iop_ShiftVV_wrk;
+      case Iop_ShrV128:
+         vec_binop = S390_VEC_SHRL_BITS;
+         goto Iop_ShiftVV_wrk;
+      case Iop_SarV128:
+         vec_binop = S390_VEC_SHRA_BITS;
+         goto Iop_ShiftVV_wrk;
+
+      Iop_ShiftVV_wrk: {
+         vassert(vec_binop != S390_VEC_BINOP_T_INVALID);
+         reg1 = s390_isel_vec_expr(env, arg1);
+         reg2 = s390_isel_vec_expr(env, IRExpr_Unop(Iop_Dup8x16, arg2));
+
+         /* Handle special case */
+         if (vec_is_bytes_only_shift(arg2))
+         {
+            /* In this case we skip the BITS shift step. */
+            addInstr(env, s390_insn_vec_binop(16, (vec_binop + 1),
+                     dst, reg1, reg2));
+
+            return dst;
+         }
+
+         /* General case (BYTES shift & BITS shift) */
+         addInstr(env, s390_insn_vec_binop(16, (vec_binop + 1),
+                  dst, reg1, reg2));
+
+         addInstr(env, s390_insn_vec_binop(16, vec_binop,
+                  dst, dst, reg2));
+
+         return dst;
+      }
+
+      Iop_VV_wrk: {
+         vassert(vec_binop != S390_VEC_BINOP_T_INVALID);
+         reg1 = s390_isel_vec_expr(env, arg1);
+         reg2 = s390_isel_vec_expr(env, arg2);
+
+         addInstr(env, s390_insn_vec_binop(size, vec_binop,
+                  dst, reg1, reg2));
+
+         return dst;
+      }
+
+      Iop_irrm_V_wrk: {
+         vassert(vec_unop != S390_UNOP_T_INVALID);
+         set_bfp_rounding_mode_in_fpc(env, arg1);
+         reg1 = s390_isel_vec_expr(env, arg2);
+
+         addInstr(env, s390_insn_unop(size, vec_unop, dst, s390_opnd_reg(reg1)));
+         return dst;
+      }
+
+      case Iop_64HLtoV128:
+         reg1 = s390_isel_int_expr(env, arg1);
+         reg2 = s390_isel_int_expr(env, arg2);
+
+         addInstr(env, s390_insn_vec_binop(size, S390_VEC_INIT_FROM_GPRS,
+                  dst, reg1, reg2));
+
+         return dst;
+
+      default:
+         goto irreducible;
+      }
+   }
+
+   /* --------- TERNARY OP --------- */
+   case Iex_Triop: {
+      HReg dst = newVRegV(env);
+      s390_amode* amode2 = NULL;
+      HReg reg1 = INVALID_HREG, reg2 = INVALID_HREG, reg3 = INVALID_HREG;
+      IROp op = expr->Iex.Triop.details->op;
+      IRExpr* arg1 = expr->Iex.Triop.details->arg1;
+      IRExpr* arg2 = expr->Iex.Triop.details->arg2;
+      IRExpr* arg3 = expr->Iex.Triop.details->arg3;
+      s390_vec_binop_t vec_binop = S390_VEC_BINOP_T_INVALID;
+      switch (op) {
+      case Iop_SetElem8x16:
+         size = 1;
+         goto Iop_SetElem_wrk;
+      case Iop_SetElem16x8:
+         size = 2;
+         goto Iop_SetElem_wrk;
+      case Iop_SetElem32x4:
+         size = 4;
+         goto Iop_SetElem_wrk;
+      case Iop_SetElem64x2: {
+         size = 8;
+
+         Iop_SetElem_wrk:{
+            reg1 = s390_isel_vec_expr(env, arg1);
+            amode2 = s390_isel_amode(env, IRExpr_Unop(Iop_8Uto64, arg2));
+            reg3 = s390_isel_int_expr(env, arg3);
+
+            addInstr(env, s390_insn_move(16, dst, reg1));
+            addInstr(env, s390_insn_vec_amodeintop(size, S390_VEC_SET_ELEM,
+                     dst, amode2, reg3));
+            return dst;
+         }
+      }
+
+      case Iop_Perm8x16x2:
+         size = 16;
+         reg1 = s390_isel_vec_expr(env, arg1);
+         reg2 = s390_isel_vec_expr(env, arg2);
+         reg3 = s390_isel_vec_expr(env, arg3);
+
+         addInstr(env, s390_insn_vec_triop(size, S390_VEC_PERM,
+                                           dst, reg1, reg2, reg3));
+         return dst;
+
+      case Iop_Add64Fx2:
+         size = 8;
+         vec_binop = S390_VEC_FLOAT_ADD;
+         goto Iop_irrm_VV_wrk;
+
+      case Iop_Sub64Fx2:
+         size = 8;
+         vec_binop = S390_VEC_FLOAT_SUB;
+         goto Iop_irrm_VV_wrk;
+
+      case Iop_Mul64Fx2:
+         size = 8;
+         vec_binop = S390_VEC_FLOAT_MUL;
+         goto Iop_irrm_VV_wrk;
+      case Iop_Div64Fx2:
+         size = 8;
+         vec_binop = S390_VEC_FLOAT_DIV;
+         goto Iop_irrm_VV_wrk;
+
+      Iop_irrm_VV_wrk: {
+         vassert(vec_binop != S390_VEC_BINOP_T_INVALID);
+         set_bfp_rounding_mode_in_fpc(env, arg1);
+         reg1 = s390_isel_vec_expr(env, arg2);
+         reg2 = s390_isel_vec_expr(env, arg3);
+
+         addInstr(env, s390_insn_vec_binop(size, vec_binop,
+                                           dst, reg1, reg2));
+
+         return dst;
+       }
+
+      default:
+         goto irreducible;
+      }
+   }
+
+   default:
+      goto irreducible;
+   }
+
+   /* We get here if no pattern matched. */
+ irreducible:
+   ppIRExpr(expr);
+   vpanic("s390_isel_vec_expr: cannot reduce tree");
+}
+
+static HReg
+s390_isel_vec_expr(ISelEnv *env, IRExpr *expr)
+{
+   HReg dst = s390_isel_vec_expr_wrk(env, expr);
+
+   /* Sanity checks ... */
+   vassert(hregClass(dst) == HRcVec128);
+   vassert(hregIsVirtual(dst));
+
+   return dst;
+}
+
+
 /*---------------------------------------------------------*/
 /*--- ISEL: Statements                                  ---*/
 /*---------------------------------------------------------*/
@@ -3618,6 +4730,9 @@ s390_isel_stmt(ISelEnv *env, IRStmt *stmt)
          /* Cannot occur. No such instruction */
          vpanic("Ist_Store with 128-bit floating point data");
 
+      case Ity_V128:
+         src = s390_isel_vec_expr(env, stmt->Ist.Store.data);
+         break;
       default:
          goto stmt_fail;
       }
@@ -3764,6 +4879,9 @@ s390_isel_stmt(ISelEnv *env, IRStmt *stmt)
          src = s390_isel_dfp_expr(env, stmt->Ist.Put.data);
          break;
 
+      case Ity_V128:
+         src = s390_isel_vec_expr(env, stmt->Ist.Put.data);
+         break;
       default:
          goto stmt_fail;
       }
@@ -3839,6 +4957,11 @@ s390_isel_stmt(ISelEnv *env, IRStmt *stmt)
          return;
       }
 
+      case Ity_V128:
+         src = s390_isel_vec_expr(env, stmt->Ist.WrTmp.data);
+         dst = lookupIRTemp(env, tmp);
+         break;
+
       default:
          goto stmt_fail;
       }
@@ -3883,7 +5006,7 @@ s390_isel_stmt(ISelEnv *env, IRStmt *stmt)
 
       retty = typeOfIRTemp(env->type_env, d->tmp);
       if (retty == Ity_I64 || retty == Ity_I32
-          || retty == Ity_I16 || retty == Ity_I8) {
+          || retty == Ity_I16 || retty == Ity_I8 || retty == Ity_V128) {
          /* Move the returned value to the destination register */
          HReg ret = make_gpr(S390_REGNO_RETURN_VALUE);
 
@@ -3891,10 +5014,28 @@ s390_isel_stmt(ISelEnv *env, IRStmt *stmt)
          doHelperCall(&addToSp, &rloc, env, d->guard,  d->cee, retty,
                       d->args);
          vassert(is_sane_RetLoc(rloc));
-         vassert(rloc.pri == RLPri_Int);
-         vassert(addToSp == 0);
-         addInstr(env, s390_insn_move(sizeof(ULong), dst, ret));
 
+         switch(retty)
+         {
+            case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8:
+               vassert(rloc.pri == RLPri_Int);
+               vassert(addToSp == 0);
+               addInstr(env, s390_insn_move(sizeof(ULong), dst, ret));
+               break;
+            case Ity_V128:
+               /* The returned value is on the stack, and rloc.spOff
+                  tells us where.  Fish it off the stack and then move
+                  the stack pointer upwards to clear it, as directed by
+                  doHelperCall. */
+               vassert(rloc.pri == RLPri_V128SpRel);
+               vassert(addToSp == sizeof(V128));
+               s390_amode* am  = s390_amode_b12(rloc.spOff, s390_hreg_stack_pointer());
+               addInstr(env, s390_insn_load(sizeof(V128), dst, am));
+               add_to_SP(env, addToSp);
+               break;
+            default:
+               vpanic("s390_isel_stmt: invalid return type from dirty helper");
+         }
          return;
       }
       break;
@@ -4000,7 +5141,8 @@ s390_isel_stmt(ISelEnv *env, IRStmt *stmt)
       case Ijk_ClientReq:
       case Ijk_NoRedir:
       case Ijk_Yield:
-      case Ijk_SigTRAP: {
+      case Ijk_SigTRAP:
+      case Ijk_SigFPE: {
          HReg dst = s390_isel_int_expr(env, IRExpr_Const(stmt->Ist.Exit.dst));
          addInstr(env, s390_insn_xassisted(cond, dst, guest_IA,
                                            stmt->Ist.Exit.jk));
@@ -4115,7 +5257,8 @@ iselNext(ISelEnv *env, IRExpr *next, IRJumpKind jk, Int offsIP)
    case Ijk_ClientReq:
    case Ijk_NoRedir:
    case Ijk_Yield:
-   case Ijk_SigTRAP: {
+   case Ijk_SigTRAP:
+   case Ijk_SigFPE: {
       HReg dst = s390_isel_int_expr(env, next);
       addInstr(env, s390_insn_xassisted(S390_CC_ALWAYS, dst, guest_IA, jk));
       return;
@@ -4220,7 +5363,10 @@ iselSB_S390(const IRSB *bb, VexArch arch_host, const VexArchInfo *archinfo_host,
          hregHI = mkVRegF(j++);
          break;
 
-      case Ity_V128: /* fall through */
+      case Ity_V128:
+         hreg   = mkVRegV(j++);
+         break;
+
       default:
          ppIRType(bb->tyenv->types[i]);
          vpanic("iselSB_S390: IRTemp type");
diff --git a/priv/s390_defs.h b/priv/s390_defs.h
index 6751d8063..56886dbe4 100644
--- a/priv/s390_defs.h
+++ b/priv/s390_defs.h
@@ -8,7 +8,7 @@
    This file is part of Valgrind, a dynamic binary instrumentation
    framework.
 
-   Copyright IBM Corp. 2010-2015
+   Copyright IBM Corp. 2010-2017
 
    This program is free software; you can redistribute it and/or
    modify it under the terms of the GNU General Public License as
@@ -146,6 +146,28 @@ typedef enum {
    S390_PFPO_D128_TO_F128 = 0x01070A
 } s390_pfpo_function_t;
 
+/* PPNO function code as it is encoded in bits [57:63] of GR0
+   when PPNO insn is executed. */
+typedef enum
+{
+   S390_PPNO_QUERY       = 0x00,
+   S390_PPNO_SHA512_GEN  = 0x03,
+   S390_PPNO_SHA512_SEED = 0x83
+} s390_ppno_function_t;
+
+/* Size of parameter block for PPNO functions.
+   All values are in bytes.
+ */
+#define S390_PPNO_PARAM_BLOCK_SIZE_QUERY  16
+#define S390_PPNO_PARAM_BLOCK_SIZE_SHA512 240
+
+/* Maximum length of modified memory for PPNO functions.
+   All values are in bytes.
+*/
+#define S390_PPNO_MAX_SIZE_SHA512_SEED 512
+#define S390_PPNO_MAX_SIZE_SHA512_GEN  64
+
+
 /* The length of the longest mnemonic: locgrnhe */
 #define S390_MAX_MNEMONIC_LEN  8
 
diff --git a/priv/s390_disasm.c b/priv/s390_disasm.c
index fa18ca5d4..58189f123 100644
--- a/priv/s390_disasm.c
+++ b/priv/s390_disasm.c
@@ -8,7 +8,7 @@
    This file is part of Valgrind, a dynamic binary instrumentation
    framework.
 
-   Copyright IBM Corp. 2010-2015
+   Copyright IBM Corp. 2010-2017
 
    This program is free software; you can redistribute it and/or
    modify it under the terms of the GNU General Public License as
@@ -135,6 +135,26 @@ cab_operand(const HChar *base, UInt mask)
 }
 
 
+/* Return the name of a vector register for dis-assembly purposes. */
+static const HChar *
+vr_operand(UInt archreg)
+{
+   static const HChar names[32][5] = {
+      "%v0", "%v1", "%v2", "%v3",
+      "%v4", "%v5", "%v6", "%v7",
+      "%v8", "%v9", "%v10", "%v11",
+      "%v16", "%v17", "%v18", "%v19",
+      "%v20", "%v21", "%v22", "%v23",
+      "%v24", "%v25", "%v26", "%v27",
+      "%v28", "%v29", "%v30", "%v31",
+   };
+
+   vassert(archreg < 32);
+
+   return names[archreg];
+}
+
+
 /* Common function used to construct a mnemonic based on a condition code
    mask. */
 static const HChar *
@@ -231,6 +251,12 @@ cls_operand(Int kind, UInt mask)
    case S390_XMNM_LOCG:   prefix = "locg";  break;
    case S390_XMNM_STOC:   prefix = "stoc";  break;
    case S390_XMNM_STOCG:  prefix = "stocg"; break;
+   case S390_XMNM_STOCFH: prefix = "stocfh"; break;
+   case S390_XMNM_LOCFH:  prefix = "locgh"; break;
+   case S390_XMNM_LOCFHR: prefix = "locghr"; break;
+   case S390_XMNM_LOCHI:  prefix = "lochi"; break;
+   case S390_XMNM_LOCGHI: prefix = "locghi"; break;
+   case S390_XMNM_LOCHHI: prefix = "lochhi"; break;
    default:
       vpanic("cls_operand");
    }
@@ -284,13 +310,42 @@ udlb_operand(HChar *p, UInt d, UInt length, UInt b)
 }
 
 
+/* An operand with a base register, an vector register, and a displacement.
+   If the displacement is signed, the rightmost 20 bit of D need to be
+   sign extended */
+static HChar *
+dvb_operand(HChar *p, UInt d, UInt v, UInt b, Bool displacement_is_signed)
+{
+   if (displacement_is_signed) {
+      Int displ = (Int)(d << 12) >> 12;  /* sign extend */
+
+      p += vex_sprintf(p, "%d", displ);
+   } else {
+      p += vex_sprintf(p, "%u", d);
+   }
+   if (v != 0) {
+      p += vex_sprintf(p, "(%s", vr_operand(v));
+      if (b != 0) {
+         p += vex_sprintf(p, ",%s", gpr_operand(b));
+      }
+      p += vex_sprintf(p, ")");
+   } else {
+      if (b != 0) {
+         p += vex_sprintf(p, "(%s)", gpr_operand(b));
+      }
+   }
+
+   return p;
+}
+
+
 /* The first argument is the command that says how to write the disassembled
    insn. It is understood that the mnemonic comes first and that arguments
    are separated by a ','. The command holds the arguments. Each argument is
    encoded using a 4-bit S390_ARG_xyz value. The first argument is placed
    in the least significant bits of the command and so on. There are at most
-   5 arguments in an insn and a sentinel (S390_ARG_DONE) is needed to identify
-   the end of the argument list. 6 * 4 = 24 bits are required for the
+   7 arguments in an insn and a sentinel (S390_ARG_DONE) is needed to identify
+   the end of the argument list. 8 * 4 = 32 bits are required for the
    command. */
 void
 s390_disasm(UInt command, ...)
@@ -367,6 +422,12 @@ s390_disasm(UInt command, ...)
          case S390_XMNM_LOCG:
          case S390_XMNM_STOC:
          case S390_XMNM_STOCG:
+         case S390_XMNM_STOCFH:
+         case S390_XMNM_LOCFH:
+         case S390_XMNM_LOCFHR:
+         case S390_XMNM_LOCHI:
+         case S390_XMNM_LOCGHI:
+         case S390_XMNM_LOCHHI:
             mask = va_arg(args, UInt);
             mnm = cls_operand(kind, mask);
             p  += vex_sprintf(p, "%s", mnemonic(mnm));
@@ -455,6 +516,21 @@ s390_disasm(UInt command, ...)
          }
          break;
       }
+
+      case S390_ARG_VR:
+         p += vex_sprintf(p, "%s", vr_operand(va_arg(args, UInt)));
+         break;
+
+      case S390_ARG_UDVB: {
+         UInt d, v, b;
+
+         d = va_arg(args, UInt);
+         v = va_arg(args, UInt);
+         b = va_arg(args, UInt);
+
+         p = dvb_operand(p, d, v, b, 0 /* signed_displacement */);
+         break;
+         }
       }
 
       separator = ',';
diff --git a/priv/s390_disasm.h b/priv/s390_disasm.h
index 2334f01e0..3cccceadf 100644
--- a/priv/s390_disasm.h
+++ b/priv/s390_disasm.h
@@ -8,7 +8,7 @@
    This file is part of Valgrind, a dynamic binary instrumentation
    framework.
 
-   Copyright IBM Corp. 2010-2015
+   Copyright IBM Corp. 2010-2017
 
    This program is free software; you can redistribute it and/or
    modify it under the terms of the GNU General Public License as
@@ -51,6 +51,10 @@
 #undef  ENC6
 #define ENC6(a,b,c,d,e,f) ((P(DONE) << 24) | (P(f) << 20) | (P(e) << 16) | \
                            (P(d) << 12) | (P(c) << 8) | (P(b) << 4) | P(a))
+#undef  ENC7
+#define ENC7(a,b,c,d,e,f,g) ((P(DONE) << 28) | (P(g) << 24) | (P(f) << 20) | \
+                             (P(e) << 16) | (P(d) << 12) | (P(c) << 8) | \
+                             (P(b) << 4) | P(a))
 
 /* The different kinds of operands in an asm insn */
 enum {
@@ -66,7 +70,9 @@ enum {
    S390_ARG_UDLB = 9,
    S390_ARG_CABM = 10,
    S390_ARG_MNM = 11,
-   S390_ARG_XMNM = 12
+   S390_ARG_XMNM = 12,
+   S390_ARG_VR = 13,
+   S390_ARG_UDVB = 14,
 };
 
 /* The different kinds of extended mnemonics */
@@ -81,7 +87,13 @@ enum {
    S390_XMNM_LOC = 7,
    S390_XMNM_LOCG = 8,
    S390_XMNM_STOC = 9,
-   S390_XMNM_STOCG = 10
+   S390_XMNM_STOCG = 10,
+   S390_XMNM_STOCFH = 11,
+   S390_XMNM_LOCFH = 12,
+   S390_XMNM_LOCFHR = 13,
+   S390_XMNM_LOCHI = 14,
+   S390_XMNM_LOCGHI = 15,
+   S390_XMNM_LOCHHI = 16
 };
 
 void s390_disasm(UInt command, ...);
diff --git a/pub/libvex_guest_s390x.h b/pub/libvex_guest_s390x.h
index 99d5947b6..602f1a13b 100644
--- a/pub/libvex_guest_s390x.h
+++ b/pub/libvex_guest_s390x.h
@@ -8,7 +8,7 @@
    This file is part of Valgrind, a dynamic binary instrumentation
    framework.
 
-   Copyright IBM Corp. 2010-2015
+   Copyright IBM Corp. 2010-2017
 
    This program is free software; you can redistribute it and/or
    modify it under the terms of the GNU General Public License as
@@ -61,98 +61,122 @@ typedef struct {
    /*   60 */  UInt guest_a15;
 
 /*------------------------------------------------------------*/
-/*--- fpr registers                                        ---*/
-/*------------------------------------------------------------*/
-
-   /*   64 */  ULong guest_f0;
-   /*   72 */  ULong guest_f1;
-   /*   80 */  ULong guest_f2;
-   /*   88 */  ULong guest_f3;
-   /*   96 */  ULong guest_f4;
-   /*  104 */  ULong guest_f5;
-   /*  112 */  ULong guest_f6;
-   /*  120 */  ULong guest_f7;
-   /*  128 */  ULong guest_f8;
-   /*  136 */  ULong guest_f9;
-   /*  144 */  ULong guest_f10;
-   /*  152 */  ULong guest_f11;
-   /*  160 */  ULong guest_f12;
-   /*  168 */  ULong guest_f13;
-   /*  176 */  ULong guest_f14;
-   /*  184 */  ULong guest_f15;
+/*--- fpr & vr registers                                   ---*/
+/*------------------------------------------------------------*/
+
+   /*
+      FPRs[0-15] are mapped to the first double words of VR's[0-15].
+      According to documentation if we modify fpr1 with FP insn then the content of vr1's 64..128
+      bits is unpredictable. If we modify 64..128 of vr1 then fpr1's value is unpredictable too.
+      In our implementation writing to one half of vr doesn't affect another part but
+      apllications shouldn't rely on it.
+   */
+
+   /*   64 */  V128 guest_v0;
+   /*   80 */  V128 guest_v1;
+   /*   96 */  V128 guest_v2;
+   /*  112 */  V128 guest_v3;
+   /*  128 */  V128 guest_v4;
+   /*  144 */  V128 guest_v5;
+   /*  160 */  V128 guest_v6;
+   /*  176 */  V128 guest_v7;
+   /*  192 */  V128 guest_v8;
+   /*  208 */  V128 guest_v9;
+   /*  224 */  V128 guest_v10;
+   /*  240 */  V128 guest_v11;
+   /*  256 */  V128 guest_v12;
+   /*  272 */  V128 guest_v13;
+   /*  288 */  V128 guest_v14;
+   /*  304 */  V128 guest_v15;
+   /*  320 */  V128 guest_v16;
+   /*  336 */  V128 guest_v17;
+   /*  352 */  V128 guest_v18;
+   /*  368 */  V128 guest_v19;
+   /*  384 */  V128 guest_v20;
+   /*  400 */  V128 guest_v21;
+   /*  416 */  V128 guest_v22;
+   /*  432 */  V128 guest_v23;
+   /*  448 */  V128 guest_v24;
+   /*  464 */  V128 guest_v25;
+   /*  480 */  V128 guest_v26;
+   /*  496 */  V128 guest_v27;
+   /*  512 */  V128 guest_v28;
+   /*  528 */  V128 guest_v29;
+   /*  544 */  V128 guest_v30;
+   /*  560 */  V128 guest_v31;
 
 /*------------------------------------------------------------*/
 /*--- gpr registers                                        ---*/
 /*------------------------------------------------------------*/
 
-   /*  192 */  ULong guest_r0;
-   /*  200 */  ULong guest_r1;
-   /*  208 */  ULong guest_r2;
-   /*  216 */  ULong guest_r3;
-   /*  224 */  ULong guest_r4;
-   /*  232 */  ULong guest_r5;
-   /*  240 */  ULong guest_r6;
-   /*  248 */  ULong guest_r7;
-   /*  256 */  ULong guest_r8;
-   /*  264 */  ULong guest_r9;
-   /*  272 */  ULong guest_r10;
-   /*  280 */  ULong guest_r11;
-   /*  288 */  ULong guest_r12;
-   /*  296 */  ULong guest_r13;
-   /*  304 */  ULong guest_r14;
-   /*  312 */  ULong guest_r15;
+   /*  568 */  ULong guest_r0;
+   /*  576 */  ULong guest_r1;
+   /*  584 */  ULong guest_r2;
+   /*  592 */  ULong guest_r3;
+   /*  600 */  ULong guest_r4;
+   /*  608 */  ULong guest_r5;
+   /*  616 */  ULong guest_r6;
+   /*  624 */  ULong guest_r7;
+   /*  632 */  ULong guest_r8;
+   /*  640 */  ULong guest_r9;
+   /*  648 */  ULong guest_r10;
+   /*  656 */  ULong guest_r11;
+   /*  664 */  ULong guest_r12;
+   /*  672 */  ULong guest_r13;
+   /*  680 */  ULong guest_r14;
+   /*  688 */  ULong guest_r15;
 
 /*------------------------------------------------------------*/
 /*--- S390 miscellaneous registers                         ---*/
 /*------------------------------------------------------------*/
 
-   /*  320 */  ULong guest_counter;
-   /*  328 */  UInt guest_fpc;
-   /*  332 */  UChar unused[4]; /* 4-byte hole to get 8-byte alignment */
-   /*  336 */  ULong guest_IA;
+   /*  696 */  ULong guest_counter;
+   /*  704 */  UInt guest_fpc;
+   /*  708 */  UChar unused[4]; /* 4-byte hole to get 8-byte alignment */
+   /*  712 */  ULong guest_IA;
 
 /*------------------------------------------------------------*/
 /*--- S390 pseudo registers                                ---*/
 /*------------------------------------------------------------*/
 
-   /*  344 */  ULong guest_SYSNO;
+   /*  720 */  ULong guest_SYSNO;
 
 /*------------------------------------------------------------*/
 /*--- 4-word thunk used to calculate the condition code    ---*/
 /*------------------------------------------------------------*/
 
-   /*  352 */  ULong guest_CC_OP;
-   /*  360 */  ULong guest_CC_DEP1;
-   /*  368 */  ULong guest_CC_DEP2;
-   /*  376 */  ULong guest_CC_NDEP;
+   /*  728 */  ULong guest_CC_OP;
+   /*  736 */  ULong guest_CC_DEP1;
+   /*  744 */  ULong guest_CC_DEP2;
+   /*  752 */  ULong guest_CC_NDEP;
 
 /*------------------------------------------------------------*/
 /*--- Pseudo registers. Required by all architectures      ---*/
 /*------------------------------------------------------------*/
 
    /* See comments at bottom of libvex.h */
-   /*  384 */  ULong guest_NRADDR;
-   /*  392 */  ULong guest_CMSTART;
-   /*  400 */  ULong guest_CMLEN;
+   /*  760 */  ULong guest_NRADDR;
+   /*  768 */  ULong guest_CMSTART;
+   /*  776 */  ULong guest_CMLEN;
 
    /* Used when backing up to restart a syscall that has
       been interrupted by a signal. See also comment in
       libvex_ir.h */
-   /*  408 */  ULong guest_IP_AT_SYSCALL;
+   /*  784 */  ULong guest_IP_AT_SYSCALL;
 
    /* Emulation notes; see comments in libvex_emnote.h */
-   /*  416 */  UInt guest_EMNOTE;
+   /*  792 */  UInt guest_EMNOTE;
 
    /* For translation chaining */
-   /*  420 */  UInt  host_EvC_COUNTER;
-   /*  424 */  ULong host_EvC_FAILADDR;
+   /*  796 */  UInt  host_EvC_COUNTER;
+   /*  800 */  ULong host_EvC_FAILADDR;
 
 /*------------------------------------------------------------*/
 /*--- Force alignment to 16 bytes                          ---*/
 /*------------------------------------------------------------*/
-   /*  432 */  UChar padding[0];
+   /*  808 */  UChar padding[0];
 
-   /*  432 */  /* This is the size of the guest state */
+   /*  816 */  /* This is the size of the guest state */
 } VexGuestS390XState;
 
 
diff --git a/pub/libvex_s390x_common.h b/pub/libvex_s390x_common.h
index ecdc11be1..8723ee21d 100644
--- a/pub/libvex_s390x_common.h
+++ b/pub/libvex_s390x_common.h
@@ -8,7 +8,7 @@
    This file is part of Valgrind, a dynamic binary instrumentation
    framework.
 
-   Copyright IBM Corp. 2010-2015
+   Copyright IBM Corp. 2010-2017
 
    This program is free software; you can redistribute it and/or
    modify it under the terms of the GNU General Public License as
@@ -99,8 +99,11 @@
 #define S390_FAC_DFPZC   48  // DFP zoned-conversion
 #define S390_FAC_MISC    49  // miscellaneous insn
 #define S390_FAC_CTREXE  50  // constrained transactional execution
+#define S390_FAC_LSC2    53  // load/store on condition 2 and load and zero rightmost byte
+#define S390_FAC_MSA5    57  // message-security-assist 5
 #define S390_FAC_TREXE   73  // transactional execution
 #define S390_FAC_MSA4    77  // message-security-assist 4
+#define S390_FAC_VX      129 // vector facility
 
 
 /*--------------------------------------------------------------*/
@@ -111,7 +114,7 @@
 #define S390_NUM_GPRPARMS 5
 
 /* Number of double words needed to store all facility bits. */
-#define S390_NUM_FACILITY_DW 2
+#define S390_NUM_FACILITY_DW 3
 
 #endif /* __LIBVEX_PUB_S390X_H */
 

From 2bbbc71f946a031666bdb5904a4db06c572a6faf Mon Sep 17 00:00:00 2001
From: mephi42 <mephi42@gmail.com>
Date: Fri, 10 May 2019 13:18:20 +0200
Subject: [PATCH 9/9] s390x: update to upstream revision 379c62017

---
 priv/guest_s390_defs.h    |   8 +-
 priv/guest_s390_helpers.c |  20 ++-
 priv/guest_s390_toIR.c    | 247 +++++++++++++++++++++++++++++++++-----
 priv/host_s390_defs.c     |  30 ++++-
 priv/host_s390_defs.h     |   4 +-
 priv/host_s390_isel.c     |  11 +-
 priv/s390_defs.h          |   4 +-
 priv/s390_disasm.c        |  14 ++-
 priv/s390_disasm.h        |   7 +-
 pub/libvex.h              |   4 +-
 10 files changed, 287 insertions(+), 62 deletions(-)

diff --git a/priv/guest_s390_defs.h b/priv/guest_s390_defs.h
index d72cc9f6d..1470558ce 100644
--- a/priv/guest_s390_defs.h
+++ b/priv/guest_s390_defs.h
@@ -21,9 +21,7 @@
    General Public License for more details.
 
    You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
-   02110-1301, USA.
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
 
    The GNU General Public License is contained in the file COPYING.
 */
@@ -160,7 +158,9 @@ enum {
    S390_CC_OP_DFP_128_TO_INT_64 = 57,
    S390_CC_OP_PFPO_32 = 58,
    S390_CC_OP_PFPO_64 = 59,
-   S390_CC_OP_PFPO_128 = 60
+   S390_CC_OP_PFPO_128 = 60,
+   S390_CC_OP_MUL_32 = 61,
+   S390_CC_OP_MUL_64 = 62
 };
 
 /*------------------------------------------------------------*/
diff --git a/priv/guest_s390_helpers.c b/priv/guest_s390_helpers.c
index 5877743c9..525e7000c 100644
--- a/priv/guest_s390_helpers.c
+++ b/priv/guest_s390_helpers.c
@@ -21,9 +21,7 @@
    General Public License for more details.
 
    You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
-   02110-1301, USA.
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
 
    The GNU General Public License is contained in the file COPYING.
 */
@@ -992,6 +990,16 @@ decode_bfp_rounding_mode(UInt irrm)
    psw >> 28;   /* cc */ \
 })
 
+#define S390_CC_FOR_TERNARY(opcode,cc_dep1,cc_dep2) \
+({ \
+   __asm__ volatile ( \
+        opcode ",%[op1],%[op1],%[op2],0\n\t" \
+        "ipm %[psw]\n\t"           : [psw] "=d"(psw), [op1] "+d"(cc_dep1) \
+                                   : [op2] "d"(cc_dep2) \
+                                   : "cc");\
+   psw >> 28;   /* cc */ \
+})
+
 #define S390_CC_FOR_TERNARY_SUBB(opcode,cc_dep1,cc_dep2,cc_ndep) \
 ({ \
    /* Recover the original DEP2 value. See comment near s390_cc_thunk_put3 \
@@ -1804,6 +1812,12 @@ s390_calculate_cc(ULong cc_op, ULong cc_dep1, ULong cc_dep2, ULong cc_ndep)
       return psw >> 28;  /* cc */
    }
 
+   case S390_CC_OP_MUL_32:
+      return S390_CC_FOR_TERNARY(".insn rrf,0xb9fd0000", cc_dep1, cc_dep2);
+
+   case S390_CC_OP_MUL_64:
+      return S390_CC_FOR_TERNARY(".insn rrf,0xb9ed0000", cc_dep1, cc_dep2);
+
    default:
       break;
    }
diff --git a/priv/guest_s390_toIR.c b/priv/guest_s390_toIR.c
index 8599e5e2e..06ec27fae 100644
--- a/priv/guest_s390_toIR.c
+++ b/priv/guest_s390_toIR.c
@@ -21,9 +21,7 @@
    General Public License for more details.
 
    You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
-   02110-1301, USA.
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
 
    The GNU General Public License is contained in the file COPYING.
 */
@@ -51,7 +49,7 @@
 static UInt s390_decode_and_irgen(const UChar *, UInt, DisResult *);
 static void s390_irgen_xonc(IROp, IRTemp, IRTemp, IRTemp);
 static void s390_irgen_CLC_EX(IRTemp, IRTemp, IRTemp);
-
+static const HChar *s390_irgen_BIC(UChar r1, IRTemp op2addr);
 
 /*------------------------------------------------------------*/
 /*--- Globals                                              ---*/
@@ -3316,8 +3314,12 @@ s390_format_RXY_RRRD(const HChar *(*irgen)(UChar r1, IRTemp op2addr),
 
    mnm = irgen(r1, op2addr);
 
-   if (UNLIKELY(vex_traceflags & VEX_TRACE_FE))
-      s390_disasm(ENC3(MNM, GPR, SDXB), mnm, r1, dh2, dl2, x2, b2);
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_FE)) {
+      if (irgen == s390_irgen_BIC)
+         s390_disasm(ENC2(XMNM, SDXB), S390_XMNM_BIC, r1, dh2, dl2, x2, b2);
+      else
+         s390_disasm(ENC3(MNM, GPR, SDXB), mnm, r1, dh2, dl2, x2, b2);
+   }
 }
 
 static void
@@ -4282,6 +4284,22 @@ s390_irgen_AHIK(UChar r1, UChar r3, UShort i2)
    return "ahik";
 }
 
+static const HChar *
+s390_irgen_AGH(UChar r1, IRTemp op2addr)
+{
+   IRTemp op1 = newTemp(Ity_I64);
+   IRTemp op2 = newTemp(Ity_I64);
+   IRTemp result = newTemp(Ity_I64);
+
+   assign(op1, get_gpr_dw0(r1));
+   assign(op2, unop(Iop_16Sto64, load(Ity_I16, mkexpr(op2addr))));
+   assign(result, binop(Iop_Add64, mkexpr(op1), mkexpr(op2)));
+   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_ADD_64, op1, op2);
+   put_gpr_dw0(r1, mkexpr(result));
+
+   return "agh";
+}
+
 static const HChar *
 s390_irgen_AGHIK(UChar r1, UChar r3, UShort i2)
 {
@@ -5199,6 +5217,24 @@ s390_irgen_BCTG(UChar r1, IRTemp op2addr)
    return "bctg";
 }
 
+static const HChar *
+s390_irgen_BIC(UChar r1, IRTemp op2addr)
+{
+   IRTemp cond = newTemp(Ity_I32);
+
+   if (r1 == 0) {
+      /* nothing */
+   } else if (r1 == 15) {
+      always_goto(load(Ity_I64, mkexpr(op2addr)));
+   } else {
+      assign(cond, s390_call_calculate_cond(r1));
+      if_condition_goto_computed(binop(Iop_CmpNE32, mkexpr(cond), mkU32(0)),
+                                 load(Ity_I64, mkexpr(op2addr)));
+   }
+
+   return "bic";
+}
+
 static const HChar *
 s390_irgen_BXH(UChar r1, UChar r3, IRTemp op2addr)
 {
@@ -8335,6 +8371,54 @@ s390_irgen_MFY(UChar r1, IRTemp op2addr)
    return "mfy";
 }
 
+static const HChar *
+s390_irgen_MG(UChar r1, IRTemp op2addr)
+{
+   IRTemp op1 = newTemp(Ity_I64);
+   IRTemp op2 = newTemp(Ity_I64);
+   IRTemp result = newTemp(Ity_I128);
+
+   assign(op1, get_gpr_dw0(r1 + 1));
+   assign(op2, load(Ity_I64, mkexpr(op2addr)));
+   assign(result, binop(Iop_MullS64, mkexpr(op1), mkexpr(op2)));
+   put_gpr_dw0(r1, unop(Iop_128HIto64, mkexpr(result)));
+   put_gpr_dw0(r1 + 1, unop(Iop_128to64, mkexpr(result)));
+
+   return "mg";
+}
+
+static const HChar *
+s390_irgen_MGH(UChar r1, IRTemp op2addr)
+{
+   IRTemp op1 = newTemp(Ity_I64);
+   IRTemp op2 = newTemp(Ity_I16);
+   IRTemp result = newTemp(Ity_I128);
+
+   assign(op1, get_gpr_dw0(r1));
+   assign(op2, load(Ity_I16, mkexpr(op2addr)));
+   assign(result, binop(Iop_MullS64, mkexpr(op1), unop(Iop_16Sto64, mkexpr(op2))
+   ));
+   put_gpr_dw0(r1, unop(Iop_128to64, mkexpr(result)));
+
+   return "mgh";
+}
+
+static const HChar *
+s390_irgen_MGRK(UChar r3, UChar r1, UChar r2)
+{
+   IRTemp op2 = newTemp(Ity_I64);
+   IRTemp op3 = newTemp(Ity_I64);
+   IRTemp result = newTemp(Ity_I128);
+
+   assign(op2, get_gpr_dw0(r2));
+   assign(op3, get_gpr_dw0(r3));
+   assign(result, binop(Iop_MullS64, mkexpr(op2), mkexpr(op3)));
+   put_gpr_dw0(r1, unop(Iop_128HIto64, mkexpr(result)));
+   put_gpr_dw0(r1 + 1, unop(Iop_128to64, mkexpr(result)));
+
+   return "mgrk";
+}
+
 static const HChar *
 s390_irgen_MH(UChar r1, IRTemp op2addr)
 {
@@ -8524,6 +8608,38 @@ s390_irgen_MS(UChar r1, IRTemp op2addr)
    return "ms";
 }
 
+static const HChar *
+s390_irgen_MSC(UChar r1, IRTemp op2addr)
+{
+   IRTemp op1 = newTemp(Ity_I32);
+   IRTemp op2 = newTemp(Ity_I32);
+   IRTemp result = newTemp(Ity_I64);
+
+   assign(op1, get_gpr_w1(r1));
+   assign(op2, load(Ity_I32, mkexpr(op2addr)));
+   assign(result, binop(Iop_MullS32, mkexpr(op1), mkexpr(op2)));
+   s390_cc_thunk_putSS(S390_CC_OP_MUL_32, op1, op2);
+   put_gpr_w1(r1, unop(Iop_64to32, mkexpr(result)));
+
+   return "msc";
+}
+
+static const HChar *
+s390_irgen_MSRKC(UChar r3, UChar r1, UChar r2)
+{
+   IRTemp op2 = newTemp(Ity_I32);
+   IRTemp op3 = newTemp(Ity_I32);
+   IRTemp result = newTemp(Ity_I64);
+
+   assign(op2, get_gpr_w1(r2));
+   assign(op3, get_gpr_w1(r3));
+   assign(result, binop(Iop_MullS32, mkexpr(op2), mkexpr(op3)));
+   s390_cc_thunk_putSS(S390_CC_OP_MUL_32, op2, op3);
+   put_gpr_w1(r1, unop(Iop_64to32, mkexpr(result)));
+
+   return "msrkc";
+}
+
 static const HChar *
 s390_irgen_MSY(UChar r1, IRTemp op2addr)
 {
@@ -8554,6 +8670,22 @@ s390_irgen_MSG(UChar r1, IRTemp op2addr)
    return "msg";
 }
 
+static const HChar *
+s390_irgen_MSGC(UChar r1, IRTemp op2addr)
+{
+   IRTemp op1 = newTemp(Ity_I64);
+   IRTemp op2 = newTemp(Ity_I64);
+   IRTemp result = newTemp(Ity_I128);
+
+   assign(op1, get_gpr_dw0(r1));
+   assign(op2, load(Ity_I64, mkexpr(op2addr)));
+   assign(result, binop(Iop_MullS64, mkexpr(op1), mkexpr(op2)));
+   s390_cc_thunk_putSS(S390_CC_OP_MUL_64, op1, op2);
+   put_gpr_dw0(r1, unop(Iop_128to64, mkexpr(result)));
+
+   return "msgc";
+}
+
 static const HChar *
 s390_irgen_MSGF(UChar r1, IRTemp op2addr)
 {
@@ -8601,6 +8733,22 @@ s390_irgen_MSGFI(UChar r1, UInt i2)
    return "msgfi";
 }
 
+static const HChar *
+s390_irgen_MSGRKC(UChar r3, UChar r1, UChar r2)
+{
+   IRTemp op2 = newTemp(Ity_I64);
+   IRTemp op3 = newTemp(Ity_I64);
+   IRTemp result = newTemp(Ity_I128);
+
+   assign(op2, get_gpr_dw0(r2));
+   assign(op3, get_gpr_dw0(r3));
+   assign(result, binop(Iop_MullS64, mkexpr(op2), mkexpr(op3)));
+   s390_cc_thunk_putSS(S390_CC_OP_MUL_64, op2, op3);
+   put_gpr_dw0(r1, unop(Iop_128to64, mkexpr(result)));
+
+   return "msgrkc";
+}
+
 static const HChar *
 s390_irgen_OR(UChar r1, UChar r2)
 {
@@ -10062,6 +10210,22 @@ s390_irgen_SGF(UChar r1, IRTemp op2addr)
    return "sgf";
 }
 
+static const HChar *
+s390_irgen_SGH(UChar r1, IRTemp op2addr)
+{
+   IRTemp op1 = newTemp(Ity_I64);
+   IRTemp op2 = newTemp(Ity_I64);
+   IRTemp result = newTemp(Ity_I64);
+
+   assign(op1, get_gpr_dw0(r1));
+   assign(op2, unop(Iop_16Sto64, load(Ity_I16, mkexpr(op2addr))));
+   assign(result, binop(Iop_Sub64, mkexpr(op1), mkexpr(op2)));
+   s390_cc_thunk_putSS(S390_CC_OP_SIGNED_SUB_64, op1, op2);
+   put_gpr_dw0(r1, mkexpr(result));
+
+   return "sgh";
+}
+
 static const HChar *
 s390_irgen_SH(UChar r1, IRTemp op2addr)
 {
@@ -12745,7 +12909,7 @@ s390_irgen_EX_SS(UChar r, IRTemp addr2,
    IRTemp cond;
    IRDirty *d;
    IRTemp torun;
-   unsigned long ovl;
+   ULong ovl;
 
    IRTemp start1 = newTemp(Ity_I64);
    IRTemp start2 = newTemp(Ity_I64);
@@ -12890,24 +13054,17 @@ s390_irgen_EX(UChar r1, IRTemp addr2)
    return "ex";
 }
 
-static const UChar *exrl_bytes;
-
 static const HChar *
 s390_irgen_EXRL(UChar r1, UInt offset)
 {
-   const UChar *exrl_target;
    IRTemp addr = newTemp(Ity_I64);
    Addr64 bytes_addr = guest_IA_curr_instr + offset * 2UL;
+   UChar *bytes = (UChar *)(HWord)bytes_addr;
    /* we might save one round trip because we know the target */
-   if (!last_execute_target) {
-      exrl_target = exrl_bytes + offset * 2UL;
-      last_execute_target = ((ULong)exrl_target[0] << 56) |
-                            ((ULong)exrl_target[1] << 48) |
-                            ((ULong)exrl_target[2] << 40) |
-                            ((ULong)exrl_target[3] << 32) |
-                            ((ULong)exrl_target[4] << 24) |
-                            ((ULong)exrl_target[5] << 16);
-   }
+   if (!last_execute_target)
+      last_execute_target = ((ULong)bytes[0] << 56) | ((ULong)bytes[1] << 48) |
+                            ((ULong)bytes[2] << 40) | ((ULong)bytes[3] << 32) |
+                            ((ULong)bytes[4] << 24) | ((ULong)bytes[5] << 16);
    assign(addr, mkU64(bytes_addr));
    s390_irgen_EX(r1, addr);
    return "exrl";
@@ -19622,8 +19779,10 @@ s390_decode_4byte_and_irgen(const UChar *bytes)
    case 0xb99d: /* ESEA */ goto unimplemented;
    case 0xb99e: /* PTI */ goto unimplemented;
    case 0xb99f: /* SSAIR */ goto unimplemented;
+   case 0xb9a1: /* TPEI */ goto unimplemented;
    case 0xb9a2: /* PTF */ goto unimplemented;
    case 0xb9aa: /* LPTEA */ goto unimplemented;
+   case 0xb9ac: /* IRBM */ goto unimplemented;
    case 0xb9ae: /* RRBM */ goto unimplemented;
    case 0xb9af: /* PFMF */ goto unimplemented;
    case 0xb9b0: s390_format_RRF_M0RERE(s390_irgen_CU14, RRF3_r3(ovl),
@@ -19700,8 +19859,12 @@ s390_decode_4byte_and_irgen(const UChar *bytes)
    case 0xb9eb: s390_format_RRF_R0RR2(s390_irgen_SLGRK, RRF4_r3(ovl),
                                       RRF4_r1(ovl), RRF4_r2(ovl));
                                       goto ok;
-   case 0xb9ec: /* MGRK */ goto unimplemented;
-   case 0xb9ed: /* MSGRKC */ goto unimplemented;
+   case 0xb9ec: s390_format_RRF_R0RR2(s390_irgen_MGRK, RRF4_r3(ovl),
+                                      RRF4_r1(ovl), RRF4_r2(ovl));
+                                      goto ok;
+   case 0xb9ed: s390_format_RRF_R0RR2(s390_irgen_MSGRKC, RRF4_r3(ovl),
+                                      RRF4_r1(ovl), RRF4_r2(ovl));
+                                      goto ok;
    case 0xb9f2: s390_format_RRF_U0RR(s390_irgen_LOCR, RRF3_r3(ovl),
                                      RRF3_r1(ovl), RRF3_r2(ovl),
                                      S390_XMNM_LOCR);  goto ok;
@@ -19726,7 +19889,9 @@ s390_decode_4byte_and_irgen(const UChar *bytes)
    case 0xb9fb: s390_format_RRF_R0RR2(s390_irgen_SLRK, RRF4_r3(ovl),
                                       RRF4_r1(ovl), RRF4_r2(ovl));
                                       goto ok;
-   case 0xb9fd: /* MSRKC */ goto unimplemented;
+   case 0xb9fd: s390_format_RRF_R0RR2(s390_irgen_MSRKC, RRF4_r3(ovl),
+                                      RRF4_r1(ovl), RRF4_r2(ovl));
+                                      goto ok;
    }
 
    switch ((ovl & 0xff000000) >> 24) {
@@ -20043,8 +20208,14 @@ s390_decode_6byte_and_irgen(const UChar *bytes)
                                                 RXY_x2(ovl), RXY_b2(ovl),
                                                 RXY_dl2(ovl),
                                                 RXY_dh2(ovl));  goto ok;
-   case 0xe30000000038ULL: /* AGH */ goto unimplemented;
-   case 0xe30000000039ULL: /* SGH */ goto unimplemented;
+   case 0xe30000000038ULL: s390_format_RXY_RRRD(s390_irgen_AGH, RXY_r1(ovl),
+                                                RXY_x2(ovl), RXY_b2(ovl),
+                                                RXY_dl2(ovl),
+                                                RXY_dh2(ovl));  goto ok;
+   case 0xe30000000039ULL: s390_format_RXY_RRRD(s390_irgen_SGH, RXY_r1(ovl),
+                                                RXY_x2(ovl), RXY_b2(ovl),
+                                                RXY_dl2(ovl),
+                                                RXY_dh2(ovl));  goto ok;
    case 0xe3000000003aULL: s390_format_RXY_RRRD(s390_irgen_LLZRGF, RXY_r1(ovl),
                                                 RXY_x2(ovl), RXY_b2(ovl),
                                                 RXY_dl2(ovl),
@@ -20053,7 +20224,10 @@ s390_decode_6byte_and_irgen(const UChar *bytes)
                                                 RXY_x2(ovl), RXY_b2(ovl),
                                                 RXY_dl2(ovl),
                                                 RXY_dh2(ovl));  goto ok;
-   case 0xe3000000003cULL: /* MGH */ goto unimplemented;
+   case 0xe3000000003cULL: s390_format_RXY_RRRD(s390_irgen_MGH, RXY_r1(ovl),
+                                                RXY_x2(ovl), RXY_b2(ovl),
+                                                RXY_dl2(ovl),
+                                                RXY_dh2(ovl));  goto ok;
    case 0xe3000000003eULL: s390_format_RXY_RRRD(s390_irgen_STRV, RXY_r1(ovl),
                                                 RXY_x2(ovl), RXY_b2(ovl),
                                                 RXY_dl2(ovl),
@@ -20066,7 +20240,10 @@ s390_decode_6byte_and_irgen(const UChar *bytes)
                                                 RXY_x2(ovl), RXY_b2(ovl),
                                                 RXY_dl2(ovl),
                                                 RXY_dh2(ovl));  goto ok;
-   case 0xe30000000047ULL: /* BIC */ goto unimplemented;
+   case 0xe30000000047ULL: s390_format_RXY_RRRD(s390_irgen_BIC, RXY_r1(ovl),
+                                                RXY_x2(ovl), RXY_b2(ovl),
+                                                RXY_dl2(ovl),
+                                                RXY_dh2(ovl));  goto ok;
    case 0xe30000000048ULL: /* LLGFSG */ goto unimplemented;
    case 0xe30000000049ULL: /* STGSC */ goto unimplemented;
    case 0xe3000000004cULL: /* LGG */ goto unimplemented;
@@ -20079,7 +20256,10 @@ s390_decode_6byte_and_irgen(const UChar *bytes)
                                                 RXY_x2(ovl), RXY_b2(ovl),
                                                 RXY_dl2(ovl),
                                                 RXY_dh2(ovl));  goto ok;
-   case 0xe30000000053ULL: /* MSC */ goto unimplemented;
+   case 0xe30000000053ULL: s390_format_RXY_RRRD(s390_irgen_MSC, RXY_r1(ovl),
+                                                RXY_x2(ovl), RXY_b2(ovl),
+                                                RXY_dl2(ovl),
+                                                RXY_dh2(ovl));  goto ok;
    case 0xe30000000054ULL: s390_format_RXY_RRRD(s390_irgen_NY, RXY_r1(ovl),
                                                 RXY_x2(ovl), RXY_b2(ovl),
                                                 RXY_dl2(ovl),
@@ -20184,8 +20364,14 @@ s390_decode_6byte_and_irgen(const UChar *bytes)
                                                 RXY_x2(ovl), RXY_b2(ovl),
                                                 RXY_dl2(ovl),
                                                 RXY_dh2(ovl));  goto ok;
-   case 0xe30000000083ULL: /* MSGC */ goto unimplemented;
-   case 0xe30000000084ULL: /* MG */ goto unimplemented;
+   case 0xe30000000083ULL: s390_format_RXY_RRRD(s390_irgen_MSGC, RXY_r1(ovl),
+                                                RXY_x2(ovl), RXY_b2(ovl),
+                                                RXY_dl2(ovl),
+                                                RXY_dh2(ovl));  goto ok;
+   case 0xe30000000084ULL: s390_format_RXY_RRRD(s390_irgen_MG, RXY_r1(ovl),
+                                                RXY_x2(ovl), RXY_b2(ovl),
+                                                RXY_dl2(ovl),
+                                                RXY_dh2(ovl));  goto ok;
    case 0xe30000000085ULL: s390_format_RXY_RRRD(s390_irgen_LGAT, RXY_r1(ovl),
                                                 RXY_x2(ovl), RXY_b2(ovl),
                                                 RXY_dl2(ovl),
@@ -21454,8 +21640,7 @@ s390_decode_6byte_and_irgen(const UChar *bytes)
                                       RIL_i2(ovl));  goto ok;
    case 0xc40fULL: s390_format_RIL_RP(s390_irgen_STRL, RIL_r1(ovl),
                                       RIL_i2(ovl));  goto ok;
-   case 0xc600ULL: exrl_bytes = bytes;
-                   s390_format_RIL_RP(s390_irgen_EXRL, RIL_r1(ovl),
+   case 0xc600ULL: s390_format_RIL_RP(s390_irgen_EXRL, RIL_r1(ovl),
                                       RIL_i2(ovl));  goto ok;
    case 0xc602ULL: s390_format_RIL_UP(s390_irgen_PFDRL, RIL_r1(ovl),
                                       RIL_i2(ovl));  goto ok;
diff --git a/priv/host_s390_defs.c b/priv/host_s390_defs.c
index 22cdd0425..162550fd7 100644
--- a/priv/host_s390_defs.c
+++ b/priv/host_s390_defs.c
@@ -22,9 +22,7 @@
    General Public License for more details.
 
    You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
-   02110-1301, USA.
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
 
    The GNU General Public License is contained in the file COPYING.
 */
@@ -3025,6 +3023,26 @@ s390_emit_MFY(UChar *p, UChar r1, UChar x2, UChar b2, UShort dl2, UChar dh2)
 }
 
 
+static UChar *
+s390_emit_MG(UChar *p, UChar r1, UChar x2, UChar b2, UShort dl2, UChar dh2)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC3(MNM, GPR, SDXB), "mg", r1, dh2, dl2, x2, b2);
+
+    return emit_RXY(p, 0xe30000000084ULL, r1, x2, b2, dl2, dh2);
+}
+
+
+static UChar *
+s390_emit_MGRK(UChar *p, UChar r3, UChar r1, UChar r2)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC4(MNM, GPR, GPR, GPR), "mgrk", r1, r2, r3);
+
+   return emit_RRF3(p, 0xb9ec0000, r3, r1, r2);
+}
+
+
 static UChar *
 s390_emit_MH(UChar *p, UChar r1, UChar x2, UChar b2, UShort d2)
 {
@@ -9597,7 +9615,7 @@ s390_insn_mul_emit(UChar *buf, const s390_insn *insn)
 
       case 8:
          if (signed_multiply)
-            vpanic("s390_insn_mul_emit");
+            return s390_emit_MGRK(buf, r1 + 1, r1, r2);
          else
             return s390_emit_MLGR(buf, r1, r2);
 
@@ -9642,7 +9660,7 @@ s390_insn_mul_emit(UChar *buf, const s390_insn *insn)
 
       case 8:
          if (signed_multiply)
-            vpanic("s390_insn_mul_emit");
+            return s390_emit_MG(buf, r1, x, b, DISP20(d));
          else
             return s390_emit_MLG(buf, r1, x, b, DISP20(d));
 
@@ -9667,7 +9685,7 @@ s390_insn_mul_emit(UChar *buf, const s390_insn *insn)
       case 8:
          buf = s390_emit_load_64imm(buf, R0, value);
          if (signed_multiply)
-            vpanic("s390_insn_mul_emit");
+            return s390_emit_MGRK(buf, r1 + 1, r1, R0);
          else
             return s390_emit_MLGR(buf, r1, R0);
 
diff --git a/priv/host_s390_defs.h b/priv/host_s390_defs.h
index ed1f3cfb2..e79b990e4 100644
--- a/priv/host_s390_defs.h
+++ b/priv/host_s390_defs.h
@@ -21,9 +21,7 @@
    General Public License for more details.
 
    You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
-   02110-1301, USA.
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
 
    The GNU General Public License is contained in the file COPYING.
 */
diff --git a/priv/host_s390_isel.c b/priv/host_s390_isel.c
index 38989f217..30e5c7620 100644
--- a/priv/host_s390_isel.c
+++ b/priv/host_s390_isel.c
@@ -22,9 +22,7 @@
    General Public License for more details.
 
    You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
-   02110-1301, USA.
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
 
    The GNU General Public License is contained in the file COPYING.
 */
@@ -1020,6 +1018,8 @@ s390_isel_int128_expr_wrk(HReg *dst_hi, HReg *dst_lo, ISelEnv *env,
          goto do_multiply64;
 
       case Iop_MullS64:
+         if (!(env->hwcaps & VEX_HWCAPS_S390X_MI2))
+            goto irreducible;
          is_signed_multiply = True;
          goto do_multiply64;
 
@@ -1127,7 +1127,10 @@ s390_isel_int128_expr_wrk(HReg *dst_hi, HReg *dst_lo, ISelEnv *env,
       }
    }
 
-   vpanic("s390_isel_int128_expr");
+   /* We get here if no pattern matched. */
+ irreducible:
+   ppIRExpr(expr);
+   vpanic("s390_isel_int128_expr: cannot reduce tree");
 }
 
 
diff --git a/priv/s390_defs.h b/priv/s390_defs.h
index 56886dbe4..80d733606 100644
--- a/priv/s390_defs.h
+++ b/priv/s390_defs.h
@@ -21,9 +21,7 @@
    General Public License for more details.
 
    You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
-   02110-1301, USA.
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
 
    The GNU General Public License is contained in the file COPYING.
 */
diff --git a/priv/s390_disasm.c b/priv/s390_disasm.c
index 58189f123..e3fbc11af 100644
--- a/priv/s390_disasm.c
+++ b/priv/s390_disasm.c
@@ -21,9 +21,7 @@
    General Public License for more details.
 
    You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
-   02110-1301, USA.
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
 
    The GNU General Public License is contained in the file COPYING.
 */
@@ -435,6 +433,16 @@ s390_disasm(UInt command, ...)
                the integer mask is appended as the final operand */
             if (mask == 0 || mask == 15) mask_suffix = mask;
             break;
+         case S390_XMNM_BIC:
+            mask = va_arg(args, UInt);
+            if (mask == 0) {
+               /* There is no special opcode when mask == 0. */
+               p  += vex_sprintf(p, "bic");
+               mask_suffix = mask;
+            } else {
+               p  += vex_sprintf(p, "%s", construct_mnemonic("bi", "", mask));
+            }
+            break;
          }
       }
       continue;
diff --git a/priv/s390_disasm.h b/priv/s390_disasm.h
index 3cccceadf..eec41f8ac 100644
--- a/priv/s390_disasm.h
+++ b/priv/s390_disasm.h
@@ -21,9 +21,7 @@
    General Public License for more details.
 
    You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
-   02110-1301, USA.
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
 
    The GNU General Public License is contained in the file COPYING.
 */
@@ -93,7 +91,8 @@ enum {
    S390_XMNM_LOCFHR = 13,
    S390_XMNM_LOCHI = 14,
    S390_XMNM_LOCGHI = 15,
-   S390_XMNM_LOCHHI = 16
+   S390_XMNM_LOCHHI = 16,
+   S390_XMNM_BIC = 17
 };
 
 void s390_disasm(UInt command, ...);
diff --git a/pub/libvex.h b/pub/libvex.h
index 223da74bc..fed8ee8e1 100644
--- a/pub/libvex.h
+++ b/pub/libvex.h
@@ -161,6 +161,7 @@ typedef
 #define VEX_HWCAPS_S390X_PFPO  (1<<17)  /* Perform floating point ops facility */
 #define VEX_HWCAPS_S390X_VX    (1<<18)  /* Vector facility */
 #define VEX_HWCAPS_S390X_MSA5  (1<<19)  /* message security assistance facility */
+#define VEX_HWCAPS_S390X_MI2   (1<<20)  /* miscellaneous-instruction-extensions facility 2 */
 
 
 /* Special value representing all available s390x hwcaps */
@@ -177,7 +178,8 @@ typedef
                                 VEX_HWCAPS_S390X_ETF2  | \
                                 VEX_HWCAPS_S390X_PFPO  | \
                                 VEX_HWCAPS_S390X_VX    | \
-                                VEX_HWCAPS_S390X_MSA5)
+                                VEX_HWCAPS_S390X_MSA5  | \
+                                VEX_HWCAPS_S390X_MI2)
 
 #define VEX_HWCAPS_S390X(x)  ((x) & ~VEX_S390X_MODEL_MASK)
 #define VEX_S390X_MODEL(x)   ((x) &  VEX_S390X_MODEL_MASK)