## OpenLane Colab

This Google Colab notebook will:
* Install OpenLane and its dependencies
* Run a simple design, namely a serial-parallel multiplier, through the flow
  and targeting the [open source sky130 PDK](https://github.com/google/skywater-pdk/)
  by Google and Skywater.

In [None]:
!python setup_openlane.py

In [None]:
import openlane

print(openlane.__version__)

### Creating the design

Now that OpenLane is set up, we can write a Verilog file as follows:

In [None]:
%%writefile fixed_point_params.vh

//
// fixed_point_params.vh
// Defines the fixed-point data type for the QFT project.
//

// Total bits for our signed fixed-point number
`define TOTAL_WIDTH 8

// Number of fractional bits
`define FRAC_WIDTH 4

// Width for intermediate multiplication results (before scaling)
`define MULT_WIDTH (`TOTAL_WIDTH * 2)

// Width for intermediate addition results
`define ADD_WIDTH (`TOTAL_WIDTH + 1)


In [None]:
%%writefile qft3_top_pipelined.v

`include "fixed_point_params.vh"

//======================================================================
// 3-Qubit QFT Top Level (Corrected and Optimized)
//======================================================================
module qft3_top_pipelined(
    input clk,
    input rst_n,

    // Initial 3-qubit state vector [α000, ..., α111]
    input  signed [`TOTAL_WIDTH-1:0] i000_r, i000_i, i001_r, i001_i, i010_r, i010_i, i011_r, i011_i,
    input  signed [`TOTAL_WIDTH-1:0] i100_r, i100_i, i101_r, i101_i, i110_r, i110_i, i111_r, i111_i,

    // Final state vector after the QFT
    output signed [`TOTAL_WIDTH-1:0] f000_r, f000_i, f001_r, f001_i, f010_r, f010_i, f011_r, f011_i,
    output signed [`TOTAL_WIDTH-1:0] f100_r, f100_i, f101_r, f101_i, f110_r, f110_i, f111_r, f111_i
);

    // --- CORRECTED Pre-calculated Rotation Constants ---
    // For theta = pi/2: cos=0, sin=1.0
    localparam signed [`TOTAL_WIDTH-1:0] C_PI_2_R = 0;  // <-- THE FIX
    localparam signed [`TOTAL_WIDTH-1:0] C_PI_2_I = 16;
    // For theta = pi/4: cos=0.707, sin=0.707
    localparam signed [`TOTAL_WIDTH-1:0] C_PI_4_R = 11;
    localparam signed [`TOTAL_WIDTH-1:0] C_PI_4_I = 11;

    // --- Latency Definition ---
    localparam STAGE_LATENCY = 3;

    // --- Intermediate Wires for Pipeline Stages ---
    wire signed [`TOTAL_WIDTH-1:0] s1_r[0:7], s1_i[0:7];
    wire signed [`TOTAL_WIDTH-1:0] s2_r[0:7], s2_i[0:7];
    wire signed [`TOTAL_WIDTH-1:0] s3_r[0:7], s3_i[0:7];
    wire signed [`TOTAL_WIDTH-1:0] s4_r[0:7], s4_i[0:7];
    wire signed [`TOTAL_WIDTH-1:0] s5_r[0:7], s5_i[0:7];
    wire signed [`TOTAL_WIDTH-1:0] s6_r[0:7], s6_i[0:7];

    integer i, j;

    // --- STAGE 1: H on q2 (bit 2) --- Latency: 3 ---
    h_gate_simplified h_q2_p0 (.clk(clk), .rst_n(rst_n), .alpha_r(i000_r), .alpha_i(i000_i), .beta_r(i100_r), .beta_i(i100_i), .new_alpha_r(s1_r[0]), .new_alpha_i(s1_i[0]), .new_beta_r(s1_r[4]), .new_beta_i(s1_i[4]));
    h_gate_simplified h_q2_p1 (.clk(clk), .rst_n(rst_n), .alpha_r(i001_r), .alpha_i(i001_i), .beta_r(i101_r), .beta_i(i101_i), .new_alpha_r(s1_r[1]), .new_alpha_i(s1_i[1]), .new_beta_r(s1_r[5]), .new_beta_i(s1_i[5]));
    h_gate_simplified h_q2_p2 (.clk(clk), .rst_n(rst_n), .alpha_r(i010_r), .alpha_i(i010_i), .beta_r(i110_r), .beta_i(i110_i), .new_alpha_r(s1_r[2]), .new_alpha_i(s1_i[2]), .new_beta_r(s1_r[6]), .new_beta_i(s1_i[6]));
    h_gate_simplified h_q2_p3 (.clk(clk), .rst_n(rst_n), .alpha_r(i011_r), .alpha_i(i011_i), .beta_r(i111_r), .beta_i(i111_i), .new_alpha_r(s1_r[3]), .new_alpha_i(s1_i[3]), .new_beta_r(s1_r[7]), .new_beta_i(s1_i[7]));

    // --- STAGE 2: CROT(π/2) from q1 to q2 --- Latency: 3 ---
    ccmult_pipelined c21_p0 (.clk(clk), .rst_n(rst_n), .ar(s1_r[6]), .ai(s1_i[6]), .br(C_PI_2_R), .bi(C_PI_2_I), .pr(s2_r[6]), .pi(s2_i[6]));
    ccmult_pipelined c21_p1 (.clk(clk), .rst_n(rst_n), .ar(s1_r[7]), .ai(s1_i[7]), .br(C_PI_2_R), .bi(C_PI_2_I), .pr(s2_r[7]), .pi(s2_i[7]));
    // Pass-through with 3-cycle delay
    reg signed [`TOTAL_WIDTH-1:0] s1_passthru_s2_r [0:5][STAGE_LATENCY-1:0];
    reg signed [`TOTAL_WIDTH-1:0] s1_passthru_s2_i [0:5][STAGE_LATENCY-1:0];
    always @(posedge clk or negedge rst_n) begin
        if(!rst_n) for(j=0;j<6;j=j+1) for(i=0;i<STAGE_LATENCY;i=i+1) {s1_passthru_s2_r[j][i],s1_passthru_s2_i[j][i]} <= 0;
        else begin
            {s1_passthru_s2_r[0][0],s1_passthru_s2_i[0][0]} <= {s1_r[0],s1_i[0]}; {s1_passthru_s2_r[1][0],s1_passthru_s2_i[1][0]} <= {s1_r[1],s1_i[1]};
            {s1_passthru_s2_r[2][0],s1_passthru_s2_i[2][0]} <= {s1_r[2],s1_i[2]}; {s1_passthru_s2_r[3][0],s1_passthru_s2_i[3][0]} <= {s1_r[3],s1_i[3]};
            {s1_passthru_s2_r[4][0],s1_passthru_s2_i[4][0]} <= {s1_r[4],s1_i[4]}; {s1_passthru_s2_r[5][0],s1_passthru_s2_i[5][0]} <= {s1_r[5],s1_i[5]};
            for(j=0;j<6;j=j+1) for(i=1;i<STAGE_LATENCY;i=i+1) {s1_passthru_s2_r[j][i],s1_passthru_s2_i[j][i]} <= {s1_passthru_s2_r[j][i-1],s1_passthru_s2_i[j][i-1]};
        end
    end
    assign {s2_r[0],s2_i[0]}={s1_passthru_s2_r[0][STAGE_LATENCY-1],s1_passthru_s2_i[0][STAGE_LATENCY-1]}; assign {s2_r[1],s2_i[1]}={s1_passthru_s2_r[1][STAGE_LATENCY-1],s1_passthru_s2_i[1][STAGE_LATENCY-1]};
    assign {s2_r[2],s2_i[2]}={s1_passthru_s2_r[2][STAGE_LATENCY-1],s1_passthru_s2_i[2][STAGE_LATENCY-1]}; assign {s2_r[3],s2_i[3]}={s1_passthru_s2_r[3][STAGE_LATENCY-1],s1_passthru_s2_i[3][STAGE_LATENCY-1]};
    assign {s2_r[4],s2_i[4]}={s1_passthru_s2_r[4][STAGE_LATENCY-1],s1_passthru_s2_i[4][STAGE_LATENCY-1]}; assign {s2_r[5],s2_i[5]}={s1_passthru_s2_r[5][STAGE_LATENCY-1],s1_passthru_s2_i[5][STAGE_LATENCY-1]};

    // --- STAGE 3: CROT(π/4) from q0 to q2 --- Latency: 3 ---
    ccmult_pipelined c20_p0 (.clk(clk), .rst_n(rst_n), .ar(s2_r[5]), .ai(s2_i[5]), .br(C_PI_4_R), .bi(C_PI_4_I), .pr(s3_r[5]), .pi(s3_i[5]));
    ccmult_pipelined c20_p1 (.clk(clk), .rst_n(rst_n), .ar(s2_r[7]), .ai(s2_i[7]), .br(C_PI_4_R), .bi(C_PI_4_I), .pr(s3_r[7]), .pi(s3_i[7]));
    // Pass-through with 3-cycle delay
    reg signed [`TOTAL_WIDTH-1:0] s2_passthru_s3_r [0:5][STAGE_LATENCY-1:0];
    reg signed [`TOTAL_WIDTH-1:0] s2_passthru_s3_i [0:5][STAGE_LATENCY-1:0];
    always @(posedge clk or negedge rst_n) begin
        if(!rst_n) for(j=0;j<6;j=j+1) for(i=0;i<STAGE_LATENCY;i=i+1) {s2_passthru_s3_r[j][i],s2_passthru_s3_i[j][i]} <= 0;
        else begin
            {s2_passthru_s3_r[0][0],s2_passthru_s3_i[0][0]} <= {s2_r[0],s2_i[0]}; {s2_passthru_s3_r[1][0],s2_passthru_s3_i[1][0]} <= {s2_r[1],s2_i[1]};
            {s2_passthru_s3_r[2][0],s2_passthru_s3_i[2][0]} <= {s2_r[2],s2_i[2]}; {s2_passthru_s3_r[3][0],s2_passthru_s3_i[3][0]} <= {s2_r[3],s2_i[3]};
            {s2_passthru_s3_r[4][0],s2_passthru_s3_i[4][0]} <= {s2_r[4],s2_i[4]}; {s2_passthru_s3_r[5][0],s2_passthru_s3_i[5][0]} <= {s2_r[6],s2_i[6]};
            for(j=0;j<6;j=j+1) for(i=1;i<STAGE_LATENCY;i=i+1) {s2_passthru_s3_r[j][i],s2_passthru_s3_i[j][i]} <= {s2_passthru_s3_r[j][i-1],s2_passthru_s3_i[j][i-1]};
        end
    end
    assign {s3_r[0],s3_i[0]}={s2_passthru_s3_r[0][STAGE_LATENCY-1],s2_passthru_s3_i[0][STAGE_LATENCY-1]}; assign {s3_r[1],s3_i[1]}={s2_passthru_s3_r[1][STAGE_LATENCY-1],s2_passthru_s3_i[1][STAGE_LATENCY-1]};
    assign {s3_r[2],s3_i[2]}={s2_passthru_s3_r[2][STAGE_LATENCY-1],s2_passthru_s3_i[2][STAGE_LATENCY-1]}; assign {s3_r[3],s3_i[3]}={s2_passthru_s3_r[3][STAGE_LATENCY-1],s2_passthru_s3_i[3][STAGE_LATENCY-1]};
    assign {s3_r[4],s3_i[4]}={s2_passthru_s3_r[4][STAGE_LATENCY-1],s2_passthru_s3_i[4][STAGE_LATENCY-1]}; assign {s3_r[6],s3_i[6]}={s2_passthru_s3_r[5][STAGE_LATENCY-1],s2_passthru_s3_i[5][STAGE_LATENCY-1]};

    // --- STAGE 4: H on q1 (bit 1) --- Latency: 3 ---
    h_gate_simplified h_q1_p0 (.clk(clk), .rst_n(rst_n), .alpha_r(s3_r[0]), .alpha_i(s3_i[0]), .beta_r(s3_r[2]), .beta_i(s3_i[2]), .new_alpha_r(s4_r[0]), .new_alpha_i(s4_i[0]), .new_beta_r(s4_r[2]), .new_beta_i(s4_i[2]));
    h_gate_simplified h_q1_p1 (.clk(clk), .rst_n(rst_n), .alpha_r(s3_r[1]), .alpha_i(s3_i[1]), .beta_r(s3_r[3]), .beta_i(s3_i[3]), .new_alpha_r(s4_r[1]), .new_alpha_i(s4_i[1]), .new_beta_r(s4_r[3]), .new_beta_i(s4_i[3]));
    h_gate_simplified h_q1_p2 (.clk(clk), .rst_n(rst_n), .alpha_r(s3_r[4]), .alpha_i(s3_i[4]), .beta_r(s3_r[6]), .beta_i(s3_i[6]), .new_alpha_r(s4_r[4]), .new_alpha_i(s4_i[4]), .new_beta_r(s4_r[6]), .new_beta_i(s4_i[6]));
    h_gate_simplified h_q1_p3 (.clk(clk), .rst_n(rst_n), .alpha_r(s3_r[5]), .alpha_i(s3_i[5]), .beta_r(s3_r[7]), .beta_i(s3_i[7]), .new_alpha_r(s4_r[5]), .new_alpha_i(s4_i[5]), .new_beta_r(s4_r[7]), .new_beta_i(s4_i[7]));

    // --- STAGE 5: CROT(π/2) from q0 to q1 --- Latency: 3 ---
    ccmult_pipelined c10_p0 (.clk(clk), .rst_n(rst_n), .ar(s4_r[3]), .ai(s4_i[3]), .br(C_PI_2_R), .bi(C_PI_2_I), .pr(s5_r[3]), .pi(s5_i[3]));
    ccmult_pipelined c10_p1 (.clk(clk), .rst_n(rst_n), .ar(s4_r[7]), .ai(s4_i[7]), .br(C_PI_2_R), .bi(C_PI_2_I), .pr(s5_r[7]), .pi(s5_i[7]));
    // Pass-through with 3-cycle delay
    reg signed [`TOTAL_WIDTH-1:0] s4_passthru_s5_r [0:5][STAGE_LATENCY-1:0];
    reg signed [`TOTAL_WIDTH-1:0] s4_passthru_s5_i [0:5][STAGE_LATENCY-1:0];
    always @(posedge clk or negedge rst_n) begin
        if(!rst_n) for(j=0;j<6;j=j+1) for(i=0;i<STAGE_LATENCY;i=i+1) {s4_passthru_s5_r[j][i],s4_passthru_s5_i[j][i]} <= 0;
        else begin
            {s4_passthru_s5_r[0][0],s4_passthru_s5_i[0][0]} <= {s4_r[0],s4_i[0]}; {s4_passthru_s5_r[1][0],s4_passthru_s5_i[1][0]} <= {s4_r[1],s4_i[1]};
            {s4_passthru_s5_r[2][0],s4_passthru_s5_i[2][0]} <= {s4_r[2],s4_i[2]}; {s4_passthru_s5_r[3][0],s4_passthru_s5_i[3][0]} <= {s4_r[4],s4_i[4]};
            {s4_passthru_s5_r[4][0],s4_passthru_s5_i[4][0]} <= {s4_r[5],s4_i[5]}; {s4_passthru_s5_r[5][0],s4_passthru_s5_i[5][0]} <= {s4_r[6],s4_i[6]};
            for(j=0;j<6;j=j+1) for(i=1;i<STAGE_LATENCY;i=i+1) {s4_passthru_s5_r[j][i],s4_passthru_s5_i[j][i]} <= {s4_passthru_s5_r[j][i-1],s4_passthru_s5_i[j][i-1]};
        end
    end
    assign {s5_r[0],s5_i[0]}={s4_passthru_s5_r[0][STAGE_LATENCY-1],s4_passthru_s5_i[0][STAGE_LATENCY-1]}; assign {s5_r[1],s5_i[1]}={s4_passthru_s5_r[1][STAGE_LATENCY-1],s4_passthru_s5_i[1][STAGE_LATENCY-1]};
    assign {s5_r[2],s5_i[2]}={s4_passthru_s5_r[2][STAGE_LATENCY-1],s4_passthru_s5_i[2][STAGE_LATENCY-1]}; assign {s5_r[4],s5_i[4]}={s4_passthru_s5_r[3][STAGE_LATENCY-1],s4_passthru_s5_i[3][STAGE_LATENCY-1]};
    assign {s5_r[5],s5_i[5]}={s4_passthru_s5_r[4][STAGE_LATENCY-1],s4_passthru_s5_i[4][STAGE_LATENCY-1]}; assign {s5_r[6],s5_i[6]}={s4_passthru_s5_r[5][STAGE_LATENCY-1],s4_passthru_s5_i[5][STAGE_LATENCY-1]};

    // --- STAGE 6: H on q0 (bit 0) --- Latency: 3 ---
    h_gate_simplified h_q0_p0 (.clk(clk), .rst_n(rst_n), .alpha_r(s5_r[0]), .alpha_i(s5_i[0]), .beta_r(s5_r[1]), .beta_i(s5_i[1]), .new_alpha_r(s6_r[0]), .new_alpha_i(s6_i[0]), .new_beta_r(s6_r[1]), .new_beta_i(s6_i[1]));
    h_gate_simplified h_q0_p1 (.clk(clk), .rst_n(rst_n), .alpha_r(s5_r[2]), .alpha_i(s5_i[2]), .beta_r(s5_r[3]), .beta_i(s5_i[3]), .new_alpha_r(s6_r[2]), .new_alpha_i(s6_i[2]), .new_beta_r(s6_r[3]), .new_beta_i(s6_i[3]));
    h_gate_simplified h_q0_p2 (.clk(clk), .rst_n(rst_n), .alpha_r(s5_r[4]), .alpha_i(s5_i[4]), .beta_r(s5_r[5]), .beta_i(s5_i[5]), .new_alpha_r(s6_r[4]), .new_alpha_i(s6_i[4]), .new_beta_r(s6_r[5]), .new_beta_i(s6_i[5]));
    h_gate_simplified h_q0_p3 (.clk(clk), .rst_n(rst_n), .alpha_r(s5_r[6]), .alpha_i(s5_i[6]), .beta_r(s5_r[7]), .beta_i(s5_i[7]), .new_alpha_r(s6_r[6]), .new_alpha_i(s6_i[6]), .new_beta_r(s6_r[7]), .new_beta_i(s6_i[7]));

    // --- STAGE 7: SWAP q0 and q2 (Bit Reversal) --- Latency: 1 ---
    swap_gate_pipelined final_swap (
        .clk(clk), .rst_n(rst_n),
        .in_001_r(s6_r[1]), .in_001_i(s6_i[1]), .in_100_r(s6_r[4]), .in_100_i(s6_i[4]),
        .in_011_r(s6_r[3]), .in_011_i(s6_i[3]), .in_110_r(s6_r[6]), .in_110_i(s6_i[6]),
        .out_001_r(f001_r), .out_001_i(f001_i),
        .out_100_r(f100_r), .out_100_i(f100_i),
        .out_011_r(f011_r), .out_011_i(f011_i),
        .out_110_r(f110_r), .out_110_i(f110_i)
    );
    // Pass-through the amplitudes not affected by swap, with a 1-cycle delay to match SWAP latency.
    reg signed [`TOTAL_WIDTH-1:0] f000_r_reg, f000_i_reg;
    reg signed [`TOTAL_WIDTH-1:0] f010_r_reg, f010_i_reg;
    reg signed [`TOTAL_WIDTH-1:0] f101_r_reg, f101_i_reg;
    reg signed [`TOTAL_WIDTH-1:0] f111_r_reg, f111_i_reg;

    always @(posedge clk or negedge rst_n) begin
        if(!rst_n) begin
            {f000_r_reg, f000_i_reg} <= 0; {f010_r_reg, f010_i_reg} <= 0;
            {f101_r_reg, f101_i_reg} <= 0; {f111_r_reg, f111_i_reg} <= 0;
        end else begin
            {f000_r_reg, f000_i_reg} <= {s6_r[0], s6_i[0]};
            {f010_r_reg, f010_i_reg} <= {s6_r[2], s6_i[2]};
            {f101_r_reg, f101_i_reg} <= {s6_r[5], s6_i[5]};
            {f111_r_reg, f111_i_reg} <= {s6_r[7], s6_i[7]};
        end
    end

    assign f000_r = f000_r_reg; assign f000_i = f000_i_reg;
    assign f010_r = f010_r_reg; assign f010_i = f010_i_reg;
    assign f101_r = f101_r_reg; assign f101_i = f101_i_reg;
    assign f111_r = f111_r_reg; assign f111_i = f111_i_reg;

endmodule

`include "fixed_point_params.vh"

//======================================================================
// Complex-Complex Multiplier (Pipelined)
//======================================================================
// This module is retained as it is used for the CROT gates.
// Latency: 3 cycles
module ccmult_pipelined(
    input                         clk,
    input                         rst_n,
    input  signed [`TOTAL_WIDTH-1:0] ar, ai,
    input  signed [`TOTAL_WIDTH-1:0] br, bi,
    output signed [`TOTAL_WIDTH-1:0] pr, pi
);

    // Pipeline Stage 1: multiplication
    reg signed [`MULT_WIDTH-1:0] p_ar_br_s1, p_ai_bi_s1, p_ar_bi_s1, p_ai_br_s1;
    always @(posedge clk or negedge rst_n) begin
        if (!rst_n) begin
            p_ar_br_s1 <= 0;
            p_ai_bi_s1 <= 0;
            p_ar_bi_s1 <= 0;
            p_ai_br_s1 <= 0;
        end else begin
            p_ar_br_s1 <= ar * br;
            p_ai_bi_s1 <= ai * bi;
            p_ar_bi_s1 <= ar * bi;
            p_ai_br_s1 <= ai * br;
        end
    end

    // Pipeline Stage 2: addition/subtraction
    reg signed [`MULT_WIDTH:0] real_sum_s2, imag_sum_s2;
    always @(posedge clk or negedge rst_n) begin
        if (!rst_n) begin
            real_sum_s2 <= 0;
            imag_sum_s2 <= 0;
        end else begin
            real_sum_s2 <= p_ar_br_s1 - p_ai_bi_s1;
            imag_sum_s2 <= p_ar_bi_s1 + p_ai_br_s1;
        end
    end

    // Pipeline Stage 3: scaling (output register)
    reg signed [`TOTAL_WIDTH-1:0] pr_s3, pi_s3;
    always @(posedge clk or negedge rst_n) begin
        if (!rst_n) begin
            pr_s3 <= 0;
            pi_s3 <= 0;
        end else begin
            pr_s3 <= real_sum_s2 >>> `FRAC_WIDTH;
            pi_s3 <= imag_sum_s2 >>> `FRAC_WIDTH;
        end
    end
    
    assign pr = pr_s3;
    assign pi = pi_s3;
    
endmodule


`include "fixed_point_params.vh"

//======================================================================
// Simplified Hadamard Gate (Corrected and Pipelined)
//======================================================================
module h_gate_simplified(
    input                         clk,
    input                         rst_n,
    input  signed [`TOTAL_WIDTH-1:0] alpha_r, alpha_i,
    input  signed [`TOTAL_WIDTH-1:0] beta_r,  beta_i,
    output signed [`TOTAL_WIDTH-1:0] new_alpha_r, new_alpha_i,
    output signed [`TOTAL_WIDTH-1:0] new_beta_r,  new_beta_i
);

    // S3.4 constant for 1/sqrt(2)
    localparam signed [`TOTAL_WIDTH-1:0] ONE_OVER_SQRT2 = 11;

    // --- Pipeline Stage 1: Addition/Subtraction ---
    reg signed [`ADD_WIDTH-1:0] add_r_s1, add_i_s1;
    reg signed [`ADD_WIDTH-1:0] sub_r_s1, sub_i_s1;

    always @(posedge clk or negedge rst_n) begin
        if (!rst_n) begin
            add_r_s1 <= 0; add_i_s1 <= 0;
            sub_r_s1 <= 0; sub_i_s1 <= 0;
        end else begin
            add_r_s1 <= alpha_r + beta_r;
            add_i_s1 <= alpha_i + beta_i;
            sub_r_s1 <= alpha_r - beta_r;
            sub_i_s1 <= alpha_i - beta_i;
        end
    end

    // --- Pipeline Stage 2: Multiplication by 1/sqrt(2) ---
    // Define a wider intermediate product width to prevent overflow
    localparam H_MULT_WIDTH = `ADD_WIDTH + `TOTAL_WIDTH;
    reg signed [H_MULT_WIDTH-1:0] mult_add_r_s2, mult_add_i_s2;
    reg signed [H_MULT_WIDTH-1:0] mult_sub_r_s2, mult_sub_i_s2;

    always @(posedge clk or negedge rst_n) begin
        if (!rst_n) begin
            mult_add_r_s2 <= 0; mult_add_i_s2 <= 0;
            mult_sub_r_s2 <= 0; mult_sub_i_s2 <= 0;
        end else begin
            // --- THE FIX ---
            // Perform multiplication on the FULL 9-bit adder result to prevent overflow.
            mult_add_r_s2 <= add_r_s1 * ONE_OVER_SQRT2;
            mult_add_i_s2 <= add_i_s1 * ONE_OVER_SQRT2;
            mult_sub_r_s2 <= sub_r_s1 * ONE_OVER_SQRT2;
            mult_sub_i_s2 <= sub_i_s1 * ONE_OVER_SQRT2;
        end
    end

    // --- Pipeline Stage 3: Scaling (Output) ---
    reg signed [`TOTAL_WIDTH-1:0] new_alpha_r_s3, new_alpha_i_s3;
    reg signed [`TOTAL_WIDTH-1:0] new_beta_r_s3,  new_beta_i_s3;
    
    always @(posedge clk or negedge rst_n) begin
        if (!rst_n) begin
            new_alpha_r_s3 <= 0; new_alpha_i_s3 <= 0;
            new_beta_r_s3  <= 0; new_beta_i_s3  <= 0;
        end else begin
            // Scale the wider product back down to the target width
            new_alpha_r_s3 <= mult_add_r_s2 >>> `FRAC_WIDTH;
            new_alpha_i_s3 <= mult_add_i_s2 >>> `FRAC_WIDTH;
            new_beta_r_s3  <= mult_sub_r_s2 >>> `FRAC_WIDTH;
            new_beta_i_s3  <= mult_sub_i_s2 >>> `FRAC_WIDTH;
        end
    end
    
    assign new_alpha_r = new_alpha_r_s3;
    assign new_alpha_i = new_alpha_i_s3;
    assign new_beta_r  = new_beta_r_s3;
    assign new_beta_i  = new_beta_i_s3;
    
endmodule


`include "fixed_point_params.vh"

//======================================================================
// SWAP Gate (Pipelined)
//======================================================================
// This module is retained as-is.
// Latency: 1 cycle
module swap_gate_pipelined(
    input                         clk,
    input                         rst_n,
    input  signed [`TOTAL_WIDTH-1:0] in_001_r, in_001_i,
    input  signed [`TOTAL_WIDTH-1:0] in_100_r, in_100_i,
    input  signed [`TOTAL_WIDTH-1:0] in_011_r, in_011_i,
    input  signed [`TOTAL_WIDTH-1:0] in_110_r, in_110_i,
    output signed [`TOTAL_WIDTH-1:0] out_001_r, out_001_i,
    output signed [`TOTAL_WIDTH-1:0] out_100_r, out_100_i,
    output signed [`TOTAL_WIDTH-1:0] out_011_r, out_011_i,
    output signed [`TOTAL_WIDTH-1:0] out_110_r, out_110_i
);

    reg signed [`TOTAL_WIDTH-1:0] out_001_r_reg, out_001_i_reg;
    reg signed [`TOTAL_WIDTH-1:0] out_100_r_reg, out_100_i_reg;
    reg signed [`TOTAL_WIDTH-1:0] out_011_r_reg, out_011_i_reg;
    reg signed [`TOTAL_WIDTH-1:0] out_110_r_reg, out_110_i_reg;

    always @(posedge clk or negedge rst_n) begin
        if (!rst_n) begin
            out_001_r_reg <= 0; out_001_i_reg <= 0;
            out_100_r_reg <= 0; out_100_i_reg <= 0;
            out_011_r_reg <= 0; out_011_i_reg <= 0;
            out_110_r_reg <= 0; out_110_i_reg <= 0;
        end else begin
            // Perform swaps
            out_001_r_reg <= in_100_r; out_001_i_reg <= in_100_i;
            out_100_r_reg <= in_001_r; out_100_i_reg <= in_001_i;
            out_011_r_reg <= in_110_r; out_011_i_reg <= in_110_i;
            out_110_r_reg <= in_011_r; out_110_i_reg <= in_011_i;
        end
    end

    assign out_001_r = out_001_r_reg;
    assign out_001_i = out_001_i_reg;
    assign out_100_r = out_100_r_reg;
    assign out_100_i = out_100_i_reg;
    assign out_011_r = out_011_r_reg;
    assign out_011_i = out_011_i_reg;
    assign out_110_r = out_110_r_reg;
    assign out_110_i = out_110_i_reg;

endmodule


### Setting up the configuration

OpenLane requries you to configure any Flow before using it. This is done using
the `config` module.

For colaboratories, REPLs and other interactive environments where there is no
concrete Flow object, the Configuration may be initialized using `Config.interactive`,
which will automatically propagate the configuration to any future steps.

You can find the documentation for `Config.interactive` [here](https://openlane2.readthedocs.io/en/latest/reference/api/config/index.html#openlane.config.Config.interactive).



In [None]:
from openlane.config import Config

Config.interactive(
    "qft3_top_pipelined",
    PDK="gf180mcuC",
    CLOCK_PORT="clk",
    CLOCK_NET="clk",
    CLOCK_PERIOD=1250,
    PRIMARY_GDSII_STREAMOUT_TOOL="klayout",
)

### Running implementation steps

There are two ways to obtain OpenLane's built-in implementation steps:

* via directly importing from the `steps` module using its category:
    * `from openlane.steps import Yosys` then `Synthesis = Yosys.Synthesis`
* by using the step's id from the registry:
    * `from openlane.steps import Step` then `Synthesis = Step.factory.get("Yosys.Synthesis")`

You can find a full list of included steps here: https://openlane2.readthedocs.io/en/latest/reference/step_config_vars.html

In [None]:
from openlane.steps import Step

* First, get the step (and display its help)...

In [None]:
Synthesis = Step.factory.get("Yosys.Synthesis")

Synthesis.display_help()

* Then run it. Note you can pass step-specific configs using Python keyword
  arguments.

### Synthesis

We need to start by converting our high-level Verilog to one that just shows
the connections between small silicon patterns called "standard cells" in process
called Synthesis. We can do this by passing the Verilog files as a configuration
variable to `Yosys.Synthesis` as follows, then running it.

As this is the first step, we need to create an empty state and pass it to it.

In [None]:
from openlane.state import State

synthesis = Synthesis(
    VERILOG_FILES=["qft3_top_pipelined.v"],
    state_in=State(),
)
synthesis.start()

In [None]:
display(synthesis)

### Floorplanning

Floorplanning does two things:

* Determines the dimensions of the final chip.
* Creates the "cell placement grid" which placed cells must be aligned to.
    * Each cell in the grid is called a "site." Cells can occupy multiple
      sites, with the overwhelming majority of cells occupying multiple sites
      by width, and some standard cell libraries supporting varying heights as well.

> Don't forget- you may call `display_help()` on any Step class to get a full
> list of configuration variables.


In [None]:
Floorplan = Step.factory.get("OpenROAD.Floorplan")

floorplan = Floorplan(state_in=synthesis.state_out)
floorplan.start()

In [None]:
display(floorplan)

### Tap/Endcap Cell Insertion

This places two kinds of cells on the floorplan:

* End cap/boundary cells: Added at the beginning and end of each row. True to
  their name, they "cap off" the core area of a design.
* Tap cells: Placed in a polka dot-ish fashion across the rows. Tap cells
  connect VDD to the nwell and the psubstrate to VSS, which the majority of cells
  do not do themselves to save area- but if you go long enough without one such
  connection you end up with the cell "latching-up"; i.e.; refusing to switch
  back to LO from HI.

  There is a maximum distance between tap cells enforced as part of every
  foundry process.

In [None]:
TapEndcapInsertion = Step.factory.get("OpenROAD.TapEndcapInsertion")

tdi = TapEndcapInsertion(state_in=floorplan.state_out)
tdi.start()

In [None]:
display(tdi)

### I/O Placement

This places metal pins at the edges of the design corresponding to the top level
inputs and outputs for your design. These pins act as the interface with other
designs when you integrate it with other designs.

In [None]:
IOPlacement = Step.factory.get("OpenROAD.IOPlacement")

ioplace = IOPlacement(state_in=tdi.state_out)
ioplace.start()

In [None]:
display(ioplace)

### Generating the Power Distribution Network (PDN)

This creates the power distribution network for your design, which is essentially
a plaid pattern of horizontal and vertical "straps" across the design that is
then connected to the rails' VDD and VSS (via the tap cells.)

You can find an explanation of how the power distribution network works at this
link: https://openlane2.readthedocs.io/en/latest/usage/hardening_macros.html#pdn-generation

While we typically don't need to mess with the PDN too much, the SPM is a small
design, so we're going to need to make the plaid pattern formed by the PDN a bit
smaller.

In [None]:
GeneratePDN = Step.factory.get("OpenROAD.GeneratePDN")

pdn = GeneratePDN(
    state_in=ioplace.state_out,
    FP_PDN_VWIDTH=2,
    FP_PDN_HWIDTH=2,
    FP_PDN_VPITCH=30,
    FP_PDN_HPITCH=30,
)
pdn.start()

In [None]:
display(pdn)

### Global Placement

Global Placement is deciding on a fuzzy, non-final location for each of the cells,
with the aim of minimizing the distance between cells that are connected
together (more specifically, the total length of the not-yet-created wires that
will connect them).

As you will see in the `.display()` in the second cell below, the placement is
considered "illegal", i.e., not properly aligned with the cell placement grid.
This is addressed by "Detailed Placement", also referred to as "placement
legalization", which is the next step.

In [None]:
GlobalPlacement = Step.factory.get("OpenROAD.GlobalPlacement")

gpl = GlobalPlacement(state_in=pdn.state_out)
gpl.start()

In [None]:
display(gpl)

### Detailed Placement

This aligns the fuzzy placement from before with the grid, "legalizing" it.

In [None]:
DetailedPlacement = Step.factory.get("OpenROAD.DetailedPlacement")

dpl = DetailedPlacement(state_in=gpl.state_out)
dpl.start()

In [None]:
display(dpl)

### Clock Tree Synthesis (CTS)

With the cells now having a final placement, we can go ahead and create what
is known as the clock tree, i.e., the hierarchical set of buffers used
for clock signal to minimize what is known as "clock skew"- variable delay
of the clock cycle from register to register because of factors such as metal
wire length, clock load (number of gates connected to the same clock buffer,)
et cetera.

The CTS step creates the cells and places the between the gaps in the detailed
placement above.

In [None]:
CTS = Step.factory.get("OpenROAD.CTS")

cts = CTS(state_in=dpl.state_out)
cts.start()

In [None]:
display(cts)

### Global Routing

Global routing "plans" the routes the wires between two gates (or gates and
I/O pins/the PDN) will take. The results of global routing (which are called
"routing guides") are stored in internal data structures and have no effect on
the actual design, so there is no `display()` statement.

In [None]:
GlobalRouting = Step.factory.get("OpenROAD.GlobalRouting")

grt = GlobalRouting(state_in=cts.state_out)
grt.start()

### Detailed Routing

Detailed routing uses the guides from Global Routing to actually create wires
on the metal layers and connect the gates, making the connections finally physical.

This is typically the longest step in the flow.

In [None]:
DetailedRouting = Step.factory.get("OpenROAD.DetailedRouting")

drt = DetailedRouting(state_in=grt.state_out)
drt.start()

In [None]:
display(drt)

### Fill Insertion

Finally, as we're done placing all the essential cells, the only thing left to
do is fill in the gaps.

We prioritize the use of decap (decoupling capacitor) cells, which
further supports the power distribution network, but when there aren't any
small enough cells, we just use regular fill cells.

In [None]:
FillInsertion = Step.factory.get("OpenROAD.FillInsertion")

fill = FillInsertion(state_in=drt.state_out)
fill.start()

In [None]:
display(fill)

### Parasitics Extraction a.k.a. Resistance/Capacitance Extraction (RCX)

This step does not alter the design- rather, it computes the
[Parasitic elements](https://en.wikipedia.org/wiki/Parasitic_element_(electrical_networks))
of the circuit, which have an effect of timing, as we prepare to do the final
timing analysis.

The parasitic elements are saved in the **Standard Parasitics Exchange Format**,
or SPEF. OpenLane creates a SPEF file for each interconnect corner as described in
the [Corners and STA](https://openlane2.readthedocs.io/en/latest/usage/corners_and_sta.html)
section of the documentation.

In [None]:
RCX = Step.factory.get("OpenROAD.RCX")

rcx = RCX(state_in=fill.state_out)
rcx.start()

### Static Timing Analysis (Post-PnR)

STA is a process that verifies that a chip meets certain constraints on clock
and data timings to run at its rated clock speed. See [Corners and STA](https://openlane2.readthedocs.io/en/latest/usage/corners_and_sta.html)
in the documentation for more info.

---

This step generates two kinds of files:
* `.lib`: Liberty™-compatible Library files. Can be used to do static timing
  analysis when creating a design with this design as a sub-macro.
* `.sdf`: Standard Delay Format. Can be used with certain simulation software
  to do *dynamic* timing analysis.

Unfortunately, the `.lib` files coming out of OpenLane right now are not super
reliable for timing purposes and are only provided for completeness.
When using OpenLane-created macros withing other designs, it is best to use the
macro's final netlist and extracted parasitics instead.

In [None]:
STAPostPNR = Step.factory.get("OpenROAD.STAPostPNR")

sta_post_pnr = STAPostPNR(state_in=rcx.state_out)
sta_post_pnr.start()

### Stream-out

Stream-out is the process of converting the designs from the abstract formats
using during floorplanning, placement and routing into a concrete format called
GDSII (lit. Graphic Design System 2), which is the final file that is then sent
for fabrication.

In [None]:
StreamOut = Step.factory.get("KLayout.StreamOut")

gds = StreamOut(state_in=sta_post_pnr.state_out)
gds.start()

In [None]:
display(gds)

### Design Rule Checks (DRC)

DRC determines that the final layout does not violate any of the rules set by
the foundry to ensure the design is actually manufacturable- for example,
not enough space between two wires, *too much* space between tap cells, and so
on.

A design not passing DRC will typically be rejected by the foundry, who
also run DRC on their side.

In [None]:
DRC = Step.factory.get("Magic.DRC")

drc = DRC(state_in=gds.state_out)
drc.start()

### SPICE Extraction for Layout vs. Schematic Check

This step tries to reconstruct a SPICE netlist from the GDSII file, so it can
later be used for the **Layout vs. Schematic** (LVS) check.

In [None]:
SpiceExtraction = Step.factory.get("Magic.SpiceExtraction")

spx = SpiceExtraction(state_in=drc.state_out)
spx.start()

### Layout vs. Schematic (LVS)

A comparison between the final Verilog netlist (from PnR) and the final
SPICE netlist (extracted.)

This check effectively compares the physically implemented circuit to the final
Verilog netlist output by OpenROAD.

The idea is, if there are any disconnects, shorts or other mismatches in the
physical implementation that do not exist in the logical view of the design,
they would be caught at this step.

Common issues that result in LVS violations include:
* Lack of fill cells or tap cells in the design
* Two unrelated signals to be shorted, or a wire to be disconnected (most
  commonly seen with misconfigured PDN)

Chips with LVS errors are typically dead on arrival.

In [None]:
LVS = Step.factory.get("Netgen.LVS")

lvs = LVS(state_in=spx.state_out)
lvs.start()