# Part 3 VGG16 4bit

In [1]:
import os
import time
import shutil

import torch
import torch.nn as nn

import torchvision
import torchvision.transforms as transforms

# Include parent dir in path
import sys
from pathlib import Path
parent_dir = str(Path.cwd().parent)
if parent_dir not in sys.path:
    sys.path.append(parent_dir)

from models import *

global best_prec
use_gpu = torch.cuda.is_available()
print('=> Building model...')
    
    
batch_size = 128
model_name = "VGG16_project_part1"
model = VGG(vgg_name=model_name, w_bits=4, a_bits=4)
print(model)

normalize = transforms.Normalize(mean=[0.491, 0.482, 0.447], std=[0.247, 0.243, 0.262])


train_dataset = torchvision.datasets.CIFAR10(
    root='../data',
    train=True,
    download=True,
    transform=transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        normalize,
    ]))
trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)


test_dataset = torchvision.datasets.CIFAR10(
    root='../data',
    train=False,
    download=True,
    transform=transforms.Compose([
        transforms.ToTensor(),
        normalize,
    ]))

testloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=2)


print_freq = 100 # every 100 batches, accuracy printed. Here, each batch includes "batch_size" data points
# CIFAR10 has 50,000 training data, and 10,000 validation data.

def train(trainloader, model, criterion, optimizer, epoch):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()

    model.train()

    end = time.time()
    for i, (input, target) in enumerate(trainloader):
        # measure data loading time
        data_time.update(time.time() - end)

        input, target = input.cuda(), target.cuda()

        # compute output
        output = model(input)
        loss = criterion(output, target)

        # measure accuracy and record loss
        prec = accuracy(output, target)[0]
        losses.update(loss.item(), input.size(0))
        top1.update(prec.item(), input.size(0))

        # compute gradient and do SGD step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()


        if i % print_freq == 0:
            print('Epoch: [{0}][{1}/{2}]\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                  'Prec {top1.val:.3f}% ({top1.avg:.3f}%)'.format(
                   epoch, i, len(trainloader), batch_time=batch_time,
                   data_time=data_time, loss=losses, top1=top1))

            

def validate(val_loader, model, criterion ):
    batch_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()

    # switch to evaluate mode
    model.eval()

    end = time.time()
    with torch.no_grad():
        for i, (input, target) in enumerate(val_loader):
         
            input, target = input.cuda(), target.cuda()

            # compute output
            output = model(input)
            loss = criterion(output, target)

            # measure accuracy and record loss
            prec = accuracy(output, target)[0]
            losses.update(loss.item(), input.size(0))
            top1.update(prec.item(), input.size(0))

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            if i % print_freq == 0:  # This line shows how frequently print out the status. e.g., i%5 => every 5 batch, prints out
                print('Test: [{0}/{1}]\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                  'Prec {top1.val:.3f}% ({top1.avg:.3f}%)'.format(
                   i, len(val_loader), batch_time=batch_time, loss=losses,
                   top1=top1))

    print(' * Prec {top1.avg:.3f}% '.format(top1=top1))
    return top1.avg


def accuracy(output, target, topk=(1,)):
    """Computes the precision@k for the specified values of k"""
    maxk = max(topk)
    batch_size = target.size(0)

    _, pred = output.topk(maxk, 1, True, True)
    pred = pred.t()
    correct = pred.eq(target.view(1, -1).expand_as(pred))

    res = []
    for k in topk:
        correct_k = correct[:k].view(-1).float().sum(0)
        res.append(correct_k.mul_(100.0 / batch_size))
    return res


class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

        
def save_checkpoint(state, is_best, fdir):
    filepath = os.path.join(fdir, 'checkpoint.pth')
    torch.save(state, filepath)
    if is_best:
        shutil.copyfile(filepath, os.path.join(fdir, 'model_best.pth.tar'))
     

#model = nn.DataParallel(model).cuda()
#all_params = checkpoint['state_dict']
#model.load_state_dict(all_params, strict=False)
#criterion = nn.CrossEntropyLoss().cuda()
#validate(testloader, model, criterion)

=> Building model...
VGG_quant(
  (features): Sequential(
    (0): QuantConv2d(
      3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False
      (weight_quant): weight_quantize_fn()
    )
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): QuantConv2d(
      64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False
      (weight_quant): weight_quantize_fn()
    )
    (4): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU(inplace=True)
    (6): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (7): QuantConv2d(
      64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False
      (weight_quant): weight_quantize_fn()
    )
    (8): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (9): ReLU(inplace=True)
    (10): QuantConv2d(
      128, 128, kernel_size=(3, 3), stride

In [None]:
# This cell won't be given, but students will complete the training

lr = 1e-3
weight_decay = 1e-4
epochs = 100
best_prec = 0

#model = nn.DataParallel(model).cuda()
model.cuda()
criterion = nn.CrossEntropyLoss().cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
    optimizer,
    T_max=epochs,
    eta_min=0
)
#cudnn.benchmark = True

if not os.path.exists('result'):
    os.makedirs('result')
fdir = 'result/'+str(model_name)
if not os.path.exists(fdir):
    os.makedirs(fdir)
        

for epoch in range(0, epochs):
    #adjust_learning_rate(optimizer, epoch)

    train(trainloader, model, criterion, optimizer, epoch)
    
    # evaluate on test set
    print("Validation starts")
    prec = validate(testloader, model, criterion)
    
    scheduler.step()

    # remember best precision and save checkpoint
    is_best = prec > best_prec
    best_prec = max(prec,best_prec)
    print('best acc: {:1f}'.format(best_prec))
    save_checkpoint({
        'epoch': epoch + 1,
        'state_dict': model.state_dict(),
        'best_prec': best_prec,
        'optimizer': optimizer.state_dict(),
    }, is_best, fdir)

In [3]:
PATH = f"result/{model_name}/model_best.pth.tar"
device = "cuda" if torch.cuda.is_available() else "mps" if torch.mps.is_available() else "cpu"
checkpoint = torch.load(PATH, map_location=device)
model.load_state_dict(checkpoint['state_dict'])

model.to(device)
model.eval()

test_loss = 0
correct = 0

with torch.no_grad():
    for data, target in testloader:
        data, target = data.to(device), target.to(device) # loading to GPU
        output = model(data)
        pred = output.argmax(dim=1, keepdim=True)  
        correct += pred.eq(target.view_as(pred)).sum().item()

test_loss /= len(testloader.dataset)

test_acc = 100. * correct / len(testloader.dataset)
print('\nTest set: Accuracy: {}/{} ({:.2f}%)\n'.format(
        correct, len(testloader.dataset),
        test_acc))


Test set: Accuracy: 9225/10000 (92.25%)



In [4]:
class SaveOutput:
    def __init__(self):
        self.outputs = []
    def __call__(self, module, module_in):
        self.outputs.append(module_in)
    def clear(self):
        self.outputs = []  
        
######### Save inputs from Layer 0 (Part 3 Requirement) ##########
save_output = SaveOutput()

# Layer 0 is the first Conv layer (3 input channels -> 64 output channels)
model.features[0].register_forward_pre_hook(save_output) 

# Layer 2 is the first ReLU (0=Conv, 1=BN, 2=ReLU)
# We hook here to get the "Golden" reference output for verification
model.features[2].register_forward_pre_hook(save_output) 

# Run a single image forward pass
dataiter = iter(testloader)
images, labels = next(dataiter)
images = images.to(device)
out = model(images)

In [5]:
# Part 3: Extract Layer 0 Weights
# Original Weight Shape: [64, 3, 3, 3] (OC, IC, K, K)
# Target Shape for 8x8 Array: [8, 8, 3, 3] (Slice 8 OCs, Pad ICs from 3 to 8)

weight_q = model.features[0].weight_q
w_alpha = model.features[0].weight_quant.wgt_alpha
w_bit = 4

# Recover Integer Weights
weight_int_full = weight_q / (w_alpha / (2**(w_bit-1)-1))

# 1. Slice first 8 Output Channels
weight_int_sliced = weight_int_full[0:8, :, :, :] 

# 2. Pad Input Channels from 3 to 8 (Zero padding)
# The hardware array is 8x8, but input image only has 3 channels.
weight_int = torch.zeros(8, 8, 3, 3).to(device)
weight_int[:, 0:3, :, :] = weight_int_sliced

print(f"Original Layer 0 shape: {weight_int_full.shape}")
print(f"Padded/Sliced Weight shape for HW: {weight_int.shape}")
print(f"Unique values: {torch.unique(weight_int)}")

Original Layer 0 shape: torch.Size([64, 3, 3, 3])
Padded/Sliced Weight shape for HW: torch.Size([8, 8, 3, 3])
Unique values: tensor([-7.0000, -6.0000, -5.0000, -4.0000, -3.0000, -2.0000, -1.0000, -0.0000,
         1.0000,  2.0000,  3.0000,  4.0000,  5.0000,  6.0000,  7.0000],
       device='mps:0', grad_fn=<Unique2Backward0>)


In [6]:
# Part 3: Extract Input Image and Quantize
# Input is usually Normalized Float. We need Int representation.

raw_input = save_output.outputs[0][0] # The batch of images
act_alpha = model.features[0].act_alpha # Use Layer 0's activation alpha
act_bit = 4

# Quantize the input image
act_quant_fn = act_quantization(act_bit)
act_q = act_quant_fn(raw_input, act_alpha)
act_int_full = act_q / (act_alpha / (2**act_bit-1))

# Slice single image from batch [3, 32, 32]
a_int_raw = act_int_full[0,:,:,:] 

# Pad Input Channels from 3 to 8
a_int = torch.zeros(8, a_int_raw.size(1), a_int_raw.size(2)).to(device)
a_int[0:3, :, :] = a_int_raw

print(f"Input Image Shape (Padded): {a_int.shape}")
print(f"Unique values: {torch.unique(a_int)}")

Input Image Shape (Padded): torch.Size([8, 32, 32])
Unique values: tensor([-13.0000, -12.0000, -11.0000, -10.0000,  -9.0000,  -8.0000,  -7.0000,
         -6.0000,  -5.0000,  -4.0000,  -3.0000,  -2.0000,  -1.0000,  -0.0000,
          1.0000,   2.0000,   3.0000,   4.0000,   5.0000,   6.0000,   7.0000,
          8.0000,   9.0000,  10.0000,  11.0000,  12.0000,  13.0000,  14.0000,
         15.0000], device='mps:0', grad_fn=<Unique2Backward0>)


In [11]:
# Create a SW Convolution with the padded weights and inputs for verification
# This simulates what the Hardware 8x8 array should output
conv_int = torch.nn.Conv2d(in_channels=8, out_channels=8, kernel_size=3, padding=1, bias=True)
conv_int.weight = torch.nn.parameter.Parameter(weight_int)

# Use the bias from the actual model (Slice first 8)
if model.features[0].bias is not None:
    # Quantize bias if necessary, but usually standard PyTorch bias is float. 
    # For integer simulation, we assume bias is handled or effectively zero for simple checking.
    # Here we just use the parameter to keep the flow consistent.
    conv_int.bias = torch.nn.parameter.Parameter(model.features[0].bias[0:8])
else:
    conv_int.bias = torch.nn.parameter.Parameter(torch.zeros(8).to(device))

relu = torch.nn.ReLU(inplace=True)

# Compute Expected Output
output_int = relu(conv_int(a_int.unsqueeze(0))) # Add batch dim

print(f"Expected Output Shape: {output_int.shape}")

# Verify against the hook (Slice hook to first 8 channels to compare)
print(len(save_output.outputs))
hook_output = save_output.outputs[1][0][:, 0:8, :, :]
print("Part 3 Verification Setup Complete.")

Expected Output Shape: torch.Size([1, 8, 32, 32])
2
Part 3 Verification Setup Complete.


## Generating Activation and Kernel Weight Files

In [None]:
# act_int.size = torch.Size([128, 8, 4, 4])  <- batch_size, input_ch, ni, nj
print(a_int.shape)
# a_int.size() = [8, 32, 32]

# conv_int.weight.size() = torch.Size([8, 8, 3, 3])  <- output_ch, input_ch, ki, kj
w_int = torch.reshape(weight_int, (weight_int.size(0), weight_int.size(1), -1))  # merge ki, kj index to kij
# w_int.weight.size() = torch.Size([8, 8, 9])
print(w_int.shape)
                      
padding = 1
stride = 1
array_size = 8 # row and column number

nig = range(a_int.size(1))  ## ni group
njg = range(a_int.size(2))  ## nj group

icg = range(int(w_int.size(1)))  ## input channel 
ocg = range(int(w_int.size(0)))  ## output channel

ic_tileg = range(int(len(icg)/array_size))
oc_tileg = range(int(len(ocg)/array_size))

kijg = range(w_int.size(2))
ki_dim = int(math.sqrt(w_int.size(2)))  ## Kernel's 1 dim size

######## Padding before Convolution #######
a_pad = torch.zeros(len(icg), len(nig)+padding*2, len(nig)+padding*2).to(device)
# a_pad.size() = [8, 32+2pad, 32+2pad]
a_pad[ :, padding:padding+len(nig), padding:padding+len(njg)] = a_int.to(device)
a_pad = torch.reshape(a_pad, (a_pad.size(0), -1))
# a_pad.size() = [8, (32+2pad)*(32+2pad)]

a_tile = torch.zeros(len(ic_tileg), array_size,    a_pad.size(1)).to(device)
w_tile = torch.zeros(len(oc_tileg)*len(ic_tileg), array_size, array_size, len(kijg)).to(device)

for ic_tile in ic_tileg:
    a_tile[ic_tile,:,:] = a_pad[ic_tile*array_size:(ic_tile+1)*array_size,:]

for ic_tile in ic_tileg:
    for oc_tile in oc_tileg:
        w_tile[oc_tile*len(oc_tileg) + ic_tile,:,:,:] = w_int[oc_tile*array_size:(oc_tile+1)*array_size, ic_tile*array_size:(ic_tile+1)*array_size, :]



###########################################

p_nijg = range(a_pad.size(1)) ## psum nij group

psum = torch.zeros(len(ic_tileg), len(oc_tileg), array_size, len(p_nijg), len(kijg)).to(device) 

for kij in kijg:
    for ic_tile in ic_tileg:       # Tiling into array_sizeXarray_size array
        for oc_tile in oc_tileg:   # Tiling into array_sizeXarray_size array        
            for nij in p_nijg:       # time domain, sequentially given input
                    m = nn.Linear(array_size, array_size, bias=False)
                    #m.weight = torch.nn.Parameter(w_int[oc_tile*array_size:(oc_tile+1)*array_size, ic_tile*array_size:(ic_tile+1)*array_size, kij])
                    m.weight = torch.nn.Parameter(w_tile[len(oc_tileg)*oc_tile+ic_tile,:,:,kij])
                    psum[ic_tile, oc_tile, :, nij, kij] = m(a_tile[ic_tile,:,nij]).to(device)

torch.Size([8, 32, 32])
torch.Size([8, 8, 9])


In [None]:
import math

a_pad_ni_dim = int(math.sqrt(a_pad.size(1))) #34 

o_ni_dim = int((a_pad_ni_dim - (ki_dim- 1) - 1)/stride + 1)
o_nijg = range(o_ni_dim**2)    
print(len(o_nijg))
    
out = torch.zeros(len(ocg), len(o_nijg)).to(device)
  
   
### SFP accumulation ###
for o_nij in o_nijg: 
    for kij in kijg:
        for ic_tile in ic_tileg:    
            for oc_tile in oc_tileg:   
                out[oc_tile*array_size:(oc_tile+1)*array_size, o_nij] = out[oc_tile*array_size:(oc_tile+1)*array_size, o_nij] + \
                psum[ic_tile, oc_tile, :, int(o_nij/o_ni_dim)*a_pad_ni_dim + o_nij%o_ni_dim + int(kij/ki_dim)*a_pad_ni_dim + kij%ki_dim, kij]
                ## 4th index = (int(o_nij/30)*32 + o_nij%30) + (int(kij/3)*32 + kij%3)

1156
1024


In [None]:
# # ### show this cell partially. The following cells should be printed by students ###
# tile_id = 0 
# start_nij = 0
# num_steps = 64 
# X = a_tile[tile_id,:,start_nij:start_nij+num_steps]  # [tile_num, array row num, time_steps]
# print(X.shape)

# bit_precision = 4
# file = open('activation_tile0.txt', 'w') #write to file
# file.write('#time0row7[msb-lsb],time0row6[msb-lst],....,time0row0[msb-lst]#\n')
# file.write('#time1row7[msb-lsb],time1row6[msb-lst],....,time1row0[msb-lst]#\n')
# file.write('#................#\n')

# for t in range(X.size(1)):  # time step
#     for r in range(X.size(0)): # row #
#         val = int(round(X[X.size(0)-1-r,t].item()))
#         if val < 0:
#             val = (1 << bit_precision) + val
#         bits = format(val, f"0{bit_precision}b")
#         file.write(bits)
#         bits = format(val, f"0{bit_precision//4}x")
#     file.write('\n')
# file.close() #close file    


torch.Size([8, 64])


In [19]:
# Flatten weights: [8, 8, 3, 3] -> [8, 8, 9] (OC, IC, K*K)
w_int_flat = torch.reshape(weight_int, (weight_int.size(0), weight_int.size(1), -1)) 

bit_precision = 4
len_kij = 9

# Iterate through 9 kernel positions
for kij in range(len_kij):
    # Extract 8x8 matrix for this kernel position
    W = w_int_flat[:, :, kij]   # [OC, IC]
    
    filename = f"weight_itile0_otile0_kij{kij}.txt"
    with open(filename, "w") as f:
        f.write('#col0row7[msb-lsb]...col0row0[msb-lsb]#\n')
        f.write('#................#\n')
        
        # Iterate Output Channels (Columns in HW typically)
        for c in range(8): 
            # Iterate Input Channels (Rows in HW typically)
            # We iterate 7 down to 0 to pack MSB correctly
            for r in range(8):
                val = int(round(W[c, 7-r].item())) 
                if val < 0:
                    val = (1 << bit_precision) + val
                bits = format(val, f"0{bit_precision}b")
                f.write(bits)
            f.write("\n")
            
print("Part 3 weight files generated.")

Part 3 weight files generated.


In [20]:
import math

# a_pad: [in_channels, a_pad_ni_dim^2] after reshape
a_pad_ni_dim = int(math.sqrt(a_pad.size(1))) # 6
ki_dim = int(math.sqrt(w_int.size(2)))  # 3
stride = 1

o_ni_dim = int((a_pad_ni_dim - (ki_dim - 1) - 1)/stride + 1) # 4
o_nijg = range(o_ni_dim*o_ni_dim)   #0..15 

len_onij_tb = 8      # parameter len_onij in core_tb.v
psum_bw = 16

bit_precision = psum_bw

with open("out.txt", "w") as f:
    for o_idx in range(len_onij_tb):
        # We assume oc_tile = 0, so we use out[0:8, o_idx]
        vals = out[0:array_size, o_idx]  # shape [8]

        # Pack into 128-bit word: row7..row0 
        bits_128 = ""
        for r in range(array_size):
            val = int(round(vals[array_size - 1 - r].item()))  # row7..row0

            # 16-bit two's complement
            if val < 0:
                val = (1 << bit_precision) + val
            val &= (1 << bit_precision) - 1

            bits_16 = format(val, f"0{bit_precision}b")
            bits_128 += bits_16

        assert len(bits_128) == 128
        f.write(bits_128 + "\n")

In [21]:
len_kij = w_int.size(2)   # 9
len_onij_tb = 16          # matches core_tb.v

a_pad_ni_dim = int(math.sqrt(a_pad.size(1))) # 6 
ki_dim = int(math.sqrt(w_int.size(2)))         # 3
stride = 1
o_ni_dim = int((a_pad_ni_dim - (ki_dim - 1) - 1)/stride + 1)
len_pmem_per_kij = 36

with open("acc.txt", "w") as f:
    for o_idx in o_nijg:
        o_row = o_idx // o_ni_dim
        o_col = o_idx % o_ni_dim

        for kij in range(len_kij):
            k_row = kij // ki_dim
            k_col = kij % ki_dim

            nij_psum = (o_row * a_pad_ni_dim + o_col) + \
                       (k_row * a_pad_ni_dim + k_col)
            
            addr = kij * len_pmem_per_kij + nij_psum

            # 11-bit address
            addr = addr & ((1 << 11) - 1)
            bits_11 = format(addr, "011b")
            f.write(bits_11 + "\n")

In [22]:
# Handle Padding manually (32x32 -> 34x34)
padding = 1
padded_input = torch.zeros(8, 32 + 2*padding, 32 + 2*padding).to(device)
padded_input[:, padding:-padding, padding:-padding] = a_int

# Flatten spatial dims: [8, 34*34]
flat_input = torch.reshape(padded_input, (8, -1))

# Part 3 spec says: "only 1st eight nij". 
# We generate 64 steps just to be safe, but the testbench only needs the first few.
num_steps = 8 
X = flat_input[:, 0:num_steps] 
bit_precision = 4

with open('activation_tile0.txt', 'w') as file:
    file.write('#time0row7[msb-lsb]...time0row0[msb-lsb]#\n')
    
    for t in range(X.size(1)):  # time steps
        for r in range(8):      # 8 Input Channels (Rows)
            val = int(round(X[7-r, t].item())) # 7-r for packing order
            if val < 0:
                val = (1 << bit_precision) + val
            bits = format(val, f"0{bit_precision}b")
            file.write(bits)
        file.write('\n')

print("Part 3 activation.txt generated.")

Part 3 activation.txt generated.


In [23]:
# output_int shape: [1, 8, 32, 32]
# Flatten to match hardware stream order
out_flat = output_int[0].flatten(1) # [8, 1024]

# Spec: "Map only the first 8 output channel and only 1st eight nij"
out_ref = out_flat[:, 0:8] 

# PSUM width (usually 16 or 20 bits for accumulation)
bit_precision = 16 

with open("out_part3.txt", "w") as f:
    # Iterate over the 8 output pixels (nij / time domain)
    for t in range(8): 
        # Pack the 8 Output Channels (Rows in accumulator)
        bits_line = ""
        for oc in range(8):
            val = int(round(out_ref[7-oc, t].item()))
            
            if val < 0:
                val = (1 << bit_precision) + val
            val &= (1 << bit_precision) - 1
            
            bits = format(val, f"0{bit_precision}b")
            bits_line += bits
        f.write(bits_line + "\n")

print("Part 3 out_part3.txt generated (First 8 output channels, First 8 nijs).")

Part 3 out_part3.txt generated (First 8 output channels, First 8 nijs).
