In [1]:
import numpy as np

In [2]:
# Original numbers in full precision
numbers = [2.3888888, 0, 34.444, 12.3486e4, -1223.4566]
print("Original numbers:", numbers)
print("=" * 60)

Original numbers: [2.3888888, 0, 34.444, 123486.0, -1223.4566]


In [3]:
def quantize_and_compute(numbers, dtype_name):
    print(f"\n{dtype_name.upper()} Results:")
    print("-" * 30)
    
    if dtype_name == "float32":
        quantized = [np.float32(x) for x in numbers]
    elif dtype_name == "float16":
        quantized = [np.float16(x) for x in numbers]
    elif dtype_name == "bfloat16":
        # Simulate bfloat16 by truncating mantissa (keep sign + 8 exp + 7 mantissa bits)
        quantized = []
        for x in numbers:
            if x == 0:
                quantized.append(0.0)
            else:
                # Convert to float32 first, then simulate bfloat16 precision loss
                f32 = np.float32(x)
                # Simple bfloat16 simulation - round to fewer significant digits
                if abs(f32) >= 1:
                    # Keep ~3-4 significant digits for bfloat16
                    rounded = round(f32, max(0, 3 - len(str(int(abs(f32))))))
                else:
                    rounded = round(f32, 4)
                quantized.append(rounded)
    elif dtype_name == "int8":
        # Scale and clip to int8 range [-128, 127]
        # Using simple scaling - find max absolute value for scaling factor
        max_val = max(abs(x) for x in numbers if x != 0)
        scale = 127 / max_val
        quantized = [np.clip(round(x * scale), -128, 127) / scale for x in numbers]
    
    print("Quantized values:", quantized)
    
    # Perform arithmetic operations
    if len(quantized) >= 5:
        a, b, c, d, e = quantized[:5]
        
        # Basic operations
        add_result = a + c
        mult_result = a * d
        div_result = d / c if c != 0 else float('inf')
        complex_result = (a + b) * c - e
        
        print(f"a + c = {a} + {c} = {add_result}")
        print(f"a * d = {a} * {d} = {mult_result}")
        print(f"d / c = {d} / {c} = {div_result}")
        print(f"(a + b) * c - e = {complex_result}")
        
        return {
            'quantized': quantized,
            'add': add_result,
            'mult': mult_result,
            'div': div_result,
            'complex': complex_result
        }

In [4]:
# Test all quantization levels
results = {}
for dtype in ["float32", "float16", "bfloat16", "int8"]:
    results[dtype] = quantize_and_compute(numbers, dtype)

# Compare precision loss
print("\n" + "=" * 60)
print("PRECISION LOSS COMPARISON")
print("=" * 60)

original_float32 = results["float32"]

for dtype in ["float16", "bfloat16", "int8"]:
    print(f"\n{dtype.upper()} vs FLOAT32:")
    print("-" * 25)
    
    # Compare quantized values
    for i, (orig, quant) in enumerate(zip(numbers, results[dtype]['quantized'])):
        orig_f32 = np.float32(orig)
        error = abs(orig_f32 - quant)
        rel_error = (error / abs(orig_f32) * 100) if orig_f32 != 0 else 0
        print(f"Value {i+1}: {orig_f32:.6f} → {quant:.6f} (error: {error:.6f}, {rel_error:.2f}%)")
    
    # Compare operation results
    print("\nOperation errors:")
    for op in ['add', 'mult', 'div', 'complex']:
        if op in original_float32 and op in results[dtype]:
            orig_val = original_float32[op]
            quant_val = results[dtype][op]
            error = abs(orig_val - quant_val)
            rel_error = (error / abs(orig_val) * 100) if orig_val != 0 and abs(orig_val) != float('inf') else 0
            print(f"{op}: {orig_val:.6f} → {quant_val:.6f} (error: {error:.6f}, {rel_error:.2f}%)")

print("\n" + "=" * 60)
print("SUMMARY:")
print("• Float32: Full precision baseline")
print("• Float16: ~3-4 decimal digits, can handle large ranges")
print("• BFloat16: ~3-4 decimal digits, better for large numbers")
print("• Int8: Severe quantization, only 256 possible values")
print("=" * 60)


FLOAT32 Results:
------------------------------
Quantized values: [np.float32(2.3888888), np.float32(0.0), np.float32(34.444), np.float32(123486.0), np.float32(-1223.4565)]
a + c = 2.3888888359069824 + 34.444000244140625 = 36.832889556884766
a * d = 2.3888888359069824 * 123486.0 = 294994.3125
d / c = 123486.0 / 34.444000244140625 = 3585.12353515625
(a + b) * c - e = 1305.7393798828125

FLOAT16 Results:
------------------------------
Quantized values: [np.float16(2.389), np.float16(0.0), np.float16(34.44), np.float16(inf), np.float16(-1223.0)]
a + c = 2.388671875 + 34.4375 = 36.8125
a * d = 2.388671875 * inf = inf
d / c = inf / 34.4375 = inf
(a + b) * c - e = 1305.0

BFLOAT16 Results:
------------------------------
Quantized values: [np.float32(2.39), 0.0, np.float32(34.4), np.float32(123486.0), np.float32(-1223.0)]
a + c = 2.390000104904175 + 34.400001525878906 = 36.790000915527344
a * d = 2.390000104904175 * 123486.0 = 295131.5625
d / c = 123486.0 / 34.400001525878906 = 3589.70922851

  quantized = [np.float16(x) for x in numbers]
