# Performance Optimization - Solutions

NumPy vs Python performance, vectorization, and optimization techniques.

## Question 1
Compare the performance of summing elements using Python lists vs NumPy arrays using %timeit.

In [None]:
import numpy as np

# Create test data
size = 100000
python_list = list(range(size))
numpy_array = np.arange(size)

print(f"Data size: {size:,} elements")
print(f"\nPython list sum:")
%timeit sum(python_list)

print(f"\nNumPy array sum:")
%timeit numpy_array.sum()

print(f"\nNumPy np.sum():")
%timeit np.sum(numpy_array)

# Manual calculation for comparison
import time

start = time.time()
result_python = sum(python_list)
python_time = time.time() - start

start = time.time()
result_numpy = numpy_array.sum()
numpy_time = time.time() - start

print(f"\nResults:")
print(f"Python sum: {result_python}")
print(f"NumPy sum: {result_numpy}")
print(f"Speed improvement: {python_time / numpy_time:.1f}x faster with NumPy")

## Question 2
Demonstrate the performance difference between using loops and vectorized operations for element-wise multiplication.

In [None]:
import numpy as np
import time

# Create large arrays for testing
size = 1000000
arr1 = np.random.rand(size)
arr2 = np.random.rand(size)

print(f"Array size: {size:,} elements")

# Method 1: Using explicit Python loop (slow)
def multiply_with_loop(a, b):
    result = np.empty_like(a)
    for i in range(len(a)):
        result[i] = a[i] * b[i]
    return result

# Method 2: Using list comprehension (still slow)
def multiply_with_list_comp(a, b):
    return np.array([a[i] * b[i] for i in range(len(a))])

# Method 3: Vectorized NumPy operation (fast)
def multiply_vectorized(a, b):
    return a * b

# Time each method
print("\nTiming different methods:")

# Use smaller arrays for loop methods to avoid long wait times
small_size = 10000
small_arr1 = arr1[:small_size]
small_arr2 = arr2[:small_size]

start = time.time()
result_loop = multiply_with_loop(small_arr1, small_arr2)
loop_time = time.time() - start
print(f"Loop method ({small_size:,} elements): {loop_time:.4f} seconds")

start = time.time()
result_vectorized = multiply_vectorized(arr1, arr2)
vectorized_time = time.time() - start
print(f"Vectorized method ({size:,} elements): {vectorized_time:.4f} seconds")

# Calculate relative performance
extrapolated_loop_time = loop_time * (size / small_size)
print(f"\nExtrapolated loop time for {size:,} elements: {extrapolated_loop_time:.4f} seconds")
print(f"Speed improvement: {extrapolated_loop_time / vectorized_time:.0f}x faster with vectorization")

# Verify results are the same
print(f"\nResults are equivalent: {np.allclose(result_loop, result_vectorized[:small_size])}")

# Using %timeit for more accurate timing
print(f"\nMore precise timing with %timeit:")
print(f"Vectorized operation:")
%timeit arr1 * arr2

## Question 3
Compare memory usage between Python lists and NumPy arrays for the same data.

In [None]:
import numpy as np
import sys

# Create test data
size = 100000
python_list = list(range(size))
numpy_array = np.arange(size, dtype=np.int64)
numpy_array_int32 = np.arange(size, dtype=np.int32)

# Calculate memory usage
list_memory = sys.getsizeof(python_list) + sum(sys.getsizeof(item) for item in python_list)
numpy_memory = numpy_array.nbytes
numpy_int32_memory = numpy_array_int32.nbytes

print(f"Data size: {size:,} integers")
print(f"\nMemory usage:")
print(f"Python list: {list_memory:,} bytes ({list_memory / 1024 / 1024:.2f} MB)")
print(f"NumPy array (int64): {numpy_memory:,} bytes ({numpy_memory / 1024 / 1024:.2f} MB)")
print(f"NumPy array (int32): {numpy_int32_memory:,} bytes ({numpy_int32_memory / 1024 / 1024:.2f} MB)")

print(f"\nMemory efficiency:")
print(f"NumPy (int64) uses {list_memory / numpy_memory:.1f}x less memory than Python list")
print(f"NumPy (int32) uses {list_memory / numpy_int32_memory:.1f}x less memory than Python list")

# Show per-element memory usage
print(f"\nPer-element memory usage:")
print(f"Python list: {list_memory / size:.1f} bytes per element")
print(f"NumPy int64: {numpy_array.itemsize} bytes per element")
print(f"NumPy int32: {numpy_array_int32.itemsize} bytes per element")

# Demonstrate with different data types
print(f"\nMemory usage for different NumPy data types ({size:,} elements):")
dtypes = [np.int8, np.int16, np.int32, np.int64, np.float32, np.float64]
for dtype in dtypes:
    arr = np.arange(size, dtype=dtype)
    print(f"{dtype.__name__:>8}: {arr.nbytes:>8,} bytes ({arr.itemsize} bytes per element)")

print(f"\nKey takeaway: NumPy arrays are much more memory-efficient than Python lists")
print(f"Choose appropriate data types to optimize memory usage further")

## Question 4
Show the performance impact of data type choices by comparing operations on int32 vs int64 arrays.

In [None]:
import numpy as np
import time

size = 5000000
data = np.random.randint(0, 1000, size)

# Create arrays with different data types
arr_int32 = data.astype(np.int32)
arr_int64 = data.astype(np.int64)
arr_float32 = data.astype(np.float32)
arr_float64 = data.astype(np.float64)

print(f"Array size: {size:,} elements")
print(f"\nMemory usage:")
print(f"int32:   {arr_int32.nbytes:>10,} bytes")
print(f"int64:   {arr_int64.nbytes:>10,} bytes")
print(f"float32: {arr_float32.nbytes:>10,} bytes")
print(f"float64: {arr_float64.nbytes:>10,} bytes")

def time_operation(arr, operation_name, func):
    start = time.time()
    result = func(arr)
    elapsed = time.time() - start
    return elapsed, result

# Test different operations
operations = [
    ("Sum", lambda x: x.sum()),
    ("Mean", lambda x: x.mean()),
    ("Square", lambda x: x ** 2),
    ("Sort", lambda x: np.sort(x))
]

print(f"\nPerformance comparison:")
print(f"{'Operation':<10} {'int32':<10} {'int64':<10} {'float32':<10} {'float64':<10} {'Speedup':<10}")
print("-" * 70)

for op_name, op_func in operations:
    time_int32, _ = time_operation(arr_int32, op_name, op_func)
    time_int64, _ = time_operation(arr_int64, op_name, op_func)
    time_float32, _ = time_operation(arr_float32, op_name, op_func)
    time_float64, _ = time_operation(arr_float64, op_name, op_func)
    
    speedup = time_int64 / time_int32
    
    print(f"{op_name:<10} {time_int32:<10.4f} {time_int64:<10.4f} {time_float32:<10.4f} {time_float64:<10.4f} {speedup:<10.2f}x")

# More detailed timing using %timeit
print(f"\nDetailed timing with %timeit:")
print(f"\nSum operation:")
print(f"int32:")
%timeit arr_int32.sum()
print(f"int64:")
%timeit arr_int64.sum()

print(f"\nSquare operation:")
print(f"int32:")
%timeit arr_int32 ** 2
print(f"int64:")
%timeit arr_int64 ** 2

print(f"\nKey insights:")
print(f"- Smaller data types often perform better due to:")
print(f"  * Better cache utilization")
print(f"  * Less memory bandwidth required")
print(f"  * More elements fit in CPU registers")
print(f"- Choose the smallest data type that fits your data range")
print(f"- Trade-off between precision and performance")

## Question 5
Demonstrate the performance benefit of avoiding array copies by using views when possible.

In [None]:
import numpy as np
import time

large_array = np.random.rand(10000, 1000)
print(f"Large array shape: {large_array.shape}")
print(f"Array size: {large_array.nbytes / 1024 / 1024:.1f} MB")

# Operations that create views (fast)
print(f"\nOperations that create views (no copying):")

start = time.time()
view_slice = large_array[1000:2000, :]
time_view_slice = time.time() - start
print(f"Slicing: {time_view_slice:.6f} seconds")

start = time.time()
view_transpose = large_array.T
time_view_transpose = time.time() - start
print(f"Transpose: {time_view_transpose:.6f} seconds")

start = time.time()
view_reshape = large_array.reshape(-1)
time_view_reshape = time.time() - start
print(f"Reshape: {time_view_reshape:.6f} seconds")

# Operations that create copies (slow)
print(f"\nOperations that create copies (memory allocation):")

start = time.time()
copy_slice = large_array[1000:2000, :].copy()
time_copy_slice = time.time() - start
print(f"Slicing + copy(): {time_copy_slice:.6f} seconds")

start = time.time()
copy_explicit = np.copy(large_array)
time_copy_explicit = time.time() - start
print(f"np.copy(): {time_copy_explicit:.6f} seconds")

start = time.time()
copy_fancy = large_array[[1, 3, 5, 7], :]
time_copy_fancy = time.time() - start
print(f"Fancy indexing: {time_copy_fancy:.6f} seconds")

# Verify views vs copies
print(f"\nVerifying views vs copies:")
print(f"Slice shares memory: {np.shares_memory(large_array, view_slice)}")
print(f"Transpose shares memory: {np.shares_memory(large_array, view_transpose)}")
print(f"Reshape shares memory: {np.shares_memory(large_array, view_reshape)}")
print(f"Copy shares memory: {np.shares_memory(large_array, copy_explicit)}")
print(f"Fancy indexing shares memory: {np.shares_memory(large_array, copy_fancy)}")

# Demonstrate modification through views
print(f"\nDemonstrating view modification:")
test_array = np.arange(12).reshape(3, 4)
print(f"Original array:\n{test_array}")

# Create a view and modify it
view = test_array[1:, 1:3]
print(f"View (rows 1:, cols 1:3):\n{view}")

view[0, 0] = 999
print(f"After modifying view[0,0] = 999:")
print(f"Original array:\n{test_array}")
print(f"View:\n{view}")

# Performance implications
print(f"\nPerformance implications:")
print(f"- Views are created instantly (no memory allocation)")
print(f"- Copies require time proportional to data size")
print(f"- Use views when you don't need independent data")
print(f"- Use copies when you need to modify data independently")

# Timing with %timeit
print(f"\nTiming with %timeit:")
print(f"View creation (transpose):")
%timeit large_array.T

print(f"Copy creation:")
%timeit large_array.copy()

## Question 6
Compare the performance of using np.sum() vs Python's built-in sum() function on NumPy arrays.

In [None]:
import numpy as np
import time

# Create test arrays of different sizes
sizes = [1000, 10000, 100000, 1000000]

print(f"Performance comparison: np.sum() vs Python sum()")
print(f"{'Size':<10} {'np.sum()':<12} {'Python sum()':<15} {'Speedup':<10}")
print("-" * 55)

for size in sizes:
    arr = np.random.rand(size)
    
    # Time np.sum()
    start = time.time()
    for _ in range(10):  # Average over multiple runs
        result_numpy = np.sum(arr)
    numpy_time = (time.time() - start) / 10
    
    # Time Python sum() - but only for smaller arrays as it's very slow
    if size <= 100000:
        start = time.time()
        for _ in range(10):
            result_python = sum(arr)
        python_time = (time.time() - start) / 10
        speedup = python_time / numpy_time
        
        print(f"{size:<10,} {numpy_time:<12.6f} {python_time:<15.6f} {speedup:<10.1f}x")
    else:
        print(f"{size:<10,} {numpy_time:<12.6f} {'too slow':<15} {'>>1000':<10}x")

# Detailed analysis with %timeit for a specific size
test_size = 50000
test_array = np.random.rand(test_size)

print(f"\nDetailed timing for {test_size:,} elements:")
print(f"\nnp.sum(array):")
%timeit np.sum(test_array)

print(f"\narray.sum():")
%timeit test_array.sum()

print(f"\nsum(array) [Python built-in]:")
%timeit sum(test_array)

# Test with different data types
print(f"\nPerformance with different data types:")
size = 100000
dtypes = [np.int32, np.int64, np.float32, np.float64]

for dtype in dtypes:
    arr = np.random.randint(0, 100, size).astype(dtype)
    
    start = time.time()
    for _ in range(10):
        result = np.sum(arr)
    avg_time = (time.time() - start) / 10
    
    print(f"{dtype.__name__:<8}: {avg_time:.6f} seconds")

# Verify results are the same (within floating point precision)
arr_small = np.random.rand(1000)
numpy_result = np.sum(arr_small)
python_result = sum(arr_small)

print(f"\nResult verification:")
print(f"np.sum() result: {numpy_result}")
print(f"sum() result: {python_result}")
print(f"Results are close: {np.isclose(numpy_result, python_result)}")
print(f"Difference: {abs(numpy_result - python_result)}")

print(f"\nKey takeaways:")
print(f"- np.sum() is orders of magnitude faster than Python's sum()")
print(f"- The speed difference increases with array size")
print(f"- Both array.sum() and np.sum() are equivalent and fast")
print(f"- Always use NumPy functions for NumPy arrays")

## Question 7
Show the performance difference between using np.dot() and manual matrix multiplication with loops.

In [None]:
import numpy as np
import time

def manual_matrix_multiply(A, B):
    """Manual matrix multiplication using nested loops"""
    rows_A, cols_A = A.shape
    rows_B, cols_B = B.shape
    
    if cols_A != rows_B:
        raise ValueError("Matrix dimensions incompatible")
    
    result = np.zeros((rows_A, cols_B))
    
    for i in range(rows_A):
        for j in range(cols_B):
            for k in range(cols_A):
                result[i, j] += A[i, k] * B[k, j]
    
    return result

# Test with different matrix sizes
sizes = [50, 100, 200]

print(f"Matrix multiplication performance comparison")
print(f"{'Size':<10} {'Manual (s)':<12} {'np.dot (s)':<12} {'@ operator (s)':<15} {'Speedup':<10}")
print("-" * 70)

for size in sizes:
    # Create random matrices
    A = np.random.rand(size, size)
    B = np.random.rand(size, size)
    
    # Time manual multiplication (only for smaller sizes)
    if size <= 100:
        start = time.time()
        result_manual = manual_matrix_multiply(A, B)
        manual_time = time.time() - start
    else:
        manual_time = float('inf')
        result_manual = None
    
    # Time np.dot()
    start = time.time()
    result_dot = np.dot(A, B)
    dot_time = time.time() - start
    
    # Time @ operator
    start = time.time()
    result_at = A @ B
    at_time = time.time() - start
    
    if manual_time != float('inf'):
        speedup = manual_time / dot_time
        print(f"{size}x{size:<6} {manual_time:<12.4f} {dot_time:<12.6f} {at_time:<15.6f} {speedup:<10.0f}x")
        
        # Verify results are the same
        if result_manual is not None:
            assert np.allclose(result_manual, result_dot), "Results don't match!"
            assert np.allclose(result_dot, result_at), "np.dot and @ don't match!"
    else:
        print(f"{size}x{size:<6} {'too slow':<12} {dot_time:<12.6f} {at_time:<15.6f} {'>10000':<10}x")

# Detailed timing for medium-sized matrices
print(f"\nDetailed timing for 100x100 matrices:")
A = np.random.rand(100, 100)
B = np.random.rand(100, 100)

print(f"\nnp.dot(A, B):")
%timeit np.dot(A, B)

print(f"\nA @ B:")
%timeit A @ B

print(f"\nnp.matmul(A, B):")
%timeit np.matmul(A, B)

# Test with different matrix shapes
print(f"\nPerformance with different matrix shapes:")
shapes = [(100, 200, 150), (500, 100, 300), (200, 500, 100)]

for m, k, n in shapes:
    A = np.random.rand(m, k)
    B = np.random.rand(k, n)
    
    start = time.time()
    result = A @ B
    elapsed = time.time() - start
    
    print(f"({m}x{k}) @ ({k}x{n}) = ({m}x{n}): {elapsed:.4f} seconds")

# Show why NumPy is faster
print(f"\nWhy NumPy is faster:")
print(f"- Uses optimized BLAS (Basic Linear Algebra Subprograms) libraries")
print(f"- Written in C/Fortran, not Python")
print(f"- Optimized for cache efficiency and vectorization")
print(f"- Can use multiple CPU cores (depending on BLAS implementation)")
print(f"- Avoids Python's interpreter overhead for inner loops")

# Check which BLAS is being used
print(f"\nBLAS information:")
try:
    print(f"NumPy build configuration:")
    np.show_config()
except:
    print(f"Unable to show BLAS configuration")

## Question 8
Demonstrate the impact of memory layout (C-order vs Fortran-order) on performance for different operations.

In [None]:
import numpy as np
import time

size = 2000
# Create arrays with different memory layouts
arr_c = np.random.rand(size, size)  # C-order (row-major)
arr_f = np.asfortranarray(arr_c)    # Fortran-order (column-major)

print(f"Array shape: {arr_c.shape}")
print(f"C-order array flags: C_CONTIGUOUS={arr_c.flags['C_CONTIGUOUS']}, F_CONTIGUOUS={arr_c.flags['F_CONTIGUOUS']}")
print(f"F-order array flags: C_CONTIGUOUS={arr_f.flags['C_CONTIGUOUS']}, F_CONTIGUOUS={arr_f.flags['F_CONTIGUOUS']}")

def time_operation(arr, name, operation):
    start = time.time()
    result = operation(arr)
    elapsed = time.time() - start
    return elapsed

# Test row-wise operations (should be faster for C-order)
print(f"\nRow-wise operations (should favor C-order):")
print(f"{'Operation':<20} {'C-order':<10} {'F-order':<10} {'Ratio':<10}")
print("-" * 50)

# Row sum
time_c = time_operation(arr_c, "Row sum C", lambda x: x.sum(axis=1))
time_f = time_operation(arr_f, "Row sum F", lambda x: x.sum(axis=1))
print(f"{'Row sum':<20} {time_c:<10.4f} {time_f:<10.4f} {time_f/time_c:<10.2f}")

# Row iteration (accessing consecutive elements)
def row_iteration(arr):
    total = 0
    for i in range(min(100, arr.shape[0])):  # Limited iterations for timing
        total += arr[i, :].sum()
    return total

time_c = time_operation(arr_c, "Row iter C", row_iteration)
time_f = time_operation(arr_f, "Row iter F", row_iteration)
print(f"{'Row iteration':<20} {time_c:<10.4f} {time_f:<10.4f} {time_f/time_c:<10.2f}")

# Test column-wise operations (should be faster for F-order)
print(f"\nColumn-wise operations (should favor F-order):")
print(f"{'Operation':<20} {'C-order':<10} {'F-order':<10} {'Ratio':<10}")
print("-" * 50)

# Column sum
time_c = time_operation(arr_c, "Col sum C", lambda x: x.sum(axis=0))
time_f = time_operation(arr_f, "Col sum F", lambda x: x.sum(axis=0))
print(f"{'Column sum':<20} {time_c:<10.4f} {time_f:<10.4f} {time_c/time_f:<10.2f}")

# Column iteration
def col_iteration(arr):
    total = 0
    for j in range(min(100, arr.shape[1])):
        total += arr[:, j].sum()
    return total

time_c = time_operation(arr_c, "Col iter C", col_iteration)
time_f = time_operation(arr_f, "Col iter F", col_iteration)
print(f"{'Column iteration':<20} {time_c:<10.4f} {time_f:<10.4f} {time_c/time_f:<10.2f}")

# Test operations that don't care about layout
print(f"\nLayout-agnostic operations:")
print(f"{'Operation':<20} {'C-order':<10} {'F-order':<10} {'Ratio':<10}")
print("-" * 50)

# Element-wise operations
time_c = time_operation(arr_c, "Square C", lambda x: x ** 2)
time_f = time_operation(arr_f, "Square F", lambda x: x ** 2)
print(f"{'Element-wise square':<20} {time_c:<10.4f} {time_f:<10.4f} {abs(time_f-time_c)/min(time_c,time_f):<10.2f}")

# Matrix multiplication
arr_c_small = arr_c[:500, :500]
arr_f_small = arr_f[:500, :500]

time_c = time_operation(arr_c_small, "MatMul C", lambda x: x @ x.T)
time_f = time_operation(arr_f_small, "MatMul F", lambda x: x @ x.T)
print(f"{'Matrix multiply':<20} {time_c:<10.4f} {time_f:<10.4f} {abs(time_f-time_c)/min(time_c,time_f):<10.2f}")

# Demonstrate cache effects with detailed timing
print(f"\nDetailed cache effects demonstration:")
small_size = 1000
arr_c_small = np.random.rand(small_size, small_size)
arr_f_small = np.asfortranarray(arr_c_small)

print(f"\nRow-wise access (C-order should be faster):")
print(f"C-order array, row access:")
%timeit arr_c_small.sum(axis=1)

print(f"F-order array, row access:")
%timeit arr_f_small.sum(axis=1)

print(f"\nColumn-wise access (F-order should be faster):")
print(f"C-order array, column access:")
%timeit arr_c_small.sum(axis=0)

print(f"F-order array, column access:")
%timeit arr_f_small.sum(axis=0)

print(f"\nKey insights:")
print(f"- Memory layout affects cache performance")
print(f"- C-order (row-major): consecutive elements in rows are adjacent in memory")
print(f"- F-order (column-major): consecutive elements in columns are adjacent in memory")
print(f"- Accessing data in memory order is faster due to cache locality")
print(f"- Most NumPy operations are optimized for both layouts")
print(f"- Choose layout based on your primary access pattern")

## Question 9
Compare the performance of using boolean indexing vs np.where() for conditional operations.

In [None]:
import numpy as np
import time

# Create test data
size = 1000000
arr = np.random.randn(size)  # Random normal distribution

print(f"Array size: {size:,} elements")
print(f"Task: Replace negative values with 0, keep positive values")

def method_boolean_indexing(arr):
    """Using boolean indexing"""
    result = arr.copy()
    result[result < 0] = 0
    return result

def method_where(arr):
    """Using np.where"""
    return np.where(arr < 0, 0, arr)

def method_maximum(arr):
    """Using np.maximum (specialized for this case)"""
    return np.maximum(arr, 0)

def method_clip(arr):
    """Using np.clip (specialized for this case)"""
    return np.clip(arr, 0, None)

# Time each method
methods = [
    ("Boolean indexing", method_boolean_indexing),
    ("np.where", method_where),
    ("np.maximum", method_maximum),
    ("np.clip", method_clip)
]

print(f"\nPerformance comparison:")
print(f"{'Method':<20} {'Time (s)':<10} {'Relative':<10}")
print("-" * 40)

times = []
results = []

for name, method in methods:
    start = time.time()
    for _ in range(10):  # Average over multiple runs
        result = method(arr)
    elapsed = (time.time() - start) / 10
    
    times.append(elapsed)
    results.append(result)
    relative = elapsed / times[0] if times[0] > 0 else 1
    
    print(f"{name:<20} {elapsed:<10.6f} {relative:<10.2f}x")

# Verify all methods give the same result
print(f"\nResult verification:")
for i in range(1, len(results)):
    match = np.allclose(results[0], results[i])
    print(f"{methods[i][0]} matches boolean indexing: {match}")

# More detailed timing with %timeit
print(f"\nDetailed timing with %timeit:")

# Test with smaller array for %timeit (faster execution)
test_arr = np.random.randn(100000)

print(f"\nBoolean indexing:")
%timeit result = test_arr.copy(); result[result < 0] = 0

print(f"\nnp.where:")
%timeit np.where(test_arr < 0, 0, test_arr)

print(f"\nnp.maximum:")
%timeit np.maximum(test_arr, 0)

print(f"\nnp.clip:")
%timeit np.clip(test_arr, 0, None)

# Test with more complex conditions
print(f"\nComplex condition: replace negative with 0, values > 2 with 2, keep others")

def complex_boolean(arr):
    result = arr.copy()
    result[result < 0] = 0
    result[result > 2] = 2
    return result

def complex_where(arr):
    return np.where(arr < 0, 0, np.where(arr > 2, 2, arr))

def complex_clip(arr):
    return np.clip(arr, 0, 2)

# Time complex operations
test_arr = np.random.randn(500000)

start = time.time()
result_bool = complex_boolean(test_arr)
time_bool = time.time() - start

start = time.time()
result_where = complex_where(test_arr)
time_where = time.time() - start

start = time.time()
result_clip = complex_clip(test_arr)
time_clip = time.time() - start

print(f"\nComplex condition timing:")
print(f"Boolean indexing: {time_bool:.6f} seconds")
print(f"np.where: {time_where:.6f} seconds")
print(f"np.clip: {time_clip:.6f} seconds")

print(f"\nResults match: {np.allclose(result_bool, result_where) and np.allclose(result_where, result_clip)}")

print(f"\nKey insights:")
print(f"- np.maximum and np.clip are often fastest for simple range operations")
print(f"- Boolean indexing is intuitive but may require copying")
print(f"- np.where is flexible but can be slower for simple cases")
print(f"- Choose based on readability and specific use case")
print(f"- Specialized functions (clip, maximum) are often optimized")

## Question 10
Demonstrate performance optimization by pre-allocating arrays vs growing them dynamically.

In [None]:
import numpy as np
import time

def grow_array_append(n):
    """Growing array by appending (very slow)"""
    arr = np.array([])
    for i in range(n):
        arr = np.append(arr, i)
    return arr

def grow_array_concatenate(n):
    """Growing array by concatenation (slow)"""
    arr = np.array([0])
    for i in range(1, n):
        arr = np.concatenate([arr, [i]])
    return arr

def grow_list_then_convert(n):
    """Using Python list then converting (better)"""
    lst = []
    for i in range(n):
        lst.append(i)
    return np.array(lst)

def preallocate_array(n):
    """Pre-allocating array (fastest)"""
    arr = np.zeros(n)
    for i in range(n):
        arr[i] = i
    return arr

def use_arange(n):
    """Using built-in function (optimal)"""
    return np.arange(n)

# Test with different sizes
sizes = [1000, 5000, 10000]

print(f"Performance comparison: Array creation strategies")
print(f"{'Size':<8} {'Append':<10} {'List+Convert':<12} {'Pre-allocate':<12} {'np.arange':<10}")
print("-" * 60)

for size in sizes:
    # Test np.append (only for small sizes - it's very slow)
    if size <= 1000:
        start = time.time()
        result_append = grow_array_append(size)
        time_append = time.time() - start
    else:
        time_append = float('inf')
    
    # Test list then convert
    start = time.time()
    result_list = grow_list_then_convert(size)
    time_list = time.time() - start
    
    # Test pre-allocation
    start = time.time()
    result_prealloc = preallocate_array(size)
    time_prealloc = time.time() - start
    
    # Test np.arange
    start = time.time()
    result_arange = use_arange(size)
    time_arange = time.time() - start
    
    append_str = f"{time_append:.4f}" if time_append != float('inf') else "too slow"
    print(f"{size:<8} {append_str:<10} {time_list:<12.6f} {time_prealloc:<12.6f} {time_arange:<10.6f}")

# More realistic example: building array with computed values
def compute_slow_append(n):
    """Slow: append each computed value"""
    arr = np.array([])
    for i in range(n):
        value = np.sin(i) * np.cos(i)  # Some computation
        arr = np.append(arr, value)
    return arr

def compute_list_convert(n):
    """Better: use list then convert"""
    lst = []
    for i in range(n):
        value = np.sin(i) * np.cos(i)
        lst.append(value)
    return np.array(lst)

def compute_preallocate(n):
    """Fast: pre-allocate array"""
    arr = np.zeros(n)
    for i in range(n):
        arr[i] = np.sin(i) * np.cos(i)
    return arr

def compute_vectorized(n):
    """Fastest: fully vectorized"""
    i = np.arange(n)
    return np.sin(i) * np.cos(i)

print(f"\nRealistic example: Computing sin(x)*cos(x) for x in range(n)")

n = 10000
methods = [
    ("List + Convert", compute_list_convert),
    ("Pre-allocate", compute_preallocate),
    ("Vectorized", compute_vectorized)
]

print(f"\nTiming for n={n:,}:")
times = []

for name, method in methods:
    start = time.time()
    result = method(n)
    elapsed = time.time() - start
    times.append(elapsed)
    
    speedup = times[0] / elapsed if elapsed > 0 else 1
    print(f"{name:<15}: {elapsed:.6f} seconds ({speedup:.1f}x speedup)")

# Demonstrate memory allocation overhead
print(f"\nMemory allocation overhead demonstration:")

def show_growth_cost(max_size):
    """Show how append cost grows"""
    sizes = [100, 500, 1000, 2000]
    
    print(f"{'Size':<8} {'Append Time':<12} {'Time/Element':<12}")
    print("-" * 35)
    
    for size in sizes:
        if size <= max_size:
            start = time.time()
            arr = np.array([])
            for i in range(size):
                arr = np.append(arr, i)
            elapsed = time.time() - start
            per_element = elapsed / size
            
            print(f"{size:<8} {elapsed:<12.6f} {per_element:<12.8f}")

show_growth_cost(2000)

print(f"\nKey takeaways:")
print(f"- NEVER use np.append in loops - it's extremely slow")
print(f"- Pre-allocating arrays is much faster than growing them")
print(f"- Python lists + conversion is better than np.append")
print(f"- Vectorized operations are fastest when possible")
print(f"- Each np.append creates a new array and copies all data")
print(f"- Pre-allocation avoids repeated memory allocation/copying")

# Final timing comparison with %timeit
print(f"\nFinal comparison with %timeit (n=5000):")
n = 5000

print(f"\nList + convert:")
%timeit compute_list_convert(n)

print(f"\nPre-allocate:")
%timeit compute_preallocate(n)

print(f"\nVectorized:")
%timeit compute_vectorized(n)