In [None]:
import os
from google.colab import drive
import matplotlib.pyplot as plt
import pandas as pd
drive.mount('/content/drive')
os.chdir('/content/drive/My Drive/Colab Notebooks/lstsq/Pycuda')

Mounted at /content/drive


# SGD

[Parallelized Stochastic Gradient Descent](https://proceedings.neurips.cc/paper/2010/hash/abea47ba24142ed16b7d8fbf2c740e0d-Abstract.html) 을 참고...

## Definition of SGD
___
Algorithm 1: 
$SGD(
\begin{Bmatrix}
c^1,...,c^m
\end{Bmatrix}
, T, \eta, w_0)$
___
for t=1 to T, do<br>
$\qquad$Draw $j \in \left\{1...m\right\}$ uniformly at random.<br>
$\qquad$$w_t \leftarrow w_{t-1} - \eta\partial_wc^j(w_{t-1}).$<br>
end for<br>
return $w_T$
___
<br>

___
Algorithm 2:
$ParallelSGD(
\begin{Bmatrix}
c^1,...,c^m
\end{Bmatrix}
, T, \eta, w_0, k)$
___
for all $i \in \left\{1,...,k\right\}$ parallel do<br>
$\qquad$$v_i = SGD(
\begin{Bmatrix}
c^1,...,c^m
\end{Bmatrix}
, T, \eta, w_0)$<br>
end for<br>
Aggregate from all computers $v = \frac{1}{k}\sum_{i=1}^kv_i$ and return $v$
___
<br>

___
Algorithm 3: $SimuParallelSGD(Examples
\begin{Bmatrix}
c^1,...,c^m
\end{Bmatrix}
,Learning\ Rate\ \eta, Machines\ k)$
___
Define $T = \lfloor \frac{m}{k} \rfloor$<br>
Randomly partition the examples, giving $T$ examples to each machine.<br>
for all $i \in \left\{1,...,k\right\}$ parallel do<br>
$\qquad$Randomly shuffle the data on machine $i$.<br>
$\qquad$Initialize $w_{i,0} = 0$.<br>
$\qquad$for all $t \in \left\{1,...,T\right\}$: do<br>
$\qquad\qquad$Get the $t$th example on the $i$th machine (this machine), $c^{i,t}$<br>
$\qquad\qquad$$w_{i,t} \leftarrow w_{i,t-1} - \eta\partial_wc^i(w_{i,t-1})$<br>
$\qquad$end for<br>
end for<br>
Aggregate from all computers $v = \frac{1}{k}\sum_{i=1}^k w_{i,T}$ and return $v$.
___

## Brief strategy to prove Algorithm 2

* When performing sotchastic gradient descent with fixed (and sufficiently small) learning rate $\eta$ the distribution of the parameter vector is asymptotically normal. Since all computers are drawing from the same data distribution they all converge to the same limit.

이 밑의 내용은 PYCUDA 책을 읽으면서 정리한 내용입니다.

# Maldelbrot set

For a given complex number, c, we define a recursive sequence for $n \ge 0$, with $z_0 = 0$ and $z_n = z_{n-1}^2 + c$ for $n \ge 1$.<br>
If $\left\vert z_n \right\vert$ remains bounded by 2 as $n$ increases to infinity, then we will say that $c$ is a member of the __Mandelbrot set__.

```python
import numpy as np
from time import time
import matplotlib
import matplotlib.pyplot as plt

def simple_mandelbrot(width, height, real_low, real_high, imag_low, imag_high, max_iters):
    
    real_vals = np.linspace(real_low, real_high, width)
    imag_vals = np.linspace(imag_low, imag_high, height)

    # we will represent members as 1, non-members as 0.
    mandelbrot_graph = np.ones((height,width), dtype=np.float32)
    for x in range(width):
        for y in range(height):
            c = np.complex64(real_vals[x] + imag_vals[y] * 1j)
            z = np.complex64(0)
            for i in range(max_iters):
                z = z**2 + c
                if (np.abs(z) > 2):
                    mandelbrot_graph[y,x] = 0
                    break
    return mandelbrot_graph

# the following will prevent the figure from popping up
matplotlib.use("Agg")

if __name__ == "__main__":
    t1 = time()
    mandel = simple_mandelbrot(512,512,-2,2,-2,2,256)
    t2 = time()
    mandel_time = t2 - t1

    t1 = time()
    fig = plt.figure(1)
    plt.imshow(mandel, extent=(-2,2,-2,2))
    t2 = time()
    dump_time = t2 - t1
    print(f"It took {mandel_time} seconds to calculate the Mandelbrot graph.")
    print(f"It took {dump_time} seconds to dump the image.")

```

In [None]:
img = plt.imread("mandelbrot.png", format=".png")
plt.imshow(img)
plt.show()

In [None]:
time_result = open("mandelbrot_profile.txt", "r", encoding="utf-16")
for i in range(15):
    line = time_result.readline()
    print(line)
time_result.close()

It took 18.088903188705444 seconds to calculate the Mandelbrot graph.

It took 0.060090065002441406 seconds to dump the image.

         570352 function calls (562114 primitive calls) in 18.673 seconds



   Ordered by: cumulative time



   ncalls  tottime  percall  cumtime  percall filename:lineno(function)

    604/1    0.001    0.000   18.673   18.673 {built-in method builtins.exec}

        1    0.000    0.000   18.673   18.673 mandelbrot0.py:1(<module>)

        1   18.089   18.089   18.089   18.089 mandelbrot0.py:6(simple_mandelbrot)

       30    0.001    0.000    0.650    0.022 __init__.py:1(<module>)

    361/4    0.002    0.000    0.524    0.131 <frozen importlib._bootstrap>:1002(_find_and_load)

    355/3    0.001    0.000    0.523    0.174 <frozen importlib._bootstrap>:967(_find_and_load_unlocked)

    335/3    0.001    0.000    0.522    0.174 <frozen importlib._bootstrap>:659(_load_unlocked)

    283/3    0.001    0.000    0.522    0.174 <frozen importlib._bootstrap_exter

This means that this program is 99% parallelizable!<br>
What sort of speedup can we potentially get? Tesla gpu has 4992 cores; our N will thus be 4992 when we use the formula.<br>
We calculate the speedup as follows:<br>
$\qquad Speedup = \frac{1}{.01+.99/4992} \approx 98.1$<br>
But, remember that Amdahl's Law only gives a very rough estimate!

# lstsq

```python
import numpy as np
import matplotlib.pyplot as plt
from time import time 

class LeastSquare():
    def __init__(self, A, b):
        self.A = A
        self.b = b
        self.x = np.random.rand(self.A.shape[1])
        self.lr = 1e-3/A.shape[1]
        self.x_list = []
        self.error_list = []

    def do(self):
        for i in range(100):
            ## initialize
            index = np.random.choice(self.A.shape[0],1000)
            A = self.A[index]
            b = self.b[index]

            ## optimize x
            for j in range(100):
                b_ = np.dot(A, self.x)
                grad = 2 * np.dot(A.T, (b_ - b))
                self.x -= grad * self.lr

            self.x_list.append(self.x)
            self.error_list.append(self.check())

        return self.x

    def check(self):
        b_ = self.A @ self.x
        error = np.linalg.norm(self.b - b_)

        return error

## execute
if __name__ == "__main__":
    A = np.random.rand(10000,1000)
    b = np.random.rand(10000)

    t1 = time()
    lstsq = LeastSquare(A,b)
    t2 = time()
    dump_time1 = t2 - t1

    t1 = time()
    theta = lstsq.do()
    error = lstsq.check()
    t2 = time()
    calculation_time = t2 - t1

    t1 = time()
    result = open("lstsq_result_cpu.txt", "w")
    result.write(f"error: {error}")
    result.write("\n")
    result.write(f"optimal x: {theta}")
    result.close()
    t2 = time()
    dump_time2 = t2 - t1

    t1 = time()
    fig = plt.figure(figsize=(8,8))
    plt.plot(lstsq.error_list)
    plt.xlabel("epoches")
    plt.ylabel("error")
    plt.savefig("lstsq_error.png", dpi=fig.dpi)
    t2 = time()
    dump_time3 = t2 - t1

    dump_time = dump_time1 + dump_time2 + dump_time3

    print(f"It took {calculation_time} seconds to calculate the least square probelm.")
    print(f"It took {dump_time} seconds to something else.")
```

In [None]:
img = plt.imread("lstsq_error.png", format="png")
plt.imshow(img)
plt.show()

In [None]:
lstsq_result = open("lstsq_result_cpu.txt", "r", encoding="utf-8")
for i in range(10):
    line = lstsq_result.readline()
    print(line)
lstsq_result.close()

error: 62.05344030560634

optimal x: [-3.93896743e-02  2.45951057e-02  7.18019150e-02 -1.98371896e-02

 -9.87640627e-02  6.44443898e-02  1.85098354e-02  6.78026736e-02

  5.11517740e-02 -1.12579341e-01 -1.09601801e-01  2.02053859e-02

 -5.47842429e-02  3.49777597e-02  1.01547900e-02  2.07681079e-02

 -6.57411882e-02 -1.07413917e-02  7.28141689e-02 -2.81213375e-02

 -1.47827205e-01 -4.29647457e-02 -9.29501827e-02 -1.18995402e-01

 -6.14744260e-02 -4.93697000e-02  2.29943144e-02  6.90976171e-02

  6.16123092e-02 -1.29615396e-01  2.52842754e-02 -5.11789968e-02

  7.84546627e-02 -8.70759457e-02 -8.22437879e-02 -1.54061959e-02



In [None]:
lstsq_result = open("lstsq_cpu_profile.txt", "r", encoding="utf-16")
for i in range(10):
    line = lstsq_result.readline()
    print(line)
lstsq_result.close()

It took 4.0358405113220215 seconds to calculate the least square probelm.

It took 0.2634012699127197 seconds to something else.

         742875 function calls (732302 primitive calls) in 4.935 seconds



   Ordered by: cumulative time



   ncalls  tottime  percall  cumtime  percall filename:lineno(function)

    612/1    0.001    0.000    4.936    4.936 {built-in method builtins.exec}

        1    0.001    0.001    4.936    4.936 lstsq_cpu.py:1(<module>)

        1    0.683    0.683    4.034    4.034 lstsq_cpu.py:14(do)



Least Square problem with SGD can be 93.8% parallelizable!
For Tesla-k80 gpu, speedup will be calculated as:<br>
$\qquad Speedup = \frac{1}{.062 + 0.938/4992} \approx 16.1$