# Linear Regression

Given a set of 2 dimension points, the goal of linear regression is to achieve the goal:

$y=\alpha + \beta X$

that minimizes the distance to all points.

In [None]:
import pycompss.interactive as ipycompss

In [None]:
# Start PyCOMPSs runtime with graph and tracing enabled
ipycompss.start(graph=True, trace=True)

In [None]:
from pycompss.api.task import task
from pycompss.api.parameter import *
import math
from numpy import arange
from numpy.random import randint
import types
import time

## Task definitions

In [None]:
@task(returns=int)
def _add(x):
    return sum(x)

@task(returns=int)
def reduce_add(x, y):
    return x+y

@task(returns=float)
def _mean(X, n):
    return sum(X)/float(n)

@task(returns=list)
def _norm(X, m):
    return [x-m for x in X]

@task(returns=list)
def _pow(X, p=2):
    return [pow(x, 2) for x in X]

@task(returns=float)
def _mul(x, y):
    return x*y

@task(returns=float)
def op_task(sum_x, sum_y, suma):
    return suma/float(math.sqrt(sum_x*sum_y))

@task(returns=float)
def multFrag(a, b):
    p = zip(a, b)
    result = 0
    for (a, b) in p:
        result += a * b
    return result

@task(returns=(float, float))
def computeLine(r, stdy, stdx, my, mx):
    b = r * (math.sqrt(stdy) / math.sqrt(stdx))
    A = my - b*mx
    return b, A

@task(returns=list)
def genFragment(pointsPerFrag):
    return list(randint(0,100,size=pointsPerFrag))

## Functions

In [None]:
def std(X, m, n):
    xs = [_norm(x, m) for x in X]
    xp = [_pow(x, 2) for x in xs]
    suma = mergeReduce(reduce_add, [_mean(x, n) for x in xp])
    return suma

In [None]:
def pearson(X, Y, mx, my):
    xs = [_norm(x, mx) for x in X]
    ys = [_norm(y, my) for y in Y]
    xxs = [_pow(x, 2) for x in xs]
    yys = [_pow(y, 2) for y in ys]

    suma = mergeReduce(reduce_add, [multFrag(a, b) for (a,b) in zip(xs, ys)])

    sum_x = mergeReduce(reduce_add, list(map(_add, xxs)))
    sum_y = mergeReduce(reduce_add, list(map(_add, yys)))
    r = op_task(sum_x, sum_y, suma)
    return r

In [None]:
def mergeReduce(function, data):
    """ Apply function cumulatively to the items of data,
        from left to right in binary tree structure, so as to
        reduce the data to a single value.
    :param function: function to apply to reduce data
    :param data: List of items to be reduced
    :return: result of reduce the data to a single value
    """
    from collections import deque
    q = deque(list(range(len(data))))
    while len(q):
        x = q.popleft()
        if len(q):
            y = q.popleft()
            data[x] = function(data[x], data[y])
            q.append(x)
        else:
            return data[x]

In [None]:
def initData(pointsPerFrag, fragments, dim):
    data = [[genFragment(pointsPerFrag) for _ in range(fragments)] for _ in range(dim)]
    return data

In [None]:
def mean(X, n):
    result = mergeReduce(reduce_add, [_mean(x, n) for x in X])
    return result

In [None]:
def fit(X, Y, n):
    from pycompss.api.api import compss_wait_on
    st = time.time()
    mx = mean(X, n)
    my = mean(Y, n)
    r = pearson(X, Y, mx, my)
    stdx = std(X, mx, n)
    stdy = std(Y, mx, n)

    line = computeLine(r, stdy, stdx, my, mx)

    line = compss_wait_on(line)
    print("Elapsed time {}".format(time.time() - st))
    return lambda x: line[0]*x+line[1]

## MAIN 

Parameters (that can be configured in the following cell):
* numPoints: Number of points (default: 1.000)
* dim: Number of dimensions (default: 2 (plotting considers only 2 dimensions))
* fragments: Number of fragments to consider (default: 5)

In [None]:
from pycompss.api.api import compss_wait_on

numPoints = 1000
dim = 2
fragments = 5

pointsPerFrag = numPoints//fragments
data = initData(pointsPerFrag, fragments, dim)
line = fit(data[0], data[1], numPoints)

In [None]:
# Plot Result
%matplotlib inline
from pylab import scatter, show, plot, savefig
data = compss_wait_on(data)
datax = [item for sublist in data[0] for item in sublist]
datay = [item for sublist in data[1] for item in sublist]
scatter(datax, datay, marker='x')
plot([line(x) for x in arange(0.0, 100.0, 0.1)], arange(0.0, 100.0, 0.1))
show()
# savefig('lrd.png')

In [None]:
ipycompss.stop()