### First we create our spark session

In [1]:
from pyspark import SparkContext
sc = SparkContext()

### Let's import the libraries in mllib

In [2]:
from pyspark.mllib.classification import LogisticRegressionWithSGD
from pyspark.mllib.regression import LabeledPoint
from numpy import array

### Import the data

In [3]:
# Load the data
data = sc.textFile("./data/sample_svm_data.txt")
data.take(5)

[u'1 0 2.52078447201548 0 0 0 2.004684436494304 2.000347299268466 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0',
 u'0 2.857738033247042 0 0 2.619965104088255 0 2.004684436494304 2.000347299268466 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0',
 u'0 2.857738033247042 0 2.061393766919624 0 0 2.004684436494304 0 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0',
 u'1 0 0 2.061393766919624 2.619965104088255 0 2.004684436494304 2.000347299268466 0 0 0 0 2.055002875864414 0 0 0 0',
 u'1 2.857738033247042 0 2.061393766919624 2.619965104088255 0 2.004684436494304 0 0 0 0 0 2.055002875864414 0 0 0 0']

In [18]:
data.count()

322

We have to transform the data to labeled points

In [4]:
# Parse the data
def parsePoint(line):
    values = [float(x) for x in line.split(' ')]
    return LabeledPoint(values[0], values[1:])

parsedData = data.map(parsePoint)
parsedData.take(5)

[LabeledPoint(1.0, [0.0,2.52078447201548,0.0,0.0,0.0,2.004684436494304,2.000347299268466,0.0,2.228387042742021,2.228387042742023,0.0,0.0,0.0,0.0,0.0,0.0]),
 LabeledPoint(0.0, [2.857738033247042,0.0,0.0,2.619965104088255,0.0,2.004684436494304,2.000347299268466,0.0,2.228387042742021,2.228387042742023,0.0,0.0,0.0,0.0,0.0,0.0]),
 LabeledPoint(0.0, [2.857738033247042,0.0,2.061393766919624,0.0,0.0,2.004684436494304,0.0,0.0,2.228387042742021,2.228387042742023,0.0,0.0,0.0,0.0,0.0,0.0]),
 LabeledPoint(1.0, [0.0,0.0,2.061393766919624,2.619965104088255,0.0,2.004684436494304,2.000347299268466,0.0,0.0,0.0,0.0,2.055002875864414,0.0,0.0,0.0,0.0]),
 LabeledPoint(1.0, [2.857738033247042,0.0,2.061393766919624,2.619965104088255,0.0,2.004684436494304,0.0,0.0,0.0,0.0,0.0,2.055002875864414,0.0,0.0,0.0,0.0])]

Now we are building the model

In [6]:
# Build the model
model = LogisticRegressionWithSGD.train(parsedData)

And evaluate the prediction

In [8]:
# Evaluating the model on training data
labelsAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))
print(labelsAndPreds.take(5))
trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count())
print("Training Error = " + str(trainErr))

[(1.0, 1), (0.0, 1), (0.0, 0), (1.0, 1), (1.0, 0)]
Training Error = 0.363354037267


## Your turn: 
Let's do the same with a simulated data and a linear regression. 

In [36]:
import numpy as np
np.random.seed(42)
N, dim = 200, 6
beta = np.array([-2, -1, 1, 2, 3, 1])
X = np.random.random(size=(N, dim))
y = X.dot(beta)

In [53]:
data_all = np.hstack((y[:,np.newaxis],X))

In [59]:
np.savetxt('data/data_reg.txt', data_all, delimiter=',')

Now we are using spark to do a linear regression

In [61]:
data_reg = sc.textFile("./data/data_reg.txt")
data_reg.take(5)

[u'8.535668077643493445e-01,3.745401188473624909e-01,9.507143064099161656e-01,7.319939418114050911e-01,5.986584841970366000e-01,1.560186404424365181e-01,1.559945203362026467e-01',
 u'2.066580132273367454e+00,5.808361216819946105e-02,8.661761457749351800e-01,6.011150117432088047e-01,7.080725777960454881e-01,2.058449429580244683e-02,9.699098521619943236e-01',
 u'1.088927551456997378e-01,8.324426408004217404e-01,2.123391106782761550e-01,1.818249672071006184e-01,1.834045098534338170e-01,3.042422429595377231e-01,5.247564316322378408e-01',
 u'9.785172274435358641e-01,4.319450186421157634e-01,2.912291401980419137e-01,6.118528947223794701e-01,1.394938606520418345e-01,2.921446485352181544e-01,3.663618432936917024e-01',
 u'1.354520848464622507e+00,4.560699842170359286e-01,7.851759613930135995e-01,1.996737821583597361e-01,5.142344384136116053e-01,5.924145688620424677e-01,4.645041271999772459e-02']

In [63]:
# Parse the data
def parsePoint(line):
    values = [float(x) for x in line.split(',')]
    return LabeledPoint(values[0], values[1:])

parsedData = data_reg.map(parsePoint)
parsedData.take(5)

[LabeledPoint(0.853566807764, [0.3745401188473625,0.9507143064099162,0.7319939418114051,0.5986584841970366,0.15601864044243652,0.15599452033620265]),
 LabeledPoint(2.06658013227, [0.05808361216819946,0.8661761457749352,0.6011150117432088,0.7080725777960455,0.020584494295802447,0.9699098521619943]),
 LabeledPoint(0.108892755146, [0.8324426408004217,0.21233911067827616,0.18182496720710062,0.18340450985343382,0.3042422429595377,0.5247564316322378]),
 LabeledPoint(0.978517227444, [0.43194501864211576,0.2912291401980419,0.6118528947223795,0.13949386065204183,0.29214464853521815,0.3663618432936917]),
 LabeledPoint(1.35452084846, [0.45606998421703593,0.7851759613930136,0.19967378215835974,0.5142344384136116,0.5924145688620425,0.046450412719997725])]

In [65]:
from pyspark.mllib.regression import LinearRegressionWithSGD
model_reg = LinearRegressionWithSGD.train(parsedData)

In [69]:
model_reg.weights

DenseVector([-1.4407, -0.7387, 0.8546, 1.7761, 2.6242, 0.8614])