### First we create our spark session

In [None]:
from pyspark import SparkContext
sc = SparkContext()

### Let's import the libraries in mllib

In [None]:
from pyspark.mllib.classification import LogisticRegressionWithSGD
from pyspark.mllib.regression import LabeledPoint
from numpy import array

### Import the data

In [None]:
# Load the data
data = sc.textFile("./data/sample_svm_data.txt")
data.take(5)

In [None]:
data.count()

We have to transform the data to labeled points

In [None]:
# Parse the data
def parsePoint(line):
    values = [float(x) for x in line.split(' ')]
    return LabeledPoint(values[0], values[1:])

parsedData = data.map(parsePoint)
parsedData.take(5)

Now we are building the model

In [None]:
# Build the model
model = LogisticRegressionWithSGD.train(parsedData)

And evaluate the prediction

In [None]:
# Evaluating the model on training data
labelsAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))
print(labelsAndPreds.take(5))
trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count())
print("Training Error = " + str(trainErr))

## Your turn: 
Let's do the same with a simulated data and a linear regression. 

In [None]:
import numpy as np
np.random.seed(42)
N, dim = 200, 6
beta = np.array([-2, -1, 1, 2, 3, 1])
X = np.random.random(size=(N, dim))
y = X.dot(beta)

In [None]:
data_all = np.hstack((y[:,np.newaxis],X))

In [None]:
np.savetxt('data/data_reg.txt', data_all, delimiter=',')

Now we are using spark to do a linear regression

In [None]:
data_reg = sc.textFile("./data/data_reg.txt")
data_reg.take(5)

In [None]:
# Parse the data
def parsePoint(line):
    values = [float(x) for x in line.split(',')]
    return LabeledPoint(values[0], values[1:])

parsedData = data_reg.map(parsePoint)
parsedData.take(5)

In [None]:
from pyspark.mllib.regression import LinearRegressionWithSGD
model_reg = LinearRegressionWithSGD.train(parsedData)

In [None]:
model_reg.weights