In [None]:
from pyspark.mllib.random import RandomRDDs
from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD
import matplotlib.pyplot as plt
from numpy.random import normal

In [None]:
# Our function to find. After a linear fit we would expect w to be 2 and an offset of -1.
def f(x):
    return 2 * x -1

In [None]:
# Generates a random number from the standard normal distribution 
normal()

## Generate data

In [None]:
n_points = 10000

x = sc.parallelize(range(n_points)).map(lambda x: float(x) / n_points) 

# Let's add som noise to y
y = x.map(f).map(lambda a: a + .3 * normal())

In [None]:
y.stats()

## Plot the data

In [None]:
xc = x.collect()
yc = y.collect()

In [None]:
%matplotlib inline
plt.scatter(xc, yc, label='c1', alpha=.2)
plt.legend()
plt.xlabel('$x_1$')
plt.ylabel('$x_2$')
plt.show()

## Convert into LabeledPoints dataset

In [None]:
# Combine into a labelled dataset
ds = y.zip(x).map(lambda pt: LabeledPoint(pt[0], [pt[1]]))

ds.take(5)

## Create and train linear model

In [None]:
model = LinearRegressionWithSGD().train(ds, iterations=2000, step=1., intercept=True, convergenceTol=0.00001)
model

In [None]:
# Plot the fitted line

## Using scikit-learn instead

In [None]:
from sklearn.linear_model import LinearRegression
import numpy as np
m = LinearRegression()

xx = np.array(xc).reshape([len(xc), 1])
yy = np.array(yc).reshape([len(yc), 1])

m.fit(xx,yy)
print '%f, %f' % (m.intercept_, m.coef_)