In [None]:
%matplotlib inline

## Regression

In [None]:
import os 
import sklearn
import numpy as np
import pandas as pd

from os import path as filepath
#from pyspark import SparkConf, SparkContext
#from pyspark.sql import SQLContext

In [None]:
# Fixtures for HDFS
# HDFS = "hdfs://{}".format(os.environ["HDFS"])
# DATA = filepath.join(HDFS, "user", "ec2-user")

# Fixtures for Local 
DATA   = os.path.abspath(filepath.join("..", "data"))

WINE   = filepath.join(DATA, "winequality")
REDS   = filepath.join(WINE, "winequality-red.csv")
WHITES = filepath.join(WINE, "winequality-white.csv")

In [None]:
def parse(line, sep=';'):
    """
    Parses a line based on separator
    """
    fields = line.strip().split(sep)
    try:
        return map(float, fields)
    except:
        return None #Header line will result in None

# Select reds, whites, or all wines to regress on!
# data = WINE
data = REDS
# data = WHITES

# Load the data and parse it 
wine = sc.textFile(data).map(parse)

# Filter out None resulting from first line
wine = wine.filter(lambda r: r is not None)

In [None]:
columns = ["fixed acidity","volatile acidity","citric acid","residual sugar","chlorides","free sulfur dioxide","total sulfur dioxide","density","pH","sulphates","alcohol","quality"]
wineDF = wine.toDF(columns)
wineDF.show()

## Extract Features 

In [None]:
from pyspark.ml.linalg import Vectors

vecs = wine.map(lambda r: (r[-1], Vectors.dense(r[:-1]))).toDF(["label", "features"])
vecs.show()

## Models

In [None]:
# Import the model family
from pyspark.ml.regression import LinearRegression

In [None]:
# Instantiate the model form 
reg = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

# Fit the model
model = reg.fit(vecs)

# Print the coefficients and intercept for linear regression
print("Coefficients: {}".format(model.coefficients))
print("Intercept: {}".format(model.intercept))

# Summarize the model over the training set and print out some metrics
print("Iterations: {}".format(model.summary.totalIterations))
print("Objective History: {}".format(model.summary.objectiveHistory))

# Residuals and Scores
model.summary.residuals.show()
print("RMSE: {:0.3f}".format(model.summary.rootMeanSquaredError))
print("r2: {:0.3f}".format(model.summary.r2))