In [1]:
## Spark Application - execute with spark-submit

## Import the packages
import csv
from pyspark.mllib.regression import LabeledPoint
from StringIO import StringIO
from collections import namedtuple
from pyspark import SparkConf, SparkContext
from pyspark.mllib.util import MLUtils
from pyspark.mllib.classification import LogisticRegressionWithLBFGS


## Module Constants
APP_NAME = "Wine Quality Analysis"
fields   = ('fixed_acidity','volatile_acidity','citric_acid','residual_sugar','chlorides','free_sulfur_dioxide','total_sulfur_dioxide','density','pH','sulphates','alcohol','quality')
sc = SparkContext(appName="Classification")

# Closures
def split(line):
    """
    Operator function for splitting a line with csv module
    """
    reader = csv.reader(StringIO(line))
    return reader.next()

def parse(row):
    """
    Parses a row and returns a named tuple.
    """
    row[0]  = float(row[0])
    row[1]  = float(row[1])
    row[2]  = float(row[2])
    row[3]  = float(row[3])
    row[4]  = float(row[4])
    row[5]  = float(row[5])
    row[6]  = float(row[6])
    row[7]  = float(row[8])
    row[8]  = float(row[9])
    row[9]  = float(row[10])
    row[10] = row[11]
    
    return LabeledPoint(row[10], row[0:9])

wines = sc.textFile("C:\\Users\\anirudhbedre\\Desktop\\Wine\\wine_quality_pyspark_classification.csv").map(split).map(parse)

(trainingData, testData) = wines.randomSplit([0.7, 0.3])

# Train the model.

model =LogisticRegressionWithLBFGS.train(trainingData, iterations=10,regType=None  )


# Evaluating the model on training data
training_labelsAndPreds = trainingData.map(lambda p: (p.label, model.predict(p.features)))
trainErr = training_labelsAndPreds.filter(lambda (v, p): v != p).count() / float(trainingData.count())
print("Training Score = " + str(1-trainErr))

test_labelsAndPreds = testData.map(lambda p: (p.label, model.predict(p.features)))
testErr = test_labelsAndPreds.filter(lambda (v, p): v != p).count() / float(testData.count())
print("Testing Score = " + str(1-testErr))

sc.stop()

Training Score = 0.780381944444
Testing Score = 0.784327323162
