In [1]:
import pyspark 
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Wine Quality Prediction").enableHiveSupport().getOrCreate()

In [40]:
df = spark.read.csv('TrainingDataset.csv',header='true', inferSchema='true', sep=';')

In [42]:
new_column_name_list= list(map(lambda x: x.replace("\"\"", ""), df.columns))

df = df.toDF(*new_column_name_list)

df = df.withColumnRenamed("quality\"", "quality")

In [43]:
def isTasty(quality):
    if quality >= 7:
        return 1
    else:
        return 0

In [44]:
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType
tasty_udf_int = udf(isTasty, IntegerType())

In [46]:
df_tasty = df.withColumn("tasty", tasty_udf_int('quality'))

In [47]:
featureColumns = ["alcohol", "volatile acidity", "sulphates", "citric acid", "total sulfur dioxide", "density"]

In [None]:
TRACKING_URI = 'http://<user_name>:<password>@<ec2_public_address>/'

In [48]:
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=featureColumns, 
                            outputCol="features")

In [49]:
trainingData = assembler.transform(df_tasty).select('features', 'tasty')

In [50]:
df_test = spark.read.csv('ValidationDataset.csv',header='true', inferSchema='true', sep=';')

In [51]:
new_column_name_list= list(map(lambda x: x.replace("\"\"", ""), df.columns))

df_test = df_test.toDF(*new_column_name_list)

df_test = df_test.withColumnRenamed("quality\"", "quality")

In [52]:
df_test_tasty = df_test.withColumn("tasty", tasty_udf_int('quality'))

In [53]:
featureColumns_test = ["alcohol", "volatile acidity", "sulphates", "citric acid", "total sulfur dioxide", "density"]

In [54]:
assembler_test = VectorAssembler(inputCols=featureColumns_test, 
                            outputCol="features")

In [55]:
testData = assembler_test.transform(df_test_tasty).select('features', 'tasty')

In [56]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(featuresCol = 'features', labelCol = 'tasty', maxIter=25)

In [57]:
lrModel = lr.fit(trainingData)

In [58]:
predictions = lrModel.transform(testData)

In [60]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
lr_evaluator = MulticlassClassificationEvaluator(
    labelCol='tasty', predictionCol="prediction", metricName="f1")
f1 = lr_evaluator.evaluate(predictions)
print("f-score on test data = %g" % f1)

f-score on test data = 0.870752
