In [1]:
import pyspark 
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Wine Quality Prediction").enableHiveSupport().getOrCreate()

In [2]:
df = spark.read.csv('TrainingDataset.csv',header='true', inferSchema='true', sep=';')

In [3]:
new_column_name_list= list(map(lambda x: x.replace("\"\"", ""), df.columns))

df = df.toDF(*new_column_name_list)

df = df.withColumnRenamed("quality\"", "quality")

In [4]:
def isTasty(quality):
    if quality >= 7:
        return 1
    else:
        return 0

In [5]:
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType
tasty_udf_int = udf(isTasty, IntegerType())

In [6]:
df_tasty = df.withColumn("tasty", tasty_udf_int('quality'))

In [7]:
featureColumns = ["alcohol", "volatile acidity", "sulphates", "citric acid", "total sulfur dioxide", "density"]

In [8]:
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=featureColumns, 
                            outputCol="features")

In [9]:
trainingData = assembler.transform(df_tasty).select('features', 'tasty')

In [10]:
df_test = spark.read.csv('ValidationDataset.csv',header='true', inferSchema='true', sep=';')

In [11]:
new_column_name_list= list(map(lambda x: x.replace("\"\"", ""), df.columns))

df_test = df_test.toDF(*new_column_name_list)

df_test = df_test.withColumnRenamed("quality\"", "quality")

In [12]:
df_test_tasty = df_test.withColumn("tasty", tasty_udf_int('quality'))

In [13]:
featureColumns_test = ["alcohol", "volatile acidity", "sulphates", "citric acid", "total sulfur dioxide", "density"]

In [14]:
assembler_test = VectorAssembler(inputCols=featureColumns_test, 
                            outputCol="features")

In [15]:
testData = assembler_test.transform(df_test_tasty).select('features', 'tasty')

In [16]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(featuresCol = 'features', labelCol = 'tasty', maxIter=25)

In [17]:
lrModel = lr.fit(trainingData)

In [18]:
predictions = lrModel.transform(testData)

In [19]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
lr_evaluator = MulticlassClassificationEvaluator(
    labelCol='tasty', predictionCol="prediction", metricName="f1")
f1 = lr_evaluator.evaluate(predictions)
print("f-score on test data = %g" % f1)

f-score on test data = 0.870752
