### Tmporting relevant libraries

In [None]:
import pyspark
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('ML Pipeline').getOrCreate()

In [None]:
#Preparing the data
data = spark.read.csv('Data/process_yelp.csv', header = True, inferSchema = True)
data.head(10)

In [None]:
data.printSchema()

In [None]:
import pyspark.sql.types
#Change data type to int
data = data.withColumn('stars', data['stars'].cast('int'))

In [None]:
data.printSchema()

In [None]:
data.head(10)

In [None]:
#Configure an ML pipeline
tokenizer = Tokenizer(inputCol='text', outputCol='words')
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol='features')
lr = LogisticRegression(maxIter=10, regParam=0.001)

#Setting the stages of the ML pipeline
pipeline = Pipeline(stages = [tokenizer, hashingTF, lr])
model = pipeline.fit(data)

In [None]:
#Performing Predictions
prediction = model.transform(data)
selected = prediction.select("stars", "text", "Probability", "Prediction")

for row in selected.collect():
    prob, prediction = row
    print(" (%d, %s) ---> prob = %s, Prediction = %f % (id, text, str(prob), prediction")