In [5]:
import os
import sys

SPARK_HOME = "/usr/hdp/current/spark2-client"
PYSPARK_PYTHON = "/opt/conda/envs/dsenv/bin/python"
os.environ["PYSPARK_PYTHON"]= PYSPARK_PYTHON
os.environ["SPARK_HOME"] = SPARK_HOME

PYSPARK_HOME = os.path.join(SPARK_HOME, "python/lib")
sys.path.insert(0, os.path.join(PYSPARK_HOME, "py4j-0.10.7-src.zip"))
sys.path.insert(0, os.path.join(PYSPARK_HOME, "pyspark.zip"))

In [6]:
import random
SPARK_UI_PORT = random.choice(range(10000, 10200))
print(f"Spark UI port is: {SPARK_UI_PORT}")

Spark UI port is: 10190


In [7]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()
conf.set("spark.ui.port", SPARK_UI_PORT)

spark = SparkSession.builder.config(conf=conf).appName("Spark ML Intro").getOrCreate()

In [126]:
from pyspark.sql.types import *
from pyspark.sql import functions as f
from pyspark.ml.feature import *
from pyspark.ml import Estimator, Transformer
from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline

In [163]:
small_train_path = '/datasets/amazon/all_reviews_5_core_train_small.json'

In [164]:
schema = StructType([
    StructField("asin", StringType()),
    StructField("id", LongType()),
    StructField("overall", DoubleType()),
    StructField("reviewText", StringType()),
    StructField("reviewTime", DateType()),
    StructField("reviewerID", StringType()),
    StructField("reviewerName", StringType()),
    StructField("vote", IntegerType()),
    StructField("summary", StringType()),
    StructField("unixReviewTime", TimestampType()),
    StructField("verified", BooleanType())
])

In [165]:
dataset = spark.read.json(small_train_path, schema=schema,dateFormat='MM dd, yyyy').cache()

In [130]:
dataset.rdd.getNumPartitions()

7

In [131]:
dataset.show(2, vertical=True)

-RECORD 0------------------------------
 asin           | B00005MDZ8           
 id             | 6500                 
 overall        | 5.0                  
 reviewText     | quick shipping, g... 
 reviewTime     | 2014-10-23           
 reviewerID     | AEZ4DZCUL021H        
 reviewerName   | Stephen              
 vote           | null                 
 summary        | great product        
 unixReviewTime | 2014-10-23 00:00:00  
 verified       | true                 
-RECORD 1------------------------------
 asin           | B000DZE0XK           
 id             | 42580                
 overall        | 5.0                  
 reviewText     | Most delicious Ever! 
 reviewTime     | 2016-02-13           
 reviewerID     | A3UPMJ5WQFHGLN       
 reviewerName   | Pelipen              
 vote           | null                 
 summary        | Five Stars           
 unixReviewTime | 2016-02-13 00:00:00  
 verified       | true                 
only showing top 2 rows



In [132]:
dataset = dataset.dropna(subset='reviewText')

In [166]:
droper = SQLTransformer(statement="SELECT * FROM __THIS__ WHERE reviewText is not null")

In [159]:
dataset2 = sqlTrans.transform(dataset)

In [160]:
dataset2.select([count(when(isnan(c), c)).alias(c) for c in ['reviewText']]).show()

+----------+
|reviewText|
+----------+
|         0|
+----------+



In [167]:
tokenizer = Tokenizer(inputCol="reviewText", outputCol="words")
#dataset2 = tokenizer.transform(dataset)

In [168]:
hasher = HashingTF(numFeatures=100, binary=True, inputCol=tokenizer.getOutputCol(), outputCol="word_vector")
#dataset2 = hasher.transform(dataset2)

In [169]:
lr = LinearRegression(featuresCol=hasher.getOutputCol(), labelCol="overall", maxIter=15)

In [170]:
pipeline = Pipeline(stages=[
    droper,
    tokenizer,
    hasher,
    lr
])

In [171]:
train,test = dataset.randomSplit([0.7, 0.3],42)

In [172]:
pipeline_model = pipeline.fit(train)

In [173]:
predictions = pipeline_model.transform(test)

In [174]:
from pyspark.ml.evaluation import RegressionEvaluator
lr_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="overall",metricName="rmse")

In [175]:
print("Root Mean Squared Error (RMSE) on test data = %g" % lr_evaluator.evaluate(predictions))

Root Mean Squared Error (RMSE) on test data = 1.13703


In [144]:
predictions

DataFrame[asin: string, id: bigint, overall: double, reviewText: string, reviewTime: date, reviewerID: string, reviewerName: string, vote: int, summary: string, unixReviewTime: timestamp, verified: boolean, words: array<string>, word_vector: vector, prediction: double]

In [146]:
Tokenizer.explainParams() 

TypeError: explainParams() missing 1 required positional argument: 'self'

In [None]:
spark.stop()