In [1]:
# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "C:\Program Files\Java\jdk1.8.0_241"
os.environ["SPARK_HOME"] = "D:\spark-2.4.5-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

In [2]:
from pyspark import SparkFiles
from pyspark.sql.types import *
from pyspark.sql import SparkSession
from pyspark.ml.feature import *
from pyspark.ml.classification import *
from pyspark.ml import Pipeline
from pyspark.sql.functions import *
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator


In [3]:
spark = SparkSession.builder \
    .master("local[*]") \
    .config("spark.driver.memory", "14g") \
    .appName("CloudETLProject") \
    .getOrCreate()   

In [4]:
# Load in user_data.csv from S3 into a DataFrame
url = "D:\Projects\yelp-dataset\yelp_review.csv"
spark.sparkContext.addFile(url)

yelpSchema = StructType([
    StructField("review_id",StringType(), True),
    StructField("user_id",StringType(), True),
    StructField("business_id",StringType(), True),
    StructField("stars", IntegerType(), True),
    StructField("date", StringType(), True),
    StructField("text", StringType(), True),
    StructField("useful", StringType(), True),
    StructField("funny", StringType(), True),
    StructField("cool", StringType(), True)
])

df = spark.read.option('header', 'true').csv(SparkFiles.get("yelp_review.csv"), schema=yelpSchema, sep=',', timestampFormat="yyyy/mm/dd")
df.printSchema()

root
 |-- review_id: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- business_id: string (nullable = true)
 |-- stars: integer (nullable = true)
 |-- date: string (nullable = true)
 |-- text: string (nullable = true)
 |-- useful: string (nullable = true)
 |-- funny: string (nullable = true)
 |-- cool: string (nullable = true)



In [5]:
df = df.filter(df.text.isNotNull())
df = df.filter(df.useful.isNotNull())

df.show()

+--------------------+--------------------+--------------------+-----+----------+--------------------+--------------------+--------------------+--------------------+
|           review_id|             user_id|         business_id|stars|      date|                text|              useful|               funny|                cool|
+--------------------+--------------------+--------------------+-----+----------+--------------------+--------------------+--------------------+--------------------+
|HRPm3vEZ_F-33TYVT...|_4iMDXbXZ1p1ONG29...|8QWPlVQ6D-OExqXoa...|    5|2014-09-24|Cycle Pub Las Veg...|                   1|                   0|                   0|
|Ia-w-nR1FrlzsiuEi...|u0LXt3Uea_GidxRW1...|Eox_Qq74oaFZ-Yjth...|    3|2011-07-18|Service is really...|                   1|                   1|                   1|
|hkyyWaX-EMiIkvyu1...|nsOf58RZjMTn8V94E...|djyIZW8gVNWby8wau...|    5|2017-08-23|Suzanne was able ...|                   0|                   0|                   0|
|BF0

In [6]:
df = df.withColumn('review_length', length(df['text']))

In [7]:
df = df.filter("stars >= 1 and stars <= 5")
df.select('stars').distinct().show()

+-----+
|stars|
+-----+
|    1|
|    3|
|    5|
|    4|
|    2|
+-----+



In [8]:
df.count()

3200677

In [9]:
sample_df = df.sample(fraction=0.01, seed=42)
sample_df.count()

32170

## Transform DataFrame to fit review_rating table

In [10]:
review_df = sample_df.select(["text", "stars", "date", "review_length"])
review_df.show()

+--------------------+-----+----------+-------------+
|                text|stars|      date|review_length|
+--------------------+-----+----------+-------------+
|I bought a groupo...|    1|2014-02-03|          221|
|Came here with a ...|    4|2016-11-26|          570|
|Have been here a ...|    4|2017-07-14|          320|
|Baked ricotta and...|    5|2016-01-31|          231|
|Dr. Anjum is an e...|    5|2017-08-15|          610|
|This location is ...|    3|2012-09-19|          274|
|This was such an ...|    5|2014-04-14|          337|
|If you are out dr...|    5|2007-09-11|          157|
|For over 10 years...|    5|2017-07-29|          861|
|Food here is cons...|    3|2011-07-01|          117|
|Love DPA, the sta...|    5|2012-07-21|          166|
|fried chicken, ye...|    4|2014-10-06|          530|
|C'est ma biblioth...|    5|2017-06-10|          319|
|Their vegetarian ...|    5|2014-03-09|          244|
|"Stars: Proper ra...|    3|2016-01-14|          214|
|Wow, what an incr...|    5|

In [11]:
training, testing = review_df.randomSplit([0.7, 0.3], seed = 42)

In [15]:
def build_trigrams(inputCol=["text","stars"], n=3):
    tokenizer = [Tokenizer(inputCol="text", outputCol="words")]
    ngrams = [
        NGram(n=i, inputCol="words", outputCol="{0}_grams".format(i))
        for i in range(1, n + 1)
    ]

    cv = [
        CountVectorizer(vocabSize=2**14,inputCol="{0}_grams".format(i),
            outputCol="{0}_tf".format(i))
        for i in range(1, n + 1)
    ]
    idf = [IDF(inputCol="{0}_tf".format(i), outputCol="{0}_tfidf".format(i), minDocFreq=5) for i in range(1, n + 1)]

    assembler = [VectorAssembler(
        inputCols=["{0}_tfidf".format(i) for i in range(1, n + 1)],
        outputCol="features"
    )]
    label_stringIdx = [StringIndexer(inputCol = "stars", outputCol = "label")]
    selector = [ChiSqSelector(numTopFeatures=2**15,featuresCol='rawFeatures', outputCol="features")]
    lr = [LogisticRegression(maxIter=100)]
    return Pipeline(stages=tokenizer + ngrams + cv + idf+ assembler + label_stringIdx + lr)

In [16]:
trigram_pipelineFit = build_trigrams().fit(training)
test_results = trigram_pipelineFit.transform(testing)

In [17]:
predictions = test_results.select(col("label").cast("Float"),col("prediction"))
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Model Accuracy = %g" % accuracy)

Model Accuracy = 0.594468
