In [1]:
-from pyspark.sql.functions import lit

# Load the three different product categories: Video Games, Books, and Home and Kitchen
# Add a category column for all three data sets

df1 = spark.sql("select * from default.video_games_5")
df1 = df1.withColumn("category", lit("video_games"))
                     
df2 = spark.sql("select * from default.books_5_small")
df2 = df2.withColumn("category", lit("books"))
                     
df3 = spark.sql("select * from default.home_and_kitchen_5_small")
df3 = df3.withColumn("category", lit("home_kitchen"))

# Combine the three data sets into one data set for data cleaning

df = df1.union(df2).union(df3)

# There are 3,487,331 observations and 12 original features and 1 added feature                     

print((df.count(), len(df.columns)))

In [2]:
# Quick summary statistics of Amazon Reviews data set

df.describe().show()

In [3]:
# Display raw amazon review data

display(df)

reviewID,overall,vote,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,label,category
0,5.0,0,True,"10 17, 2015",A1HP7NVNPFMA4N,0700026657,Ambrosia075,"This game is a bit hard to get the hang of, but when you do it's great.",but when you do it's great.,1445040000,0,video_games
1,4.0,0,False,"07 27, 2015",A1JGAP0185YJI6,0700026657,travis,"I played it a while but it was alright. The steam was a bit of trouble. The more they move these game to steam the more of a hard time I have activating and playing a game. But in spite of that it was fun, I liked it. Now I am looking forward to anno 2205 I really want to play my way to the moon.","But in spite of that it was fun, I liked it",1437955200,0,video_games
2,3.0,0,True,"02 23, 2015",A1YJWEXHQBWK2B,0700026657,Vincent G. Mezera,ok game.,Three Stars,1424649600,0,video_games
3,2.0,0,True,"02 20, 2015",A2204E1TH211HT,0700026657,Grandma KR,"found the game a bit too complicated, not what I expected after having played 1602, 1503, and 1701",Two Stars,1424390400,0,video_games
4,5.0,0,True,"12 25, 2014",A2RF5B5H74JLPE,0700026657,jon,"great game, I love it and have played it since its arrived",love this game,1419465600,0,video_games
5,4.0,0,True,"11 13, 2014",A11V6ZJ2FVQY1D,0700026657,IBRAHIM ALBADI,i liked a lot some time that i haven't play a wonderfull game very simply and funny game verry good game.,Anno 2070,1415836800,0,video_games
6,1.0,0,False,"08 2, 2014",A1KXJ1ELZIU05C,0700026657,Creation27,"I'm an avid gamer, but Anno 2070 is an INSULT to gaming. It is so buggy and half-finished that the first campaign doesn't even work properly and the DRM is INCREDIBLY frustrating to deal with. Once you manage to work your way past the massive amounts of bugs and get through the DRM, HOURS later you finally figure out that the game has no real tutorial, so you stuck just clicking around randomly. Sad, sad, sad, example of a game that could have been great but FTW.",Avoid This Game - Filled with Bugs,1406937600,0,video_games
7,5.0,0,True,"03 3, 2014",A1WK5I4874S3O2,0700026657,WhiteSkull,"I bought this game thinking it would be pretty cool and that i might play it for a week or two and be done. Boy was I wrong! From the moment I finally got the gamed Fired up (the other commentors on this are right, it takes forever and u are forced to create an account) I watched as it booted up I could tell right off the bat that ALOT of thought went into making this game. If you have ever played Sim city, then this game is a must try as you will easily navigate thru it and its multi layers. I have been playing htis now for a month straight, and I am STILL discovering layers of complexity in the game. There are a few things in the game that could used tweaked, but all in all this is a 5 star game.",A very good game balance of skill with depth of choices,1393804800,0,video_games
8,5.0,0,True,"02 21, 2014",AV969NA4CBP10,0700026657,Travis B. Moore,I have played the old anno 1701 AND 1503. this game looks great but is more complex than the previous versions of the game. I found a lot of things lacking such as the sources of power and an inability to store energy with batteries or regenertive fuel cells as buildings in the game need power. Trade is about the same. My main beef with this it requires an internet connection. Other than that it has wonderful artistry and graphics. It is the same as anno 1701 but set in a future world where global warmming as flood the land and resource scarcity has sent human kind to look to the deep ocean for valuable minerals. I recoment the deep ocean expansion or complete if you get this. I found the ai instructor a little corny but other than that the game has some real polish. I wrote my 2 cents worth on suggestions on anno 2070 wiki and you can read 3 pages on that for game ideas I had.,Anno 2070 more like anno 1701,1392940800,0,video_games
9,4.0,0,True,"06 27, 2013",A1EO9BFUHTGWKZ,0700026657,johnnyz3,"I liked it and had fun with it, played for a while and got my money's worth. You can certainly go further than I did but I got frustrated with the fact that here we are in this new start and still taking from the earth rather than living with it. Better than simcity in that respect and maybe the best we could hope for.",Pretty fun,1372291200,0,video_games


In [4]:
# Drop duplicates using a subset of features reviewerID and asin
# Number of duplicated observations removed is approximately 8.3 % (288,157 = 3,487,331 - 3,199,174)

print("Before duplication removal: ", df.count())
df_distinct = df.dropDuplicates(["reviewerID", "asin"])
print("After duplication removal: ", df_distinct.count())

In [5]:
# Convert Unix timestamp to readable date

from pyspark.sql.functions import from_unixtime, to_date

df_date = df_distinct.withColumn("reviewTime", to_date(from_unixtime(df_distinct.unixReviewTime))).drop("unixReviewTime")

In [6]:
# Fill in the empty vote column with 0, and convert it to numeric type

from pyspark.sql.types import *

df_fill_vote = df_date.withColumn("vote", df_date.vote.cast(IntegerType())).fillna(0, subset = ["vote"]) 

In [7]:
# Install nltk

! pip install nltk

# Install Spark NLP

! pip install --ignore-installed spark-nlp==2.4.5

In [8]:
# Adapted from https://github.com/maobedkova/TopicModelling_PySpark_SparkNLP/blob/master/Topic_Modelling_with_PySpark_and_Spark_NLP.ipynb
# Converts reviewText data into Spark NLP annotation format

from sparknlp.base import DocumentAssembler

documentAssembler = DocumentAssembler() \
                    .setInputCol("reviewText") \
                    .setOutputCol("reviewDocument")

In [9]:
# Tokenize data using Tokenizer

from sparknlp.annotator import Tokenizer

tokenizer = Tokenizer() \
            .setInputCols(["reviewDocument"]) \
            .setOutputCol("reviewTokenized")

In [10]:
# Download stop words from nltk package

import nltk
nltk.download("stopwords")

from nltk.corpus import stopwords

eng_stopwords = stopwords.words("english")

In [11]:
# Remove stop words 

from sparknlp.annotator import StopWordsCleaner

stopwords_cleaner = StopWordsCleaner() \
                    .setInputCols(["reviewTokenized"]) \
                    .setOutputCol("reviewSWRemoved") \
                    .setStopWords(eng_stopwords)

In [12]:
# Normalize data to lowercase

from sparknlp.annotator import Normalizer

normalizer = Normalizer() \
             .setInputCols(["reviewSWRemoved"]) \
             .setOutputCol("reviewNormalized") \
             .setLowercase(True)

In [13]:
# Lemmatize data

from sparknlp.annotator import LemmatizerModel

lemmatizer = LemmatizerModel.pretrained() \
             .setInputCols(["reviewNormalized"]) \
             .setOutputCol("reviewUnigrams")

In [14]:
# Create n-grams

from sparknlp.annotator import NGramGenerator

ngrammer = NGramGenerator() \
           .setInputCols(["reviewUnigrams"]) \
           .setOutputCol("reviewNgrams") \
           .setN(2) \
           .setEnableCumulative(True) \
           .setDelimiter('_')

In [15]:
# Use Part-of-Speech (POS) tagger

from sparknlp.annotator import PerceptronModel

pos_tagger = PerceptronModel.pretrained("pos_anc") \
             .setInputCols(["reviewDocument", "reviewUnigrams"]) \
             .setOutputCol("reviewPOS")

In [16]:
# Transform data with Finisher

from sparknlp.base import Finisher

finisher = Finisher() \
           .setInputCols(["reviewUnigrams", "reviewNgrams", "reviewPOS"])

In [17]:
# Create a Preprocessing Pipeline

from pyspark.ml import Pipeline

pipeline = Pipeline() \
           .setStages([documentAssembler,
                      tokenizer,
                      stopwords_cleaner,
                      normalizer,
                      lemmatizer,
                      ngrammer,
                      pos_tagger,
                      finisher])

In [18]:
# Fit the pipeline to training data

pipeline_fit = pipeline.fit(df_fill_vote)
df_transformed = pipeline_fit.transform(df_fill_vote)
df_transformed.show(5)

In [19]:
display(df_transformed)

reviewID,overall,vote,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,label,category,finished_reviewUnigrams,finished_reviewNgrams,finished_reviewPOS
8590184176,1.0,0,True,2014-02-07,A0380485C177Q6QQNJIX,B003N5ZYOG,Franklin Tineo,bad very bad the conect drums is broken,bad very bad the conect drums is broken,0,video_games,"List(bad, bad, conect, drum, break)","List(bad, bad, conect, drum, break, bad_bad, bad_conect, conect_drum, drum_break)","List(JJ, JJ, NN, NN, NN)"
1594188,2.0,0,True,2017-02-18,A0457350OAOAFTV3LPDA,B001PKU1YO,Lauren,"This dresser has been a disaster almost since we put it together. Due to our college-kid budget, we purchased this product hoping for the best (our old dresser had seen its last days). The customer service has helped manage the poor quality of the drawers, but each time they send (free of charge, and from wonderful representatives) new drawer bottoms, they're the wrong way thickness and don't fit. If you purchase this, know you'll have a lot of extra work down the road. There's a lot of rigging you'll have to do yourself. The exterior is very nice. The only reason this isn't a 1-star is because of the excellent customer service we have received over our 3-4 phone calls in.","Not worth the money, but the customer service was excellent",0,home_kitchen,"List(dresser, disaster, almost, since, put, together, due, collegekid, budget, purchase, product, hope, good, old, dresser, see, last, day, customer, service, help, manage, poor, quality, drawer, time, send, free, charge, wonderful, representative, new, drawer, bottom, theyre, wrong, way, thickness, fit, purchase, know, lot, extra, work, road, there, lot, rig, exterior, nice, reason, star, excellent, customer, service, receive, phone, call)","List(dresser, disaster, almost, since, put, together, due, collegekid, budget, purchase, product, hope, good, old, dresser, see, last, day, customer, service, help, manage, poor, quality, drawer, time, send, free, charge, wonderful, representative, new, drawer, bottom, theyre, wrong, way, thickness, fit, purchase, know, lot, extra, work, road, there, lot, rig, exterior, nice, reason, star, excellent, customer, service, receive, phone, call, dresser_disaster, disaster_almost, almost_since, since_put, put_together, together_due, due_collegekid, collegekid_budget, budget_purchase, purchase_product, product_hope, hope_good, good_old, old_dresser, dresser_see, see_last, last_day, day_customer, customer_service, service_help, help_manage, manage_poor, poor_quality, quality_drawer, drawer_time, time_send, send_free, free_charge, charge_wonderful, wonderful_representative, representative_new, new_drawer, drawer_bottom, bottom_theyre, theyre_wrong, wrong_way, way_thickness, thickness_fit, fit_purchase, purchase_know, know_lot, lot_extra, extra_work, work_road, road_there, there_lot, lot_rig, rig_exterior, exterior_nice, nice_reason, reason_star, star_excellent, excellent_customer, customer_service, service_receive, receive_phone, phone_call)","List(NN, NN, RB, IN, VBD, RB, JJ, NN, NN, NN, NN, VBP, JJ, JJ, NN, VB, JJ, NN, NN, NN, VB, VB, JJ, NN, NN, NN, VB, JJ, NN, JJ, NN, JJ, NN, NN, NN, JJ, NN, NN, VB, NN, VB, NN, JJ, NN, NN, EX, NN, NN, JJ, JJ, NN, NN, JJ, NN, NN, VB, NN, NN)"
1961125,5.0,0,True,2016-03-07,A0741133YX9L0KH921L1,B003G2ZWDO,Lenora B .,"Beautiful ! I love it .. Got the curtains that match also here on Amazon , fit perfect !","Great quality , very happy",0,home_kitchen,"List(beautiful, love, get, curtain, match, also, amazon, fit, perfect)","List(beautiful, love, get, curtain, match, also, amazon, fit, perfect, beautiful_love, love_get, get_curtain, curtain_match, match_also, also_amazon, amazon_fit, fit_perfect)","List(JJ, NN, VB, NN, VB, RB, NN, NN, JJ)"
766017,5.0,0,False,1999-12-26,A100YHBWL4TR4D,0060739428,Dana Huff,"This book is one of my favorite books of all time. Karen Cushman perfectly captures life for a young teenage girl in the Middle Ages. From mundane, everyday problems like fleas to major concerns like familial expectations and marriage, Cushman covers it all! This is one of the few books that is so accurate that readers can really get a feel for the times as they truly were - not the way they are romanticized. I say kudos to Cushman, and I hope she keeps them coming!",One of the best books ever written for young adults,0,books,"List(book, one, favorite, book, time, karen, cushman, perfectly, capture, life, young, teenage, girl, middle, age, mundane, everyday, problem, like, flea, major, concern, like, familial, expectation, marriage, cushman, cover, one, book, accurate, reader, really, get, feel, time, truly, way, romanticize, say, kudo, cushman, hope, keep, come)","List(book, one, favorite, book, time, karen, cushman, perfectly, capture, life, young, teenage, girl, middle, age, mundane, everyday, problem, like, flea, major, concern, like, familial, expectation, marriage, cushman, cover, one, book, accurate, reader, really, get, feel, time, truly, way, romanticize, say, kudo, cushman, hope, keep, come, book_one, one_favorite, favorite_book, book_time, time_karen, karen_cushman, cushman_perfectly, perfectly_capture, capture_life, life_young, young_teenage, teenage_girl, girl_middle, middle_age, age_mundane, mundane_everyday, everyday_problem, problem_like, like_flea, flea_major, major_concern, concern_like, like_familial, familial_expectation, expectation_marriage, marriage_cushman, cushman_cover, cover_one, one_book, book_accurate, accurate_reader, reader_really, really_get, get_feel, feel_time, time_truly, truly_way, way_romanticize, romanticize_say, say_kudo, kudo_cushman, cushman_hope, hope_keep, keep_come)","List(NN, CD, JJ, NN, NN, NN, NN, RB, JJ, NN, JJ, NN, NN, JJ, NN, NN, JJ, NN, IN, NN, JJ, NN, IN, JJ, NN, NN, NN, NN, CD, NN, JJ, NN, RB, VB, JJ, NN, RB, NN, VB, VBP, NN, NN, NN, VB, NN)"
307286,5.0,0,True,2015-12-31,A1011IS5D1SR35,B0002L1056,Chris Bloom,"Not sure how practical it could be, but its for decoration in our house for my wife. Seems great so far.",Seems great so far,0,home_kitchen,"List(sure, practical, could, decoration, house, wife, seem, great, far)","List(sure, practical, could, decoration, house, wife, seem, great, far, sure_practical, practical_could, could_decoration, decoration_house, house_wife, wife_seem, seem_great, great_far)","List(JJ, JJ, MD, NN, NN, NN, VBP, JJ, RB)"
431996,5.0,0,True,2018-05-07,A101RVZV3RBT8O,B000BRU78C,James Rizer,"heavy duty, great thermos.",great thermos.,0,home_kitchen,"List(heavy, duty, great, thermos)","List(heavy, duty, great, thermos, heavy_duty, duty_great, great_thermos)","List(JJ, NN, JJ, NN)"
57008,5.0,0,True,2013-10-21,A101SYFLM1I1HR,B00004W4UQ,Rustie C. Dimitriadis,"This product is simple, but does the job perfectly. No need to try to cover our plates with plastic wrap, which is unhealthy and a waste of money, and can be messy and time-consuming. Just stick this cover over your plate, bowl, or microwaveable container and you're good to go. It stores right in the microwave when not in use and is a cinch to clean. Either give it a rinse right after use when the food will easily rinse right off or put it in the dishwasher. I like the hole for release of steam because it makes the lid easy to grab with one hand. This lid makes the food heat better and keeps your microwave very clean. Fits over any dish. I have used it with small plates, where it will just rest on the turntable of my microwave or on larger dinner plates, where the lid rests on the rim of the plate. I also use on bowls of various heights. It will either rest right on the turntable or on the bowl/dish itself. It seems to work equally well with all different sizes of dishes. I have purchased these for gifts as well and would recommend to anyone.",Just what I was looking for!,0,home_kitchen,"List(product, simple, job, perfectly, need, try, cover, plate, plastic, wrap, unhealthy, waste, money, messy, timeconsuming, stick, cover, plate, bowl, microwaveable, container, good, go, store, right, microwave, use, cinch, clean, either, give, rinse, right, use, food, easily, rinse, right, put, dishwasher, like, hole, release, steam, make, lid, easy, grab, one, hand, lid, make, food, heat, well, keep, microwave, clean, fit, dish, use, small, plate, rest, turntable, microwave, large, dinner, plate, lid, rest, rim, plate, also, use, bowl, various, height, either, rest, right, turntable, bowldish, seem, work, equally, well, different, size, dish, purchase, gift, well, would, recommend, anyone)","List(product, simple, job, perfectly, need, try, cover, plate, plastic, wrap, unhealthy, waste, money, messy, timeconsuming, stick, cover, plate, bowl, microwaveable, container, good, go, store, right, microwave, use, cinch, clean, either, give, rinse, right, use, food, easily, rinse, right, put, dishwasher, like, hole, release, steam, make, lid, easy, grab, one, hand, lid, make, food, heat, well, keep, microwave, clean, fit, dish, use, small, plate, rest, turntable, microwave, large, dinner, plate, lid, rest, rim, plate, also, use, bowl, various, height, either, rest, right, turntable, bowldish, seem, work, equally, well, different, size, dish, purchase, gift, well, would, recommend, anyone, product_simple, simple_job, job_perfectly, perfectly_need, need_try, try_cover, cover_plate, plate_plastic, plastic_wrap, wrap_unhealthy, unhealthy_waste, waste_money, money_messy, messy_timeconsuming, timeconsuming_stick, stick_cover, cover_plate, plate_bowl, bowl_microwaveable, microwaveable_container, container_good, good_go, go_store, store_right, right_microwave, microwave_use, use_cinch, cinch_clean, clean_either, either_give, give_rinse, rinse_right, right_use, use_food, food_easily, easily_rinse, rinse_right, right_put, put_dishwasher, dishwasher_like, like_hole, hole_release, release_steam, steam_make, make_lid, lid_easy, easy_grab, grab_one, one_hand, hand_lid, lid_make, make_food, food_heat, heat_well, well_keep, keep_microwave, microwave_clean, clean_fit, fit_dish, dish_use, use_small, small_plate, plate_rest, rest_turntable, turntable_microwave, microwave_large, large_dinner, dinner_plate, plate_lid, lid_rest, rest_rim, rim_plate, plate_also, also_use, use_bowl, bowl_various, various_height, height_either, either_rest, rest_right, right_turntable, turntable_bowldish, bowldish_seem, seem_work, work_equally, equally_well, well_different, different_size, size_dish, dish_purchase, purchase_gift, gift_well, well_would, would_recommend, recommend_anyone)","List(NN, JJ, NN, RB, VBP, VB, RB, NN, NN, NN, JJ, NN, NN, NN, VBG, NN, NN, NN, NN, JJ, NN, JJ, NN, NN, NN, NN, NN, NN, VB, CC, VB, JJ, JJ, NN, NN, RB, NN, NN, VBD, NN, IN, NN, NN, NN, NN, NN, JJ, NN, CD, NN, NN, NN, NN, NN, RB, VB, NN, VB, VB, JJ, NN, JJ, NN, NN, JJ, NN, JJ, NN, NN, JJ, NN, NN, NN, RB, NN, NN, JJ, NN, CC, NN, NN, JJ, NN, VBP, NN, RB, RB, JJ, NN, NN, NN, NN, RB, MD, VB, NN)"
78628,4.0,0,True,2015-01-10,A102MU6ZC9H1N6,B000EGELP0,Teresa Halbert,My daughter still pulls this out from time to time just to see if she can still keep up with the puzzles. In great shape and purchase and price was good.,Brain Age: Train Your Brain in Minutes a Day,0,video_games,"List(daughter, still, pull, time, time, see, still, keep, puzzle, great, shape, purchase, price, good)","List(daughter, still, pull, time, time, see, still, keep, puzzle, great, shape, purchase, price, good, daughter_still, still_pull, pull_time, time_time, time_see, see_still, still_keep, keep_puzzle, puzzle_great, great_shape, shape_purchase, purchase_price, price_good)","List(NN, RB, JJ, NN, NN, VB, RB, VB, NN, JJ, NN, NN, NN, JJ)"
186322,4.0,0,True,2013-03-26,A102MU6ZC9H1N6,B001TOQ8JS,Teresa Halbert,"My son is Beatles crazy and when he saw this game, he was so excited to get it ordered and play it with his friends. The game was in good shape and the price was great.",Xbox 360 The Beatles: rock Band,0,video_games,"List(son, beatles, crazy, see, game, excited, get, order, play, friend, game, good, shape, price, great)","List(son, beatles, crazy, see, game, excited, get, order, play, friend, game, good, shape, price, great, son_beatles, beatles_crazy, crazy_see, see_game, game_excited, excited_get, get_order, order_play, play_friend, friend_game, game_good, good_shape, shape_price, price_great)","List(NN, NNS, JJ, VB, NN, VBN, VB, NN, VB, NN, NN, JJ, NN, NN, JJ)"
1394884,5.0,0,True,2015-12-01,A102MV1GZ0TH3A,B001BDUHJE,A. Zaniesienko,"Sweetest, but VERY fragile and difficult to fix! A friend gave me one of these and it promptly broke. I loved it so much that I had to replace it. I pretend it's the original. It's perched in a safe spot and I hope it never falls.",I loved it so much that I had to replace it,0,home_kitchen,"List(sweet, fragile, difficult, fix, friend, give, one, promptly, break, love, much, replace, pretend, original, perch, safe, spot, hope, never, fall)","List(sweet, fragile, difficult, fix, friend, give, one, promptly, break, love, much, replace, pretend, original, perch, safe, spot, hope, never, fall, sweet_fragile, fragile_difficult, difficult_fix, fix_friend, friend_give, give_one, one_promptly, promptly_break, break_love, love_much, much_replace, replace_pretend, pretend_original, original_perch, perch_safe, safe_spot, spot_hope, hope_never, never_fall)","List(JJ, JJ, JJ, NN, NN, VB, CD, RB, NN, NN, JJ, VB, VB, JJ, NN, JJ, NN, VBP, RB, NN)"


In [20]:
# Join POS Tags

from pyspark.sql import types as T
from pyspark.sql import functions as F

udf_join_arr = F.udf(lambda x: " ".join(x), T.StringType())
df_transformed = df_transformed.withColumn("finished_reviewPOS", udf_join_arr(F.col("finished_reviewPOS")))

In [21]:
# Converts POS Tags into Spark NLP annotation format 

pos_documentAssembler = DocumentAssembler() \
                        .setInputCol("finished_reviewPOS") \
                        .setOutputCol("posDocument")

In [22]:
# Tokenize POS tags using Tokenizer

pos_tokenizer = Tokenizer() \
                .setInputCols(["posDocument"]) \
                .setOutputCol("posTokenized")

In [23]:
# Generate N-grams for POS Tags

pos_ngrammer = NGramGenerator() \
               .setInputCols(["posTokenized"]) \
               .setOutputCol("posNgrams") \
               .setN(2) \
               .setEnableCumulative(True) \
               .setDelimiter("_")

In [24]:
# Transform POS Tags with Finisher

pos_finisher = Finisher() \
               .setInputCols(["posTokenized", "posNgrams"])

In [25]:
# Create a Preprocessing Pipeline

pos_pipeline = Pipeline() \
               .setStages([pos_documentAssembler,
                          pos_tokenizer,
                          pos_ngrammer,
                          pos_finisher])

In [26]:
# Fit the Pipeline to Training Data

df_processed = pos_pipeline.fit(df_transformed).transform(df_transformed)
df_processed.show(5)

In [27]:
df_processed.columns

In [28]:
df_processed.select("finished_reviewUnigrams", "finished_posTokenized").limit(5).show()

In [29]:
df_processed.select("finished_reviewNgrams", "finished_posNgrams").limit(5).show()

In [30]:
# Create Function to Filter Out POS Tags

def filter_pos(words, pos_tags):
    return [word for word, pos in zip(words, pos_tags) 
            if pos in ["JJ", "NN", "NNS", "VB", "VBP"]]

udf_filter_pos = F.udf(filter_pos, T.ArrayType(T.StringType()))

In [31]:
df_processed = df_processed.withColumn("filtered_unigrams",
                                               udf_filter_pos(F.col("finished_reviewUnigrams"), 
                                                              F.col("finished_posTokenized")))

In [32]:
df_processed.select("filtered_unigrams").limit(5).show(truncate = 90)

In [33]:
def filter_pos_combs(words, pos_tags):
  return [word for word, pos in zip(words, pos_tags)
         if (len(pos.split("_")) == 2 and \
            pos.split("_")[0] in ["JJ", "NN", "NNS", "VB", "VBP"] and \
            pos.split("_")[1] in ["JJ", "NN", "NNS"])]

udf_filter_pos_combs = F.udf(filter_pos_combs, T.ArrayType(T.StringType()))

In [34]:
df_processed = df_processed.withColumn("filtered_ngrams",
                                      udf_filter_pos_combs(F.col("finished_reviewNgrams"), F.col("finished_posNgrams")))

In [35]:
df_processed.select("filtered_ngrams").limit(5).show(truncate = 90)

In [36]:
#Combine Unigram and Ngrams

from pyspark.sql.functions import concat

df_processed = df_processed.withColumn("reviewFinal",
                                          concat(F.col("filtered_unigrams"),
                                                 F.col("filtered_ngrams")))

In [37]:
# Dropping temporary columns, and cache results (note that cache is also a lazy operation)

df_cleaned = df_processed.drop("finished_reviewUnigrams",
 "finished_reviewNgrams",
 "finished_reviewPOS",
 "finished_posTokenized",
 "finished_posNgrams",
 ).cache()

display(df_cleaned)

reviewID,overall,vote,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,label,category,filtered_unigrams,filtered_ngrams,reviewFinal
8590184176,1.0,0,True,2014-02-07,A0380485C177Q6QQNJIX,B003N5ZYOG,Franklin Tineo,bad very bad the conect drums is broken,bad very bad the conect drums is broken,0,video_games,"List(bad, bad, conect, drum, break)","List(bad_bad, bad_conect, conect_drum, drum_break)","List(bad, bad, conect, drum, break, bad_bad, bad_conect, conect_drum, drum_break)"
1594188,2.0,0,True,2017-02-18,A0457350OAOAFTV3LPDA,B001PKU1YO,Lauren,"This dresser has been a disaster almost since we put it together. Due to our college-kid budget, we purchased this product hoping for the best (our old dresser had seen its last days). The customer service has helped manage the poor quality of the drawers, but each time they send (free of charge, and from wonderful representatives) new drawer bottoms, they're the wrong way thickness and don't fit. If you purchase this, know you'll have a lot of extra work down the road. There's a lot of rigging you'll have to do yourself. The exterior is very nice. The only reason this isn't a 1-star is because of the excellent customer service we have received over our 3-4 phone calls in.","Not worth the money, but the customer service was excellent",0,home_kitchen,"List(dresser, disaster, due, collegekid, budget, purchase, product, hope, good, old, dresser, see, last, day, customer, service, help, manage, poor, quality, drawer, time, send, free, charge, wonderful, representative, new, drawer, bottom, theyre, wrong, way, thickness, fit, purchase, know, lot, extra, work, road, lot, rig, exterior, nice, reason, star, excellent, customer, service, receive, phone, call)","List(dresser_disaster, due_collegekid, collegekid_budget, budget_purchase, purchase_product, hope_good, good_old, old_dresser, see_last, last_day, day_customer, customer_service, manage_poor, poor_quality, quality_drawer, drawer_time, send_free, free_charge, charge_wonderful, wonderful_representative, representative_new, new_drawer, drawer_bottom, bottom_theyre, theyre_wrong, wrong_way, way_thickness, fit_purchase, know_lot, lot_extra, extra_work, work_road, lot_rig, rig_exterior, exterior_nice, nice_reason, reason_star, star_excellent, excellent_customer, customer_service, receive_phone, phone_call)","List(dresser, disaster, due, collegekid, budget, purchase, product, hope, good, old, dresser, see, last, day, customer, service, help, manage, poor, quality, drawer, time, send, free, charge, wonderful, representative, new, drawer, bottom, theyre, wrong, way, thickness, fit, purchase, know, lot, extra, work, road, lot, rig, exterior, nice, reason, star, excellent, customer, service, receive, phone, call, dresser_disaster, due_collegekid, collegekid_budget, budget_purchase, purchase_product, hope_good, good_old, old_dresser, see_last, last_day, day_customer, customer_service, manage_poor, poor_quality, quality_drawer, drawer_time, send_free, free_charge, charge_wonderful, wonderful_representative, representative_new, new_drawer, drawer_bottom, bottom_theyre, theyre_wrong, wrong_way, way_thickness, fit_purchase, know_lot, lot_extra, extra_work, work_road, lot_rig, rig_exterior, exterior_nice, nice_reason, reason_star, star_excellent, excellent_customer, customer_service, receive_phone, phone_call)"
1961125,5.0,0,True,2016-03-07,A0741133YX9L0KH921L1,B003G2ZWDO,Lenora B .,"Beautiful ! I love it .. Got the curtains that match also here on Amazon , fit perfect !","Great quality , very happy",0,home_kitchen,"List(beautiful, love, get, curtain, match, amazon, fit, perfect)","List(beautiful_love, get_curtain, amazon_fit, fit_perfect)","List(beautiful, love, get, curtain, match, amazon, fit, perfect, beautiful_love, get_curtain, amazon_fit, fit_perfect)"
766017,5.0,0,False,1999-12-26,A100YHBWL4TR4D,0060739428,Dana Huff,"This book is one of my favorite books of all time. Karen Cushman perfectly captures life for a young teenage girl in the Middle Ages. From mundane, everyday problems like fleas to major concerns like familial expectations and marriage, Cushman covers it all! This is one of the few books that is so accurate that readers can really get a feel for the times as they truly were - not the way they are romanticized. I say kudos to Cushman, and I hope she keeps them coming!",One of the best books ever written for young adults,0,books,"List(book, favorite, book, time, karen, cushman, capture, life, young, teenage, girl, middle, age, mundane, everyday, problem, flea, major, concern, familial, expectation, marriage, cushman, cover, book, accurate, reader, get, feel, time, way, romanticize, say, kudo, cushman, hope, keep, come)","List(favorite_book, book_time, time_karen, karen_cushman, capture_life, life_young, young_teenage, teenage_girl, girl_middle, middle_age, age_mundane, mundane_everyday, everyday_problem, flea_major, major_concern, familial_expectation, expectation_marriage, marriage_cushman, cushman_cover, book_accurate, accurate_reader, get_feel, feel_time, say_kudo, kudo_cushman, cushman_hope, keep_come)","List(book, favorite, book, time, karen, cushman, capture, life, young, teenage, girl, middle, age, mundane, everyday, problem, flea, major, concern, familial, expectation, marriage, cushman, cover, book, accurate, reader, get, feel, time, way, romanticize, say, kudo, cushman, hope, keep, come, favorite_book, book_time, time_karen, karen_cushman, capture_life, life_young, young_teenage, teenage_girl, girl_middle, middle_age, age_mundane, mundane_everyday, everyday_problem, flea_major, major_concern, familial_expectation, expectation_marriage, marriage_cushman, cushman_cover, book_accurate, accurate_reader, get_feel, feel_time, say_kudo, kudo_cushman, cushman_hope, keep_come)"
307286,5.0,0,True,2015-12-31,A1011IS5D1SR35,B0002L1056,Chris Bloom,"Not sure how practical it could be, but its for decoration in our house for my wife. Seems great so far.",Seems great so far,0,home_kitchen,"List(sure, practical, decoration, house, wife, seem, great)","List(sure_practical, decoration_house, house_wife, seem_great)","List(sure, practical, decoration, house, wife, seem, great, sure_practical, decoration_house, house_wife, seem_great)"
431996,5.0,0,True,2018-05-07,A101RVZV3RBT8O,B000BRU78C,James Rizer,"heavy duty, great thermos.",great thermos.,0,home_kitchen,"List(heavy, duty, great, thermos)","List(heavy_duty, duty_great, great_thermos)","List(heavy, duty, great, thermos, heavy_duty, duty_great, great_thermos)"
57008,5.0,0,True,2013-10-21,A101SYFLM1I1HR,B00004W4UQ,Rustie C. Dimitriadis,"This product is simple, but does the job perfectly. No need to try to cover our plates with plastic wrap, which is unhealthy and a waste of money, and can be messy and time-consuming. Just stick this cover over your plate, bowl, or microwaveable container and you're good to go. It stores right in the microwave when not in use and is a cinch to clean. Either give it a rinse right after use when the food will easily rinse right off or put it in the dishwasher. I like the hole for release of steam because it makes the lid easy to grab with one hand. This lid makes the food heat better and keeps your microwave very clean. Fits over any dish. I have used it with small plates, where it will just rest on the turntable of my microwave or on larger dinner plates, where the lid rests on the rim of the plate. I also use on bowls of various heights. It will either rest right on the turntable or on the bowl/dish itself. It seems to work equally well with all different sizes of dishes. I have purchased these for gifts as well and would recommend to anyone.",Just what I was looking for!,0,home_kitchen,"List(product, simple, job, need, try, plate, plastic, wrap, unhealthy, waste, money, messy, stick, cover, plate, bowl, microwaveable, container, good, go, store, right, microwave, use, cinch, clean, give, rinse, right, use, food, rinse, right, dishwasher, hole, release, steam, make, lid, easy, grab, hand, lid, make, food, heat, keep, microwave, clean, fit, dish, use, small, plate, rest, turntable, microwave, large, dinner, plate, lid, rest, rim, plate, use, bowl, various, height, rest, right, turntable, bowldish, seem, work, different, size, dish, purchase, gift, recommend, anyone)","List(product_simple, simple_job, plate_plastic, plastic_wrap, wrap_unhealthy, unhealthy_waste, waste_money, money_messy, stick_cover, cover_plate, plate_bowl, bowl_microwaveable, microwaveable_container, container_good, good_go, go_store, store_right, right_microwave, microwave_use, use_cinch, give_rinse, rinse_right, right_use, use_food, rinse_right, hole_release, release_steam, steam_make, make_lid, lid_easy, easy_grab, hand_lid, lid_make, make_food, food_heat, keep_microwave, fit_dish, dish_use, use_small, small_plate, plate_rest, rest_turntable, turntable_microwave, microwave_large, large_dinner, dinner_plate, plate_lid, lid_rest, rest_rim, rim_plate, use_bowl, bowl_various, various_height, rest_right, right_turntable, turntable_bowldish, seem_work, different_size, size_dish, dish_purchase, purchase_gift, recommend_anyone)","List(product, simple, job, need, try, plate, plastic, wrap, unhealthy, waste, money, messy, stick, cover, plate, bowl, microwaveable, container, good, go, store, right, microwave, use, cinch, clean, give, rinse, right, use, food, rinse, right, dishwasher, hole, release, steam, make, lid, easy, grab, hand, lid, make, food, heat, keep, microwave, clean, fit, dish, use, small, plate, rest, turntable, microwave, large, dinner, plate, lid, rest, rim, plate, use, bowl, various, height, rest, right, turntable, bowldish, seem, work, different, size, dish, purchase, gift, recommend, anyone, product_simple, simple_job, plate_plastic, plastic_wrap, wrap_unhealthy, unhealthy_waste, waste_money, money_messy, stick_cover, cover_plate, plate_bowl, bowl_microwaveable, microwaveable_container, container_good, good_go, go_store, store_right, right_microwave, microwave_use, use_cinch, give_rinse, rinse_right, right_use, use_food, rinse_right, hole_release, release_steam, steam_make, make_lid, lid_easy, easy_grab, hand_lid, lid_make, make_food, food_heat, keep_microwave, fit_dish, dish_use, use_small, small_plate, plate_rest, rest_turntable, turntable_microwave, microwave_large, large_dinner, dinner_plate, plate_lid, lid_rest, rest_rim, rim_plate, use_bowl, bowl_various, various_height, rest_right, right_turntable, turntable_bowldish, seem_work, different_size, size_dish, dish_purchase, purchase_gift, recommend_anyone)"
78628,4.0,0,True,2015-01-10,A102MU6ZC9H1N6,B000EGELP0,Teresa Halbert,My daughter still pulls this out from time to time just to see if she can still keep up with the puzzles. In great shape and purchase and price was good.,Brain Age: Train Your Brain in Minutes a Day,0,video_games,"List(daughter, pull, time, time, see, keep, puzzle, great, shape, purchase, price, good)","List(pull_time, time_time, keep_puzzle, puzzle_great, great_shape, shape_purchase, purchase_price, price_good)","List(daughter, pull, time, time, see, keep, puzzle, great, shape, purchase, price, good, pull_time, time_time, keep_puzzle, puzzle_great, great_shape, shape_purchase, purchase_price, price_good)"
186322,4.0,0,True,2013-03-26,A102MU6ZC9H1N6,B001TOQ8JS,Teresa Halbert,"My son is Beatles crazy and when he saw this game, he was so excited to get it ordered and play it with his friends. The game was in good shape and the price was great.",Xbox 360 The Beatles: rock Band,0,video_games,"List(son, beatles, crazy, see, game, get, order, play, friend, game, good, shape, price, great)","List(son_beatles, beatles_crazy, see_game, get_order, play_friend, friend_game, game_good, good_shape, shape_price, price_great)","List(son, beatles, crazy, see, game, get, order, play, friend, game, good, shape, price, great, son_beatles, beatles_crazy, see_game, get_order, play_friend, friend_game, game_good, good_shape, shape_price, price_great)"
1394884,5.0,0,True,2015-12-01,A102MV1GZ0TH3A,B001BDUHJE,A. Zaniesienko,"Sweetest, but VERY fragile and difficult to fix! A friend gave me one of these and it promptly broke. I loved it so much that I had to replace it. I pretend it's the original. It's perched in a safe spot and I hope it never falls.",I loved it so much that I had to replace it,0,home_kitchen,"List(sweet, fragile, difficult, fix, friend, give, break, love, much, replace, pretend, original, perch, safe, spot, hope, fall)","List(sweet_fragile, fragile_difficult, difficult_fix, fix_friend, break_love, love_much, pretend_original, original_perch, perch_safe, safe_spot)","List(sweet, fragile, difficult, fix, friend, give, break, love, much, replace, pretend, original, perch, safe, spot, hope, fall, sweet_fragile, fragile_difficult, difficult_fix, fix_friend, break_love, love_much, pretend_original, original_perch, perch_safe, safe_spot)"


In [38]:
df_cleaned.printSchema()

In [39]:
#Convert filter_unigrams array to string for sentiment analysis
df_cleaned = df_cleaned.withColumn("reviewString", F.concat_ws(" ", "filtered_unigrams"))

In [40]:
display(df_cleaned)

reviewID,overall,vote,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,label,category,filtered_unigrams,filtered_ngrams,reviewFinal,reviewString
8590184176,1.0,0,True,2014-02-07,A0380485C177Q6QQNJIX,B003N5ZYOG,Franklin Tineo,bad very bad the conect drums is broken,bad very bad the conect drums is broken,0,video_games,"List(bad, bad, conect, drum, break)","List(bad_bad, bad_conect, conect_drum, drum_break)","List(bad, bad, conect, drum, break, bad_bad, bad_conect, conect_drum, drum_break)",bad bad conect drum break
1594188,2.0,0,True,2017-02-18,A0457350OAOAFTV3LPDA,B001PKU1YO,Lauren,"This dresser has been a disaster almost since we put it together. Due to our college-kid budget, we purchased this product hoping for the best (our old dresser had seen its last days). The customer service has helped manage the poor quality of the drawers, but each time they send (free of charge, and from wonderful representatives) new drawer bottoms, they're the wrong way thickness and don't fit. If you purchase this, know you'll have a lot of extra work down the road. There's a lot of rigging you'll have to do yourself. The exterior is very nice. The only reason this isn't a 1-star is because of the excellent customer service we have received over our 3-4 phone calls in.","Not worth the money, but the customer service was excellent",0,home_kitchen,"List(dresser, disaster, due, collegekid, budget, purchase, product, hope, good, old, dresser, see, last, day, customer, service, help, manage, poor, quality, drawer, time, send, free, charge, wonderful, representative, new, drawer, bottom, theyre, wrong, way, thickness, fit, purchase, know, lot, extra, work, road, lot, rig, exterior, nice, reason, star, excellent, customer, service, receive, phone, call)","List(dresser_disaster, due_collegekid, collegekid_budget, budget_purchase, purchase_product, hope_good, good_old, old_dresser, see_last, last_day, day_customer, customer_service, manage_poor, poor_quality, quality_drawer, drawer_time, send_free, free_charge, charge_wonderful, wonderful_representative, representative_new, new_drawer, drawer_bottom, bottom_theyre, theyre_wrong, wrong_way, way_thickness, fit_purchase, know_lot, lot_extra, extra_work, work_road, lot_rig, rig_exterior, exterior_nice, nice_reason, reason_star, star_excellent, excellent_customer, customer_service, receive_phone, phone_call)","List(dresser, disaster, due, collegekid, budget, purchase, product, hope, good, old, dresser, see, last, day, customer, service, help, manage, poor, quality, drawer, time, send, free, charge, wonderful, representative, new, drawer, bottom, theyre, wrong, way, thickness, fit, purchase, know, lot, extra, work, road, lot, rig, exterior, nice, reason, star, excellent, customer, service, receive, phone, call, dresser_disaster, due_collegekid, collegekid_budget, budget_purchase, purchase_product, hope_good, good_old, old_dresser, see_last, last_day, day_customer, customer_service, manage_poor, poor_quality, quality_drawer, drawer_time, send_free, free_charge, charge_wonderful, wonderful_representative, representative_new, new_drawer, drawer_bottom, bottom_theyre, theyre_wrong, wrong_way, way_thickness, fit_purchase, know_lot, lot_extra, extra_work, work_road, lot_rig, rig_exterior, exterior_nice, nice_reason, reason_star, star_excellent, excellent_customer, customer_service, receive_phone, phone_call)",dresser disaster due collegekid budget purchase product hope good old dresser see last day customer service help manage poor quality drawer time send free charge wonderful representative new drawer bottom theyre wrong way thickness fit purchase know lot extra work road lot rig exterior nice reason star excellent customer service receive phone call
1961125,5.0,0,True,2016-03-07,A0741133YX9L0KH921L1,B003G2ZWDO,Lenora B .,"Beautiful ! I love it .. Got the curtains that match also here on Amazon , fit perfect !","Great quality , very happy",0,home_kitchen,"List(beautiful, love, get, curtain, match, amazon, fit, perfect)","List(beautiful_love, get_curtain, amazon_fit, fit_perfect)","List(beautiful, love, get, curtain, match, amazon, fit, perfect, beautiful_love, get_curtain, amazon_fit, fit_perfect)",beautiful love get curtain match amazon fit perfect
766017,5.0,0,False,1999-12-26,A100YHBWL4TR4D,0060739428,Dana Huff,"This book is one of my favorite books of all time. Karen Cushman perfectly captures life for a young teenage girl in the Middle Ages. From mundane, everyday problems like fleas to major concerns like familial expectations and marriage, Cushman covers it all! This is one of the few books that is so accurate that readers can really get a feel for the times as they truly were - not the way they are romanticized. I say kudos to Cushman, and I hope she keeps them coming!",One of the best books ever written for young adults,0,books,"List(book, favorite, book, time, karen, cushman, capture, life, young, teenage, girl, middle, age, mundane, everyday, problem, flea, major, concern, familial, expectation, marriage, cushman, cover, book, accurate, reader, get, feel, time, way, romanticize, say, kudo, cushman, hope, keep, come)","List(favorite_book, book_time, time_karen, karen_cushman, capture_life, life_young, young_teenage, teenage_girl, girl_middle, middle_age, age_mundane, mundane_everyday, everyday_problem, flea_major, major_concern, familial_expectation, expectation_marriage, marriage_cushman, cushman_cover, book_accurate, accurate_reader, get_feel, feel_time, say_kudo, kudo_cushman, cushman_hope, keep_come)","List(book, favorite, book, time, karen, cushman, capture, life, young, teenage, girl, middle, age, mundane, everyday, problem, flea, major, concern, familial, expectation, marriage, cushman, cover, book, accurate, reader, get, feel, time, way, romanticize, say, kudo, cushman, hope, keep, come, favorite_book, book_time, time_karen, karen_cushman, capture_life, life_young, young_teenage, teenage_girl, girl_middle, middle_age, age_mundane, mundane_everyday, everyday_problem, flea_major, major_concern, familial_expectation, expectation_marriage, marriage_cushman, cushman_cover, book_accurate, accurate_reader, get_feel, feel_time, say_kudo, kudo_cushman, cushman_hope, keep_come)",book favorite book time karen cushman capture life young teenage girl middle age mundane everyday problem flea major concern familial expectation marriage cushman cover book accurate reader get feel time way romanticize say kudo cushman hope keep come
307286,5.0,0,True,2015-12-31,A1011IS5D1SR35,B0002L1056,Chris Bloom,"Not sure how practical it could be, but its for decoration in our house for my wife. Seems great so far.",Seems great so far,0,home_kitchen,"List(sure, practical, decoration, house, wife, seem, great)","List(sure_practical, decoration_house, house_wife, seem_great)","List(sure, practical, decoration, house, wife, seem, great, sure_practical, decoration_house, house_wife, seem_great)",sure practical decoration house wife seem great
431996,5.0,0,True,2018-05-07,A101RVZV3RBT8O,B000BRU78C,James Rizer,"heavy duty, great thermos.",great thermos.,0,home_kitchen,"List(heavy, duty, great, thermos)","List(heavy_duty, duty_great, great_thermos)","List(heavy, duty, great, thermos, heavy_duty, duty_great, great_thermos)",heavy duty great thermos
57008,5.0,0,True,2013-10-21,A101SYFLM1I1HR,B00004W4UQ,Rustie C. Dimitriadis,"This product is simple, but does the job perfectly. No need to try to cover our plates with plastic wrap, which is unhealthy and a waste of money, and can be messy and time-consuming. Just stick this cover over your plate, bowl, or microwaveable container and you're good to go. It stores right in the microwave when not in use and is a cinch to clean. Either give it a rinse right after use when the food will easily rinse right off or put it in the dishwasher. I like the hole for release of steam because it makes the lid easy to grab with one hand. This lid makes the food heat better and keeps your microwave very clean. Fits over any dish. I have used it with small plates, where it will just rest on the turntable of my microwave or on larger dinner plates, where the lid rests on the rim of the plate. I also use on bowls of various heights. It will either rest right on the turntable or on the bowl/dish itself. It seems to work equally well with all different sizes of dishes. I have purchased these for gifts as well and would recommend to anyone.",Just what I was looking for!,0,home_kitchen,"List(product, simple, job, need, try, plate, plastic, wrap, unhealthy, waste, money, messy, stick, cover, plate, bowl, microwaveable, container, good, go, store, right, microwave, use, cinch, clean, give, rinse, right, use, food, rinse, right, dishwasher, hole, release, steam, make, lid, easy, grab, hand, lid, make, food, heat, keep, microwave, clean, fit, dish, use, small, plate, rest, turntable, microwave, large, dinner, plate, lid, rest, rim, plate, use, bowl, various, height, rest, right, turntable, bowldish, seem, work, different, size, dish, purchase, gift, recommend, anyone)","List(product_simple, simple_job, plate_plastic, plastic_wrap, wrap_unhealthy, unhealthy_waste, waste_money, money_messy, stick_cover, cover_plate, plate_bowl, bowl_microwaveable, microwaveable_container, container_good, good_go, go_store, store_right, right_microwave, microwave_use, use_cinch, give_rinse, rinse_right, right_use, use_food, rinse_right, hole_release, release_steam, steam_make, make_lid, lid_easy, easy_grab, hand_lid, lid_make, make_food, food_heat, keep_microwave, fit_dish, dish_use, use_small, small_plate, plate_rest, rest_turntable, turntable_microwave, microwave_large, large_dinner, dinner_plate, plate_lid, lid_rest, rest_rim, rim_plate, use_bowl, bowl_various, various_height, rest_right, right_turntable, turntable_bowldish, seem_work, different_size, size_dish, dish_purchase, purchase_gift, recommend_anyone)","List(product, simple, job, need, try, plate, plastic, wrap, unhealthy, waste, money, messy, stick, cover, plate, bowl, microwaveable, container, good, go, store, right, microwave, use, cinch, clean, give, rinse, right, use, food, rinse, right, dishwasher, hole, release, steam, make, lid, easy, grab, hand, lid, make, food, heat, keep, microwave, clean, fit, dish, use, small, plate, rest, turntable, microwave, large, dinner, plate, lid, rest, rim, plate, use, bowl, various, height, rest, right, turntable, bowldish, seem, work, different, size, dish, purchase, gift, recommend, anyone, product_simple, simple_job, plate_plastic, plastic_wrap, wrap_unhealthy, unhealthy_waste, waste_money, money_messy, stick_cover, cover_plate, plate_bowl, bowl_microwaveable, microwaveable_container, container_good, good_go, go_store, store_right, right_microwave, microwave_use, use_cinch, give_rinse, rinse_right, right_use, use_food, rinse_right, hole_release, release_steam, steam_make, make_lid, lid_easy, easy_grab, hand_lid, lid_make, make_food, food_heat, keep_microwave, fit_dish, dish_use, use_small, small_plate, plate_rest, rest_turntable, turntable_microwave, microwave_large, large_dinner, dinner_plate, plate_lid, lid_rest, rest_rim, rim_plate, use_bowl, bowl_various, various_height, rest_right, right_turntable, turntable_bowldish, seem_work, different_size, size_dish, dish_purchase, purchase_gift, recommend_anyone)",product simple job need try plate plastic wrap unhealthy waste money messy stick cover plate bowl microwaveable container good go store right microwave use cinch clean give rinse right use food rinse right dishwasher hole release steam make lid easy grab hand lid make food heat keep microwave clean fit dish use small plate rest turntable microwave large dinner plate lid rest rim plate use bowl various height rest right turntable bowldish seem work different size dish purchase gift recommend anyone
78628,4.0,0,True,2015-01-10,A102MU6ZC9H1N6,B000EGELP0,Teresa Halbert,My daughter still pulls this out from time to time just to see if she can still keep up with the puzzles. In great shape and purchase and price was good.,Brain Age: Train Your Brain in Minutes a Day,0,video_games,"List(daughter, pull, time, time, see, keep, puzzle, great, shape, purchase, price, good)","List(pull_time, time_time, keep_puzzle, puzzle_great, great_shape, shape_purchase, purchase_price, price_good)","List(daughter, pull, time, time, see, keep, puzzle, great, shape, purchase, price, good, pull_time, time_time, keep_puzzle, puzzle_great, great_shape, shape_purchase, purchase_price, price_good)",daughter pull time time see keep puzzle great shape purchase price good
186322,4.0,0,True,2013-03-26,A102MU6ZC9H1N6,B001TOQ8JS,Teresa Halbert,"My son is Beatles crazy and when he saw this game, he was so excited to get it ordered and play it with his friends. The game was in good shape and the price was great.",Xbox 360 The Beatles: rock Band,0,video_games,"List(son, beatles, crazy, see, game, get, order, play, friend, game, good, shape, price, great)","List(son_beatles, beatles_crazy, see_game, get_order, play_friend, friend_game, game_good, good_shape, shape_price, price_great)","List(son, beatles, crazy, see, game, get, order, play, friend, game, good, shape, price, great, son_beatles, beatles_crazy, see_game, get_order, play_friend, friend_game, game_good, good_shape, shape_price, price_great)",son beatles crazy see game get order play friend game good shape price great
1394884,5.0,0,True,2015-12-01,A102MV1GZ0TH3A,B001BDUHJE,A. Zaniesienko,"Sweetest, but VERY fragile and difficult to fix! A friend gave me one of these and it promptly broke. I loved it so much that I had to replace it. I pretend it's the original. It's perched in a safe spot and I hope it never falls.",I loved it so much that I had to replace it,0,home_kitchen,"List(sweet, fragile, difficult, fix, friend, give, break, love, much, replace, pretend, original, perch, safe, spot, hope, fall)","List(sweet_fragile, fragile_difficult, difficult_fix, fix_friend, break_love, love_much, pretend_original, original_perch, perch_safe, safe_spot)","List(sweet, fragile, difficult, fix, friend, give, break, love, much, replace, pretend, original, perch, safe, spot, hope, fall, sweet_fragile, fragile_difficult, difficult_fix, fix_friend, break_love, love_much, pretend_original, original_perch, perch_safe, safe_spot)",sweet fragile difficult fix friend give break love much replace pretend original perch safe spot hope fall


In [41]:
from pyspark.sql.types import FloatType

from textblob import TextBlob

def sentiment_analysis(text):
  return TextBlob(text).sentiment.polarity

sentiment_analysis_udf = udf(sentiment_analysis, FloatType())

In [42]:
df_cleaned = df_cleaned.withColumn("sentiment_score", sentiment_analysis_udf(df_cleaned["reviewString"]))
df_cleaned.show(5, True)

In [43]:
# Review Length
df_cleaned = df_cleaned.withColumn("reviewLength", F.length("reviewString"))

In [44]:
# Review Word Count
df_cleaned = df_cleaned.withColumn("reviewWordcount", F.size(F.split(F.col("reviewString"), " ")))

In [45]:
display(df_cleaned)

reviewID,overall,vote,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,label,category,filtered_unigrams,filtered_ngrams,reviewFinal,reviewString,sentiment_score,reviewLength,reviewWordcount
8590184176,1.0,0,True,2014-02-07,A0380485C177Q6QQNJIX,B003N5ZYOG,Franklin Tineo,bad very bad the conect drums is broken,bad very bad the conect drums is broken,0,video_games,"List(bad, bad, conect, drum, break)","List(bad_bad, bad_conect, conect_drum, drum_break)","List(bad, bad, conect, drum, break, bad_bad, bad_conect, conect_drum, drum_break)",bad bad conect drum break,-0.7,25,5
1594188,2.0,0,True,2017-02-18,A0457350OAOAFTV3LPDA,B001PKU1YO,Lauren,"This dresser has been a disaster almost since we put it together. Due to our college-kid budget, we purchased this product hoping for the best (our old dresser had seen its last days). The customer service has helped manage the poor quality of the drawers, but each time they send (free of charge, and from wonderful representatives) new drawer bottoms, they're the wrong way thickness and don't fit. If you purchase this, know you'll have a lot of extra work down the road. There's a lot of rigging you'll have to do yourself. The exterior is very nice. The only reason this isn't a 1-star is because of the excellent customer service we have received over our 3-4 phone calls in.","Not worth the money, but the customer service was excellent",0,home_kitchen,"List(dresser, disaster, due, collegekid, budget, purchase, product, hope, good, old, dresser, see, last, day, customer, service, help, manage, poor, quality, drawer, time, send, free, charge, wonderful, representative, new, drawer, bottom, theyre, wrong, way, thickness, fit, purchase, know, lot, extra, work, road, lot, rig, exterior, nice, reason, star, excellent, customer, service, receive, phone, call)","List(dresser_disaster, due_collegekid, collegekid_budget, budget_purchase, purchase_product, hope_good, good_old, old_dresser, see_last, last_day, day_customer, customer_service, manage_poor, poor_quality, quality_drawer, drawer_time, send_free, free_charge, charge_wonderful, wonderful_representative, representative_new, new_drawer, drawer_bottom, bottom_theyre, theyre_wrong, wrong_way, way_thickness, fit_purchase, know_lot, lot_extra, extra_work, work_road, lot_rig, rig_exterior, exterior_nice, nice_reason, reason_star, star_excellent, excellent_customer, customer_service, receive_phone, phone_call)","List(dresser, disaster, due, collegekid, budget, purchase, product, hope, good, old, dresser, see, last, day, customer, service, help, manage, poor, quality, drawer, time, send, free, charge, wonderful, representative, new, drawer, bottom, theyre, wrong, way, thickness, fit, purchase, know, lot, extra, work, road, lot, rig, exterior, nice, reason, star, excellent, customer, service, receive, phone, call, dresser_disaster, due_collegekid, collegekid_budget, budget_purchase, purchase_product, hope_good, good_old, old_dresser, see_last, last_day, day_customer, customer_service, manage_poor, poor_quality, quality_drawer, drawer_time, send_free, free_charge, charge_wonderful, wonderful_representative, representative_new, new_drawer, drawer_bottom, bottom_theyre, theyre_wrong, wrong_way, way_thickness, fit_purchase, know_lot, lot_extra, extra_work, work_road, lot_rig, rig_exterior, exterior_nice, nice_reason, reason_star, star_excellent, excellent_customer, customer_service, receive_phone, phone_call)",dresser disaster due collegekid budget purchase product hope good old dresser see last day customer service help manage poor quality drawer time send free charge wonderful representative new drawer bottom theyre wrong way thickness fit purchase know lot extra work road lot rig exterior nice reason star excellent customer service receive phone call,0.25472027,349,53
1961125,5.0,0,True,2016-03-07,A0741133YX9L0KH921L1,B003G2ZWDO,Lenora B .,"Beautiful ! I love it .. Got the curtains that match also here on Amazon , fit perfect !","Great quality , very happy",0,home_kitchen,"List(beautiful, love, get, curtain, match, amazon, fit, perfect)","List(beautiful_love, get_curtain, amazon_fit, fit_perfect)","List(beautiful, love, get, curtain, match, amazon, fit, perfect, beautiful_love, get_curtain, amazon_fit, fit_perfect)",beautiful love get curtain match amazon fit perfect,0.6875,51,8
766017,5.0,0,False,1999-12-26,A100YHBWL4TR4D,0060739428,Dana Huff,"This book is one of my favorite books of all time. Karen Cushman perfectly captures life for a young teenage girl in the Middle Ages. From mundane, everyday problems like fleas to major concerns like familial expectations and marriage, Cushman covers it all! This is one of the few books that is so accurate that readers can really get a feel for the times as they truly were - not the way they are romanticized. I say kudos to Cushman, and I hope she keeps them coming!",One of the best books ever written for young adults,0,books,"List(book, favorite, book, time, karen, cushman, capture, life, young, teenage, girl, middle, age, mundane, everyday, problem, flea, major, concern, familial, expectation, marriage, cushman, cover, book, accurate, reader, get, feel, time, way, romanticize, say, kudo, cushman, hope, keep, come)","List(favorite_book, book_time, time_karen, karen_cushman, capture_life, life_young, young_teenage, teenage_girl, girl_middle, middle_age, age_mundane, mundane_everyday, everyday_problem, flea_major, major_concern, familial_expectation, expectation_marriage, marriage_cushman, cushman_cover, book_accurate, accurate_reader, get_feel, feel_time, say_kudo, kudo_cushman, cushman_hope, keep_come)","List(book, favorite, book, time, karen, cushman, capture, life, young, teenage, girl, middle, age, mundane, everyday, problem, flea, major, concern, familial, expectation, marriage, cushman, cover, book, accurate, reader, get, feel, time, way, romanticize, say, kudo, cushman, hope, keep, come, favorite_book, book_time, time_karen, karen_cushman, capture_life, life_young, young_teenage, teenage_girl, girl_middle, middle_age, age_mundane, mundane_everyday, everyday_problem, flea_major, major_concern, familial_expectation, expectation_marriage, marriage_cushman, cushman_cover, book_accurate, accurate_reader, get_feel, feel_time, say_kudo, kudo_cushman, cushman_hope, keep_come)",book favorite book time karen cushman capture life young teenage girl middle age mundane everyday problem flea major concern familial expectation marriage cushman cover book accurate reader get feel time way romanticize say kudo cushman hope keep come,0.086979166,251,38
307286,5.0,0,True,2015-12-31,A1011IS5D1SR35,B0002L1056,Chris Bloom,"Not sure how practical it could be, but its for decoration in our house for my wife. Seems great so far.",Seems great so far,0,home_kitchen,"List(sure, practical, decoration, house, wife, seem, great)","List(sure_practical, decoration_house, house_wife, seem_great)","List(sure, practical, decoration, house, wife, seem, great, sure_practical, decoration_house, house_wife, seem_great)",sure practical decoration house wife seem great,0.65,47,7
431996,5.0,0,True,2018-05-07,A101RVZV3RBT8O,B000BRU78C,James Rizer,"heavy duty, great thermos.",great thermos.,0,home_kitchen,"List(heavy, duty, great, thermos)","List(heavy_duty, duty_great, great_thermos)","List(heavy, duty, great, thermos, heavy_duty, duty_great, great_thermos)",heavy duty great thermos,0.3,24,4
57008,5.0,0,True,2013-10-21,A101SYFLM1I1HR,B00004W4UQ,Rustie C. Dimitriadis,"This product is simple, but does the job perfectly. No need to try to cover our plates with plastic wrap, which is unhealthy and a waste of money, and can be messy and time-consuming. Just stick this cover over your plate, bowl, or microwaveable container and you're good to go. It stores right in the microwave when not in use and is a cinch to clean. Either give it a rinse right after use when the food will easily rinse right off or put it in the dishwasher. I like the hole for release of steam because it makes the lid easy to grab with one hand. This lid makes the food heat better and keeps your microwave very clean. Fits over any dish. I have used it with small plates, where it will just rest on the turntable of my microwave or on larger dinner plates, where the lid rests on the rim of the plate. I also use on bowls of various heights. It will either rest right on the turntable or on the bowl/dish itself. It seems to work equally well with all different sizes of dishes. I have purchased these for gifts as well and would recommend to anyone.",Just what I was looking for!,0,home_kitchen,"List(product, simple, job, need, try, plate, plastic, wrap, unhealthy, waste, money, messy, stick, cover, plate, bowl, microwaveable, container, good, go, store, right, microwave, use, cinch, clean, give, rinse, right, use, food, rinse, right, dishwasher, hole, release, steam, make, lid, easy, grab, hand, lid, make, food, heat, keep, microwave, clean, fit, dish, use, small, plate, rest, turntable, microwave, large, dinner, plate, lid, rest, rim, plate, use, bowl, various, height, rest, right, turntable, bowldish, seem, work, different, size, dish, purchase, gift, recommend, anyone)","List(product_simple, simple_job, plate_plastic, plastic_wrap, wrap_unhealthy, unhealthy_waste, waste_money, money_messy, stick_cover, cover_plate, plate_bowl, bowl_microwaveable, microwaveable_container, container_good, good_go, go_store, store_right, right_microwave, microwave_use, use_cinch, give_rinse, rinse_right, right_use, use_food, rinse_right, hole_release, release_steam, steam_make, make_lid, lid_easy, easy_grab, hand_lid, lid_make, make_food, food_heat, keep_microwave, fit_dish, dish_use, use_small, small_plate, plate_rest, rest_turntable, turntable_microwave, microwave_large, large_dinner, dinner_plate, plate_lid, lid_rest, rest_rim, rim_plate, use_bowl, bowl_various, various_height, rest_right, right_turntable, turntable_bowldish, seem_work, different_size, size_dish, dish_purchase, purchase_gift, recommend_anyone)","List(product, simple, job, need, try, plate, plastic, wrap, unhealthy, waste, money, messy, stick, cover, plate, bowl, microwaveable, container, good, go, store, right, microwave, use, cinch, clean, give, rinse, right, use, food, rinse, right, dishwasher, hole, release, steam, make, lid, easy, grab, hand, lid, make, food, heat, keep, microwave, clean, fit, dish, use, small, plate, rest, turntable, microwave, large, dinner, plate, lid, rest, rim, plate, use, bowl, various, height, rest, right, turntable, bowldish, seem, work, different, size, dish, purchase, gift, recommend, anyone, product_simple, simple_job, plate_plastic, plastic_wrap, wrap_unhealthy, unhealthy_waste, waste_money, money_messy, stick_cover, cover_plate, plate_bowl, bowl_microwaveable, microwaveable_container, container_good, good_go, go_store, store_right, right_microwave, microwave_use, use_cinch, give_rinse, rinse_right, right_use, use_food, rinse_right, hole_release, release_steam, steam_make, make_lid, lid_easy, easy_grab, hand_lid, lid_make, make_food, food_heat, keep_microwave, fit_dish, dish_use, use_small, small_plate, plate_rest, rest_turntable, turntable_microwave, microwave_large, large_dinner, dinner_plate, plate_lid, lid_rest, rest_rim, rim_plate, use_bowl, bowl_various, various_height, rest_right, right_turntable, turntable_bowldish, seem_work, different_size, size_dish, dish_purchase, purchase_gift, recommend_anyone)",product simple job need try plate plastic wrap unhealthy waste money messy stick cover plate bowl microwaveable container good go store right microwave use cinch clean give rinse right use food rinse right dishwasher hole release steam make lid easy grab hand lid make food heat keep microwave clean fit dish use small plate rest turntable microwave large dinner plate lid rest rim plate use bowl various height rest right turntable bowldish seem work different size dish purchase gift recommend anyone,0.15140057,502,81
78628,4.0,0,True,2015-01-10,A102MU6ZC9H1N6,B000EGELP0,Teresa Halbert,My daughter still pulls this out from time to time just to see if she can still keep up with the puzzles. In great shape and purchase and price was good.,Brain Age: Train Your Brain in Minutes a Day,0,video_games,"List(daughter, pull, time, time, see, keep, puzzle, great, shape, purchase, price, good)","List(pull_time, time_time, keep_puzzle, puzzle_great, great_shape, shape_purchase, purchase_price, price_good)","List(daughter, pull, time, time, see, keep, puzzle, great, shape, purchase, price, good, pull_time, time_time, keep_puzzle, puzzle_great, great_shape, shape_purchase, purchase_price, price_good)",daughter pull time time see keep puzzle great shape purchase price good,0.75,71,12
186322,4.0,0,True,2013-03-26,A102MU6ZC9H1N6,B001TOQ8JS,Teresa Halbert,"My son is Beatles crazy and when he saw this game, he was so excited to get it ordered and play it with his friends. The game was in good shape and the price was great.",Xbox 360 The Beatles: rock Band,0,video_games,"List(son, beatles, crazy, see, game, get, order, play, friend, game, good, shape, price, great)","List(son_beatles, beatles_crazy, see_game, get_order, play_friend, friend_game, game_good, good_shape, shape_price, price_great)","List(son, beatles, crazy, see, game, get, order, play, friend, game, good, shape, price, great, son_beatles, beatles_crazy, see_game, get_order, play_friend, friend_game, game_good, good_shape, shape_price, price_great)",son beatles crazy see game get order play friend game good shape price great,0.02,76,14
1394884,5.0,0,True,2015-12-01,A102MV1GZ0TH3A,B001BDUHJE,A. Zaniesienko,"Sweetest, but VERY fragile and difficult to fix! A friend gave me one of these and it promptly broke. I loved it so much that I had to replace it. I pretend it's the original. It's perched in a safe spot and I hope it never falls.",I loved it so much that I had to replace it,0,home_kitchen,"List(sweet, fragile, difficult, fix, friend, give, break, love, much, replace, pretend, original, perch, safe, spot, hope, fall)","List(sweet_fragile, fragile_difficult, difficult_fix, fix_friend, break_love, love_much, pretend_original, original_perch, perch_safe, safe_spot)","List(sweet, fragile, difficult, fix, friend, give, break, love, much, replace, pretend, original, perch, safe, spot, hope, fall, sweet_fragile, fragile_difficult, difficult_fix, fix_friend, break_love, love_much, pretend_original, original_perch, perch_safe, safe_spot)",sweet fragile difficult fix friend give break love much replace pretend original perch safe spot hope fall,0.20357142,106,17


In [46]:
df_cleaned.columns

In [47]:
#Convert df_cleaned dataframe to sql table
df_cleaned.createOrReplaceTempView("df_cleaned_table")

In [48]:
%sql
create table mma2021w_islington.df_cleaned as
select reviewID,
 overall,
 vote,
 verified,
 reviewTime,
 reviewerID,
 asin,
 label,
 category,
 reviewString,
 sentiment_score,
 reviewLength,
 reviewWordcount from df_cleaned_table

In [49]:
# Create a cleaned dataframe for books with label = 1
df_books_cleaned_1 = df_cleaned.filter((F.col("category") == "books") & (F.col("label") == 1))

In [50]:
display(df_books_cleaned_1).show(5)

reviewID,overall,vote,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,label,category,filtered_unigrams,filtered_ngrams,reviewFinal
299104,2.0,4,True,2014-07-27,A10EAPY41X3OS8,0007444117,Book Lover D,"I was very disappointed with this book, and not for the reasons that many people were disappointed. I don't need a happily ever after and honestly wasn't upset by the loss of that specific character. However I also don't think the loss was some important aspect of the story. It didn't add anything to it at all. Actually I feel like it took something away from the first two books. I felt like the author didn't stick with the characters that she created. Tris was fiercely independent and Four was a strong, dependable, and protective character. The feeling that I got from these characters through Divergent and Insurgent did not hold true to the characters that I read in this book. Four became a sniveling weakling and Tris became too wishy-washy. Neither character maintained the personalities that they had to start the series. I also feel like the romance that started in Divergent didn't continue though to this book. It offered no real depth to their romance. Yeah, they kissed, a lot, but that didn't really contribute to how the two characters made me feel in Divergent. I also had a huge problem with the duel perspective. Now first I must say that I usually like books that give be both perspectives. But this was not done well in this book. I had to continually go back to the beginning of the chapter to figure out who was supposed to be speaking. There was no real discernable difference between Tris and Four's voices. Another problem that I had with this book was it really didn't answer any of the questions and it didn't really wrap up any loose ends like it should have. Great, we found out what was on the other side of the fence and it was messed up and an interesting way to look at where the factions came from. But that is all we really got. We got that Tris finally admitted her love for Four, but it was so anticlimactic that it didn't make me happy. The revelation that Four all of the sudden wasn't what we thought he was, was also an affront to those of us that loved Divergent from the start and it messed up the idea that Divergent was based upon. I could go on and on with what I though about every aspect of this book, but it's not worth it. Overall, this book was an affront to Divergent. I loved that book, fell in love with it instantly. I wasn't as happy with Insurgent, but I felt like it added the right amount of turmoil to the story and it was a good next step in the series. But Allegiant was a huge disappointment that has left me unwilling to read any of the stories from Four's POV and to enjoy rereading or listening to Divergent. It was a bad ending for what could have been an amazing series.",Huge Disappointment,1,books,"List(book, reason, many, people, need, loss, specific, character, think, loss, important, aspect, story, add, anything, feel, take, something, first, book, feel, author, stick, character, create, tris, independent, strong, dependable, protective, character, feel, get, character, divergent, insurgent, hold, true, character, read, book, become, tris, become, wishywashy, character, maintain, personality, start, series, feel, romance, start, divergent, continue, book, offer, real, depth, romance, yeah, kiss, lot, contribute, character, make, feel, divergent, huge, problem, duel, perspective, first, say, book, give, perspective, do, book, go, begin, chapter, figure, suppose, speak, real, discernable, difference, tris, fours, voice, problem, book, question, wrap, loose, end, great, find, side, fence, mess, interest, way, look, faction, come, get, get, tris, admit, love, anticlimactic, make, happy, revelation, sudden, think, affront, love, divergent, start, mess, idea, divergent, base, go, aspect, book, worth, overall, book, affront, divergent, love, book, fall, love, happy, insurgent, feel, add, right, amount, turmoil, story, good, next, step, allegiant, huge, disappointment, leave, read, story, fours, pov, enjoy, reread, listen, divergent, bad, end, amazing, series)","List(book_reason, reason_many, many_people, loss_specific, specific_character, think_loss, loss_important, important_aspect, aspect_story, add_anything, take_something, book_feel, author_stick, stick_character, create_tris, strong_dependable, dependable_protective, protective_character, character_feel, get_character, character_divergent, divergent_insurgent, insurgent_hold, hold_true, true_character, character_read, read_book, become_wishywashy, character_maintain, maintain_personality, personality_start, start_series, romance_start, start_divergent, book_offer, offer_real, real_depth, depth_romance, romance_yeah, yeah_kiss, kiss_lot, feel_divergent, huge_problem, problem_duel, duel_perspective, perspective_first, give_perspective, begin_chapter, chapter_figure, speak_real, real_discernable, discernable_difference, difference_tris, tris_fours, fours_voice, problem_book, wrap_loose, loose_end, find_side, side_fence, fence_mess, mess_interest, interest_way, way_look, look_faction, get_tris, admit_love, anticlimactic_make, make_happy, happy_revelation, love_divergent, start_mess, mess_idea, idea_divergent, divergent_base, aspect_book, book_worth, worth_overall, overall_book, book_affront, affront_divergent, divergent_love, love_book, book_fall, fall_love, happy_insurgent, add_right, right_amount, amount_turmoil, turmoil_story, story_good, good_next, next_step, allegiant_huge, huge_disappointment, disappointment_leave, read_story, story_fours, fours_pov, pov_enjoy, enjoy_reread, listen_divergent, divergent_bad, bad_end, amazing_series)","List(book, reason, many, people, need, loss, specific, character, think, loss, important, aspect, story, add, anything, feel, take, something, first, book, feel, author, stick, character, create, tris, independent, strong, dependable, protective, character, feel, get, character, divergent, insurgent, hold, true, character, read, book, become, tris, become, wishywashy, character, maintain, personality, start, series, feel, romance, start, divergent, continue, book, offer, real, depth, romance, yeah, kiss, lot, contribute, character, make, feel, divergent, huge, problem, duel, perspective, first, say, book, give, perspective, do, book, go, begin, chapter, figure, suppose, speak, real, discernable, difference, tris, fours, voice, problem, book, question, wrap, loose, end, great, find, side, fence, mess, interest, way, look, faction, come, get, get, tris, admit, love, anticlimactic, make, happy, revelation, sudden, think, affront, love, divergent, start, mess, idea, divergent, base, go, aspect, book, worth, overall, book, affront, divergent, love, book, fall, love, happy, insurgent, feel, add, right, amount, turmoil, story, good, next, step, allegiant, huge, disappointment, leave, read, story, fours, pov, enjoy, reread, listen, divergent, bad, end, amazing, series, book_reason, reason_many, many_people, loss_specific, specific_character, think_loss, loss_important, important_aspect, aspect_story, add_anything, take_something, book_feel, author_stick, stick_character, create_tris, strong_dependable, dependable_protective, protective_character, character_feel, get_character, character_divergent, divergent_insurgent, insurgent_hold, hold_true, true_character, character_read, read_book, become_wishywashy, character_maintain, maintain_personality, personality_start, start_series, romance_start, start_divergent, book_offer, offer_real, real_depth, depth_romance, romance_yeah, yeah_kiss, kiss_lot, feel_divergent, huge_problem, problem_duel, duel_perspective, perspective_first, give_perspective, begin_chapter, chapter_figure, speak_real, real_discernable, discernable_difference, difference_tris, tris_fours, fours_voice, problem_book, wrap_loose, loose_end, find_side, side_fence, fence_mess, mess_interest, interest_way, way_look, look_faction, get_tris, admit_love, anticlimactic_make, make_happy, happy_revelation, love_divergent, start_mess, mess_idea, idea_divergent, divergent_base, aspect_book, book_worth, worth_overall, overall_book, book_affront, affront_divergent, divergent_love, love_book, book_fall, fall_love, happy_insurgent, add_right, right_amount, amount_turmoil, turmoil_story, story_good, good_next, next_step, allegiant_huge, huge_disappointment, disappointment_leave, read_story, story_fours, fours_pov, pov_enjoy, enjoy_reread, listen_divergent, divergent_bad, bad_end, amazing_series)"
64380,5.0,2,True,2017-03-08,A10O4LYO967IZ,0006140823,Linda R. Wagoner,"Been watching the series on public tv and wanted to read the books. Husband was in hospital, so it was a good time to get reading again. Has paralleled the TV version pretty well I think. So have read it and gotten the next one to keep on my Kindle for the next read.",Enjoy the saga!,1,books,"List(watch, public, tv, want, book, husband, hospital, good, time, get, read, paralleled, tv, version, think, read, get, next, keep, kindle, next, read)","List(public_tv, book_husband, husband_hospital, hospital_good, good_time, read_paralleled, paralleled_tv, tv_version, think_read, get_next, keep_kindle, kindle_next, next_read)","List(watch, public, tv, want, book, husband, hospital, good, time, get, read, paralleled, tv, version, think, read, get, next, keep, kindle, next, read, public_tv, book_husband, husband_hospital, hospital_good, good_time, read_paralleled, paralleled_tv, tv_version, think_read, get_next, keep_kindle, kindle_next, next_read)"
108809,2.0,3,True,2016-12-29,A113UHN3ZINEEK,0007127111,vina,"I kept waiting for the story to pick up and go somewhere, but it was just one fantasy after another, but nothing was really visually described for me. I finally quit reading about two-thirds of the way through.",No Imagination,1,books,"List(keep, wait, story, pick, go, fantasy, nothing, describe, twothirds, way)","List(wait_story, story_pick, pick_go, twothirds_way)","List(keep, wait, story, pick, go, fantasy, nothing, describe, twothirds, way, wait_story, story_pick, pick_go, twothirds_way)"
951728,1.0,14,False,2009-07-17,A1160ZX08NCGRP,0061438456,C. Schaefer,"This book was so poorly written, so cliche, so juvenile that I thought it was either a Young Adult book or was written by a ghost writer. I was so disappointed - and I knew by the second page that this was not a classic, wonderful DBF book. Thank goodness I got it from the library!",Was this supposed to be a Young Adult book??,1,books,"List(book, write, cliche, juvenile, think, young, adult, book, write, ghost, writer, know, second, page, classic, wonderful, dbf, book, thank, goodness, get, library)","List(write_cliche, cliche_juvenile, young_adult, adult_book, write_ghost, ghost_writer, know_second, second_page, page_classic, classic_wonderful, wonderful_dbf, dbf_book, thank_goodness, get_library)","List(book, write, cliche, juvenile, think, young, adult, book, write, ghost, writer, know, second, page, classic, wonderful, dbf, book, thank, goodness, get, library, write_cliche, cliche_juvenile, young_adult, adult_book, write_ghost, ghost_writer, know_second, second_page, page_classic, classic_wonderful, wonderful_dbf, dbf_book, thank_goodness, get_library)"
256695,5.0,3,True,2013-04-23,A118TDZ19KPC7K,0007350899,Richard U Barker,Got this for my grand son for his ninth grade English class. He said it is great. Teacher approved of its use.,required reading,1,books,"List(get, grand, son, ninth, grade, class, say, great, teacher, approve, use)","List(get_grand, grand_son, son_ninth, ninth_grade, say_great, great_teacher, teacher_approve, approve_use)","List(get, grand, son, ninth, grade, class, say, great, teacher, approve, use, get_grand, grand_son, son_ninth, ninth_grade, say_great, great_teacher, teacher_approve, approve_use)"
584054,4.0,3,False,2012-03-30,A11D9DYKYNJPIS,0060173343,Tammany Hall,"William C. Davis is one of the more prolific historians writing today. He seems to come out with at least one book a year. This one is a good overview of the Texas Revolution, with specific focus on William Travis, Jim Bowie, and Davy Crockett. Arguably the most famous participants, along with Sam Houston, they are subject to much mythmaking and legend. To this reader, the value of this book comes from the author and publishers national marketability. State history, even of the type this well known, tends to be a local affair, and it is easy for passions to get aroused in the telling of the Alamo story in Texas. Davis does not seem to have a dog in this fight, so his overview is more general and on the whole, more informative. Instead of endless speculation about how Davy died, or whether Travis drew a line in the sand, we get the full picture of what brought these men to Texas, and what happened to them when they got there. It would be an excellent starting point for someone interested in the Alamo or any aspect of Texas history.",Three Roads to the Alamo,1,books,"List(william, davis, prolific, historian, write, today, seem, book, year, good, overview, revolution, specific, focus, william, travis, jim, bowie, crockett, famous, participant, sam, houston, subject, much, mythmaking, legend, reader, value, book, come, author, national, marketability, state, history, type, know, tend, local, affair, easy, passion, get, arouse, tell, alamo, story, davis, seem, dog, fight, overview, general, whole, informative, endless, speculation, die, travis, draw, line, sand, get, full, picture, bring, man, happen, get, excellent, start, point, someone, interest, alamo, aspect, history)","List(prolific_historian, historian_write, write_today, book_year, good_overview, revolution_specific, specific_focus, focus_william, william_travis, travis_jim, jim_bowie, famous_participant, sam_houston, houston_subject, subject_much, much_mythmaking, mythmaking_legend, legend_reader, reader_value, value_book, come_author, national_marketability, marketability_state, state_history, tend_local, local_affair, affair_easy, easy_passion, get_arouse, tell_alamo, alamo_story, seem_dog, dog_fight, fight_overview, overview_general, general_whole, whole_informative, endless_speculation, travis_draw, draw_line, line_sand, get_full, full_picture, bring_man, start_point, point_someone, someone_interest, interest_alamo, alamo_aspect)","List(william, davis, prolific, historian, write, today, seem, book, year, good, overview, revolution, specific, focus, william, travis, jim, bowie, crockett, famous, participant, sam, houston, subject, much, mythmaking, legend, reader, value, book, come, author, national, marketability, state, history, type, know, tend, local, affair, easy, passion, get, arouse, tell, alamo, story, davis, seem, dog, fight, overview, general, whole, informative, endless, speculation, die, travis, draw, line, sand, get, full, picture, bring, man, happen, get, excellent, start, point, someone, interest, alamo, aspect, history, prolific_historian, historian_write, write_today, book_year, good_overview, revolution_specific, specific_focus, focus_william, william_travis, travis_jim, jim_bowie, famous_participant, sam_houston, houston_subject, subject_much, much_mythmaking, mythmaking_legend, legend_reader, reader_value, value_book, come_author, national_marketability, marketability_state, state_history, tend_local, local_affair, affair_easy, easy_passion, get_arouse, tell_alamo, alamo_story, seem_dog, dog_fight, fight_overview, overview_general, general_whole, whole_informative, endless_speculation, travis_draw, draw_line, line_sand, get_full, full_picture, bring_man, start_point, point_someone, someone_interest, interest_alamo, alamo_aspect)"
427785,2.0,7,True,2017-05-12,A11JN7349CSZNL,0008242763,Ketsybo,"After reading about one-third of this book and realizing I just didn't care about any of the characters, I quit reading it. Very disappointing.",Not engaging,1,books,"List(read, onethird, book, realize, care, character, quit, read, disappointing)","List(read_onethird, onethird_book, book_realize, realize_care, care_character, character_quit, quit_read, read_disappointing)","List(read, onethird, book, realize, care, character, quit, read, disappointing, read_onethird, onethird_book, book_realize, realize_care, care_character, character_quit, quit_read, read_disappointing)"
465840,5.0,2,True,2008-03-25,A11X9HWN09P7MC,002782683X,ThreePeas,"Big Pumpkin is so Big in our house, it's read all the time...Oh and our son loves it too! Once my son (4 years old) discovered and fell in love with Big Pumpkin it didn't matter if it was Halloween or not. The story is nothing short of hilarious, the illustrations could not be better suited and truly compliment this book where so many others fall short. My husband and I get such a kick out of reading this book as well, we practically know it by heart and have been known to recite it in jest around the house. One of my favorite things about this book is that it introduces the so-called ""scary"" characters of Halloween, the Ghost, the Vampire, the Mummy, the Bat, and the Witch in the most loveable manner with such simple and innocent humor that it completely removes any possible thought of ""scary"" for our little ones. Big Pumpkin can make you befriend a Witch, want to help out a Vampire, and just love your ""Mummy""! Don't wait until Halloween, buy it now, it's too cute to pass up.",The Best Halloween book for preschoolers out there!,1,books,"List(big, pumpkin, big, house, read, timeoh, son, love, son, year, old, discover, fall, love, big, pumpkin, matter, halloween, story, nothing, short, hilarious, illustration, suit, compliment, book, many, other, fall, short, husband, get, kick, read, book, know, heart, know, recite, jest, house, favorite, thing, book, introduce, socalled, scary, character, halloween, ghost, vampire, mummy, bat, witch, loveable, manner, simple, innocent, humor, remove, possible, think, scary, big, pumpkin, make, befriend, witch, want, help, vampire, love, mummy, wait, halloween, buy, cute, pass)","List(big_pumpkin, pumpkin_big, big_house, house_read, read_timeoh, timeoh_son, son_love, love_son, son_year, year_old, old_discover, discover_fall, fall_love, love_big, big_pumpkin, pumpkin_matter, matter_halloween, halloween_story, story_nothing, nothing_short, short_hilarious, hilarious_illustration, compliment_book, book_many, many_other, other_fall, fall_short, short_husband, get_kick, kick_read, read_book, know_heart, know_recite, recite_jest, favorite_thing, thing_book, introduce_socalled, socalled_scary, scary_character, character_halloween, halloween_ghost, ghost_vampire, vampire_mummy, mummy_bat, bat_witch, witch_loveable, loveable_manner, manner_simple, simple_innocent, innocent_humor, remove_possible, think_scary, big_pumpkin, pumpkin_make, make_befriend, befriend_witch, help_vampire, vampire_love, love_mummy, mummy_wait, wait_halloween, halloween_buy, buy_cute, cute_pass)","List(big, pumpkin, big, house, read, timeoh, son, love, son, year, old, discover, fall, love, big, pumpkin, matter, halloween, story, nothing, short, hilarious, illustration, suit, compliment, book, many, other, fall, short, husband, get, kick, read, book, know, heart, know, recite, jest, house, favorite, thing, book, introduce, socalled, scary, character, halloween, ghost, vampire, mummy, bat, witch, loveable, manner, simple, innocent, humor, remove, possible, think, scary, big, pumpkin, make, befriend, witch, want, help, vampire, love, mummy, wait, halloween, buy, cute, pass, big_pumpkin, pumpkin_big, big_house, house_read, read_timeoh, timeoh_son, son_love, love_son, son_year, year_old, old_discover, discover_fall, fall_love, love_big, big_pumpkin, pumpkin_matter, matter_halloween, halloween_story, story_nothing, nothing_short, short_hilarious, hilarious_illustration, compliment_book, book_many, many_other, other_fall, fall_short, short_husband, get_kick, kick_read, read_book, know_heart, know_recite, recite_jest, favorite_thing, thing_book, introduce_socalled, socalled_scary, scary_character, character_halloween, halloween_ghost, ghost_vampire, vampire_mummy, mummy_bat, bat_witch, witch_loveable, loveable_manner, manner_simple, simple_innocent, innocent_humor, remove_possible, think_scary, big_pumpkin, pumpkin_make, make_befriend, befriend_witch, help_vampire, vampire_love, love_mummy, mummy_wait, wait_halloween, halloween_buy, buy_cute, cute_pass)"
616368,5.0,4,False,2007-06-26,A120QFSXP48RJO,006027056X,cookie's mom,"Rather brief story, but an excellent introduction for the reader in grades 1-3 who is interested in archaeology. Good coverage of major points for this age range--what archaeology is, types of objects likely to be found, that the work continues into the lab, etc. I just wish there was something this good to introduce the same ideas for upper elementary students--I thought about using this story with an older group because the concepts were introduced so well, but decided they would find the story too juvenile.",Good coverage of major points for this age,1,books,"List(brief, story, excellent, introduction, reader, grade, interest, archaeology, good, coverage, major, point, age, rangewhat, archaeology, type, object, likely, find, work, continue, lab, wish, something, good, introduce, idea, upper, elementary, studentsi, think, use, story, old, group, concept, introduce, decide, find, story, juvenile)","List(brief_story, story_excellent, excellent_introduction, introduction_reader, reader_grade, grade_interest, interest_archaeology, archaeology_good, good_coverage, coverage_major, major_point, point_age, age_rangewhat, rangewhat_archaeology, archaeology_type, object_likely, find_work, continue_lab, wish_something, something_good, good_introduce, introduce_idea, idea_upper, upper_elementary, elementary_studentsi, think_use, use_story, story_old, old_group, group_concept, concept_introduce, find_story, story_juvenile)","List(brief, story, excellent, introduction, reader, grade, interest, archaeology, good, coverage, major, point, age, rangewhat, archaeology, type, object, likely, find, work, continue, lab, wish, something, good, introduce, idea, upper, elementary, studentsi, think, use, story, old, group, concept, introduce, decide, find, story, juvenile, brief_story, story_excellent, excellent_introduction, introduction_reader, reader_grade, grade_interest, interest_archaeology, archaeology_good, good_coverage, coverage_major, major_point, point_age, age_rangewhat, rangewhat_archaeology, archaeology_type, object_likely, find_work, continue_lab, wish_something, something_good, good_introduce, introduce_idea, idea_upper, upper_elementary, elementary_studentsi, think_use, use_story, story_old, old_group, group_concept, concept_introduce, find_story, story_juvenile)"
493938,2.0,7,True,2013-01-21,A126BXBBLR7WKK,0030590434,donald bear,I know it's supposed to be a classic and I tried; I really tried. After about 200 pages I had to stop. It's like Mailer sat at the typewriter and just typed whatever popped into his head. On and on and on and............. Not my cup of tea. Sorry Norman.,Couldn't make it through,1,books,"List(know, suppose, classic, try, try, page, stop, mailer, sit, typewriter, type, pop, head, cup, tea, sorry, norman)","List(suppose_classic, classic_try, try_page, page_stop, sit_typewriter, typewriter_type, pop_head, head_cup, cup_tea, tea_sorry, sorry_norman)","List(know, suppose, classic, try, try, page, stop, mailer, sit, typewriter, type, pop, head, cup, tea, sorry, norman, suppose_classic, classic_try, try_page, page_stop, sit_typewriter, typewriter_type, pop_head, head_cup, cup_tea, tea_sorry, sorry_norman)"


In [51]:
counts_books_1 = df_books_cleaned_1.select(F.explode("filtered_unigrams").alias("col")).groupBy("col").count().sort(F.desc("count")).collect()

In [52]:
display(counts_books_1)

col,count
book,563082
read,244129
story,159626
make,128296
time,121399
good,119552
character,116191
get,115650
write,114744
life,104442


In [53]:
counts_books_2grams_1 = df_books_cleaned_1.select(F.explode("filtered_ngrams").alias("col")).groupBy("col").count().sort(F.desc("count")).collect()

In [54]:
display(counts_books_2grams_1).show(10)

col,count
read_book,30122
book_read,11345
main_character,9996
good_book,8483
recommend_book,8373
ive_read,8030
year_old,7680
love_book,6920
write_book,6722
enjoy_book,5989


In [55]:
from pyspark.ml.feature import CountVectorizer

tfizer = CountVectorizer(inputCol = "reviewFinal", outputCol = "tf_Features")
tf_model = tfizer.fit(df_books_cleaned_1)
tf_result = tf_model.transform(df_books_cleaned_1)

In [56]:
from pyspark.ml.feature import IDF

idfizer = IDF(inputCol = "tf_Features", outputCol = "tf_idf_features")
idf_model = idfizer.fit(tf_result)
tfidf_result = idf_model.transform(tf_result)

In [57]:
from pyspark.ml.clustering import LDA

num_topics = 8
max_iter = 10

lda = LDA(k = num_topics, maxIter = max_iter, featuresCol = "tf_idf_features")
lda_model = lda.fit(tfidf_result)

In [58]:
vocab = tf_model.vocabulary

def get_words(token_list):
  return [vocab[token_id] for token_id in token_list]

udf_to_words = F.udf(get_words, T.ArrayType(T.StringType()))

In [59]:
num_top_words = 8

topics = lda_model.describeTopics(num_top_words).withColumn("topicWords", udf_to_words(F.col("termIndices")))
topics.select("topic", "topicWords").show(truncate = 90)

In [60]:
# Create a cleaned dataframe for books with label = 1
df_books_cleaned_0 = df_cleaned.filter((F.col("category") == "books") & (F.col("label") == 0))

In [61]:
display(df_books_cleaned_0).show(5)

reviewID,overall,vote,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,label,category,filtered_unigrams,filtered_ngrams,reviewFinal,reviewString,sentiment_score,reviewLength,reviewWordcount
766017,5.0,0,False,1999-12-26,A100YHBWL4TR4D,0060739428,Dana Huff,"This book is one of my favorite books of all time. Karen Cushman perfectly captures life for a young teenage girl in the Middle Ages. From mundane, everyday problems like fleas to major concerns like familial expectations and marriage, Cushman covers it all! This is one of the few books that is so accurate that readers can really get a feel for the times as they truly were - not the way they are romanticized. I say kudos to Cushman, and I hope she keeps them coming!",One of the best books ever written for young adults,0,books,"List(book, favorite, book, time, karen, cushman, capture, life, young, teenage, girl, middle, age, mundane, everyday, problem, flea, major, concern, familial, expectation, marriage, cushman, cover, book, accurate, reader, get, feel, time, way, romanticize, say, kudo, cushman, hope, keep, come)","List(favorite_book, book_time, time_karen, karen_cushman, capture_life, life_young, young_teenage, teenage_girl, girl_middle, middle_age, age_mundane, mundane_everyday, everyday_problem, flea_major, major_concern, familial_expectation, expectation_marriage, marriage_cushman, cushman_cover, book_accurate, accurate_reader, get_feel, feel_time, say_kudo, kudo_cushman, cushman_hope, keep_come)","List(book, favorite, book, time, karen, cushman, capture, life, young, teenage, girl, middle, age, mundane, everyday, problem, flea, major, concern, familial, expectation, marriage, cushman, cover, book, accurate, reader, get, feel, time, way, romanticize, say, kudo, cushman, hope, keep, come, favorite_book, book_time, time_karen, karen_cushman, capture_life, life_young, young_teenage, teenage_girl, girl_middle, middle_age, age_mundane, mundane_everyday, everyday_problem, flea_major, major_concern, familial_expectation, expectation_marriage, marriage_cushman, cushman_cover, book_accurate, accurate_reader, get_feel, feel_time, say_kudo, kudo_cushman, cushman_hope, keep_come)",book favorite book time karen cushman capture life young teenage girl middle age mundane everyday problem flea major concern familial expectation marriage cushman cover book accurate reader get feel time way romanticize say kudo cushman hope keep come,0.086979166,251,38
426048,5.0,0,True,2017-04-24,A102ZA8XDTTROP,0008238073,Rosebud,This one kept me guessing right up til the end. A real page turner with well-developed characters! Highly recommended for anyone who hates to put a book down.,Thriller,0,books,"List(keep, guess, right, til, end, real, page, character, recommend, anyone, hate, book)","List(guess_right, right_til, til_end, end_real, real_page, recommend_anyone, anyone_hate)","List(keep, guess, right, til, end, real, page, character, recommend, anyone, hate, book, guess_right, right_til, til_end, end_real, real_page, recommend_anyone, anyone_hate)",keep guess right til end real page character recommend anyone hate book,-0.104761906,71,12
418310,4.0,0,False,2017-08-07,A104U3MN7LBKQ4,0008179972,Hannah @ A Reading Red Sox,I loved the setting of this book. The tower was a beyond cool idea. But the characters...good lord. I couldn't have cared less about them. There was a bunch of hype surrounding this book and it ultimately just fell flat for me.,I loved the setting of this book,0,books,"List(love, book, tower, cool, idea, charactersgood, lord, care, bunch, hype, surround, book, fall, flat)","List(book_tower, cool_idea, idea_charactersgood, charactersgood_lord, lord_care, bunch_hype, surround_book, fall_flat)","List(love, book, tower, cool, idea, charactersgood, lord, care, bunch, hype, surround, book, fall, flat, book_tower, cool_idea, idea_charactersgood, charactersgood_lord, lord_care, bunch_hype, surround_book, fall_flat)",love book tower cool idea charactersgood lord care bunch hype surround book fall flat,0.275,85,14
925500,5.0,0,False,2015-02-06,A105KC8JA0O0GI,0061238783,HEROSTREET,It's the best book I have never read. ARE YOU NOT ENTERTAINED?,I WILL BUY IT & THERE WILL BE BLOOD!,0,books,"List(good, book, read, entertain)","List(good_book, read_entertain)","List(good, book, read, entertain, good_book, read_entertain)",good book read entertain,0.7,24,4
679714,4.0,0,True,2012-03-02,A105LJFSPJ6NQB,0060529709,SassyDenise,"My second Safran-Foer novel. Started with Incredibly Loud and moved on to Everything Is Illuminated based on a recommendation from an acquaintance. I can say I haven't been disappointed. Jonathan Safran-Foer creates beautifully, detailed and intricate characters. I am simply amazed at his writing. I highly recommend this and other books by him.",Illuminating me,0,books,"List(second, safranfoer, novel, start, loud, move, everything, illuminate, base, recommendation, acquaintance, say, jonathan, safranfoer, create, detail, intricate, character, write, recommend, book)","List(second_safranfoer, safranfoer_novel, novel_start, loud_move, move_everything, everything_illuminate, illuminate_base, base_recommendation, recommendation_acquaintance, jonathan_safranfoer, detail_intricate, intricate_character, recommend_book)","List(second, safranfoer, novel, start, loud, move, everything, illuminate, base, recommendation, acquaintance, say, jonathan, safranfoer, create, detail, intricate, character, write, recommend, book, second_safranfoer, safranfoer_novel, novel_start, loud_move, move_everything, everything_illuminate, illuminate_base, base_recommendation, recommendation_acquaintance, jonathan_safranfoer, detail_intricate, intricate_character, recommend_book)",second safranfoer novel start loud move everything illuminate base recommendation acquaintance say jonathan safranfoer create detail intricate character write recommend book,-0.23333333,173,21
560095,5.0,0,True,2013-10-08,A1072FC8PJMATW,0060175982,Chantal Hinton,I was pleased with the service and quality of the book. It is a great book for anyone who gets divorced or loses a loved one.,Great Book-Great Service,0,books,"List(please, service, quality, book, great, book, anyone, get, divorce, love)","List(please_service, service_quality, quality_book, book_great, great_book, book_anyone, get_divorce)","List(please, service, quality, book, great, book, anyone, get, divorce, love, please_service, service_quality, quality_book, book_great, great_book, book_anyone, get_divorce)",please service quality book great book anyone get divorce love,0.65,62,10
147681,5.0,0,True,2017-06-19,A108J10JWRKT4O,0007173156,NikNik,Just the right gift for any age graduate! Mine was for a high school student - wonderful encouragement to face life's up and downs and to keep on keeping on.,Mine was for a high school student - wonderful encouragement to face life's up and downs and to ...,0,books,"List(right, gift, age, graduate, high, school, student, wonderful, encouragement, face, life, keep, keep)","List(right_gift, gift_age, age_graduate, high_school, school_student, student_wonderful, wonderful_encouragement, encouragement_face, face_life, keep_keep)","List(right, gift, age, graduate, high, school, student, wonderful, encouragement, face, life, keep, keep, right_gift, gift_age, age_graduate, high_school, school_student, student_wonderful, wonderful_encouragement, encouragement_face, face_life, keep_keep)",right gift age graduate high school student wonderful encouragement face life keep keep,0.48190477,87,13
413846,4.0,0,True,2016-02-29,A109SYB276RKQ9,0008168482,redhead,"After Anna's plot had many clues as to who the evil perpetrator was, but I was surprised when I realized how the author had led me logically down one path only to discover who had taken Anna. The characters were believable even in their strengths and weaknesses. Anna's parents probably needed lots of counseling even before Anna's disappearance....and both probably realized the errors of their ways before the conclusion. There was some pretty graphic violence near the conclusion; however, it was totally believable from the survival effort of one character and the demented mind-set of another. I look forward to reading other books by this author.",There was some pretty graphic violence near the conclusion,0,books,"List(anna, plot, many, clue, evil, perpetrator, surprise, realize, author, lead, path, discover, take, anna, character, believable, strength, weakness, anna, parent, need, lot, anna, disappearanceand, realize, error, way, conclusion, graphic, violence, conclusion, believable, survival, effort, character, look, read, book, author)","List(anna_plot, plot_many, many_clue, clue_evil, evil_perpetrator, perpetrator_surprise, surprise_realize, realize_author, author_lead, path_discover, discover_take, take_anna, anna_character, character_believable, strength_weakness, weakness_anna, anna_parent, need_lot, anna_disappearanceand, realize_error, error_way, way_conclusion, graphic_violence, believable_survival, survival_effort, read_book, book_author)","List(anna, plot, many, clue, evil, perpetrator, surprise, realize, author, lead, path, discover, take, anna, character, believable, strength, weakness, anna, parent, need, lot, anna, disappearanceand, realize, error, way, conclusion, graphic, violence, conclusion, believable, survival, effort, character, look, read, book, author, anna_plot, plot_many, many_clue, clue_evil, evil_perpetrator, perpetrator_surprise, surprise_realize, realize_author, author_lead, path_discover, discover_take, take_anna, anna_character, character_believable, strength_weakness, weakness_anna, anna_parent, need_lot, anna_disappearanceand, realize_error, error_way, way_conclusion, graphic_violence, believable_survival, survival_effort, read_book, book_author)",anna plot many clue evil perpetrator surprise realize author lead path discover take anna character believable strength weakness anna parent need lot anna disappearanceand realize error way conclusion graphic violence conclusion believable survival effort character look read book author,0.1,287,39
814332,2.0,0,False,2009-09-18,A10DCS8UQTNN7D,0060876115,Sara B.,"I won't go into how much I disliked the format of this story and how it took place simultaniously with the The Lost Duke of Wyndham, suffice it to say it make reading this book slow and repetitive and not as interesting as it could have been. I actually liked the characters of Thomas and Amelia better than Jack and Grace and felt they deserved a better, more in depth story line all their own, away from all the goings on in the first book. Ultimately it was better than some regency historicals that I've read just b/c it was a Julia Quinn novel, but I felt like I basically read the same novel twice with a bit of a different ending, and it made the ending feel very rushed also. I hate to be disappointed in a favorite author, but I definitely was this time. I have higher hopes for What Happens in London and hope that this was just an whim on the authors part and won't be repeated in future novels.",The only Julia Quinn novel to ever disappoint me!,0,books,"List(go, much, dislike, format, story, take, place, lose, duke, wyndham, suffice, say, make, read, book, slow, repetitive, interest, character, thomas, amelia, grace, feel, deserve, depth, story, line, goings, first, book, regency, historical, ive, read, bc, julia, quinn, novel, feel, read, novel, bit, different, end, make, end, feel, rush, hate, favorite, author, time, high, hope, happen, london, hope, whim, author, part, repeat, future, novel)","List(go_much, much_dislike, dislike_format, format_story, take_place, lose_duke, duke_wyndham, wyndham_suffice, say_make, make_read, read_book, book_slow, slow_repetitive, repetitive_interest, character_thomas, thomas_amelia, feel_deserve, depth_story, story_line, goings_first, first_book, regency_historical, historical_ive, ive_read, read_bc, bc_julia, julia_quinn, quinn_novel, novel_feel, read_novel, bit_different, different_end, end_make, make_end, feel_rush, favorite_author, time_high, high_hope, hope_happen, hope_whim, whim_author, author_part, part_repeat, repeat_future, future_novel)","List(go, much, dislike, format, story, take, place, lose, duke, wyndham, suffice, say, make, read, book, slow, repetitive, interest, character, thomas, amelia, grace, feel, deserve, depth, story, line, goings, first, book, regency, historical, ive, read, bc, julia, quinn, novel, feel, read, novel, bit, different, end, make, end, feel, rush, hate, favorite, author, time, high, hope, happen, london, hope, whim, author, part, repeat, future, novel, go_much, much_dislike, dislike_format, format_story, take_place, lose_duke, duke_wyndham, wyndham_suffice, say_make, make_read, read_book, book_slow, slow_repetitive, repetitive_interest, character_thomas, thomas_amelia, feel_deserve, depth_story, story_line, goings_first, first_book, regency_historical, historical_ive, ive_read, read_bc, bc_julia, julia_quinn, quinn_novel, novel_feel, read_novel, bit_different, different_end, end_make, make_end, feel_rush, favorite_author, time_high, high_hope, hope_happen, hope_whim, whim_author, author_part, part_repeat, repeat_future, future_novel)",go much dislike format story take place lose duke wyndham suffice say make read book slow repetitive interest character thomas amelia grace feel deserve depth story line goings first book regency historical ive read bc julia quinn novel feel read novel bit different end make end feel rush hate favorite author time high hope happen london hope whim author part repeat future novel,-0.024,381,63
814309,5.0,0,True,2015-10-07,A10FWN7ZQ5OHMW,0060855924,Tony Young,Great and zany fun!!,Five Stars,0,books,"List(great, zany, fun)","List(great_zany, zany_fun)","List(great, zany, fun, great_zany, zany_fun)",great zany fun,0.55,14,3


In [62]:
counts_books_0 = df_books_cleaned_0.select(F.explode("filtered_unigrams").alias("col")).groupBy("col").count().sort(F.desc("count")).collect()

In [63]:
display(counts_books_0)

col,count
book,858274
read,491062
story,293309
love,221869
good,218727
character,187048
great,177295
time,157269
write,151368
get,148887


In [64]:
counts_books_2grams_0 = df_books_cleaned_0.select(F.explode("filtered_ngrams").alias("col")).groupBy("col").count().sort(F.desc("count")).collect()

In [65]:
display(counts_books_2grams_0).show(10)

col,count
read_book,51486
love_book,31288
great_book,27470
good_book,27095
book_read,22528
enjoy_book,19468
good_read,18574
great_read,17178
recommend_book,16742
main_character,16394


In [66]:
from pyspark.ml.feature import CountVectorizer

tfizer = CountVectorizer(inputCol = "reviewFinal", outputCol = "tf_Features")
tf_model = tfizer.fit(df_books_cleaned_0)
tf_result = tf_model.transform(df_books_cleaned_0)

In [67]:
from pyspark.ml.feature import IDF

idfizer = IDF(inputCol = "tf_Features", outputCol = "tf_idf_features")
idf_model = idfizer.fit(tf_result)
tfidf_result = idf_model.transform(tf_result)

In [68]:
from pyspark.ml.clustering import LDA

num_topics = 8
max_iter = 10

lda = LDA(k = num_topics, maxIter = max_iter, featuresCol = "tf_idf_features")
lda_model = lda.fit(tfidf_result)

In [69]:
vocab = tf_model.vocabulary

def get_words(token_list):
  return [vocab[token_id] for token_id in token_list]

udf_to_words = F.udf(get_words, T.ArrayType(T.StringType()))

In [70]:
num_top_words = 8

topics = lda_model.describeTopics(num_top_words).withColumn("topicWords", udf_to_words(F.col("termIndices")))
topics.select("topic", "topicWords").show(truncate = 90)

In [71]:
# Create a cleaned dataframe for home & kitchen with label = 1
df_home_kitchen_cleaned_1 = df_cleaned.filter((F.col("category") == "home_kitchen") & (F.col("label") == 1))

In [72]:
counts_home_kitchen_1 = df_home_kitchen_cleaned_1.select(F.explode("filtered_unigrams").alias("col")).groupBy("col").count().sort(F.desc("count")).collect()

In [73]:
display(counts_home_kitchen_1)

col,count
use,222874
make,140886
get,136183
buy,99123
good,93277
work,91757
time,88411
great,82808
look,80736
clean,66811


In [74]:
counts_home_kitchen_2grams_1 = df_home_kitchen_cleaned_1.select(F.explode("filtered_ngrams").alias("col")).groupBy("col").count().sort(F.desc("count")).collect()

In [75]:
display(counts_home_kitchen_2grams_1)

col,count
work_great,11351
stainless_steel,9436
easy_use,8023
make_sure,6968
ive_use,6125
look_great,5773
coffee_maker,5722
first_time,5413
cast_iron,5307
read_review,5250


In [76]:
from pyspark.ml.feature import CountVectorizer

tfizer = CountVectorizer(inputCol = "reviewFinal", outputCol = "tf_Features")
tf_model = tfizer.fit(df_home_kitchen_cleaned_1)
tf_result = tf_model.transform(df_home_kitchen_cleaned_1)

In [77]:
from pyspark.ml.feature import IDF

idfizer = IDF(inputCol = "tf_Features", outputCol = "tf_idf_features")
idf_model = idfizer.fit(tf_result)
tfidf_result = idf_model.transform(tf_result)

In [78]:
from pyspark.ml.clustering import LDA

num_topics = 8
max_iter = 10

lda = LDA(k = num_topics, maxIter = max_iter, featuresCol = "tf_idf_features")
lda_model = lda.fit(tfidf_result)

In [79]:
vocab = tf_model.vocabulary

def get_words(token_list):
  return [vocab[token_id] for token_id in token_list]

udf_to_words = F.udf(get_words, T.ArrayType(T.StringType()))

In [80]:
num_top_words = 8

topics = lda_model.describeTopics(num_top_words).withColumn("topicWords", udf_to_words(F.col("termIndices")))
topics.select("topic", "topicWords").show(truncate = 90)

In [81]:
# Create a cleaned dataframe for home & kitchen with label = 1
df_home_kitchen_cleaned_0 = df_cleaned.filter((F.col("category") == "home_kitchen") & (F.col("label") == 0))

In [82]:
counts_home_kitchen_0 = df_home_kitchen_cleaned_0.select(F.explode("filtered_unigrams").alias("col")).groupBy("col").count().sort(F.desc("count")).collect()

In [83]:
display(counts_home_kitchen_0)

col,count
use,486661
great,391483
work,315597
good,288723
make,282272
love,274388
get,261443
buy,241295
easy,206243
look,204298


In [84]:
counts_home_kitchen_2grams_0 = df_home_kitchen_cleaned_0.select(F.explode("filtered_ngrams").alias("col")).groupBy("col").count().sort(F.desc("count")).collect()

In [85]:
display(counts_home_kitchen_2grams_0)

col,count
work_great,71472
easy_use,33814
good_quality,25876
look_great,25420
great_product,24868
great_price,17957
easy_clean,16802
look_nice,15268
perfect_size,14416
work_fine,14286


In [86]:
from pyspark.ml.feature import CountVectorizer

tfizer = CountVectorizer(inputCol = "reviewFinal", outputCol = "tf_Features")
tf_model = tfizer.fit(df_home_kitchen_cleaned_0)
tf_result = tf_model.transform(df_home_kitchen_cleaned_0)

In [87]:
from pyspark.ml.feature import IDF

idfizer = IDF(inputCol = "tf_Features", outputCol = "tf_idf_features")
idf_model = idfizer.fit(tf_result)
tfidf_result = idf_model.transform(tf_result)

In [88]:
from pyspark.ml.clustering import LDA

num_topics = 8
max_iter = 10

lda = LDA(k = num_topics, maxIter = max_iter, featuresCol = "tf_idf_features")
lda_model = lda.fit(tfidf_result)

In [89]:
vocab = tf_model.vocabulary

def get_words(token_list):
  return [vocab[token_id] for token_id in token_list]

udf_to_words = F.udf(get_words, T.ArrayType(T.StringType()))

In [90]:
num_top_words = 8

topics = lda_model.describeTopics(num_top_words).withColumn("topicWords", udf_to_words(F.col("termIndices")))
topics.select("topic", "topicWords").show(truncate = 90)

In [91]:
# Create a cleaned dataframe for home & kitchen with label = 1
df_video_games_cleaned_1 = df_cleaned.filter((F.col("category") == "video_games") & (F.col("label") == 1))

In [92]:
counts_video_games_1 = df_video_games_cleaned_1.select(F.explode("filtered_unigrams").alias("col")).groupBy("col").count().sort(F.desc("count")).collect()

In [93]:
display(counts_video_games_1)

col,count
game,558511
play,169729
get,154221
good,101725
time,98473
make,92805
go,78342
use,72834
great,62990
character,62253


In [94]:
counts_video_games_2grams_1 = df_video_games_cleaned_1.select(F.explode("filtered_ngrams").alias("col")).groupBy("col").count().sort(F.desc("count")).collect()

In [95]:
display(counts_video_games_2grams_1)

col,count
play_game,20834
game_play,10626
video_game,8790
good_game,8743
great_game,8411
buy_game,7882
make_game,7740
get_game,6107
single_player,5930
final_fantasy,5727


In [96]:
from pyspark.ml.feature import CountVectorizer

tfizer = CountVectorizer(inputCol = "reviewFinal", outputCol = "tf_Features")
tf_model = tfizer.fit(df_video_games_cleaned_1)
tf_result = tf_model.transform(df_video_games_cleaned_1)

In [97]:
from pyspark.ml.feature import IDF

idfizer = IDF(inputCol = "tf_Features", outputCol = "tf_idf_features")
idf_model = idfizer.fit(tf_result)
tfidf_result = idf_model.transform(tf_result)

In [98]:
from pyspark.ml.clustering import LDA

num_topics = 8
max_iter = 10

lda = LDA(k = num_topics, maxIter = max_iter, featuresCol = "tf_idf_features")
lda_model = lda.fit(tfidf_result)

In [99]:
vocab = tf_model.vocabulary

def get_words(token_list):
  return [vocab[token_id] for token_id in token_list]

udf_to_words = F.udf(get_words, T.ArrayType(T.StringType()))

In [100]:
num_top_words = 8

topics = lda_model.describeTopics(num_top_words).withColumn("topicWords", udf_to_words(F.col("termIndices")))
topics.select("topic", "topicWords").show(truncate = 90)

In [101]:
# Create a cleaned dataframe for home & kitchen with label = 1
df_video_games_cleaned_0 = df_cleaned.filter((F.col("category") == "video_games") & (F.col("label") == 0))

In [102]:
counts_video_games_0 = df_video_games_cleaned_0.select(F.explode("filtered_unigrams").alias("col")).groupBy("col").count().sort(F.desc("count")).collect()

In [103]:
display(counts_video_games_0)

col,count
game,601818
play,193263
get,159670
good,147828
great,127033
time,93046
fun,83890
make,82711
use,80718
love,71810


In [104]:
counts_video_games_2grams_0 = df_video_games_cleaned_0.select(F.explode("filtered_ngrams").alias("col")).groupBy("col").count().sort(F.desc("count")).collect()

In [105]:
display(counts_video_games_2grams_0)

col,count
play_game,23249
great_game,22058
good_game,18100
game_play,13982
love_game,12548
fun_game,10576
video_game,9902
work_great,9584
buy_game,8658
game_great,8259


In [106]:
from pyspark.ml.feature import CountVectorizer

tfizer = CountVectorizer(inputCol = "reviewFinal", outputCol = "tf_Features")
tf_model = tfizer.fit(df_video_games_cleaned_0)
tf_result = tf_model.transform(df_video_games_cleaned_0)

In [107]:
from pyspark.ml.feature import IDF

idfizer = IDF(inputCol = "tf_Features", outputCol = "tf_idf_features")
idf_model = idfizer.fit(tf_result)
tfidf_result = idf_model.transform(tf_result)

In [108]:
from pyspark.ml.clustering import LDA

num_topics = 8
max_iter = 10

lda = LDA(k = num_topics, maxIter = max_iter, featuresCol = "tf_idf_features")
lda_model = lda.fit(tfidf_result)

In [109]:
vocab = tf_model.vocabulary

def get_words(token_list):
  return [vocab[token_id] for token_id in token_list]

udf_to_words = F.udf(get_words, T.ArrayType(T.StringType()))

In [110]:
num_top_words = 8

topics = lda_model.describeTopics(num_top_words).withColumn("topicWords", udf_to_words(F.col("termIndices")))
topics.select("topic", "topicWords").show(truncate = 90)

In [111]:
#Count of each overall rating 

#from pyspark.sql.functions import col
#df_cleaned.groupBy("overall").count().orderBy(col("overall").asc()).show()

In [112]:
#The most common product IDs

#df.groupBy("asin").count().orderBy(col("count").desc()).show(10)