In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr
from pyspark.sql.functions import split
from pyspark.sql.types import StructType, StructField, DoubleType, LongType
from pyspark.sql.types import TimestampType

baseInputPath = "/Users/hims/Downloads/yelp_dataset/"
baseOutputPath = "/tmp/test-1"

In [2]:
def init_spark():
    spark = (
        SparkSession
        .builder
        .appName("Project App")
        .config("spark.executor.memory", "8000mb")
        .config('spark.sql.shuffle.partitions', 4)
        .config('spark.default.parallelism', 4)
        .getOrCreate()
    )
    spark.sparkContext.setLogLevel("ERROR")
    return spark

spark = init_spark()
spark

In [3]:
def process_business_data(spark):
    try:
        businessDf = spark.read.parquet(f"{baseOutputPath}/business")
    except Exception as e:
        
        schema = StructType([
            StructField("address", StringType(), True),
            StructField("attributes", MapType(StringType(), StringType()), True),
            StructField("business_id", StringType(), True),
            StructField("categories", StringType(), True),
            StructField("city", StringType(), True),
            StructField("hours", MapType(StringType(), StringType()), True),
            StructField("is_open", LongType(), True),
            StructField("latitude", DoubleType(), True),
            StructField("longitude", DoubleType(), True),
            StructField("name", StringType(), True),
            StructField("postal_code", StringType(), True),
            StructField("review_count", LongType(), True),
            StructField("stars", DoubleType(), True),
            StructField("state", StringType(), True),
        ])
        
        businessDf = spark.read.json(f'{baseInputPath}/yelp_academic_dataset_business.json', schema) \
        .withColumn("categories", split(col("categories"), ", "))
        businessDf.coalesce(1).write.mode("overwrite").parquet(f"{baseOutputPath}/business")
        businessDf = spark.read.parquet(f"{baseOutputPath}/business")
    businessDf.show(4)
    return businessDf

business_df = process_business_data(spark)

+--------------------+--------------------+--------------------+--------------------+-------------+--------------------+-------+----------+------------+--------------------+-----------+------------+-----+-----+
|             address|          attributes|         business_id|          categories|         city|               hours|is_open|  latitude|   longitude|                name|postal_code|review_count|stars|state|
+--------------------+--------------------+--------------------+--------------------+-------------+--------------------+-------+----------+------------+--------------------+-----------+------------+-----+-----+
|1616 Chapala St, ...|{ByAppointmentOnl...|Pns2l4eNsfO8kk83d...|[Doctors, Traditi...|Santa Barbara|                null|      0|34.4266787|-119.7111968|Abby Rappoport, L...|      93101|           7|  5.0|   CA|
|87 Grasso Plaza S...|{BusinessAcceptsC...|mpf3x-BjTdTEA3yCZ...|[Shipping Centers...|       Affton|{Monday -> 0:0-0:...|      1| 38.551126|  -90.335695|    

In [4]:
def process_user_data(spark):
    try:
        userDf = spark.read.parquet(f"{baseOutputPath}/user")
    except Exception as e:
        userDf = spark.read.json(f'{baseInputPath}/yelp_academic_dataset_user.json') \
            .drop("friends") \
            .withColumn("elite", split(col("elite"), ", ")) \
            .withColumn("yelping_since", col("yelping_since").cast("timestamp"))
        
        userDf.printSchema()
        userDf.coalesce(1).write.mode("overwrite").parquet(f"{baseOutputPath}/user")
        userDf = spark.read.parquet(f"{baseOutputPath}/user")
        
    userDf.show(3)
    return userDf

user_df = process_user_data(spark)


+-------------+---------------+---------------+----------------+--------------+---------------+---------------+---------------+-----------------+----------------+------------------+-----------------+-----+--------------------+----+-----+------+------------+------+--------------------+-------------------+
|average_stars|compliment_cool|compliment_cute|compliment_funny|compliment_hot|compliment_list|compliment_more|compliment_note|compliment_photos|compliment_plain|compliment_profile|compliment_writer| cool|               elite|fans|funny|  name|review_count|useful|             user_id|      yelping_since|
+-------------+---------------+---------------+----------------+--------------+---------------+---------------+---------------+-----------------+----------------+------------------+-----------------+-----+--------------------+----+-----+------+------------+------+--------------------+-------------------+
|         3.91|            467|             56|             467|           250|   

In [5]:
def process_friends_data(spark):
    try:
        friendsDf = spark.read.parquet(f"{baseOutputPath}/friends")
    except Exception as e:
        friendsDf = spark.read.json(f'{baseInputPath}/yelp_academic_dataset_user.json') \
            .select("user_id", split(col("friends"), ", ").alias("friends"))
        
        friendsDf.printSchema()
        friendsDf.coalesce(1).write.mode("overwrite").parquet(f"{baseOutputPath}/friends")
        friendsDf = spark.read.parquet(f"{baseOutputPath}/friends")
    
    friendsDf.show(4)
    return friendsDf

friends_df = process_friends_data(spark)


+--------------------+--------------------+
|             user_id|             friends|
+--------------------+--------------------+
|qVc8ODYU5SZjKXVBg...|[NSCy54eWehBJyZdG...|
|j14WgRoU_-2ZE1aw1...|[ueRPE0CX75ePGMqO...|
|2WnXYQFK0hXEoTxPt...|[LuO3Bn4f3rlhyHIa...|
|SZDeASXq7o05mMNLs...|[enx1vVPnfdNUdPho...|
+--------------------+--------------------+


In [6]:
def process_checkin_data(spark):
    try:
        checkinDf = spark.read.parquet(f"{baseOutputPath}/checkin")
    except Exception as e:
        checkinDf = spark.read.json(f'{baseInputPath}/yelp_academic_dataset_checkin.json') \
            .withColumn("date", expr("transform(split(date, ', '), d -> to_timestamp(d))").cast(ArrayType(TimestampType())))

        checkinDf.printSchema()

        checkinDf.coalesce(1).write.mode("overwrite").parquet(f"{baseOutputPath}/checkin")
        checkinDf = spark.read.parquet(f"{baseOutputPath}/checkin")

    checkinDf.show(4)
    return checkinDf

checkin_df = process_checkin_data(spark)

+--------------------+--------------------+
|         business_id|                date|
+--------------------+--------------------+
|---kPU91CF4Lq2-Wl...|[2020-03-13 21:10...|
|--0iUa4sNDFiZFrAd...|[2010-09-13 21:43...|
|--30_8IhuyMHbSOcN...|[2013-06-14 23:29...|
|--7PUidqRWpRSpXeb...|[2011-02-15 17:12...|
+--------------------+--------------------+


In [7]:
def process_tip_data(spark):
    try:
        tipDf = spark.read.parquet(f"{baseOutputPath}/tip")
    except Exception as e:
        tipDf = spark.read.json(f'{baseInputPath}/yelp_academic_dataset_tip.json') \
            .withColumn("date", col("date").cast("timestamp"))
        
        tipDf.coalesce(1).write.mode("overwrite").parquet(f"{baseOutputPath}/tip")
        tipDf = spark.read.parquet(f"{baseOutputPath}/tip")
    
    tipDf.show(4, truncate=False)
    return tipDf

tip_df = process_tip_data(spark)

+----------------------+----------------+-------------------+---------------------------------------------------------+----------------------+
|business_id           |compliment_count|date               |text                                                     |user_id               |
+----------------------+----------------+-------------------+---------------------------------------------------------+----------------------+
|3uLgwr0qeCNMjKenHJwPGQ|0               |2012-05-18 02:17:21|Avengers time with the ladies.                           |AGNUgVwnZUey3gcPCJ76iw|
|QoezRbYQncpRqyrLH6Iqjg|0               |2013-02-05 18:35:10|They have lots of good deserts and tasty cuban sandwiches|NBN4MgHP9D3cw--SnauTkA|
|MYoRNLb5chwjQe3c_k37Gg|0               |2013-08-18 00:56:08|It's open even when you think it isn't                   |-copOvldyKh1qr-vzkDEvw|
|hV-bABTK-glh5wj31ps_Jw|0               |2017-06-27 23:05:38|Very decent fried chicken                                |FjMQVZjSqY8syIO-53KFKw|

In [9]:
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from pyspark.sql.types import StringType, ArrayType, MapType
from pyspark.sql.functions import col, udf

nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()
stop_words = set(stopwords.words('english'))


@udf(StringType())
def get_sentiment(text):
    sentiment_score = sia.polarity_scores(text)["compound"]
    if sentiment_score >= 0.05:
        return "positive"
    elif sentiment_score <= -0.05:
        return "negative"
    else:
        return "neutral"

@udf(ArrayType(StringType()))
def tokenize_and_get_top_words(text):
    tokens = word_tokenize(text)
    tokens = [word.lower() for word in tokens if word.isalpha()]
    tokens = [word for word in tokens if word not in stop_words]
    freq_dist = FreqDist(tokens)
    top_words = [word  for word, k in freq_dist.most_common(10)]
    return top_words

def process_review_data(spark):
    try:
        reviewDf = spark.read.parquet(f"{baseOutputPath}/review")
    except Exception as e:
        reviewDf = spark.read.json(f'{baseInputPath}/yelp_academic_dataset_review.json') \
            .withColumn("date", col("date").cast("timestamp")) \
            .withColumn("sentiment",  get_sentiment(col("text"))) \
            .withColumn("frequent_words", tokenize_and_get_top_words(col("text")))

        reviewDf.printSchema()
        reviewDf.write.mode("overwrite").parquet(f"{baseOutputPath}/review")
        reviewDf = spark.read.parquet(f"{baseOutputPath}/review")
        
    reviewDf.show(4)
    return reviewDf

review_df = process_review_data(spark)

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/hims/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


root
 |-- business_id: string (nullable = true)
 |-- cool: long (nullable = true)
 |-- date: timestamp (nullable = true)
 |-- funny: long (nullable = true)
 |-- review_id: string (nullable = true)
 |-- stars: double (nullable = true)
 |-- text: string (nullable = true)
 |-- useful: long (nullable = true)
 |-- user_id: string (nullable = true)
 |-- sentiment: string (nullable = true)
 |-- frequent_words: array (nullable = true)
 |    |-- element: string (containsNull = true)
+--------------------+----+-------------------+-----+--------------------+-----+--------------------+------+--------------------+---------+--------------------+
|         business_id|cool|               date|funny|           review_id|stars|                text|useful|             user_id|sentiment|      frequent_words|
+--------------------+----+-------------------+-----+--------------------+-----+--------------------+------+--------------------+---------+--------------------+
|grpNey31cTGKrhmQQ...|   0|2020-06-28 

In [11]:
# from nltk.sentiment import SentimentIntensityAnalyzer
# 
# import nltk
# 
# nltk.download('vader_lexicon')
# 
# def get_sentiments_df(df):
#     # Initialize the Sentiment Intensity Analyzer
#     sia = SentimentIntensityAnalyzer()
#     # Define a UDF for sentiment analysis
#     def get_sentiment(text):
#         sentiment_score = sia.polarity_scores(text)["compound"]
#         if sentiment_score >= 0.05:
#             return "positive"
#         elif sentiment_score <= -0.05:
#             return "negative"
#         else:
#             return "neutral"
#     
#     sentiment_udf = udf(get_sentiment, StringType())
#     df = review_df.select("user_id", "text", sentiment_udf(col("text")).alias("sentiment"))
#     return df

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/hims/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [39]:
# # from nltk.tokenize import word_tokenize
# # from nltk.probability import FreqDist
# # from nltk.corpus import stopwords
# # from pyspark.sql.types import StringType, ArrayType, IntegerType, MapType
# # from pyspark.sql.functions import col, udf, concat_ws, collect_list
# 
# stop_words = set(stopwords.words('english'))
# 
# def get_frequent_words():
# 
#     @udf(ArrayType(StringType()))
#     # @udf(MapType(StringType(), IntegerType()))
#     def tokenize_and_get_top_words(text):
# 
#         tokens = word_tokenize(text)
#         tokens = [word.lower() for word in tokens if word.isalpha()]
#         tokens = [word for word in tokens if word not in stop_words]    
#         freq_dist = FreqDist(tokens)
#         top_words = [word  for word, k in freq_dist.most_common(10)]
#         print(top_words)
#         return top_words
# 
#     df = review_df.sample(.0001).select("user_id", "text") \
#         .withColumn("frequent_words", tokenize_and_get_top_words(col("text")))
# 
#     # .groupBy("user_id").agg(concat_ws(" ", collect_list(col("text"))).alias("texts")) \
# 
#     return df
# 
# get_frequent_words().show()

+--------------------+--------------------+--------------------+
|             user_id|                text|      frequent_words|
+--------------------+--------------------+--------------------+
|Hi10sGSZNxQH3NLyW...|Even if you come ...|[even, come, ming...|
|jpx_StWgnkrTwC_eI...|On the Tuesday be...|[holiday, metropo...|
|rZZKSzzMkGawlbY9G...|My favorite Itali...|[favorite, italia...|
|ZPDnkhw4FcLHyNp1V...|Cut my workout sh...|[heather, machine...|
|tMEdZaMYP0kOaCBVG...|Stopped by on my ...|[spicy, def, leve...|
|4hEAu_7w9mbMhwarY...|From the very beg...|[move, loud, woul...|
|5mY1UAGGO2p-4V1Pd...|Wolfie's is great...|[wolfie, great, w...|
|WCk1trU2NGjd_4TGB...|We've been seeing...|[seeing, orris, w...|
|2Wuzmn21bSGNuLURQ...|We don't visit of...|[visit, often, do...|
|rJjABF23pHPvXr8v8...|I love this resta...|[dinner, pork, lu...|
|1CndurKBoAOdIlkZw...|I was in last nig...|[got, well, frien...|
|Qjd7aIiKqnd68IfaR...|Disclaimer: I AM ...|[shop, neighborho...|
|7ziWZULyiZv2TesYN...|so 

In [29]:
review_df.sample(.0001).count()

765

/Users/hims/Library/CloudStorage/GoogleDrive-anjalihimanshuojha@gmail.com/Other computers/My MacBook Air/sjsu/bigdata_tech-228/project/customer_segmentation/code/data_prep


In [7]:
# simple_data = spark.sparkContext.parallelize([[1, "Alice", 50]]).toDF()
# simple_data.count()
# simple_data.first()
# simple_data.show()