In [21]:
%run ../storage/snowflake.ipynb


Loading configs.py


In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr
from pyspark.sql.functions import split
from pyspark.sql.types import StructType, StructField, DoubleType, LongType, StringType, MapType, ArrayType
from pyspark.sql.types import TimestampType
from pyspark.sql.functions import col, udf
from pyspark.sql.functions import collect_list
from pyspark.sql.functions import explode, create_map
from pyspark.sql.functions import size
from pyspark.sql.types import IntegerType, MapType
from pyspark.sql.types import StringType

sample = 0.001
baseInputPath = baseInputPath
sampleOutputPath = f"{baseOutputPath}/sample={sample}/"

spark = spark
spark

# Most Active Users

In [5]:
def get_sampled_users_data():
    if sample != 1:
        
        try:
            sampled_users = spark.read.parquet(f"{sampleOutputPath}/sampled_user_id")
        except Exception as e:
            sampled_users =  spark.read.json(f'{baseInputPath}/yelp_academic_dataset_review.json') \
                                .groupBy("user_id").count().orderBy(col("count").desc()).select("user_id").sample(sample)
    
            sampled_users.write.mode("overwrite").parquet(f"{sampleOutputPath}/sampled_user_id")
            sampled_users = spark.read.parquet(f"{sampleOutputPath}/sampled_user_id")
            
        return sampled_users, True
    else:
        return None, False 


In [6]:
def get_sampled_business_data():
    if sample != 1:
        try:
            sampled_business = spark.read.parquet(f"{sampleOutputPath}/sampled_business_id")
        except Exception as e:

            sampled_user, _ = get_sampled_users_data()
            sampled_business =  spark.read.json(f'{baseInputPath}/yelp_academic_dataset_review.json') \
                                .join(sampled_user, on = ["user_id"]) \
                                .select("business_id").distinct()

            sampled_business.write.mode("overwrite").parquet(f"{sampleOutputPath}/sampled_business_id")
            sampled_business = spark.read.parquet(f"{sampleOutputPath}/sampled_business_id")

        return sampled_business, True
    else:
        return None, False

# get_sampled_business_data()[0].show()


# Users Data

In [16]:
def process_user_data(spark):
    try:
        userDf = spark.read.parquet(f"{sampleOutputPath}/user")
    except Exception as e:
        
        sampled_users, is_sampled = get_sampled_users_data()
        userDf = spark.read.json(f'{baseInputPath}/yelp_academic_dataset_user.json') 
        if is_sampled:
            userDf = userDf.join(sampled_users, on = ["user_id"])         
        
        userDf = userDf \
            .drop("friends") \
            .withColumn("elite", split(col("elite"), ", ")) \
            .withColumn("yelping_since", col("yelping_since").cast("timestamp"))

        userDf.coalesce(1).write.mode("overwrite").parquet(f"{sampleOutputPath}/user")
        userDf = spark.read.parquet(f"{sampleOutputPath}/user")

    return userDf

user_df = process_user_data(spark)
# user_df.show(3)


+--------------------+-------------+---------------+---------------+----------------+--------------+---------------+---------------+---------------+-----------------+----------------+------------------+-----------------+----+--------------------+----+-----+--------+------------+------+-------------------+
|             user_id|average_stars|compliment_cool|compliment_cute|compliment_funny|compliment_hot|compliment_list|compliment_more|compliment_note|compliment_photos|compliment_plain|compliment_profile|compliment_writer|cool|               elite|fans|funny|    name|review_count|useful|      yelping_since|
+--------------------+-------------+---------------+---------------+----------------+--------------+---------------+---------------+---------------+-----------------+----------------+------------------+-----------------+----+--------------------+----+-----+--------+------------+------+-------------------+
|CDjeQhhH7ZoSKBDQ5...|         3.89|             79|              0|           

# Business Data

In [8]:
def process_business_data(spark):
    try:
        businessDf = spark.read.parquet(f"{sampleOutputPath}/business")
    except Exception as e:
        
        schema = StructType([
            StructField("address", StringType(), True),
            StructField("attributes", MapType(StringType(), StringType()), True),
            StructField("business_id", StringType(), True),
            StructField("categories", StringType(), True),
            StructField("city", StringType(), True),
            StructField("hours", MapType(StringType(), StringType()), True),
            StructField("is_open", LongType(), True),
            StructField("latitude", DoubleType(), True),
            StructField("longitude", DoubleType(), True),
            StructField("name", StringType(), True),
            StructField("postal_code", StringType(), True),
            StructField("review_count", LongType(), True),
            StructField("stars", DoubleType(), True),
            StructField("state", StringType(), True),
        ])

        sampled_business, is_sampled = get_sampled_business_data()
        businessDf = spark.read.json(f'{baseInputPath}/yelp_academic_dataset_business.json', schema)
        if is_sampled:
            businessDf = businessDf.join(sampled_business, on = ["business_id"])

        
        businessDf =  businessDf\
            .withColumn("categories", split(col("categories"), ", "))
        businessDf.write.mode("overwrite").parquet(f"{sampleOutputPath}/business")
        businessDf = spark.read.parquet(f"{sampleOutputPath}/business")
        
    return businessDf

business_df = process_business_data(spark)
# business_df.show()

In [10]:
def process_friends_data(spark):
    try:
        friendsDf = spark.read.parquet(f"{sampleOutputPath}/friends")
    except Exception as e:

        sampled_users, is_sampled = get_sampled_users_data()
        friendsDf = spark.read.json(f'{baseInputPath}/yelp_academic_dataset_user.json')
        if is_sampled:
            friendsDf = friendsDf.join(sampled_users, on = ["user_id"])


        friendsDf = friendsDf.select("user_id", split(col("friends"), ", ").alias("friends"))
        
        friendsDf.printSchema()
        friendsDf.write.mode("overwrite").parquet(f"{sampleOutputPath}/friends")
        friendsDf = spark.read.parquet(f"{sampleOutputPath}/friends")
    
    return friendsDf

friends_df = process_friends_data(spark)
# friends_df.show(4)


In [11]:
def process_checkin_data(spark):
    try:
        checkinDf = spark.read.parquet(f"{sampleOutputPath}/checkin")
    except Exception as e:

        sampled_business, is_sampled = get_sampled_business_data()
        checkinDf = spark.read.json(f'{baseInputPath}/yelp_academic_dataset_checkin.json')
        if is_sampled:
            checkinDf = checkinDf.join(sampled_business, on = ["business_id"])

        checkinDf = checkinDf \
            .withColumn("date", expr("transform(split(date, ', '), d -> to_timestamp(d))").cast(ArrayType(TimestampType())))

        checkinDf.printSchema()

        checkinDf.write.mode("overwrite").parquet(f"{sampleOutputPath}/checkin")
        checkinDf = spark.read.parquet(f"{sampleOutputPath}/checkin")

    return checkinDf

checkin_df = process_checkin_data(spark)
# checkin_df.show(4)


+--------------------+--------------------+
|         business_id|                date|
+--------------------+--------------------+
|--sXnWH9Xm6_NvIjy...|[2011-06-08 19:26...|
|--sgBOzb76sjOQ-Xh...|[2018-08-01 23:08...|
|-0E7laYjwZxEAQPhF...|[2012-12-17 16:18...|
|-0Ym1Wg3bXd_TDz8J...|[2018-06-09 18:52...|
+--------------------+--------------------+


In [13]:
def process_tip_data(spark):
    try:
        tipDf = spark.read.parquet(f"{sampleOutputPath}/tip")
    except Exception as e:

        sampled_users, is_sampled = get_sampled_users_data()
        tipDf = spark.read.json(f'{baseInputPath}/yelp_academic_dataset_tip.json')
        if is_sampled:
            tipDf = tipDf.join(sampled_users, on = ["user_id"])


        tipDf = tipDf.withColumn("date", col("date").cast("timestamp"))
        
        tipDf.write.mode("overwrite").parquet(f"{sampleOutputPath}/tip")
        tipDf = spark.read.parquet(f"{sampleOutputPath}/tip")
    
    return tipDf

tip_df = process_tip_data(spark)
# tip_df.show(4, truncate=False)


# Review Data

In [14]:
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from pyspark.sql.types import StringType, ArrayType, MapType
from pyspark.sql.functions import col, udf

nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()
stop_words = set(stopwords.words('english'))


@udf(StringType())
def get_sentiment(text):
    sentiment_score = sia.polarity_scores(text)["compound"]
    if sentiment_score >= 0.05:
        return "positive"
    elif sentiment_score <= -0.05:
        return "negative"
    else:
        return "neutral"

@udf(ArrayType(StringType()))
def tokenize_and_get_top_words(text, sample_size=0.0001):
    tokens = word_tokenize(text)
    tokens = [word.lower() for word in tokens if word.isalpha()]
    tokens = [word for word in tokens if word not in stop_words]
    freq_dist = FreqDist(tokens)
    top_words = [word  for word, k in freq_dist.most_common(10)]
    return top_words

def process_review_data(spark):
    try:
        reviewDf = spark.read.parquet(f"{sampleOutputPath}/review")
    except Exception as e:
        sampled_users, is_sampled = get_sampled_users_data()
        reviewDf = spark.read.json(f'{baseInputPath}/yelp_academic_dataset_review.json')
        if is_sampled:
            reviewDf = reviewDf.join(sampled_users, on = ["user_id"])


        reviewDf = reviewDf \
            .withColumn("date", col("date").cast("timestamp")) \
            .withColumn("sentiment",  get_sentiment(col("text"))) \
            .withColumn("frequent_words", tokenize_and_get_top_words(col("text")))

        reviewDf.printSchema()
        reviewDf.write.mode("overwrite").parquet(f"{sampleOutputPath}/review")
        reviewDf = spark.read.parquet(f"{sampleOutputPath}/review")
        
    return reviewDf

review_df = process_review_data(spark)
# review_df.show(4)


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/hims/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


root
 |-- user_id: string (nullable = true)
 |-- business_id: string (nullable = true)
 |-- cool: long (nullable = true)
 |-- date: timestamp (nullable = true)
 |-- funny: long (nullable = true)
 |-- review_id: string (nullable = true)
 |-- stars: double (nullable = true)
 |-- text: string (nullable = true)
 |-- useful: long (nullable = true)
 |-- sentiment: string (nullable = true)
 |-- frequent_words: array (nullable = true)
 |    |-- element: string (containsNull = true)


In [15]:
review_df.count()

7681

In [12]:
# from nltk.sentiment import SentimentIntensityAnalyzer
# 
# import nltk
# 
# nltk.download('vader_lexicon')
# 
# def get_sentiments_df(df):
#     # Initialize the Sentiment Intensity Analyzer
#     sia = SentimentIntensityAnalyzer()
#     # Define a UDF for sentiment analysis
#     def get_sentiment(text):
#         sentiment_score = sia.polarity_scores(text)["compound"]
#         if sentiment_score >= 0.05:
#             return "positive"
#         elif sentiment_score <= -0.05:
#             return "negative"
#         else:
#             return "neutral"
#     
#     sentiment_udf = udf(get_sentiment, StringType())
#     df = review_df.select("user_id", "text", sentiment_udf(col("text")).alias("sentiment"))
#     return df

In [13]:
# # from nltk.tokenize import word_tokenize
# # from nltk.probability import FreqDist
# # from nltk.corpus import stopwords
# # from pyspark.sql.types import StringType, ArrayType, IntegerType, MapType
# # from pyspark.sql.functions import col, udf, concat_ws, collect_list
# 
# stop_words = set(stopwords.words('english'))
# 
# def get_frequent_words():
# 
#     @udf(ArrayType(StringType()))
#     # @udf(MapType(StringType(), IntegerType()))
#     def tokenize_and_get_top_words(text):
# 
#         tokens = word_tokenize(text)
#         tokens = [word.lower() for word in tokens if word.isalpha()]
#         tokens = [word for word in tokens if word not in stop_words]    
#         freq_dist = FreqDist(tokens)
#         top_words = [word  for word, k in freq_dist.most_common(10)]
#         print(top_words)
#         return top_words
# 
#     df = review_df.sample(.0001).select("user_id", "text") \
#         .withColumn("frequent_words", tokenize_and_get_top_words(col("text")))
# 
#     # .groupBy("user_id").agg(concat_ws(" ", collect_list(col("text"))).alias("texts")) \
# 
#     return df
# 
# get_frequent_words().show()

In [14]:
# review_df.sample(.0001).count()

In [7]:
# simple_data = spark.sparkContext.parallelize([[1, "Alice", 50]]).toDF()
# simple_data.count()
# simple_data.first()
# simple_data.show()