In [3]:
%run /Users/hims/sjsu/bigdata_tech-228/project/customer_segmentation/code/storage/snowflake.ipynb


Loading configs.py


In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr
from pyspark.sql.functions import split
from pyspark.sql.types import StructType, StructField, DoubleType, LongType
from pyspark.sql.types import TimestampType

baseInputPath = "/Users/hims/Downloads/yelp_dataset/"
baseOutputPath = "/tmp/test-1"

In [6]:
def process_business_data(spark):
    try:
        businessDf = spark.read.parquet(f"{baseOutputPath}/business")
    except Exception as e:
        
        schema = StructType([
            StructField("address", StringType(), True),
            StructField("attributes", MapType(StringType(), StringType()), True),
            StructField("business_id", StringType(), True),
            StructField("categories", StringType(), True),
            StructField("city", StringType(), True),
            StructField("hours", MapType(StringType(), StringType()), True),
            StructField("is_open", LongType(), True),
            StructField("latitude", DoubleType(), True),
            StructField("longitude", DoubleType(), True),
            StructField("name", StringType(), True),
            StructField("postal_code", StringType(), True),
            StructField("review_count", LongType(), True),
            StructField("stars", DoubleType(), True),
            StructField("state", StringType(), True),
        ])
        
        businessDf = spark.read.json(f'{baseInputPath}/yelp_academic_dataset_business.json', schema) \
        .withColumn("categories", split(col("categories"), ", "))
        businessDf.coalesce(1).write.mode("overwrite").parquet(f"{baseOutputPath}/business")
        businessDf = spark.read.parquet(f"{baseOutputPath}/business")
    return businessDf

business_df = process_business_data(spark)
# business_df.swho()

In [7]:
def process_user_data(spark):
    try:
        userDf = spark.read.parquet(f"{baseOutputPath}/user")
    except Exception as e:
        userDf = spark.read.json(f'{baseInputPath}/yelp_academic_dataset_user.json') \
            .drop("friends") \
            .withColumn("elite", split(col("elite"), ", ")) \
            .withColumn("yelping_since", col("yelping_since").cast("timestamp"))
        
        userDf.printSchema()
        userDf.coalesce(1).write.mode("overwrite").parquet(f"{baseOutputPath}/user")
        userDf = spark.read.parquet(f"{baseOutputPath}/user")
        
    return userDf

user_df = process_user_data(spark)
# user_df.show(3)


In [8]:
def process_friends_data(spark):
    try:
        friendsDf = spark.read.parquet(f"{baseOutputPath}/friends")
    except Exception as e:
        friendsDf = spark.read.json(f'{baseInputPath}/yelp_academic_dataset_user.json') \
            .select("user_id", split(col("friends"), ", ").alias("friends"))
        
        friendsDf.printSchema()
        friendsDf.coalesce(1).write.mode("overwrite").parquet(f"{baseOutputPath}/friends")
        friendsDf = spark.read.parquet(f"{baseOutputPath}/friends")
    
    return friendsDf

friends_df = process_friends_data(spark)
# friends_df.show(4)


In [9]:
def process_checkin_data(spark):
    try:
        checkinDf = spark.read.parquet(f"{baseOutputPath}/checkin")
    except Exception as e:
        checkinDf = spark.read.json(f'{baseInputPath}/yelp_academic_dataset_checkin.json') \
            .withColumn("date", expr("transform(split(date, ', '), d -> to_timestamp(d))").cast(ArrayType(TimestampType())))

        checkinDf.printSchema()

        checkinDf.coalesce(1).write.mode("overwrite").parquet(f"{baseOutputPath}/checkin")
        checkinDf = spark.read.parquet(f"{baseOutputPath}/checkin")

    return checkinDf

checkin_df = process_checkin_data(spark)
# checkin_df.show(4)


In [10]:
def process_tip_data(spark):
    try:
        tipDf = spark.read.parquet(f"{baseOutputPath}/tip")
    except Exception as e:
        tipDf = spark.read.json(f'{baseInputPath}/yelp_academic_dataset_tip.json') \
            .withColumn("date", col("date").cast("timestamp"))
        
        tipDf.coalesce(1).write.mode("overwrite").parquet(f"{baseOutputPath}/tip")
        tipDf = spark.read.parquet(f"{baseOutputPath}/tip")
    
    return tipDf

tip_df = process_tip_data(spark)
# tip_df.show(4, truncate=False)


In [11]:
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from pyspark.sql.types import StringType, ArrayType, MapType
from pyspark.sql.functions import col, udf

nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()
stop_words = set(stopwords.words('english'))


@udf(StringType())
def get_sentiment(text):
    sentiment_score = sia.polarity_scores(text)["compound"]
    if sentiment_score >= 0.05:
        return "positive"
    elif sentiment_score <= -0.05:
        return "negative"
    else:
        return "neutral"

@udf(ArrayType(StringType()))
def tokenize_and_get_top_words(text):
    tokens = word_tokenize(text)
    tokens = [word.lower() for word in tokens if word.isalpha()]
    tokens = [word for word in tokens if word not in stop_words]
    freq_dist = FreqDist(tokens)
    top_words = [word  for word, k in freq_dist.most_common(10)]
    return top_words

def process_review_data(spark):
    try:
        reviewDf = spark.read.parquet(f"{baseOutputPath}/review")
    except Exception as e:
        reviewDf = spark.read.json(f'{baseInputPath}/yelp_academic_dataset_review.json') \
            .withColumn("date", col("date").cast("timestamp")) \
            .withColumn("sentiment",  get_sentiment(col("text"))) \
            .withColumn("frequent_words", tokenize_and_get_top_words(col("text")))

        reviewDf.printSchema()
        reviewDf.write.mode("overwrite").parquet(f"{baseOutputPath}/review")
        reviewDf = spark.read.parquet(f"{baseOutputPath}/review")
        
    return reviewDf

review_df = process_review_data(spark)
# review_df.show(4)


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/hims/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [12]:
# from nltk.sentiment import SentimentIntensityAnalyzer
# 
# import nltk
# 
# nltk.download('vader_lexicon')
# 
# def get_sentiments_df(df):
#     # Initialize the Sentiment Intensity Analyzer
#     sia = SentimentIntensityAnalyzer()
#     # Define a UDF for sentiment analysis
#     def get_sentiment(text):
#         sentiment_score = sia.polarity_scores(text)["compound"]
#         if sentiment_score >= 0.05:
#             return "positive"
#         elif sentiment_score <= -0.05:
#             return "negative"
#         else:
#             return "neutral"
#     
#     sentiment_udf = udf(get_sentiment, StringType())
#     df = review_df.select("user_id", "text", sentiment_udf(col("text")).alias("sentiment"))
#     return df

In [13]:
# # from nltk.tokenize import word_tokenize
# # from nltk.probability import FreqDist
# # from nltk.corpus import stopwords
# # from pyspark.sql.types import StringType, ArrayType, IntegerType, MapType
# # from pyspark.sql.functions import col, udf, concat_ws, collect_list
# 
# stop_words = set(stopwords.words('english'))
# 
# def get_frequent_words():
# 
#     @udf(ArrayType(StringType()))
#     # @udf(MapType(StringType(), IntegerType()))
#     def tokenize_and_get_top_words(text):
# 
#         tokens = word_tokenize(text)
#         tokens = [word.lower() for word in tokens if word.isalpha()]
#         tokens = [word for word in tokens if word not in stop_words]    
#         freq_dist = FreqDist(tokens)
#         top_words = [word  for word, k in freq_dist.most_common(10)]
#         print(top_words)
#         return top_words
# 
#     df = review_df.sample(.0001).select("user_id", "text") \
#         .withColumn("frequent_words", tokenize_and_get_top_words(col("text")))
# 
#     # .groupBy("user_id").agg(concat_ws(" ", collect_list(col("text"))).alias("texts")) \
# 
#     return df
# 
# get_frequent_words().show()

In [14]:
# review_df.sample(.0001).count()

/Users/hims/Library/CloudStorage/GoogleDrive-anjalihimanshuojha@gmail.com/Other computers/My MacBook Air/sjsu/bigdata_tech-228/project/customer_segmentation/code/data_prep


In [7]:
# simple_data = spark.sparkContext.parallelize([[1, "Alice", 50]]).toDF()
# simple_data.count()
# simple_data.first()
# simple_data.show()