In [2]:
%run /Users/hims/sjsu/bigdata_tech-228/project/customer_segmentation/code/data_prep/data_preperation.ipynb

+--------------------+--------------------+--------------------+--------------------+-------------+--------------------+-------+----------+------------+--------------------+-----------+------------+-----+-----+
|             address|          attributes|         business_id|          categories|         city|               hours|is_open|  latitude|   longitude|                name|postal_code|review_count|stars|state|
+--------------------+--------------------+--------------------+--------------------+-------------+--------------------+-------+----------+------------+--------------------+-----------+------------+-----+-----+
|1616 Chapala St, ...|{ByAppointmentOnl...|Pns2l4eNsfO8kk83d...|[Doctors, Traditi...|Santa Barbara|                null|      0|34.4266787|-119.7111968|Abby Rappoport, L...|      93101|           7|  5.0|   CA|
|87 Grasso Plaza S...|{BusinessAcceptsC...|mpf3x-BjTdTEA3yCZ...|[Shipping Centers...|       Affton|{Monday -> 0:0-0:...|      1| 38.551126|  -90.335695|    

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/hims/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


+--------------------+----+-------------------+-----+--------------------+-----+--------------------+------+--------------------+---------+--------------------+
|         business_id|cool|               date|funny|           review_id|stars|                text|useful|             user_id|sentiment|      frequent_words|
+--------------------+----+-------------------+-----+--------------------+-----+--------------------+------+--------------------+---------+--------------------+
|grpNey31cTGKrhmQQ...|   0|2020-06-28 19:45:32|    0|6fObpwIggOQR1oDap...|  5.0|Had a wonderful, ...|     0|lN-1uUHeV_QyFbczw...| positive|[wonderful, authe...|
|Fay6yoOC6iitEt3QL...|   0|2020-06-07 18:13:33|    0|UgtrUhfuEgUdPay75...|  4.0|Yeah it was defin...|     0|TJ8Hawan8jDIZHS7A...| positive|[definitely, chee...|
|wQq0QBaYXa1KLNw_J...|   0|2018-08-17 18:51:47|    0|MmLxg9oLQmPpcPNqI...|  4.0|One of the last s...|     1|ML10yeoSaW60TwVaI...| positive|[poker, run, one,...|
|xgJMQq0uVY4KB9Efn...|   0|2020-06

In [2]:
columns = {
    "user_id" : "user_id", 
    "elites" : "elites", 
    "days_on_platform" : "yelping_since", 
    "categories_freq": "categories", # Map type
    "number_of_unique_business": "business_id",
    "number_of_visits_per_business" : "business_id",
    "first_seen" : "review_date",
    "last_seen" : "review_date"
}

In [3]:
# spark = init_spark()

In [3]:
business_df = process_business_data(spark)
user_df = process_user_data(spark)
friends_df = process_friends_data(spark)
checkin_df = process_checkin_data(spark)
tip_df = process_tip_data(spark)
review_df = process_review_data(spark)

review_df.createOrReplaceTempView("review")
joined_df = review_df.join(business_df, on = ["business_id"]).join(user_df, on = ["user_id"])
joined_df.createOrReplaceTempView("joined_df")



+--------------------+--------------------+--------------------+--------------------+-------------+--------------------+-------+----------+------------+--------------------+-----------+------------+-----+-----+
|             address|          attributes|         business_id|          categories|         city|               hours|is_open|  latitude|   longitude|                name|postal_code|review_count|stars|state|
+--------------------+--------------------+--------------------+--------------------+-------------+--------------------+-------+----------+------------+--------------------+-----------+------------+-----+-----+
|1616 Chapala St, ...|{ByAppointmentOnl...|Pns2l4eNsfO8kk83d...|[Doctors, Traditi...|Santa Barbara|                null|      0|34.4266787|-119.7111968|Abby Rappoport, L...|      93101|           7|  5.0|   CA|
|87 Grasso Plaza S...|{BusinessAcceptsC...|mpf3x-BjTdTEA3yCZ...|[Shipping Centers...|       Affton|{Monday -> 0:0-0:...|      1| 38.551126|  -90.335695|    

In [4]:
def get_customer_agg_value(spark):
    df = spark.sql("""
        select 
            user_id, 
            min(date) as first_seen, 
            max(date) as last_seen, 
            DATEDIFF(max(date), min(date)) as date_diff,
            count(distinct business_id) as different_business_count,
            avg(stars) as avg_rating,
            min(stars) as min_stars,
            max(stars) as max_stars
        from review
        group by user_id 
    """)
    return df

In [6]:
from pyspark.sql.functions import explode, col, create_map, collect_list
from pyspark.sql.functions import udf,collect_list
from pyspark.sql.types import MapType, StringType, IntegerType, MapType

def get_customer_category_counts(joined_df):
    
    # https://stackoverflow.com/questions/43723864/combine-array-of-maps-into-single-map-in-pyspark-dataframe
    combineMap = udf(lambda maps: {key:f[key] for f in maps for key in f},
                   MapType(StringType(),StringType()))

    @udf(MapType(StringType(), IntegerType()))
    def merge_maps_array(map_array):
        result = {}
        for m in map_array:
            for k, v in m.items():
                result[k] = result.get(k, 0) + v
        return result


    df = joined_df.select("user_id", explode("categories").alias("category")) \
        .groupBy("user_id", "category").count() \
        .withColumn("category_map", create_map(col("category"), col("count"))) \
        .groupBy("user_id").agg(collect_list(col("category_map")).alias("category_map")) \
        .withColumn("category_map", merge_maps_array(col("category_map"))) \
    
    df.printSchema()
    return df

get_customer_category_counts(joined_df).show(5, False)


root
 |-- user_id: string (nullable = true)
 |-- category_map: map (nullable = true)
 |    |-- key: string
 |    |-- value: integer (valueContainsNull = true)
+----------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|user_id               |category_map                                                                                                                                                                                                                       |
+----------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|---1lKK3aKOuomHnwAkAow|{Float Spa -> 1, Counseling & Mental Health -> 1, Active L

In [8]:
from pyspark.sql.functions import col, size

def get_friends_count():
    df = friends_df.select("user_id", size(col("friends")).alias("friends_count"))
    return df

In [9]:
from pyspark.sql.functions import col, udf
from pyspark.sql.types import StringType
from nltk.sentiment import SentimentIntensityAnalyzer

# Initialize the Sentiment Intensity Analyzer
sia = SentimentIntensityAnalyzer()

# Define a UDF for sentiment analysis
def get_sentiment(text):
    sentiment_score = sia.polarity_scores(text)["compound"]
    if sentiment_score >= 0.05:
        return "positive"
    elif sentiment_score <= -0.05:
        return "negative"
    else:
        return "neutral"

def get_sentiments_count():
    sentiment_udf = udf(get_sentiment, StringType())

    df = review_df.select("user_id", "text", sentiment_udf(col("text")).alias("sentiment"))
    return df



In [None]:
def most_frequent_words():
    df = review_df.select("user_id", "text")
    return df

In [10]:
user_agg_df = get_customer_agg_value(spark)
user_category_df = get_customer_category_counts(joined_df)
friends_count = get_friends_count()

complete_user_df = user_df \
                    .join(user_agg_df, on = ["user_id"]) \
                    .join(user_category_df, on = ["user_id"]) \
                    .join(friends_count, on = ["user_id"])

complete_user_df.printSchema()
complete_user_df.count()

root
 |-- user_id: string (nullable = true)
 |-- average_stars: double (nullable = true)
 |-- compliment_cool: long (nullable = true)
 |-- compliment_cute: long (nullable = true)
 |-- compliment_funny: long (nullable = true)
 |-- compliment_hot: long (nullable = true)
 |-- compliment_list: long (nullable = true)
 |-- compliment_more: long (nullable = true)
 |-- compliment_note: long (nullable = true)
 |-- compliment_photos: long (nullable = true)
 |-- compliment_plain: long (nullable = true)
 |-- compliment_profile: long (nullable = true)
 |-- compliment_writer: long (nullable = true)
 |-- cool: long (nullable = true)
 |-- elite: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- fans: long (nullable = true)
 |-- funny: long (nullable = true)
 |-- name: string (nullable = true)
 |-- review_count: long (nullable = true)
 |-- useful: long (nullable = true)
 |-- yelping_since: timestamp (nullable = true)
 |-- first_seen: timestamp (nullable = true)
 |-- last_seen



CodeCache: size=131072Kb used=32040Kb max_used=32100Kb free=99031Kb
 bounds [0x0000000104a60000, 0x00000001069f0000, 0x000000010ca60000]
 total_blobs=12171 nmethods=11175 adapters=906
 compilation: disabled (not enough contiguous free space left)


                                                                                

1987653