# Data Wrangling with PySpark DataFrames 

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import isnan, count, when, col, desc, udf, col, sort_array, asc, avg
from pyspark.sql.functions import sum as Fsum
from pyspark.sql.window import Window
from pyspark.sql.types import IntegerType

spark = SparkSession \
    .builder \
    .appName("Wrangling Data") \
    .getOrCreate()
path = "data/sparkify_log_small.json"
user_log = spark.read.json(path)

# Which page did user id "" (empty string) NOT visit?

In [40]:
ul1 = user_log.alias('ul1')
ul2 = user_log.filter(user_log.userId == "").alias('ul2')

pages = ul1.join(ul2, ul1.page == ul2.page, how='left_anti').select('page') \
                                                            .distinct() \
                                                            .collect()
pages = [x['page'] + for x in pages]

print("Pages not visited by empty string user id: {}".format(pages))

Pages not visited by empty string user id: ['Submit Downgrade', 'Downgrade', 'Logout', 'Save Settings', 'Settings', 'NextSong', 'Upgrade', 'Error', 'Submit Upgrade']


# What type of user does the empty string user id most likely refer to?


In [39]:
all_pages = ul1.select('page').distinct().collect()

all_pages = [x['page'] for x in all_pages]

other_user_pages = [x for x in all_pages if x not in pages]

print("Pages visited by empty string user id: {}".format(other_user_pages))

Pages visited by empty string user id: ['Home', 'About', 'Login', 'Help']


Since ['Home', 'About', 'Login', 'Help'] are pages that empty string user ids visit, they are likely users who have not yet registered

# How many female users do we have in the data set?

In [38]:
female_no = ul1.filter(ul1.gender == 'F').select("userId").distinct().count()
print("Number of female users: {}".format(female_no))

Number of female users: 462


# How many songs were played from the most played artist?

In [37]:
artist_counts = ul1.where(col("artist").isNotNull()).groupby("artist") \
                   .count().sort(col("count").desc()).collect()

top_artist = artist_counts[0]['artist']

number_of_songs = ul1.filter(ul1.artist == top_artist).count()

print("Number of songs played by top artist {}: {}".format(top_artist,
                                                           number_of_songs))

Number of songs played by top artist Coldplay: 83


# How many songs do users listen to on average between visiting our home page? Please round your answer to the closest integer.



In [43]:
function = udf(lambda ishome : int(ishome == 'Home'), IntegerType())

user_window = Window \
    .partitionBy('userID') \
    .orderBy(desc('ts')) \
    .rangeBetween(Window.unboundedPreceding, 0)

cusum = ul1.filter((ul1.page == 'NextSong') | (ul1.page == 'Home')) \
    .select('userID', 'page', 'ts') \
    .withColumn('homevisit', function(col('page'))) \
    .withColumn('period', Fsum('homevisit').over(user_window))

cusum.filter((cusum.page == 'NextSong')) \
    .groupBy('userID', 'period') \
    .agg({'period':'count'}) \
    .agg({'count(period)':'avg'}).show()

+------------------+
|avg(count(period))|
+------------------+
| 6.898347107438017|
+------------------+

