In [1]:
import findspark

In [2]:
findspark.init()

In [3]:
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder.appName('dag').getOrCreate()

In [5]:
spark._jsc.sc().uiWebUrl().get()

'http://SaadTariq:4040'

In [6]:
# df = spark.read.csv("sales_info.csv",header=True)

In [7]:
# df.show()

## Group By

In [8]:
# company = df.groupby('Company')

In [9]:
# type(df.groupby('Company'))

In [10]:
# type(company)

In [11]:
# company.agg({'Sales':'avg'}).show()

In [12]:
# type(company.agg({'Sales':'avg'}))

In [13]:
df = spark.read.json('sparkify_log_small.json')

In [19]:
df.printSchema()

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: long (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)



In [45]:
print("Number of Null Artists: {}".format(df.filter('artist is NULL').count()))

Number of Null Artists: 1653


## Dropping Artist that contains null values

In [47]:
df = df.filter('artist is not NULL')

In [48]:
print("Number of Null Artists: {}".format(df.filter('artist is NULL').count()))

Number of Null Artists: 0


## Exploring Gender Column

In [49]:
print("Number of Gender Missing: {}".format(df.filter('gender is NULL').count()))

Number of Gender Missing: 0


## Importing Spark Functions

In [22]:
from pyspark.sql.functions import countDistinct,count,avg,sum,stddev

## Counting Number of Rows

In [50]:
print("Number of Rows: {}".format(df.count()))

Number of Rows: 8347


## Counting Number of Distinct Artists

In [51]:
distinct_artists = df.select(countDistinct('artist').alias('Unique Artists'))
distinct_artists.show()

+--------------+
|Unique Artists|
+--------------+
|          3617|
+--------------+



## Checking Number of Times each song has been played once

In [54]:
distinct_song = df.select(countDistinct('song').alias('Unique Songs'))
distinct_song.show()

+------------+
|Unique Songs|
+------------+
|        6125|
+------------+



## Counting Which Artist has more albums

In [60]:
artist_albums = df.groupBy('artist').agg(count('*').alias('Albums Count'))

In [61]:
artist_albums.sort(artist_albums['Albums Count'].desc()).show(10)

+--------------------+------------+
|              artist|Albums Count|
+--------------------+------------+
|            Coldplay|          83|
|       Kings Of Leon|          69|
|Florence + The Ma...|          52|
|            BjÃÂ¶rk|          46|
|       Dwight Yoakam|          45|
|       Justin Bieber|          43|
|      The Black Keys|          40|
|         OneRepublic|          37|
|        Jack Johnson|          36|
|                Muse|          36|
+--------------------+------------+
only showing top 10 rows



## Counting Which Song has been Played More

In [58]:
max_song = df.groupBy('song').agg(count('*').alias('Song Played Most'))

In [59]:
max_song.sort(max_song['Song Played Most'].desc()).show(10)

+--------------------+----------------+
|                song|Song Played Most|
+--------------------+----------------+
|      You're The One|              46|
|                Undo|              44|
|Horn Concerto No....|              28|
|             Secrets|              25|
|Dog Days Are Over...|              23|
|       Sehr kosmisch|              22|
|             Revelry|              22|
|              Yellow|              21|
|        Use Somebody|              19|
|              Canada|              18|
+--------------------+----------------+
only showing top 10 rows



## Checking Which Gender has more interest in Songs

In [74]:
gender = df.groupBy('gender').agg(count('*').alias('Songs Played By Gender'))

In [75]:
gender.sort(gender['Songs Played By Gender'].desc()).show()

+------+----------------------+
|gender|Songs Played By Gender|
+------+----------------------+
|     M|                  5082|
|     F|                  3265|
+------+----------------------+



In [79]:
# df.groupBy('gender').agg(count('*').alias('Songs Played By Gender')).orderBy('Songs Played By Gender',ascending=False).show()

## Checking Songs that have maximum length

In [66]:
df.createOrReplaceTempView('Table')

In [82]:
spark.sql("SELECT song,length FROM Table ORDER BY length DESC LIMIT 5").show()

+--------------------+----------+
|                song|    length|
+--------------------+----------+
|This Dust Makes T...| 1806.8371|
|240 Years Before ...| 1400.2673|
|240 Years Before ...| 1400.2673|
|On The Mountain A...|1278.45832|
|            Two Step|1141.52444|
+--------------------+----------+



In [90]:
spark.sql("SELECT DISTINCT(song), length FROM table ORDER BY length DESC LIMIT 5").show()

+--------------------+----------+
|                song|    length|
+--------------------+----------+
|This Dust Makes T...| 1806.8371|
|240 Years Before ...| 1400.2673|
|On The Mountain A...|1278.45832|
|            Two Step|1141.52444|
|     Blackwater Park|1139.59138|
+--------------------+----------+



## Checking Average Length of Songs

In [93]:
spark.sql("SELECT AVG(length) FROM table").show()

+-----------------+
|      avg(length)|
+-----------------+
|249.6486587492506|
+-----------------+



## SPARK SQL

**Counting Number of Rows in Data Frame**

In [103]:
spark.sql('SELECT COUNT(*) AS Total_Rows FROM table').show()

+----------+
|Total_Rows|
+----------+
|      8347|
+----------+



**Counting Number of Unique Rows in Data Frame**

In [102]:
spark.sql('SELECT COUNT(DISTINCT(*)) AS Distinct_Rows FROM Table').show()

+-------------+
|Distinct_Rows|
+-------------+
|         8347|
+-------------+

