# Test Nested Attributes & Functions in PySPark



In [119]:
import os
import pandas as pd
import numpy as np

from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession, SQLContext

from pyspark.sql.types import *
from pyspark.sql.window import Window

import pyspark.sql.functions as F
from pyspark.sql.functions import udf, col

In [120]:
# setting random seed for notebook reproducability
rnd_seed=23
np.random.seed=23
np.random.set_state=23

In [121]:
os.environ['SPARK_HOME']

'D:\\Work\\spark-2.3.0-bin-hadoop2.7'

In [122]:
spark = SparkSession.builder.master("local[2]").appName("test_nested_attributes").getOrCreate()

In [123]:
spark

In [124]:
sc = spark.sparkContext
sc

In [125]:
sqlContext = SQLContext(spark.sparkContext)
sqlContext

<pyspark.sql.context.SQLContext at 0x20b01e030f0>

In [126]:
import re

# Utility function to emulate stripMargin in Scala string.
def strip_margin(text):
    nomargin = re.sub('\n[ \t]*\|', ' ', text)
    trimmed = re.sub('\s+', ' ', nomargin)
    return trimmed

In [127]:
spotify_df = spark.read.csv(path='data/spotify-songs.csv', inferSchema=True, header=True).cache()

In [128]:
spotify_df.show()

+---+--------------------+----------------+------------+------------+-----------+------+----------------+---+--------+--------+----+-----------+-------+--------------+-------+------+
| id|          song_title|          artist|acousticness|danceability|duration_ms|energy|instrumentalness|key|liveness|loudness|mode|speechiness|  tempo|time_signature|valence|target|
+---+--------------------+----------------+------------+------------+-----------+------+----------------+---+--------+--------+----+-----------+-------+--------------+-------+------+
|  0|            Mask Off|          Future|      0.0102|       0.833|     204600| 0.434|          0.0219|  2|   0.165|  -8.795|   1|      0.431|150.062|             4|  0.286|     1|
|  1|             Redbone|Childish Gambino|       0.199|       0.743|     326933| 0.359|         0.00611|  1|   0.137| -10.401|   1|     0.0794|160.083|             4|  0.588|     1|
|  2|        Xanny Family|          Future|      0.0344|       0.838|     185707| 0.4

In [129]:
map_df = (spotify_df
 .select('id', 'song_title', 'artist', F.array('key', 'mode', 'target').alias('audience'), F.create_map(F.lit('acousticness'), 'acousticness').alias('qualities'))).cache()

In [130]:
map_df.show(truncate=False)

+---+------------------------------------------+----------------+----------+-------------------------+
|id |song_title                                |artist          |audience  |qualities                |
+---+------------------------------------------+----------------+----------+-------------------------+
|0  |Mask Off                                  |Future          |[2, 1, 1] |[acousticness -> 0.0102] |
|1  |Redbone                                   |Childish Gambino|[1, 1, 1] |[acousticness -> 0.199]  |
|2  |Xanny Family                              |Future          |[2, 1, 1] |[acousticness -> 0.0344] |
|3  |Master Of None                            |Beach House     |[5, 1, 1] |[acousticness -> 0.604]  |
|4  |Parallel Lines                            |Junior Boys     |[5, 0, 1] |[acousticness -> 0.18]   |
|5  |Sneakin’                                  |Drake           |[8, 1, 1] |[acousticness -> 0.00479]|
|6  |Childs Play                               |Drake           |[1, 1, 1

In [131]:
map_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- song_title: string (nullable = true)
 |-- artist: string (nullable = true)
 |-- audience: array (nullable = false)
 |    |-- element: integer (containsNull = true)
 |-- qualities: map (nullable = false)
 |    |-- key: string
 |    |-- value: double (valueContainsNull = true)



In [132]:
map_df.write.json(path='data/spotify-songs.json')

In [133]:
schema3 = StructType([
    StructField('id', IntegerType(), nullable=False),
    StructField('song_title', StringType(), nullable=False),
    StructField('artist', StringType(), nullable=False),
    StructField('audience', ArrayType(elementType=IntegerType()), nullable=False),
    StructField('qualities', MapType(keyType=StringType(), valueType=DoubleType(), valueContainsNull=False), nullable=True)])

In [135]:
spotify_df2 = spark.read.json(path='data/spotify-songs.json', schema=schema3).cache()

In [136]:
spotify_df2.show()

+---+--------------------+----------------+----------+--------------------+
| id|          song_title|          artist|  audience|           qualities|
+---+--------------------+----------------+----------+--------------------+
|  0|            Mask Off|          Future| [2, 1, 1]|[acousticness -> ...|
|  1|             Redbone|Childish Gambino| [1, 1, 1]|[acousticness -> ...|
|  2|        Xanny Family|          Future| [2, 1, 1]|[acousticness -> ...|
|  3|      Master Of None|     Beach House| [5, 1, 1]|[acousticness -> ...|
|  4|      Parallel Lines|     Junior Boys| [5, 0, 1]|[acousticness -> ...|
|  5|            Sneakin’|           Drake| [8, 1, 1]|[acousticness -> ...|
|  6|         Childs Play|           Drake| [1, 1, 1]|[acousticness -> ...|
|  7|     Gyöngyhajú lány|           Omega|[10, 0, 1]|[acousticness -> ...|
|  8|   I've Seen Footage|     Death Grips|[11, 0, 1]|[acousticness -> ...|
|  9|      Digital Animal|     Honey Claws| [7, 1, 1]|[acousticness -> ...|
| 10|Subways

In [137]:
spotify_df2.printSchema()

root
 |-- id: integer (nullable = true)
 |-- song_title: string (nullable = true)
 |-- artist: string (nullable = true)
 |-- audience: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- qualities: map (nullable = true)
 |    |-- key: string
 |    |-- value: double (valueContainsNull = true)



In [138]:
spotify_df2.select(spotify_df2.audience.getItem(0).alias('key'), spotify_df2.audience.getItem(1).alias('mode')).show()

+---+----+
|key|mode|
+---+----+
|  2|   1|
|  1|   1|
|  2|   1|
|  5|   1|
|  5|   0|
|  8|   1|
|  1|   1|
| 10|   0|
| 11|   0|
|  7|   1|
|  5|   0|
|  0|   1|
|  0|   1|
|  9|   0|
|  6|   1|
|  1|   1|
|  4|   0|
|  1|   0|
|  1|   0|
| 10|   0|
+---+----+
only showing top 20 rows



In [139]:
spotify_df2.select(spotify_df2.qualities.getItem('acousticness').alias('acousticness')).show()

+------------+
|acousticness|
+------------+
|      0.0102|
|       0.199|
|      0.0344|
|       0.604|
|        0.18|
|     0.00479|
|      0.0145|
|      0.0202|
|      0.0481|
|     0.00208|
|      0.0572|
|       0.253|
|       0.366|
|        0.44|
|       0.019|
|      0.0239|
|       0.233|
|       0.314|
|      0.0242|
|     7.02E-4|
+------------+
only showing top 20 rows



In [140]:
spotify_df2.select(F.posexplode(spotify_df2.audience)).show()

+---+---+
|pos|col|
+---+---+
|  0|  2|
|  1|  1|
|  2|  1|
|  0|  1|
|  1|  1|
|  2|  1|
|  0|  2|
|  1|  1|
|  2|  1|
|  0|  5|
|  1|  1|
|  2|  1|
|  0|  5|
|  1|  0|
|  2|  1|
|  0|  8|
|  1|  1|
|  2|  1|
|  0|  1|
|  1|  1|
+---+---+
only showing top 20 rows



In [141]:
spotify_df2.select(F.explode(spotify_df2.qualities).alias("qualities", "value")).show()

+------------+-------+
|   qualities|  value|
+------------+-------+
|acousticness| 0.0102|
|acousticness|  0.199|
|acousticness| 0.0344|
|acousticness|  0.604|
|acousticness|   0.18|
|acousticness|0.00479|
|acousticness| 0.0145|
|acousticness| 0.0202|
|acousticness| 0.0481|
|acousticness|0.00208|
|acousticness| 0.0572|
|acousticness|  0.253|
|acousticness|  0.366|
|acousticness|   0.44|
|acousticness|  0.019|
|acousticness| 0.0239|
|acousticness|  0.233|
|acousticness|  0.314|
|acousticness| 0.0242|
|acousticness|7.02E-4|
+------------+-------+
only showing top 20 rows



In [142]:
spark.stop()