# BIG TATA

In [10]:
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import col , sum as sum_func , when
from IPython.display import display


In [11]:
import findspark
findspark.init()
findspark.find()

'C:\\Program Files\\Spark\\spark-3.5.1-bin-hadoop3'

In [12]:
# Create a SparkSession
spark = SparkSession.builder \
    .appName("My Spark App") \
    .getOrCreate()

In [13]:
input_df = spark.read.csv("spotify_songs.csv", header=True, inferSchema=True)


In [14]:
# 3. Display schema of DataFrame. 
input_df.printSchema()


root
 |-- track_id: string (nullable = true)
 |-- track_name: string (nullable = true)
 |-- track_artist: string (nullable = true)
 |-- track_popularity: integer (nullable = true)
 |-- track_album_id: string (nullable = true)
 |-- track_album_name: string (nullable = true)
 |-- track_album_release_date: string (nullable = true)
 |-- playlist_name: string (nullable = true)
 |-- playlist_id: string (nullable = true)
 |-- playlist_genre: string (nullable = true)
 |-- playlist_subgenre: string (nullable = true)
 |-- danceability: string (nullable = true)
 |-- energy: double (nullable = true)
 |-- key: double (nullable = true)
 |-- loudness: double (nullable = true)
 |-- mode: double (nullable = true)
 |-- speechiness: double (nullable = true)
 |-- acousticness: double (nullable = true)
 |-- instrumentalness: double (nullable = true)
 |-- liveness: double (nullable = true)
 |-- valence: double (nullable = true)
 |-- tempo: double (nullable = true)
 |-- duration_ms: double (nullable = true)


In [15]:
# clean data

# remove unnecessary columns (track_album_id , track_album_name , track_album_release_date , playlist_name , playlist_id , track_popularity)

cleaned_df = input_df.drop(
    "track_album_id",
    "track_album_name",
    "track_album_release_date",
    "playlist_name",
    "playlist_id",
    "track_popularity",
    "track_name",
    "track_artist",
    "playlist_subgenre",
    "track_id",
)

#rename playlist_genre to genre

cleaned_df = cleaned_df.withColumnRenamed("playlist_genre", "genre")

# print schema
cleaned_df.printSchema()

# delete all records with null values


# show records count

print(cleaned_df.count())

root
 |-- genre: string (nullable = true)
 |-- danceability: string (nullable = true)
 |-- energy: double (nullable = true)
 |-- key: double (nullable = true)
 |-- loudness: double (nullable = true)
 |-- mode: double (nullable = true)
 |-- speechiness: double (nullable = true)
 |-- acousticness: double (nullable = true)
 |-- instrumentalness: double (nullable = true)
 |-- liveness: double (nullable = true)
 |-- valence: double (nullable = true)
 |-- tempo: double (nullable = true)
 |-- duration_ms: double (nullable = true)
 |-- language: string (nullable = true)
 |-- lyrics: string (nullable = true)

18454


In [7]:
# remove records with null values

cleaned_df = cleaned_df.na.drop()

# remove records with lyrics = NA
cleaned_df = cleaned_df.filter(cleaned_df.lyrics != "NA")
features = [
    "danceability",
    "energy",
    "key",
    "loudness",
    "mode",
    "speechiness",
    "acousticness",
    "instrumentalness",
    "liveness",
    "valence",
    "tempo",
    "duration_ms",
]

# make sure that all this features have numeric values

for feature in features:
    cleaned_df = cleaned_df.filter(
        col(feature).cast("float").isNotNull() | col(feature).cast("int").isNotNull()
    )


# show records count
print(cleaned_df.count())

15395 en
1705 es
241 de
176 pt
133 it
76 pl
72 nl
65 fr
63 tl
40 da
34 id
29 ko
24 vi
16 ro
16 sw
16 so
12 cy
10 no
10 hi
8 ja
7 et
5 tr
5 sv
5 af
4 ru
3 sq
3 hr
2 cs
2 ar
2 ca
2 el
1 hu
1 sk
1 fi
18184


In [16]:
# show genre with more than 100 songs
result = cleaned_df.groupBy("genre").count()
filtered_result = result.filter(col("count") > 100)
display(filtered_result.toPandas())


# remove all records that doesn't belong to filtered_result

cleaned_df = cleaned_df.join(filtered_result.select("genre"), on="genre", how="left")
print(cleaned_df.count())


Unnamed: 0,genre,count
0,r&b,3317
1,pop,3992
2,edm,2045
3,rap,3391
4,rock,3521
5,latin,2178


18454


In [17]:
#print first 5 rows
cleaned_df.show(5)

#export to csv
pandas_df = cleaned_df.toPandas()
pandas_df.to_csv("cleaned_songs.csv", index=False , header=True)

+-----+------------+------+---+--------+----+-------------------+--------------------+----------------+-------------------+------------------+-------+-----------+--------+--------------------+
|genre|danceability|energy|key|loudness|mode|        speechiness|        acousticness|instrumentalness|           liveness|           valence|  tempo|duration_ms|language|              lyrics|
+-----+------------+------+---+--------+----+-------------------+--------------------+----------------+-------------------+------------------+-------+-----------+--------+--------------------+
| rock|       0.682| 0.401|2.0| -10.068| 1.0|             0.0236|               0.279|          0.0117|             0.0887|0.5660000000000001| 97.091|   235440.0|      tl|Minsan pa Nang ak...|
| rock|       0.303|  0.88|9.0|  -4.739| 1.0|             0.0442|              0.0117|         0.00994|0.34700000000000003|             0.404|135.225|   373512.0|      en|The trees, are si...|
|  r&b|       0.845| 0.652|6.0|  -7