# BIG TATA

In [1]:
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import col , sum as sum_func , when
from IPython.display import display


In [2]:
import findspark
findspark.init()
findspark.find()

'C:\\Program Files\\Spark\\spark-3.5.1-bin-hadoop3'

In [3]:
# Create a SparkSession
spark = SparkSession.builder \
    .appName("My Spark App") \
    .getOrCreate()

In [4]:
input_df = spark.read.csv("spotify_songs2.csv", header=True, inferSchema=True)


In [5]:
# 3. Display schema of DataFrame. 
input_df.printSchema()


root
 |-- track_id: string (nullable = true)
 |-- track_name: string (nullable = true)
 |-- track_artist: string (nullable = true)
 |-- track_popularity: integer (nullable = true)
 |-- track_album_id: string (nullable = true)
 |-- track_album_name: string (nullable = true)
 |-- track_album_release_date: string (nullable = true)
 |-- playlist_name: string (nullable = true)
 |-- playlist_id: string (nullable = true)
 |-- playlist_genre: string (nullable = true)
 |-- playlist_subgenre: string (nullable = true)
 |-- danceability: string (nullable = true)
 |-- energy: string (nullable = true)
 |-- key: string (nullable = true)
 |-- loudness: string (nullable = true)
 |-- mode: string (nullable = true)
 |-- speechiness: double (nullable = true)
 |-- acousticness: double (nullable = true)
 |-- instrumentalness: double (nullable = true)
 |-- liveness: double (nullable = true)
 |-- valence: double (nullable = true)
 |-- tempo: double (nullable = true)
 |-- duration_ms: double (nullable = true)


In [6]:
# clean data

# remove unnecessary columns (track_album_id , track_album_name , track_album_release_date , playlist_name , playlist_id , track_popularity)

cleaned_df = input_df.drop(
    "track_album_id",
    "track_album_name",
    "track_album_release_date",
    "playlist_name",
    "playlist_id",
    "track_popularity",
    "track_name",
    "track_artist",
    "playlist_subgenre",
    "track_id",
)

#rename playlist_genre to genre

cleaned_df = cleaned_df.withColumnRenamed("playlist_genre", "genre")

# print schema
cleaned_df.printSchema()

# delete all records with null values


# show records count

print(cleaned_df.count())

root
 |-- genre: string (nullable = true)
 |-- danceability: string (nullable = true)
 |-- energy: string (nullable = true)
 |-- key: string (nullable = true)
 |-- loudness: string (nullable = true)
 |-- mode: string (nullable = true)
 |-- speechiness: double (nullable = true)
 |-- acousticness: double (nullable = true)
 |-- instrumentalness: double (nullable = true)
 |-- liveness: double (nullable = true)
 |-- valence: double (nullable = true)
 |-- tempo: double (nullable = true)
 |-- duration_ms: double (nullable = true)

32833


In [8]:
# remove records with null values

cleaned_df = cleaned_df.na.drop()

# # remove records with lyrics = NA
# cleaned_df = cleaned_df.filter(cleaned_df.lyrics != "NA")
features = [
    "danceability",
    "energy",
    "key",
    "loudness",
    "mode",
    "speechiness",
    "acousticness",
    "instrumentalness",
    "liveness",
    "valence",
    "tempo",
    "duration_ms",
]

# make sure that all this features have numeric values

for feature in features:
    cleaned_df = cleaned_df.filter(
        col(feature).cast("float").isNotNull() | col(feature).cast("int").isNotNull()
    )


# show records count
print(cleaned_df.count())

32821


In [9]:
# show genre with more than 100 songs
result = cleaned_df.groupBy("genre").count()
filtered_result = result.filter(col("count") > 100)
display(filtered_result.toPandas())


# remove all records that doesn't belong to filtered_result

cleaned_df = cleaned_df.join(filtered_result.select("genre"), on="genre", how="left")
print(cleaned_df.count())


Unnamed: 0,genre,count
0,pop,5506
1,rap,5746
2,rock,4951
3,latin,5155
4,r&b,5421
5,edm,6042


32821


In [10]:
#print first 5 rows
cleaned_df.show(5)

#export to csv
pandas_df = cleaned_df.toPandas()
pandas_df.to_csv("cleaned_songs.csv", index=False , header=True)

+-----+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+-----------+
|genre|danceability|energy|key|loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence|  tempo|duration_ms|
+-----+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+-----------+
|  pop|       0.748| 0.916|  6|  -2.634|   1|     0.0583|       0.102|             0.0|  0.0653|  0.518|122.036|   194754.0|
|  pop|       0.726| 0.815| 11|  -4.969|   1|     0.0373|      0.0724|         0.00421|   0.357|  0.693| 99.972|   162600.0|
|  pop|       0.675| 0.931|  1|  -3.432|   0|     0.0742|      0.0794|         2.33E-5|    0.11|  0.613|124.008|   176616.0|
|  pop|       0.718|  0.93|  7|  -3.778|   1|      0.102|      0.0287|         9.43E-6|   0.204|  0.277|121.956|   169093.0|
|  pop|        0.65| 0.833|  1|  -4.672|   1|     0.0359|      0.0803|             0.0|  0.0833|  0.725|123.976|   189052.0|
