# Prepare the environment and data

In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
     .appName("Final Project") \
     .getOrCreate()
sc = spark.sparkContext      # get the context
sc

In [3]:
features = spark.read.format("csv").option("header", "true").load("s3://anly502final/fma_dataset/fma_dataset.csv")

In [4]:
tracks = spark.read.format("csv").option("header", "true").load("s3://anly502final/fma_dataset/tracks.csv")

In [5]:
#Combine two csv file by tracks_id column
Join_table = features.join(tracks["track_id","type"], features.feature == tracks.track_id)

In [6]:
#convert the type of track_id column into integer
from pyspark.sql.types import *
Join_table = Join_table.withColumn("track_id", Join_table["track_id"].cast(IntegerType()))

In [7]:
#order by track_id column
from pyspark.sql.functions import col
Join_table = Join_table.orderBy(Join_table.track_id)

In [8]:
#delete duplicate column
Join_table = Join_table.drop('feature')

In [9]:
Join_table.take(4)

[Row(chroma_cens1='7.1806526184e+00', chroma_cens2='5.2303090096e+00', chroma_cens3='2.4932080507e-01', chroma_cens4='1.3476201296e+00', chroma_cens5='1.4824777842e+00', chroma_cens6='5.3137123585e-01', chroma_cens7='1.4815930128e+00', chroma_cens8='2.6914546490e+00', chroma_cens9='8.6686819792e-01', chroma_cens10='1.3412306309e+00', chroma_cens11='1.3477915525e+00', chroma_cens12='1.2376583815e+00', chroma_cens13='6.9249993563e-01', chroma_cens14='5.6934404373e-01', chroma_cens15='5.9704089165e-01', chroma_cens16='6.2586373091e-01', chroma_cens17='5.6732958555e-01', chroma_cens18='4.4394925237e-01', chroma_cens19='4.8797628284e-01', chroma_cens20='4.9732723832e-01', chroma_cens21='5.7443547249e-01', chroma_cens22='5.7924067974e-01', chroma_cens23='6.2010246515e-01', chroma_cens24='5.8694541454e-01', chroma_cens25='4.7430026531e-01', chroma_cens26='3.6981594563e-01', chroma_cens27='2.3611885309e-01', chroma_cens28='2.2806788981e-01', chroma_cens29='2.2282999754e-01', chroma_cens30='2.2

### .printSchema()

There are 520 columns in the following dataframe. 
The first 518 columns are features extracted by LibROSA, which is a python package for music and audio analysis. 
The last two columns are respectively the identity column "track_id" and the target variable "type".

In [10]:
Join_table.printSchema()

root
 |-- chroma_cens1: string (nullable = true)
 |-- chroma_cens2: string (nullable = true)
 |-- chroma_cens3: string (nullable = true)
 |-- chroma_cens4: string (nullable = true)
 |-- chroma_cens5: string (nullable = true)
 |-- chroma_cens6: string (nullable = true)
 |-- chroma_cens7: string (nullable = true)
 |-- chroma_cens8: string (nullable = true)
 |-- chroma_cens9: string (nullable = true)
 |-- chroma_cens10: string (nullable = true)
 |-- chroma_cens11: string (nullable = true)
 |-- chroma_cens12: string (nullable = true)
 |-- chroma_cens13: string (nullable = true)
 |-- chroma_cens14: string (nullable = true)
 |-- chroma_cens15: string (nullable = true)
 |-- chroma_cens16: string (nullable = true)
 |-- chroma_cens17: string (nullable = true)
 |-- chroma_cens18: string (nullable = true)
 |-- chroma_cens19: string (nullable = true)
 |-- chroma_cens20: string (nullable = true)
 |-- chroma_cens21: string (nullable = true)
 |-- chroma_cens22: string (nullable = true)
 |-- chroma_ce

Create a TempView and filter out distinct values of target variables:

Now we have four categories: album, live performance, single tracks and radio program.

In [11]:
Join_table.createOrReplaceTempView("Join_table")

In [12]:
cleaned_df = spark.sql("select * from Join_table where type in ('Album', 'Live Performance', 'Single Tracks', 'Radio Program') ")

In [13]:
cleaned_df.count()

41309

### Exploratory data analysis

### Modelling

In [None]:
#select your features and convert string type to integer type for modelling: