In [2]:
import matplotlib.pyplot as plt
from collections import OrderedDict
import seaborn as sns

import pandas as pd

import plotly.graph_objects as go
import numpy as np

In [3]:
import pyspark # only run after findspark.init()
from pyspark.sql import SparkSession 
from pyspark.conf import SparkConf
from pyspark.sql.functions import *
from pyspark.mllib.stat import Statistics
from pyspark.sql.types import StringType, IntegerType, StructType, StructField

In [4]:
spark = SparkSession \
    .builder \
    .appName("regression") \
    .master("local[4]") \
    .config("spark.driver.maxResultSize", "8g") \
    .config("spark.network.timeout","3600")\
    .config("spark.executor.heartbeatInterval","3000s")\
    .getOrCreate()

# TREND ANALYSIS

In [31]:
df = spark.read.json('../data/cleaned_dataset')
df.show(3,truncate=False,vertical =True)


-RECORD 0---------------------------------------------------------------------------------------------------------------------------------------------------
 acousticness          | 0.658                                                                                                                              
 age                   | 41.821917808219176                                                                                                                 
 avg_artist_followers  | 5403.5                                                                                                                             
 avg_artist_popularity | 40.0                                                                                                                               
 danceability          | 0.602                                                                                                                              
 duration_ms           | 156067                           

In [34]:

technical_columns = ['acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness', 'key', 'liveness', 'loudness', 'mode',  'speechiness', 'tempo', 'time_signature', 'valence']

In [33]:
df = df.withColumn('year',2021 - df.age.cast(IntegerType()))
from pyspark.sql.window import Window
windowSpec = Window.partitionBy("year") 
df = df.withColumn("popularity_norma",sum(col("popularity_track")).over(windowSpec))
df.select('year','popularity_norma').show()



+----+----------------+
|year|popularity_norma|
+----+----------------+
|1959|           59595|
|1959|           59595|
|1959|           59595|
|1959|           59595|
|1959|           59595|
|1959|           59595|
|1959|           59595|
|1959|           59595|
|1959|           59595|
|1959|           59595|
|1959|           59595|
|1959|           59595|
|1959|           59595|
|1959|           59595|
|1959|           59595|
|1959|           59595|
|1959|           59595|
|1959|           59595|
|1959|           59595|
|1959|           59595|
+----+----------------+
only showing top 20 rows



In [35]:
from functools import reduce

In [67]:


dfn = reduce(
    lambda memo_df, col_name: memo_df.withColumn(col_name+'_n', (col(col_name)/col('popularity_norma'))  ),technical_columns,df)

dfn

DataFrame[acousticness: double, age: double, avg_artist_followers: double, avg_artist_popularity: double, danceability: double, duration_ms: bigint, energy: double, genres: array<string>, id_track: string, instrumentalness: double, key: bigint, liveness: double, loudness: double, mode: bigint, popularity_track: bigint, release_date: string, speechiness: double, sum_artist_followers: bigint, sum_artist_popularity: bigint, tempo: double, time_signature: bigint, valence: double, year: int, popularity_norma: bigint, acousticness_n: double, danceability_n: double, duration_ms_n: double, energy_n: double, instrumentalness_n: double, key_n: double, liveness_n: double, loudness_n: double, mode_n: double, speechiness_n: double, tempo_n: double, time_signature_n: double, valence_n: double]

In [79]:
for column in technical_columns:
    for aggr in ['min','max','avg']:
        s = "F."+aggr+"('"+column+"').alias('"+aggr+"_"+column+"'),\\"
        print(s,end='\n')
    print("F.sum('"+column+"_n').alias('wavg_"+column+"'),\\")
    

F.min('acousticness').alias('min_acousticness'),\
F.max('acousticness').alias('max_acousticness'),\
F.avg('acousticness').alias('avg_acousticness'),\
F.sum('acousticness_n').alias('wavg_acousticness'),\
F.min('danceability').alias('min_danceability'),\
F.max('danceability').alias('max_danceability'),\
F.avg('danceability').alias('avg_danceability'),\
F.sum('danceability_n').alias('wavg_danceability'),\
F.min('duration_ms').alias('min_duration_ms'),\
F.max('duration_ms').alias('max_duration_ms'),\
F.avg('duration_ms').alias('avg_duration_ms'),\
F.sum('duration_ms_n').alias('wavg_duration_ms'),\
F.min('energy').alias('min_energy'),\
F.max('energy').alias('max_energy'),\
F.avg('energy').alias('avg_energy'),\
F.sum('energy_n').alias('wavg_energy'),\
F.min('instrumentalness').alias('min_instrumentalness'),\
F.max('instrumentalness').alias('max_instrumentalness'),\
F.avg('instrumentalness').alias('avg_instrumentalness'),\
F.sum('instrumentalness_n').alias('wavg_instrumentalness'),\
F.min('ke

In [74]:
import pyspark.sql.functions as F

In [82]:

dfg = dfn.groupBy('year')\
    .agg(F.min('acousticness').alias('min_acousticness'),\
        F.max('acousticness').alias('max_acousticness'),\
        F.avg('acousticness').alias('avg_acousticness'),\
        F.sum('acousticness_n').alias('wavg_acousticness'),\
        F.min('danceability').alias('min_danceability'),\
        F.max('danceability').alias('max_danceability'),\
        F.avg('danceability').alias('avg_danceability'),\
        F.sum('danceability_n').alias('wavg_danceability'),\
        F.min('duration_ms').alias('min_duration_ms'),\
        F.max('duration_ms').alias('max_duration_ms'),\
        F.avg('duration_ms').alias('avg_duration_ms'),\
        F.sum('duration_ms_n').alias('wavg_duration_ms'),\
        F.min('energy').alias('min_energy'),\
        F.max('energy').alias('max_energy'),\
        F.avg('energy').alias('avg_energy'),\
        F.sum('energy_n').alias('wavg_energy'),\
        F.min('instrumentalness').alias('min_instrumentalness'),\
        F.max('instrumentalness').alias('max_instrumentalness'),\
        F.avg('instrumentalness').alias('avg_instrumentalness'),\
        F.sum('instrumentalness_n').alias('wavg_instrumentalness'),\
        F.min('key').alias('min_key'),\
        F.max('key').alias('max_key'),\
        F.avg('key').alias('avg_key'),\
        F.sum('key_n').alias('wavg_key'),\
        F.min('liveness').alias('min_liveness'),\
        F.max('liveness').alias('max_liveness'),\
        F.avg('liveness').alias('avg_liveness'),\
        F.sum('liveness_n').alias('wavg_liveness'),\
        F.min('loudness').alias('min_loudness'),\
        F.max('loudness').alias('max_loudness'),\
        F.avg('loudness').alias('avg_loudness'),\
        F.sum('loudness_n').alias('wavg_loudness'),\
        F.min('mode').alias('min_mode'),\
        F.max('mode').alias('max_mode'),\
        F.avg('mode').alias('avg_mode'),\
        F.sum('mode_n').alias('wavg_mode'),\
        F.min('speechiness').alias('min_speechiness'),\
        F.max('speechiness').alias('max_speechiness'),\
        F.avg('speechiness').alias('avg_speechiness'),\
        F.sum('speechiness_n').alias('wavg_speechiness'),\
        F.min('tempo').alias('min_tempo'),\
        F.max('tempo').alias('max_tempo'),\
        F.avg('tempo').alias('avg_tempo'),\
        F.sum('tempo_n').alias('wavg_tempo'),\
        F.min('time_signature').alias('min_time_signature'),\
        F.max('time_signature').alias('max_time_signature'),\
        F.avg('time_signature').alias('avg_time_signature'),\
        F.sum('time_signature_n').alias('wavg_time_signature'),\
        F.min('valence').alias('min_valence'),\
        F.max('valence').alias('max_valence'),\
        F.avg('valence').alias('avg_valence'),\
        F.sum('valence_n').alias('wavg_valence') )

In [83]:
dfg.printSchema()

root
 |-- year: integer (nullable = true)
 |-- min_acousticness: double (nullable = true)
 |-- max_acousticness: double (nullable = true)
 |-- avg_acousticness: double (nullable = true)
 |-- wavg_acousticness: double (nullable = true)
 |-- min_danceability: double (nullable = true)
 |-- max_danceability: double (nullable = true)
 |-- avg_danceability: double (nullable = true)
 |-- wavg_danceability: double (nullable = true)
 |-- min_duration_ms: long (nullable = true)
 |-- max_duration_ms: long (nullable = true)
 |-- avg_duration_ms: double (nullable = true)
 |-- wavg_duration_ms: double (nullable = true)
 |-- min_energy: double (nullable = true)
 |-- max_energy: double (nullable = true)
 |-- avg_energy: double (nullable = true)
 |-- wavg_energy: double (nullable = true)
 |-- min_instrumentalness: double (nullable = true)
 |-- max_instrumentalness: double (nullable = true)
 |-- avg_instrumentalness: double (nullable = true)
 |-- wavg_instrumentalness: double (nullable = true)
 |-- min_

In [85]:
dfg.count()

101

In [86]:
dfp = dfg.toPandas()

In [88]:
dfp.to_csv('../data/timeseries.csv')