# Music Popularity Analysis

<hr style="border:2px solid black"> </hr>

## Exploratory Data Analysis
## Notebook 02 - Preprocessing

---

### Import libraries

In [18]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.preprocessing import OneHotEncoder

In [19]:
def num_uniques(ser):
    try:
        return len(ser.unique())
    except:
        return "Not unique check-able"


def summarize_df(df):
    print("======DATA SUMMARY======")
    print("{} rows by {} columns".format(df.shape[0], df.shape[1]))
    print("\n======COLUMNS======")
    print(df.dtypes)
    print("\n======PREVIEW======")
    display(df.head())
    print("\n======NUMERICAL COL SUMMARY======")
    print(df.describe())
    print("\n")
    for col in df.columns:
        print("{}: {} unique values".format(col, num_uniques(df[col])))

---
### Import Data

In [20]:
df = pd.read_parquet('../02_DataPrep/song_data.parquet')
summarize_df(df)

26940 rows by 24 columns

isrc                     object
sp_track_uri             object
mb_release_gid           object
mb_track_name            object
matched                  object
score                     int64
play_count                int64
sp_danceability         float64
sp_energy               float64
sp_key                    int64
sp_loudness             float64
sp_mode                   int64
sp_speechiness          float64
sp_acousticness         float64
sp_instrumentalness     float64
sp_liveness             float64
sp_valence              float64
sp_tempo                float64
sp_duration_ms            int64
sp_time_signature         int64
sp_artist_uri            object
sp_artist_popularity      int64
sp_genres                object
sp_artist_followers       int64
dtype: object



Unnamed: 0,isrc,sp_track_uri,mb_release_gid,mb_track_name,matched,score,play_count,sp_danceability,sp_energy,sp_key,...,sp_instrumentalness,sp_liveness,sp_valence,sp_tempo,sp_duration_ms,sp_time_signature,sp_artist_uri,sp_artist_popularity,sp_genres,sp_artist_followers
0,AEA0D1937329,19JIYDxouJVup6Vju4XbHl,10fd2bac-ab04-4eb8-8428-f2068923bdf7,My Wasteland,My Wasteland,100,5,0.533,0.804,4,...,0.00069,0.127,0.277,145.023,256073,4,5TIid55c3FutqRL4fzbnSF,0,[],52
1,AEA0D2084900,6DQKbMorKoc8I5dm72bZJi,4ac930af-fd16-4397-ba4c-f3727331d2c4,Hakuna Matata,Hakuna Matata,100,45,0.362,0.326,0,...,0.794,0.104,0.191,179.88,164792,4,29GcsJ0cD5bdbHpOicXLtF,23,['disney piano'],26
2,AEA0Q1967586,6k2GJRe3u6wHqaXL9XCt74,0df23deb-d8b5-4ef8-ae16-6865f6facc22,Farcry,Farcry,100,2,0.76,0.557,10,...,0.822,0.0938,0.719,144.994,202893,4,0hvlHMDFpZBDFPEVgcAX6s,0,[],2027
3,AEA0Q2041540,6xCie6yrUQh0McImPanmGq,88532401-a81e-4af7-bcfa-397ce6c4f6cc,Self Isolation,Self Isolation,100,1631,0.635,0.323,2,...,0.535,0.0974,0.209,179.996,156000,4,0XFgyr4jwM0MGeZZW0VzA5,74,[],184864
4,RUA1H2153459,1hSu17dJJcu40C7JwXnzTm,5611c030-2d68-4c99-9633-af0c59e296e6,Die With a Smile on Your Face,Die with a Smile on Your Face,100,2875,0.664,0.44,1,...,0.181,0.0536,0.256,74.735,217600,4,0XFgyr4jwM0MGeZZW0VzA5,74,[],184864



              score    play_count  sp_danceability     sp_energy  \
count  26940.000000  2.694000e+04     26940.000000  26940.000000   
mean      98.614031  1.969753e+05         0.529161      0.614964   
std        3.651687  9.166851e+05         0.185528      0.263511   
min       80.000000  0.000000e+00         0.000000      0.000000   
25%      100.000000  1.840000e+02         0.403000      0.431000   
50%      100.000000  2.852500e+03         0.540000      0.655000   
75%      100.000000  3.638050e+04         0.666000      0.838000   
max      100.000000  2.399470e+07         0.977000      1.000000   

             sp_key   sp_loudness       sp_mode  sp_speechiness  \
count  26940.000000  26940.000000  26940.000000    26940.000000   
mean       5.215071     -9.120079      0.623125        0.087248   
std        3.584864      5.439261      0.484612        0.103623   
min        0.000000    -60.000000      0.000000        0.000000   
25%        2.000000    -11.055250      0.000000    

In [21]:
num_vars = ['sp_danceability', 'sp_energy', 'sp_loudness', 'sp_speechiness', 'sp_acousticness', 'sp_instrumentalness', 'sp_liveness', 'sp_valence', 'sp_tempo', 'sp_duration_ms', 'sp_artist_popularity', 'sp_artist_followers', 'play_count']

cat_vars = ['sp_key', 'sp_mode', 'sp_time_signature']

In [22]:
df = df[num_vars + cat_vars+['isrc']]
df = df.reset_index(drop=True)
index_key = dict(zip(df.index, df['isrc']))
df = df.drop('isrc', axis=1)

In [23]:
pd.DataFrame(index_key.items(), columns=['index', 'isrc']).to_csv('index_key.csv')

---
### One-hot Encode Categorical Variables

In [24]:
for cat in ['sp_time_signature', 'sp_key']:
    X = np.array(df[cat]).reshape(-1, 1)
    enc = OneHotEncoder()
    enc.fit(X)
    arr = enc.transform(X).toarray()
    encdf = pd.DataFrame(enc.transform(X).toarray(), columns=[cat+'_'+str(i) for i in enc.categories_[0]])
    print(encdf.shape)
    df = df.join(encdf)

df = df.drop(['sp_key', 'sp_time_signature'], axis=1)
summarize_df(df)

(26940, 5)
(26940, 12)
26940 rows by 31 columns

sp_danceability         float64
sp_energy               float64
sp_loudness             float64
sp_speechiness          float64
sp_acousticness         float64
sp_instrumentalness     float64
sp_liveness             float64
sp_valence              float64
sp_tempo                float64
sp_duration_ms            int64
sp_artist_popularity      int64
sp_artist_followers       int64
play_count                int64
sp_mode                   int64
sp_time_signature_0     float64
sp_time_signature_1     float64
sp_time_signature_3     float64
sp_time_signature_4     float64
sp_time_signature_5     float64
sp_key_0                float64
sp_key_1                float64
sp_key_2                float64
sp_key_3                float64
sp_key_4                float64
sp_key_5                float64
sp_key_6                float64
sp_key_7                float64
sp_key_8                float64
sp_key_9                float64
sp_key_10              

Unnamed: 0,sp_danceability,sp_energy,sp_loudness,sp_speechiness,sp_acousticness,sp_instrumentalness,sp_liveness,sp_valence,sp_tempo,sp_duration_ms,...,sp_key_2,sp_key_3,sp_key_4,sp_key_5,sp_key_6,sp_key_7,sp_key_8,sp_key_9,sp_key_10,sp_key_11
0,0.533,0.804,-7.516,0.0743,0.0211,0.00069,0.127,0.277,145.023,256073,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.362,0.326,-15.748,0.0423,0.739,0.794,0.104,0.191,179.88,164792,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.76,0.557,-13.295,0.0625,0.0213,0.822,0.0938,0.719,144.994,202893,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.635,0.323,-12.979,0.0338,0.142,0.535,0.0974,0.209,179.996,156000,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.664,0.44,-5.156,0.036,0.0531,0.181,0.0536,0.256,74.735,217600,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0



       sp_danceability     sp_energy   sp_loudness  sp_speechiness  \
count     26940.000000  26940.000000  26940.000000    26940.000000   
mean          0.529161      0.614964     -9.120079        0.087248   
std           0.185528      0.263511      5.439261        0.103623   
min           0.000000      0.000000    -60.000000        0.000000   
25%           0.403000      0.431000    -11.055250        0.035800   
50%           0.540000      0.655000     -7.679000        0.048000   
75%           0.666000      0.838000     -5.578000        0.085500   
max           0.977000      1.000000      4.106000        0.965000   

       sp_acousticness  sp_instrumentalness   sp_liveness    sp_valence  \
count     26940.000000         26940.000000  26940.000000  26940.000000   
mean          0.298166             0.239801      0.207013      0.419168   
std           0.337638             0.358885      0.184652      0.255309   
min           0.000000             0.000000      0.000000      0.000

---
### Write out processed file

In [25]:
df.to_csv('song_data_processed.csv')