In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import TextBlob

### Data Selection


In [3]:
df = pd.read_csv('./data/processed_data.csv')
df.drop(columns=['Unnamed: 0.1', 'Unnamed: 0'], inplace=True)

df.head()

Unnamed: 0,pos,artist_name,track_uri,artist_uri,track_name,album_uri,duration_ms_x,album_name,name,danceability,...,type,id,uri,track_href,analysis_url,duration_ms_y,time_signature,artist_pop,genres,track_pop
0,0,Missy Elliott,0UaMYEvWZi0ZqiDOoHU3YI,spotify:artist:2wIVse2owClT7go1WT98tk,Lose Control (feat. Ciara & Fat Man Scoop),spotify:album:6vV5UrXcfyQD1wu4Qo2I9K,226863,The Cookbook,Throwbacks,0.904,...,audio_features,0UaMYEvWZi0ZqiDOoHU3YI,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,https://api.spotify.com/v1/tracks/0UaMYEvWZi0Z...,https://api.spotify.com/v1/audio-analysis/0UaM...,226864,4,74,dance_pop hip_hop hip_pop pop pop_rap r&b rap ...,69
1,73,Missy Elliott,0UaMYEvWZi0ZqiDOoHU3YI,spotify:artist:2wIVse2owClT7go1WT98tk,Lose Control (feat. Ciara & Fat Man Scoop),spotify:album:6vV5UrXcfyQD1wu4Qo2I9K,226863,The Cookbook,w o r k o u t,0.904,...,audio_features,0UaMYEvWZi0ZqiDOoHU3YI,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,https://api.spotify.com/v1/tracks/0UaMYEvWZi0Z...,https://api.spotify.com/v1/audio-analysis/0UaM...,226864,4,74,dance_pop hip_hop hip_pop pop pop_rap r&b rap ...,69
2,14,Missy Elliott,0UaMYEvWZi0ZqiDOoHU3YI,spotify:artist:2wIVse2owClT7go1WT98tk,Lose Control (feat. Ciara & Fat Man Scoop),spotify:album:6vV5UrXcfyQD1wu4Qo2I9K,226863,The Cookbook,party playlist,0.904,...,audio_features,0UaMYEvWZi0ZqiDOoHU3YI,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,https://api.spotify.com/v1/tracks/0UaMYEvWZi0Z...,https://api.spotify.com/v1/audio-analysis/0UaM...,226864,4,74,dance_pop hip_hop hip_pop pop pop_rap r&b rap ...,69
3,42,Missy Elliott,0UaMYEvWZi0ZqiDOoHU3YI,spotify:artist:2wIVse2owClT7go1WT98tk,Lose Control (feat. Ciara & Fat Man Scoop),spotify:album:6vV5UrXcfyQD1wu4Qo2I9K,226863,The Cookbook,Dance mix,0.904,...,audio_features,0UaMYEvWZi0ZqiDOoHU3YI,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,https://api.spotify.com/v1/tracks/0UaMYEvWZi0Z...,https://api.spotify.com/v1/audio-analysis/0UaM...,226864,4,74,dance_pop hip_hop hip_pop pop pop_rap r&b rap ...,69
4,1,Missy Elliott,0UaMYEvWZi0ZqiDOoHU3YI,spotify:artist:2wIVse2owClT7go1WT98tk,Lose Control (feat. Ciara & Fat Man Scoop),spotify:album:6vV5UrXcfyQD1wu4Qo2I9K,226863,The Cookbook,spin,0.904,...,audio_features,0UaMYEvWZi0ZqiDOoHU3YI,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,https://api.spotify.com/v1/tracks/0UaMYEvWZi0Z...,https://api.spotify.com/v1/audio-analysis/0UaM...,226864,4,74,dance_pop hip_hop hip_pop pop pop_rap r&b rap ...,69


In [4]:
df.rename(columns={'artist_pop': 'artist_popularity',
          'track_pop': 'track_popularity'}, inplace=True)

In [5]:
# drop duplicated songs
df = df.drop_duplicates(subset='track_uri')

# select relevant columns
df = df[['artist_name', 'id', 'track_name', 'danceability', 'energy', 'key', 'loudness', 'mode',
         'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'artist_popularity', 'genres', 'track_popularity']]

# convert genres columns back into a list
df['genres'] = df['genres'].apply(lambda x: x.split(' '))

df.reset_index(drop=True, inplace=True)

df.head()

Unnamed: 0,artist_name,id,track_name,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,artist_popularity,genres,track_popularity
0,Missy Elliott,0UaMYEvWZi0ZqiDOoHU3YI,Lose Control (feat. Ciara & Fat Man Scoop),0.904,0.813,4,-7.105,0,0.121,0.0311,0.00697,0.0471,0.81,125.461,74,"[dance_pop, hip_hop, hip_pop, pop, pop_rap, r&...",69
1,Britney Spears,6I9VzXrHxO9rA9A5euc8Ak,Toxic,0.774,0.838,5,-3.914,0,0.114,0.0249,0.025,0.242,0.924,143.04,84,"[dance_pop, pop, post-teen_pop]",83
2,Beyoncé,0WqIKmW4BTrj3eJFmnCKMv,Crazy In Love,0.664,0.758,2,-6.583,0,0.21,0.00238,0.0,0.0598,0.701,99.259,86,"[dance_pop, pop, r&b]",25
3,Justin Timberlake,1AWQoqb9bSvzTjaLralEkT,Rock Your Body,0.892,0.714,4,-6.055,0,0.141,0.201,0.000234,0.0521,0.817,100.972,82,"[dance_pop, pop]",79
4,Shaggy,1lzr43nnXAijIGYnCT8M8H,It Wasn't Me,0.853,0.606,0,-4.596,1,0.0713,0.0561,0.0,0.313,0.654,94.759,75,"[pop_rap, reggae_fusion]",2


### Feature Generation


In [6]:
def get_subjectivity(text):
    # subjectivity - amount of personal and factual information contained in the text
    return TextBlob(text).sentiment.subjectivity


def get_polarity(text):
    # the degree of strong or clearly defined sentiment accounting for negation
    return TextBlob(text).sentiment.polarity


def get_analysis(score, task):
    '''
    Categorizing the Polarity and Subjectivity score.
    '''
    if task == 'subjectivity':
        if score < 1/3:
            return 'low'

        if score > 1/3:
            return 'high'

        return 'medium'

    if task == 'polarity':
        if score < 0:
            return 'negative'

        if score > 0:
            return 'positive'

        return 'neutral'


def sentiment_analyze(df, column):
    df['subjectivity'] = df[column].apply(get_subjectivity).apply(
        lambda x: get_analysis(x, 'subjectivity'))
    df['polarity'] = df[column].apply(get_polarity).apply(
        lambda x: get_analysis(x, 'polarity'))

    return df

In [7]:
def one_hot_encode(df):
    onehot_df = pd.get_dummies(
        df, prefix=df.columns, columns=df.columns, dtype=int)

    return onehot_df

In [8]:
def normalize(df):
    scaler = MinMaxScaler()
    normal_df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

    return normal_df

In [9]:
# sentiment analysis
df = sentiment_analyze(df, 'track_name')

In [10]:
# TF-IDF
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(df['genres'].apply(lambda x: ' '.join(x)))

genre_df = pd.DataFrame(tfidf_matrix.toarray())
genre_df.columns = ['genre_' + i for i in tfidf.get_feature_names_out()]
genre_df.drop(columns='genre_unknown')
genre_df.reset_index(drop=True, inplace=True)


# one-hot encoding
subjectivity = one_hot_encode(df[['subjectivity']]) * 0.3
polarity = one_hot_encode(df[['polarity']]) * 0.5
key = one_hot_encode(df[['key']]) * 0.5
mode = one_hot_encode(df[['mode']]) * 0.5
onehot_df = pd.concat([subjectivity, polarity, key, mode], axis=1)
onehot_df.reset_index(drop=True, inplace=True)

# normalisation
float_columns = df.dtypes[df.dtypes == 'float64'].index.values
columns_to_normalize = np.concatenate(
    (float_columns, ['artist_popularity', 'track_popularity']))
normal_df = normalize(df[columns_to_normalize].reset_index(drop=True)) * 0.2
normal_df.reset_index(drop=True, inplace=True)


feature_set = pd.concat([genre_df, onehot_df, normal_df], axis=1)
feature_set['id'] = df['id']

feature_set.head()

Unnamed: 0,genre_21st_century_classical,genre_432hz,genre__hip_hop,genre__roll,genre_a_cappella,genre_abstract_beats,genre_abstract_hip_hop,genre_accordion,genre_acid_jazz,genre_acid_rock,...,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,artist_popularity,track_popularity,id
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.168547,0.025156,0.006245,0.001401,0.00942,0.162325,0.114421,0.148,0.142268,0UaMYEvWZi0ZqiDOoHU3YI
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.178715,0.023701,0.005,0.005025,0.0484,0.18517,0.130453,0.168,0.171134,6I9VzXrHxO9rA9A5euc8Ak
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.17021,0.043659,0.000478,0.0,0.01196,0.140481,0.090525,0.172,0.051546,0WqIKmW4BTrj3eJFmnCKMv
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.171892,0.029314,0.040361,4.7e-05,0.01042,0.163727,0.092087,0.164,0.162887,1AWQoqb9bSvzTjaLralEkT
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.176541,0.014823,0.011265,0.0,0.0626,0.131062,0.086421,0.15,0.004124,1lzr43nnXAijIGYnCT8M8H


In [11]:
df.head()

Unnamed: 0,artist_name,id,track_name,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,artist_popularity,genres,track_popularity,subjectivity,polarity
0,Missy Elliott,0UaMYEvWZi0ZqiDOoHU3YI,Lose Control (feat. Ciara & Fat Man Scoop),0.904,0.813,4,-7.105,0,0.121,0.0311,0.00697,0.0471,0.81,125.461,74,"[dance_pop, hip_hop, hip_pop, pop, pop_rap, r&...",69,low,neutral
1,Britney Spears,6I9VzXrHxO9rA9A5euc8Ak,Toxic,0.774,0.838,5,-3.914,0,0.114,0.0249,0.025,0.242,0.924,143.04,84,"[dance_pop, pop, post-teen_pop]",83,low,neutral
2,Beyoncé,0WqIKmW4BTrj3eJFmnCKMv,Crazy In Love,0.664,0.758,2,-6.583,0,0.21,0.00238,0.0,0.0598,0.701,99.259,86,"[dance_pop, pop, r&b]",25,high,negative
3,Justin Timberlake,1AWQoqb9bSvzTjaLralEkT,Rock Your Body,0.892,0.714,4,-6.055,0,0.141,0.201,0.000234,0.0521,0.817,100.972,82,"[dance_pop, pop]",79,low,neutral
4,Shaggy,1lzr43nnXAijIGYnCT8M8H,It Wasn't Me,0.853,0.606,0,-4.596,1,0.0713,0.0561,0.0,0.313,0.654,94.759,75,"[pop_rap, reggae_fusion]",2,low,neutral


In [10]:
with open('./data/tracks.csv', 'w') as file:
    pass

with open('./data/test_playlist.csv', 'w') as file:
    pass

df.to_csv('data/tracks.csv', index=False)

feature_set.to_csv('data/feature_set.csv', index=False)