In [24]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import TextBlob
import re

### Data Selection


In [25]:
df = pd.read_csv('./data/processed_data.csv')
df.drop(columns=['Unnamed: 0.1', 'Unnamed: 0'], inplace=True)

In [26]:
df.rename(columns={'artist_pop': 'artist_popularity',
          'track_pop': 'track_popularity'}, inplace=True)

In [28]:
# drop duplicated songs
df = df.drop_duplicates(subset='track_uri')
# select relevant columns
df = df[['artist_name', 'id', 'track_name', 'danceability', 'energy', 'key', 'loudness', 'mode',
         'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'artist_popularity', 'genres', 'track_popularity']]
# convert genres columns back into a list
df['genres'] = df['genres'].apply(lambda x: x.split(' '))
df.head()

Unnamed: 0,artist_name,id,track_name,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,artist_popularity,genres,track_popularity
0,Missy Elliott,0UaMYEvWZi0ZqiDOoHU3YI,Lose Control (feat. Ciara & Fat Man Scoop),0.904,0.813,4,-7.105,0,0.121,0.0311,0.00697,0.0471,0.81,125.461,74,"[dance_pop, hip_hop, hip_pop, pop, pop_rap, r&...",69
6,Britney Spears,6I9VzXrHxO9rA9A5euc8Ak,Toxic,0.774,0.838,5,-3.914,0,0.114,0.0249,0.025,0.242,0.924,143.04,84,"[dance_pop, pop, post-teen_pop]",83
19,Beyoncé,0WqIKmW4BTrj3eJFmnCKMv,Crazy In Love,0.664,0.758,2,-6.583,0,0.21,0.00238,0.0,0.0598,0.701,99.259,86,"[dance_pop, pop, r&b]",25
46,Justin Timberlake,1AWQoqb9bSvzTjaLralEkT,Rock Your Body,0.892,0.714,4,-6.055,0,0.141,0.201,0.000234,0.0521,0.817,100.972,82,"[dance_pop, pop]",79
55,Shaggy,1lzr43nnXAijIGYnCT8M8H,It Wasn't Me,0.853,0.606,0,-4.596,1,0.0713,0.0561,0.0,0.313,0.654,94.759,75,"[pop_rap, reggae_fusion]",2


### Feature Generation


In [None]:
def get_subjectivity(text):
    return TextBlob(text).sentiment.subjectivity


def get_polarity(text):
    return TextBlob(text).sentiment.polarity


def get_analysis(score, task='polarity'):
    '''
    Categorizing the Polarity and Subjectivity score.
    '''
    if task == 'subjectivity':
        if score < 1/3:
            return 'low'

        if score > 1/3:
            return 'high'

        return 'medium'

    if task == 'polarity':
        if score < 0:
            return 'Negative'

        if score > 0:
            return 'Positive'

        return 'Neutral'