In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import TextBlob
import re

### Data Selection


In [None]:
df = pd.read_csv('./data/processed_data.csv')
df.drop(columns=['Unnamed: 0.1', 'Unnamed: 0'], inplace=True)

df.head()

In [None]:
df.rename(columns={'artist_pop': 'artist_popularity',
          'track_pop': 'track_popularity'}, inplace=True)

In [None]:
# drop duplicated songs
df = df.drop_duplicates(subset='track_uri')

# select relevant columns
df = df[['artist_name', 'id', 'track_name', 'danceability', 'energy', 'key', 'loudness', 'mode',
         'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'artist_popularity', 'genres', 'track_popularity']]

# convert genres columns back into a list
df['genres'] = df['genres'].apply(lambda x: x.split(' '))

df.reset_index(drop=True, inplace=True)

df.head()

### Feature Generation


In [None]:
def get_subjectivity(text):
    # subjectivity - amount of personal and factual information contained in the text
    return TextBlob(text).sentiment.subjectivity


def get_polarity(text):
    # the degree of strong or clearly defined sentiment accounting for negation
    return TextBlob(text).sentiment.polarity


def get_analysis(score, task):
    '''
    Categorizing the Polarity and Subjectivity score.
    '''
    if task == 'subjectivity':
        if score < 1/3:
            return 'low'

        if score > 1/3:
            return 'high'

        return 'medium'

    if task == 'polarity':
        if score < 0:
            return 'negative'

        if score > 0:
            return 'positive'

        return 'neutral'


def sentiment_analyze(df, column):
    df['subjectivity'] = df[column].apply(get_subjectivity).apply(
        lambda x: get_analysis(x, 'subjectivity'))
    df['polarity'] = df[column].apply(get_polarity).apply(
        lambda x: get_analysis(x, 'polarity'))

    return df

In [None]:
def one_hot_encode(df):
    onehot_df = pd.get_dummies(
        df, prefix=df.columns, columns=df.columns, dtype=int)

    return onehot_df

In [None]:
def normalize(df):
    scaler = MinMaxScaler()
    normal_df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

    return normal_df

In [None]:
# sentiment analysis
df = sentiment_analyze(df, 'track_name')

In [None]:
# TF-IDF
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(df['genres'].apply(lambda x: ' '.join(x)))

genre_df = pd.DataFrame(tfidf_matrix.toarray())
genre_df.columns = ['genre_' + i for i in tfidf.get_feature_names_out()]
genre_df.drop(columns='genre_unknown')
genre_df.reset_index(drop=True, inplace=True)


# one-hot encoding
subjectivity = one_hot_encode(df[['subjectivity']]) * 0.3
polarity = one_hot_encode(df[['polarity']]) * 0.5
key = one_hot_encode(df[['key']]) * 0.5
mode = one_hot_encode(df[['mode']]) * 0.5
onehot_df = pd.concat([subjectivity, polarity, key, mode], axis=1)
onehot_df.reset_index(drop=True, inplace=True)

# normalisation
float_columns = df.dtypes[df.dtypes == 'float64'].index.values
columns_to_normalize = np.concatenate(
    (float_columns, ['artist_popularity', 'track_popularity']))
normal_df = normalize(df[columns_to_normalize].reset_index(drop=True)) * 0.2
normal_df.reset_index(drop=True, inplace=True)


feature_set = pd.concat([genre_df, onehot_df, normal_df], axis=1)
feature_set['id'] = df['id']

feature_set.head()