## Import Libraries

In [1]:
!python3 -m pip install "dask[complete]"

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m


In [2]:
!python3 -m pip install dask-ml

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m


In [3]:
# import relevant libraries
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import f1_score

In [4]:
np.__version__

'1.24.3'

## Data Cleaning for Spotify Dataset

In [5]:
# import relevant libraries
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import f1_score

# import the dataset
spotify_tracks = pd.read_csv('archive/tracks.csv')

# drop duplicates with the same name and artist
spotify_tracks = spotify_tracks.drop_duplicates(
  subset = ['name', 'artists'],
  keep = 'last').reset_index(drop = True)

# convert release_time to appropriate time date format
spotify_tracks['release_date']= pd.to_datetime(spotify_tracks['release_date'], format='mixed')

# remove songs older than 1990
spotify_tracks = spotify_tracks[spotify_tracks['release_date'].dt.year >= 1990]

# change duration from ms to minutes
spotify_tracks['duration_ms'] = spotify_tracks['duration_ms']/60000

# rearrange columns
spotify_tracks = spotify_tracks[['id',
        'name',
        'artists',
 'id_artists',
 'release_date',
 'duration_ms',
 'explicit',
 'danceability',
 'energy',
 'key',
 'loudness',
 'mode',
 'speechiness',
 'acousticness',
 'instrumentalness',
 'liveness',
 'valence',
 'tempo',
 'time_signature',
 'popularity',
]]

# reset index
spotify_tracks = spotify_tracks.reset_index(drop=True)

# identify IQR for duration and remove outliers
Q1 = np.percentile(spotify_tracks['duration_ms'], 25,
                   method = 'midpoint')
Q3 = np.percentile(spotify_tracks['duration_ms'], 75,
                   method = 'midpoint')
IQR = Q3 - Q1
upper = Q3 + 1.5*IQR
lower = Q1 - 1.5*IQR
upper_array=np.where(spotify_tracks['duration_ms']>=upper)
lower_array=np.where(spotify_tracks['duration_ms']<=lower)

spotify_tracks.drop(upper_array[0],inplace=True)
spotify_tracks.drop(lower_array[0],inplace=True)

# remove songs with time signature = 0, 1
spotify_tracks = spotify_tracks[(spotify_tracks['time_signature'] != 0) & 
                                (spotify_tracks['time_signature'] !=1)]

# remove songs with high speechiness like talk shows, audio books, poetry
spotify_tracks = spotify_tracks[spotify_tracks['speechiness']<0.8]

# remove songs with live audiences
spotify_tracks = spotify_tracks[spotify_tracks['liveness']<0.9]

# drop the artist_id, since we have the artist name
spotify_tracks.drop(columns = ['id', 'id_artists'], inplace=True)

# drop all null values
spotify_tracks = spotify_tracks.dropna()

# separate releasedate to month and year and drop releasedate
spotify_tracks['month'] = pd.DatetimeIndex(spotify_tracks['release_date']).month
spotify_tracks['year'] = pd.DatetimeIndex(spotify_tracks['release_date']).year
spotify_tracks.drop(columns = ['release_date'], axis = 1, inplace=True)

# it seems like energy/loudness, as well as loudness/acousticness are correlated, and energy/acousticness; decide to remove acousticness and loudness
spotify_tracks.drop(columns = ['loudness', 'acousticness'], inplace=True)

# ensure that song name and artist name is a string
spotify_tracks['name'] = spotify_tracks['name'].astype(str)
spotify_tracks['artists'] = spotify_tracks['artists'].astype(str)

# remove all non alphanumeric characters in song name and artists
spotify_tracks['name'] = spotify_tracks['name'].replace(r'[^A-Za-z0-9\s]+', '', regex=True)
spotify_tracks['artists'] = spotify_tracks['artists'].replace(r'[^A-Za-z0-9\s]+', '', regex=True)

# remove extra spaces in song name and artists
spotify_tracks['name'] = spotify_tracks['name'].replace(r'\s\s+', ' ', regex=True)
spotify_tracks['artists'] = spotify_tracks['artists'].replace(r'\s\s+', ' ', regex=True)

# remove all special characters, including punctuation
spotify_tracks['name'] = spotify_tracks['name'].replace(r'[^\w\s]|_', '', regex=True)
spotify_tracks['artists'] = spotify_tracks['artists'].replace(r'[^\w\s]|_', '', regex=True)

# make all characters in song name and artist lowercase
spotify_tracks['name'] = spotify_tracks.name.apply(lambda x: x.lower())
spotify_tracks['artists'] = spotify_tracks.artists.apply(lambda x: x.lower())

# length of spotify_tracks
len(spotify_tracks)

265669

## Data Cleaning for Spotify Dataset

In [6]:
# import the dataset
billboard_tracks = pd.read_csv('archive/charts.csv')

# remove all fields other than song, rank, and artist
billboard_tracks.drop(columns = ['date', 'last-week', 'peak-rank', 'weeks-on-board'], inplace=True)

# ensure that song name and artist is a string
billboard_tracks['song'] = billboard_tracks['song'].astype(str)
billboard_tracks['artist'] = billboard_tracks['artist'].astype(str)

# remove all non alphanumeric characters in song name and artist
billboard_tracks['song'] = billboard_tracks['song'].replace(r'[^A-Za-z0-9\s]+', '', regex=True)
billboard_tracks['artist'] = billboard_tracks['artist'].replace(r'[^A-Za-z0-9\s]+', '', regex=True)

# remove extra spaces in song name and artist
billboard_tracks['song'] = billboard_tracks['song'].replace(r'\s\s+', ' ', regex=True)
billboard_tracks['artist'] = billboard_tracks['artist'].replace(r'\s\s+', ' ', regex=True)

# remove all special characters, including punctuation
billboard_tracks['song'] = billboard_tracks['song'].replace(r'[^\w\s]|_', '', regex=True)
billboard_tracks['artist'] = billboard_tracks['artist'].replace(r'[^\w\s]|_', '', regex=True)

# make all characters in song name lowercase
billboard_tracks['song'] = billboard_tracks.song.apply(lambda x: x.lower())
billboard_tracks['artist'] = billboard_tracks.artist.apply(lambda x: x.lower())

# length of billboard_tracks
len(billboard_tracks)

330087

## Joining the two datasets

In [7]:
# ensure that columns we join on are the same
spotify_tracks.rename(columns={'artists': 'artist', 'name': 'song'}, inplace=True)

# perform left join
combined_tracks = spotify_tracks.merge(billboard_tracks, how = 'left', on = ['song', 'artist'])

# replace nan values with zero, if there is no matches from the merge
combined_tracks['rank'] = combined_tracks['rank'].replace(np.nan, 0)

# convert the rank into binary variable (1 if popular, 0 otherwise)
combined_tracks['billboard_popularity'] = np.where(combined_tracks['rank'] > 0, 1, 0)

# drop the billboard rank, since we don't want it infuencing our prediction
combined_tracks.drop(columns = ['rank'], inplace=True)

# drop the artist column, since it was only used for joining
combined_tracks.drop(columns=['artist'], inplace=True)

## Create separate dataset with song names vectorized

In [8]:
import dask.dataframe as dd

# perform count vectorizer (goal is to see if song name has impact on popularity)

count_vect = TfidfVectorizer(binary=False, min_df=150)
#print("init TF idf vectorizer")
name_vectorized = count_vect.fit_transform(combined_tracks['song'])

#print("created vectorized song names ")

# drop the song name column and add the new vectorized song name
combined_vectsongs = combined_tracks.drop('song', axis = 1)

#print(" dropped song name column")

count_vect_df = pd.DataFrame(name_vectorized.todense(), columns = count_vect.get_feature_names_out())

#print("created dataframe from vectorized song names")

# reset index for combined_vectsongs, count_vect_df
#print("vectsongs shape: ")
#print(combined_vectsongs.shape)
#print(" count vect df shape: ")
#print(count_vect_df.shape)

#combined_vectsongs = combined_vectsongs.reset_index().drop('index', axis = 1)
# alternative way of dropping index column
combined_vectsongs.reset_index(drop=True, inplace=True)
print("shape of combined_vectsongs: ")
print(combined_vectsongs.shape)

#count_vect_df = count_vect_df.reset_index().drop('index', axis = 1)
count_vect_df.reset_index(drop=True, inplace=True)
print("shape of count_vect_df: ")
print(count_vect_df.shape)

#print("combined_vectsongs and count_vect_df reset index")

#combined_vectsongs = pd.concat([combined_vectsongs.reset_index().drop('index', axis = 1), count_vect_df.reset_index().drop('index', axis = 1)], axis = 1)

#combined_vectsongs = pd.concat([combined_vectsongs, count_vect_df], axis = 1, ignore_index=True)

#combined_vectsongs = combined_vectsongs.join(count_vect_df, lsuffix="_left", rsuffix="_right")


#combined_vectsongs = dd.from_pandas(combined_vectsongs, npartitions=8)
#count_vect_df = dd.from_pandas(count_vect_df, npartitions=8)

combined_vectsongs = pd.concat([combined_vectsongs, count_vect_df], axis = 1)

#print("concatenated the two dataframes")
# drop the song name from combined dataset as well
combined_tracks.drop(columns = ['song'], inplace=True)

shape of combined_vectsongs: 
(364000, 16)
shape of count_vect_df: 
(364000, 753)


## Creating two datasets, two where Spotify popularity is used as target, and the other two where Billboard popularity is used as target

In [9]:
# create dataset where Spotify popularity is target
combined_spotify = combined_tracks.drop(columns = ['billboard_popularity'])
combined_spotify_vectsongs = combined_vectsongs.drop(columns = ['billboard_popularity'])

print("Number of songs in dataset: ", len(combined_spotify))

# create dataset where Billboard popularity is target
combined_billboard = combined_tracks.drop(columns = ['popularity'])
combined_billboard_vectsongs = combined_vectsongs.drop(columns = ['popularity'])

Number of songs in dataset:  364000


## Baseline Model 1 - Linear Regression with Spotify Popularity without song name

In [10]:
from sklearn.preprocessing import StandardScaler
# standardize split the data into training and test sets

scaler = StandardScaler()

df_train, df_test = train_test_split(combined_spotify, test_size = 0.2)

# create the features and target dataframes
df_train_x = df_train.drop('popularity', axis = 1).to_numpy()
df_train_y = df_train['popularity'].values
df_train_x = scaler.fit_transform(df_train_x)

df_test_x = df_test.drop('popularity', axis = 1).to_numpy()
df_test_y = df_test['popularity'].values
df_test_x = scaler.fit_transform(df_test_x)

# fit the linear regression model
LinReg = LinearRegression()
LinReg.fit(df_train_x, df_train_y)

# get score on test-set
test_score = LinReg.score(df_test_x, df_test_y)

# print the score
print(f"R2 score for test set is {test_score}")

R2 score for test set is 0.13918034208167884


## Baseline Model 2 - Linear Regression with Spotify Popularity with vectorized song name

In [11]:
# split the data into training and test sets
import dask_ml.model_selection as dcv
import dask_ml.linear_model as dlm
import dask_ml.preprocessing as dpp

#print("memory usage of combined_spotify_vectsongs:")
#print(combined_spotify_vectsongs.memory_usage(deep=True).sum())
#combined_spotify_vectsongs = combined_spotify_vectsongs.compute() 
#print("shape of combined_spotify_vectsongs: ")
#print(combined_spotify_vectsongs.shape)
#print("type of combined_spotify_vectsongs: ")
#print(type(combined_spotify_vectsongs))
scaler = StandardScaler()
df_train, df_test = train_test_split(combined_spotify_vectsongs, test_size = 0.2)
print("train test split done")

# create the features and target dataframes
#df_train_x = df_train.drop('popularity', axis = 1).to_numpy()
df_train_x = df_train.drop('popularity', axis = 1).values
print("df_train_x created")
df_train_y = df_train['popularity'].values
print("df_train_y created:")
print(df_train_y)
df_train_x = scaler.fit_transform(df_train_x)
print("df_train_x scaled")

#df_test_x = df_test.drop('popularity', axis = 1).to_numpy()
df_test_x = df_test.drop('popularity', axis = 1).values
print("df_test_x created")
df_test_y = df_test['popularity'].values
print("df_test_y created")
df_test_x = scaler.fit_transform(df_test_x)

# fit the linear regression model
LinReg = LinearRegression(n_jobs=8)
print("Linear regression model created")
#print("type of df_train_x: ", type(df_train_x))
print("type of df_train_y: ", type(df_train_y))
print("df_train_x shape: ", df_train_x.shape)
print("df_train_y shape: ", df_train_y.shape)
LinReg.fit(df_train_x, df_train_y)
print("Linear regression model fitted")

# get score on test-set
test_score = LinReg.score(df_test_x, df_test_y)

# print the score
print(f"R2 score for test set is {test_score}")

train test split done
df_train_x created
df_train_y created:
[22 31 33 ... 24 41 47]
df_train_x scaled
df_test_x created
df_test_y created
Linear regression model created
type of df_train_y:  <class 'numpy.ndarray'>
df_train_x shape:  (291200, 767)
df_train_y shape:  (291200,)
Linear regression model fitted
R2 score for test set is 0.2754341521766862


## Baseline Model 3 - Logistic Regression with Billboard Popularity without song name

In [12]:
# split the data into training and test sets

scaler = StandardScaler()

df_train, df_test = train_test_split(combined_billboard, test_size = 0.2)

# create the features and target dataframes
df_train_x = df_train.drop('billboard_popularity', axis = 1).to_numpy()
df_train_y = df_train['billboard_popularity'].values
df_train_x = scaler.fit_transform(df_train_x)

df_test_x = df_test.drop('billboard_popularity', axis = 1).to_numpy()
df_test_y = df_test['billboard_popularity'].values
df_test_x = scaler.fit_transform(df_test_x)

# fit the logistic regression model with no regularization terms
LogReg = LogisticRegression(multi_class='ovr', penalty='none', max_iter = 10000)
LogReg.fit(df_train_x, df_train_y)

# calculate F1 score
f1_train = f1_score(df_train_y, LogReg.predict(df_train_x), average = 'micro')
f1_test = f1_score(df_test_y, LogReg.predict(df_test_x), average = 'micro')

# print F1 values out
print(f"Training set with no regularization terms F1-Score is {f1_train}")
print(f"Test set with no regularization terms F1-Score is {f1_test}")



Training set with no regularization terms F1-Score is 0.7117032967032967
Test set with no regularization terms F1-Score is 0.714478021978022


## Baseline Model 4 - Logistic Regression with Billboard Popularity with vectorized song name

In [13]:
# split the data into training and test sets

scaler = StandardScaler()

df_train, df_test = train_test_split(combined_billboard_vectsongs, test_size = 0.2)

# create the features and target dataframes
df_train_x = df_train.drop('billboard_popularity', axis = 1).to_numpy()
df_train_y = df_train['billboard_popularity'].values
df_train_x = scaler.fit_transform(df_train_x)

df_test_x = df_test.drop('billboard_popularity', axis = 1).to_numpy()
df_test_y = df_test['billboard_popularity'].values
df_test_x = scaler.fit_transform(df_test_x)

# fit the logistic regression model with no regularization terms
LogReg = LogisticRegression(multi_class='ovr', penalty='none', max_iter = 10000)
LogReg.fit(df_train_x, df_train_y)

# calculate F1 score
f1_train = f1_score(df_train_y, LogReg.predict(df_train_x), average = 'micro')
f1_test = f1_score(df_test_y, LogReg.predict(df_test_x), average = 'micro')

# print F1 values out
print(f"Training set with no regularization terms F1-Score is {f1_train}")
print(f"Test set with no regularization terms F1-Score is {f1_test}")



Training set with no regularization terms F1-Score is 0.8296050824175825
Test set with no regularization terms F1-Score is 0.8265934065934066
