In [2]:
import pandas as pd
pd.set_option('display.max_columns', None)

import nltk
from nltk.corpus import stopwords

import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context
    
nltk.download('stopwords')
stopwords = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/carnellzhou/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
wasabi_songs_raw = pd.read_csv('data/wasabi/wasabi_songs.csv',  sep='\t')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [4]:
mxm_songs_raw = pd.read_csv('data/mxm/mxm_songs.csv', sep='<SEP>')

  return func(*args, **kwargs)


In [5]:
artists_21_raw = pd.read_csv('data/21-artist/21_artists_merged.csv')

In [6]:
emotion_tags_dict = pd.read_pickle("data/wasabi/lastfm_id_to_emotion_tags.pickle")
social_tags_dict = pd.read_pickle("data/wasabi/lastfm_id_to_social_tags.pickle")

In [7]:
# remove unwanted columns
wasabi_songs = wasabi_songs_raw[['artist', 'title', 'albumTitle', 'album_genre', 'lastfm_id', 'urlDeezer', 'urlSpotify']]
mxm_songs = mxm_songs_raw[['tid', 'artist_name', 'title', 'mxm tid']]
artists_21 = artists_21_raw[['Artist', 'Title', 'Album', 'Lyric']]

In [8]:
# rename columns to standard names
wasabi_songs = wasabi_songs.rename(columns={'albumTitle': 'album_title', 'urlSpotify': 'spotify_url', 'urlDeezer': 'deezer_url', 'lastfm_id': 'msd_tid'})
mxm_songs = mxm_songs.rename(columns={'tid': 'msd_tid', 'artist_name': 'artist', 'mxm tid': 'mxm_tid'})
artists_21 = artists_21.rename(columns={'Artist': 'artist', 'Title': 'title', 'Album': 'album', 'Lyric': 'lyrics'})

In [9]:
# remove arist names from song titles with multiple artists
wasabi_songs.loc[:, ('title')] = wasabi_songs.loc[:, ('title')].str.replace('^[^:]+:\s*','', regex=True)

In [10]:
REMOVE_STOPWORDS = False

mxm_songs_to_lyrics = dict()

# adapted from https://github.com/areevesman/lyric-embeddings/blob/master/train_example.ipynb
with open('data/mxm/mxm_dataset_train.txt') as f:
    lines = f.readlines()
    words = lines[17].replace('%','').split(',')
    
    song_rows = lines[18:]
    for i, row in enumerate(song_rows):
        song_info = row.split(',')
        msd_tid = song_info[0]
        mxm_tid = int(song_info[1])
        song_bow = [x.split(':') for x in song_info[2:]]
        song_dict = {}
        
        for word, word_count in song_bow:
            song_dict[int(word)] = int(word_count.replace('\n',''))
            
        word_lists = [[words[word-1]]*song_dict[word] for word in song_dict.keys()]
        
        song = [word for word_list in word_lists for word in word_list]
        if REMOVE_STOPWORDS:
            song = [w for w in song if w not in stopwords]
        mxm_songs_to_lyrics[(str(msd_tid), mxm_tid)] = ' '.join(song).replace('\n','')

In [11]:
df = {
    'msd_tid': list([msd_tid for (msd_tid, _) in mxm_songs_to_lyrics.keys()]),
    'mxm_tid': list([mxm_tid for (_, mxm_tid) in mxm_songs_to_lyrics.keys()]),
    'mxm_lyrics': [mxm_songs_to_lyrics[k] for k in mxm_songs_to_lyrics.keys()]
}

mxm_lyrics = pd.DataFrame.from_dict(df)

mxm_lyrics.head()

Unnamed: 0,msd_tid,mxm_tid,mxm_lyrics
0,TRAAAAV128F421A322,4623710,i i i i i i the the the the you you to to and ...
1,TRAAABD128F429CF47,6477168,i i i i i i i i i i you you you you you you yo...
2,TRAAAED128E0783FAB,2516445,i i i i i i i i i i i i i i i i i i i i i i i ...
3,TRAAAEF128F4273421,3759847,i i i i i the the the the you you you to to an...
4,TRAAAEW128F42930C0,3783760,i i i i to to to to to and and and and and and...


In [12]:
# merge mxm songs with mxm lyrics
mxm_songs_with_lyrics = mxm_songs.merge(mxm_lyrics, on=['msd_tid', 'mxm_tid'])

In [13]:
# convert emotion and social tag dictionaries to data frames

emotion = {
    "msd_tid": emotion_tags_dict.keys(),
    "emotion_tags": list(emotion_tags_dict[msd_tid] for msd_tid in emotion_tags_dict.keys())
}

emotion_df = pd.DataFrame.from_dict(emotion)

social = {
    "msd_tid": social_tags_dict.keys(),
    "social_tags": list(social_tags_dict[msd_tid] for msd_tid in social_tags_dict.keys())
}

social_df = pd.DataFrame.from_dict(social)

In [14]:
# left join wasabi with emotion and social tags

emotion_social_df = emotion_df.merge(social_df, on=['msd_tid'], how='outer')

wasabi_with_emotion_social_tags = wasabi_songs.merge(emotion_social_df, on=['msd_tid'], how='left')

In [15]:
# inner join wasabi and mxm based on msd (million song dataset) tid's
wasabi_merge_mxm = wasabi_with_emotion_social_tags.merge(mxm_songs_with_lyrics, on=['msd_tid'], suffixes=('', '_y'))

# remove duplicate columns
wasabi_merge_mxm.drop(wasabi_merge_mxm.filter(regex='_y$').columns, axis=1, inplace=True)

In [16]:
# standardize title and artist name for inner join

def standardize(df):
    return df.str.lower().str.strip() \
        .str.normalize('NFKD') \
        .str.replace('[^a-zA-Z\d:]','', regex=True) \
        .str.encode('ascii', errors='ignore') \
        .str.decode('utf-8')

wasabi_merge_mxm['title_standardized'] = standardize(wasabi_merge_mxm['title'])
wasabi_merge_mxm['artist_standardized'] = standardize(wasabi_merge_mxm['artist'])

wasabi_songs['title_standardized'] = standardize(wasabi_songs['title'])
wasabi_songs['artist_standardized'] = standardize(wasabi_songs['artist'])

artists_21['title_standardized'] = standardize(artists_21['title'])
artists_21['artist_standardized'] = standardize(artists_21['artist'])

In [17]:
# left join wasabi and 21 artists on the standardized song titles and artists
artist_21_merge_wasabi = artists_21.merge(wasabi_songs, on=['title_standardized', 'artist_standardized'], how="left", suffixes=('', '_y'))
# remove duplicate columns
artist_21_merge_wasabi.drop(artist_21_merge_wasabi.filter(regex='_y$').columns, axis=1, inplace=True)


In [18]:
# outer join (wasabi + 21 artists) and (wasabi + mxm) on standardized song titles and artists

final_df = artist_21_merge_wasabi.merge(wasabi_merge_mxm, on=['title_standardized', 'artist_standardized'], how="outer", suffixes=('', '_y'))

# remove duplicate columns
final_df.drop(final_df.filter(regex='_y$').columns, axis=1, inplace=True)


In [19]:
final_df.shape[0]

174808

In [20]:
final_df.head()

Unnamed: 0,artist,title,album,lyrics,title_standardized,artist_standardized,album_title,album_genre,msd_tid,deezer_url,spotify_url,emotion_tags,social_tags,mxm_tid,mxm_lyrics
0,Beyoncé,Drunk in Love,BEYONCÉ,beyoncé i've been drinkin' i've been drinkin' ...,drunkinlove,beyonce,Beyoncé,R&amp;B,,http://www.deezer.com/track/73724283,https://play.spotify.com/track/5EmCpD8tUj78VW3...,,,,
1,Beyoncé,Formation,Lemonade,messy mya what happened at the new wil'ins bit...,formation,beyonce,Other Songs,,,,,,,,
2,Beyoncé,Partition,BEYONCÉ,part yoncé let me hear you say hey ms carte...,partition,beyonce,Beyoncé,R&amp;B,,http://www.deezer.com/track/75955393,https://play.spotify.com/track/2vPTtiR7x7T6Lr1...,,,,
3,Beyoncé,Mine,BEYONCÉ,beyoncé i've been watching for the signs took ...,mine,beyonce,Beyoncé,R&amp;B,,http://www.deezer.com/track/80455986,https://play.spotify.com/track/40owR7p9BuUhtj5...,,,,
4,Beyoncé,Hold Up,Lemonade,hold up they don't love you like i love you sl...,holdup,beyonce,,,,,,,,,


In [21]:
final_df[final_df['social_tags'].notna() | final_df['emotion_tags'].notna()].shape[0]

150311

In [22]:
final_df['social_tags']

0                                                       NaN
1                                                       NaN
2                                                       NaN
3                                                       NaN
4                                                       NaN
                                ...                        
174803    [(hip-hop, 100), (rap, 90), (gangsta rap, 55),...
174804                                                  NaN
174805    [(rap, 100), (hip-hop, 58), (soundtrack, 37), ...
174806    [(rap, 100), (50 cent, 50), (g-unit, 50), (gan...
174807    [(rap, 100), (hip-hop, 28), (g-unit, 28), (gan...
Name: social_tags, Length: 174808, dtype: object

In [24]:
pd.final_df.to_csv('final_df.csv')

AttributeError: module 'pandas' has no attribute 'final_df_to_csv'