# Part 2: Data Pre-Processing

In [1]:
import pandas as pd
import re
from afinn import Afinn
from nrclex import NRCLex

Mengimpor file-file yang sudah kita kerjakan sebelumnya

In [54]:
lyrics = pd.read_csv('datasets/lyrics.csv')
audio = pd.read_csv('datasets/audio_features.csv')
artists = pd.read_csv('datasets/artists.csv')
audio.shape, lyrics.shape

((50, 29), (50, 6))

Mengambil data genre lagu dari genres artis

In [55]:
df = lyrics.merge(artists[['artist', 'genres']], on='artist')

In [56]:
df['genres'].value_counts()

genres
['indonesian pop']                                                             16
[]                                                                              5
['boy band', 'pop', 'post-teen pop', 'talent show']                             2
['permanent wave', 'pop']                                                       2
['indonesian pop', 'malaysian pop']                                             2
['indonesian jazz', 'indonesian pop', 'indonesian singer-songwriter']           2
['k-pop', 'k-pop girl group', 'pop']                                            1
['emo', 'modern rock', 'pop punk', 'pov: indie', 'rock']                        1
['canadian contemporary r&b', 'canadian pop', 'pop']                            1
['k-pop']                                                                       1
['indonesian folk']                                                             1
['garage rock', 'modern rock', 'permanent wave', 'rock', 'sheffield indie']     1
['indones

In [57]:
df['genre'] = df['genres'].apply(lambda x: 'pop' if 'pop' in x else ('rock' if 'rock' in x else ('jazz' if 'jazz' in x else ('hip-hop' if 'hip' in x else ('indie' if 'indie' in x else ('r&b' if 'r&b' in x else 'other'))))))

In [58]:
df['genre'].value_counts()

genre
pop        35
other       7
indie       3
r&b         2
rock        2
hip-hop     1
Name: count, dtype: int64

## Fitur Baru
<ul>
    <li>Membuat fitur jumlah kata dalam lirik</li>
    <li>Membuat fitur berdasarkan lexicon</li>
</ul>

In [59]:
df['lyrics'] = [re.sub(r'^.*?Lyrics', '', str(lyric)) for lyric in df['lyrics']]
df['cleaned_lyrics'] = [str(lyric).replace('\n',' ') for lyric in df['lyrics']]
df['cleaned_lyrics'] = [re.sub("\[.*?\]","",lyric) for lyric in df['cleaned_lyrics']]

df['title_length'] = [len(title) for title in df['track']]
df['lines'] = [str(lyric).count('\n') for lyric in df['lyrics']]
df['sections'] = [str(lyric).count('\n\n') for lyric in df['lyrics']]
df['verse_count'] = [str(lyric).count('[Verse') for lyric in df['lyrics']]
df['chorus_count'] = [str(lyric).count('[Chorus') for lyric in df['lyrics']]
df['dash_count'] = [str(lyric).count('" -') for lyric in df['lyrics']]
df['words'] = [len(str(lyric).split()) for lyric in df['cleaned_lyrics']]
df['unique_words'] = [len(set(str(lyric).split())) for lyric in df['cleaned_lyrics']]

df.head(3)

Unnamed: 0.1,Unnamed: 0,id,track,artist,lyrics,language,genres,genre,cleaned_lyrics,title_length,lines,sections,verse_count,chorus_count,dash_count,words,unique_words
0,0,26cvTWJq2E1QqN4jyH2OTU,Tak Segampang Itu,Anggi Marito,"Waktu demi waktu, hari demi hari \n Sadar ku t...",id,['indonesian pop'],pop,"Waktu demi waktu, hari demi hari Sadar ku t'...",17,26,0,0,0,0,148,83
1,1,2AaaE0qvFWtyT8srKNfRhH,Komang,Raim Laode,"Dari kejauhan, tergambar cerita tentang kita \...",id,[],other,"Dari kejauhan, tergambar cerita tentang kita ...",6,27,0,0,0,0,112,60
2,2,6dXiWwFrcGieqnoLYzPNp5,Sial,Mahalini,Sampai saat ini tak terpikir olehku \n Aku per...,id,['indonesian pop'],pop,Sampai saat ini tak terpikir olehku Aku pern...,4,29,0,0,0,0,166,71


## Menerjemahkan Lagu
NRCLex hanya bisa memproses dengan baik teks-teks berbahasa Inggris. Lirik yang sudah diproses di atas selanjutnya melalui proses penerjemahan. Sayangnya, belum ada modul yang dapat menerjemahkan lirik di atas sebaik Google Translate. Maka dari itu, saya menggunakan salah satu fitur yang tersedia di Google Sheets, yakni fungsi =GOOGLETRANSLATE.

In [25]:
df.to_csv('datasets/lyrics-count.csv')

In [27]:
df = pd.read_csv('datasets/lyrics-translated.csv')
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,id,track,artist,lyrics,lyrics_translated,language,genres,genre,cleaned_lyrics,cleaned_lyrics_translated,title_length,lines,sections,verse_count,chorus_count,dash_count,words,unique_words
0,0,0,26cvTWJq2E1QqN4jyH2OTU,Tak Segampang Itu,Anggi Marito,"Waktu demi waktu, hari demi hari \n Sadar ku t...","Time after time, day after day\n I realized my...",id,['indonesian pop'],pop,"Waktu demi waktu, hari demi hari Sadar ku t'...","Time after time, day after day I realize mysel...",17,26,0,0,0,0,148,83
1,1,1,2AaaE0qvFWtyT8srKNfRhH,Komang,Raim Laode,"Dari kejauhan, tergambar cerita tentang kita \...","From a distance, a story about us\n Separate d...",id,[],other,"Dari kejauhan, tergambar cerita tentang kita ...","From a distance, illustrated the story about u...",6,27,0,0,0,0,112,60
2,2,2,6dXiWwFrcGieqnoLYzPNp5,Sial,Mahalini,Sampai saat ini tak terpikir olehku \n Aku per...,Until now I didn't think of me\n I once gave a...,id,['indonesian pop'],pop,Sampai saat ini tak terpikir olehku Aku pern...,Until now I have not thought of me I have give...,4,29,0,0,0,0,166,71
3,3,22,6Iq3sgLVrqqZfRitLaeHkn,Sisa Rasa,Mahalini,"Ha-ah-ah-ah \n Melihatmu bahagia, satu hal yan...","Ha-ah-ah-ah\n See you happy, one of the most b...",id,['indonesian pop'],pop,"Ha-ah-ah-ah Melihatmu bahagia, satu hal yang...","Ha-ah-ah-ah see you happy, one of the most bea...",9,37,0,0,0,0,171,93
4,4,44,6fX8WwxAQ6rCPIC7lMgztu,Kisah Sempurna,Mahalini,"Ha-ah, ha-ah-ah \n Tenggelam, jiwaku dalam ang...","Ha-ah, ha-ah-ah\n Sinking, my soul in dreams\n...",id,['indonesian pop'],pop,"Ha-ah, ha-ah-ah Tenggelam, jiwaku dalam anga...","Ha-ah, ha-ah-ah sink, my soul in the dreams lo...",14,36,0,0,0,0,175,70


In [29]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/alifdwt/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [30]:
df['emot_positive'] = [NRCLex(lyric).affect_frequencies.get('positive') for lyric in df['cleaned_lyrics_translated']]
df['emot_negative'] = [NRCLex(lyric).affect_frequencies.get('negative') for lyric in df['cleaned_lyrics_translated']]
df['emot_anger'] = [NRCLex(lyric).affect_frequencies.get('anger') for lyric in df['cleaned_lyrics_translated']]
df['emot_trust'] = [NRCLex(lyric).affect_frequencies.get('trust') for lyric in df['cleaned_lyrics_translated']]
df['emot_disgust'] = [NRCLex(lyric).affect_frequencies.get('disgust') for lyric in df['cleaned_lyrics_translated']]
df['emot_fear'] = [NRCLex(lyric).affect_frequencies.get('fear') for lyric in df['cleaned_lyrics_translated']]
df['emot_joy'] = [NRCLex(lyric).affect_frequencies.get('joy') for lyric in df['cleaned_lyrics_translated']]
df['emot_surprise'] = [NRCLex(lyric).affect_frequencies.get('surprise') for lyric in df['cleaned_lyrics_translated']]
df['emot_anticp'] = [NRCLex(lyric).affect_frequencies.get('anticip') for lyric in df['cleaned_lyrics_translated']]

In [31]:
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,id,track,artist,lyrics,lyrics_translated,language,genres,genre,...,unique_words,emot_positive,emot_negative,emot_anger,emot_trust,emot_disgust,emot_fear,emot_joy,emot_surprise,emot_anticp
0,0,0,26cvTWJq2E1QqN4jyH2OTU,Tak Segampang Itu,Anggi Marito,"Waktu demi waktu, hari demi hari \n Sadar ku t...","Time after time, day after day\n I realized my...",id,['indonesian pop'],pop,...,83,0.296296,0.111111,0.0,0.037037,0.0,0.037037,0.259259,0.074074,0.0
1,1,1,2AaaE0qvFWtyT8srKNfRhH,Komang,Raim Laode,"Dari kejauhan, tergambar cerita tentang kita \...","From a distance, a story about us\n Separate d...",id,[],other,...,60,0.388889,0.037037,0.018519,0.111111,0.0,0.0,0.277778,0.055556,0.0
2,2,2,6dXiWwFrcGieqnoLYzPNp5,Sial,Mahalini,Sampai saat ini tak terpikir olehku \n Aku per...,Until now I didn't think of me\n I once gave a...,id,['indonesian pop'],pop,...,71,0.22449,0.071429,0.071429,0.091837,0.040816,0.102041,0.183673,0.05102,0.0
3,3,22,6Iq3sgLVrqqZfRitLaeHkn,Sisa Rasa,Mahalini,"Ha-ah-ah-ah \n Melihatmu bahagia, satu hal yan...","Ha-ah-ah-ah\n See you happy, one of the most b...",id,['indonesian pop'],pop,...,93,0.316667,0.066667,0.0,0.066667,0.0,0.05,0.116667,0.033333,0.0
4,4,44,6fX8WwxAQ6rCPIC7lMgztu,Kisah Sempurna,Mahalini,"Ha-ah, ha-ah-ah \n Tenggelam, jiwaku dalam ang...","Ha-ah, ha-ah-ah\n Sinking, my soul in dreams\n...",id,['indonesian pop'],pop,...,70,0.185484,0.129032,0.080645,0.104839,0.048387,0.048387,0.177419,0.032258,0.0


In [33]:
df.to_csv('datasets/data_lyrics.csv')

In [53]:
audio = audio.merge(df[['id','genre']], on='id')
audio.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,id,track,artist_id,artist,album_id,album,release_date,link,...,tempo,type,uri,track_href,analysis_url,duration_ms,time_signature,genre_x,image,genre_y
0,0,0,26cvTWJq2E1QqN4jyH2OTU,Tak Segampang Itu,1JvbNeV9zG9Sew1JyaWsyx,Anggi Marito,6gO5mDyNTqiIqHItRil4OG,Tak Segampang Itu,2022-12-02,https://api.spotify.com/v1/tracks/26cvTWJq2E1Q...,...,129.538,audio_features,spotify:track:26cvTWJq2E1QqN4jyH2OTU,https://api.spotify.com/v1/tracks/26cvTWJq2E1Q...,https://api.spotify.com/v1/audio-analysis/26cv...,231100,4,pop,https://i.scdn.co/image/ab6761610000e5eb604493...,pop
1,1,1,2AaaE0qvFWtyT8srKNfRhH,Komang,5LcNPa8f8bRmZqELgoRFkN,Raim Laode,2N8JUijzZsT2IJnplY4vAE,Komang,2022-08-17,https://api.spotify.com/v1/tracks/2AaaE0qvFWty...,...,133.913,audio_features,spotify:track:2AaaE0qvFWtyT8srKNfRhH,https://api.spotify.com/v1/tracks/2AaaE0qvFWty...,https://api.spotify.com/v1/audio-analysis/2Aaa...,222707,4,other,https://i.scdn.co/image/ab6761610000e5eb8f5fe3...,other
2,2,2,6dXiWwFrcGieqnoLYzPNp5,Sial,3wOsYKZM0zcKNasi3I7fP4,Mahalini,6iBh7T1cUR8MPrtly5xugU,fábula,2023-01-23,https://api.spotify.com/v1/tracks/6dXiWwFrcGie...,...,120.014,audio_features,spotify:track:6dXiWwFrcGieqnoLYzPNp5,https://api.spotify.com/v1/tracks/6dXiWwFrcGie...,https://api.spotify.com/v1/audio-analysis/6dXi...,243725,4,pop,https://i.scdn.co/image/ab6761610000e5ebb83334...,pop
3,3,22,6Iq3sgLVrqqZfRitLaeHkn,Sisa Rasa,3wOsYKZM0zcKNasi3I7fP4,Mahalini,6iBh7T1cUR8MPrtly5xugU,fábula,2023-01-23,https://api.spotify.com/v1/tracks/6Iq3sgLVrqqZ...,...,122.049,audio_features,spotify:track:6Iq3sgLVrqqZfRitLaeHkn,https://api.spotify.com/v1/tracks/6Iq3sgLVrqqZ...,https://api.spotify.com/v1/audio-analysis/6Iq3...,254863,4,pop,https://i.scdn.co/image/ab6761610000e5ebb83334...,pop
4,4,44,6fX8WwxAQ6rCPIC7lMgztu,Kisah Sempurna,3wOsYKZM0zcKNasi3I7fP4,Mahalini,6iBh7T1cUR8MPrtly5xugU,fábula,2023-01-23,https://api.spotify.com/v1/tracks/6fX8WwxAQ6rC...,...,122.099,audio_features,spotify:track:6fX8WwxAQ6rCPIC7lMgztu,https://api.spotify.com/v1/tracks/6fX8WwxAQ6rC...,https://api.spotify.com/v1/audio-analysis/6fX8...,276019,4,pop,https://i.scdn.co/image/ab6761610000e5ebb83334...,pop


In [49]:
artists.head()

Unnamed: 0.1,Unnamed: 0,artist_id,artist,image,genres,popularity,link,followers
0,0,7Ln5yumFjHCkeZ8bAzHUcp,Yovie Widianto,https://i.scdn.co/image/ab6761610000e5eb5146b8...,['indonesian pop'],64,https://api.spotify.com/v1/artists/7Ln5yumFjHC...,198778
1,1,6FTLayBxjkQeanFdUusk1I,Fabio Asher,https://i.scdn.co/image/ab6761610000e5ebd7384b...,['indonesian pop'],65,https://api.spotify.com/v1/artists/6FTLayBxjkQ...,736372
2,2,4AK6F7OLvEQ5QYCBNiQWHq,One Direction,https://i.scdn.co/image/5bb443424a1ad71603c43d...,"['boy band', 'pop', 'post-teen pop', 'talent s...",85,https://api.spotify.com/v1/artists/4AK6F7OLvEQ...,31158728
3,3,0Pk4JEXgC64RBmovnQDZ27,Jogja Hip Hop Foundation,https://i.scdn.co/image/ab67616d0000b273de748f...,"['indonesian hip hop', 'lagu jawa']",62,https://api.spotify.com/v1/artists/0Pk4JEXgC64...,623252
4,4,3wOsYKZM0zcKNasi3I7fP4,Mahalini,https://i.scdn.co/image/ab6761610000e5ebb83334...,['indonesian pop'],75,https://api.spotify.com/v1/artists/3wOsYKZM0zc...,5636815


In [50]:
audio = audio.merge(artists[['artist_id', 'image']], on='artist_id')

In [51]:
audio = audio.rename(columns={0:'num_charts'}, inplace=True)
audio.head()

AttributeError: 'NoneType' object has no attribute 'head'

In [45]:
audio.to_csv('datasets/audio_features.csv')