Preprossesing includes 2 Parts: Data Selection and Data Preprocessing

---

Data Selection: has the selection of english songs and three different elimination parameters from the original data to identify songs to retrieve lyrics for.

Data Preprocessing: has lyrical data cleaning including stop word removals, lemmatization, tokenization and the creation of two different class labels: period and popularity per period.

#Mount Device#

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


#Imports and Installations#

In [3]:
!pip install langdetect

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[?25l[K     |▍                               | 10 kB 32.7 MB/s eta 0:00:01[K     |▊                               | 20 kB 32.3 MB/s eta 0:00:01[K     |█                               | 30 kB 35.7 MB/s eta 0:00:01[K     |█▍                              | 40 kB 23.5 MB/s eta 0:00:01[K     |█▊                              | 51 kB 16.0 MB/s eta 0:00:01[K     |██                              | 61 kB 16.8 MB/s eta 0:00:01[K     |██▍                             | 71 kB 11.6 MB/s eta 0:00:01[K     |██▊                             | 81 kB 12.6 MB/s eta 0:00:01[K     |███                             | 92 kB 13.7 MB/s eta 0:00:01[K     |███▍                            | 102 kB 13.1 MB/s eta 0:00:01[K     |███▊                            | 112 kB 13.1 MB/s eta 0:00:01[K     |████                            | 122 kB 13.1 MB/s eta 0:00:01[K     |████▍                           | 133 kB 13.1 MB/s eta 0:00:

In [4]:
import pandas as pd
import os
import csv
import numpy as np

import nltk 
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from langdetect import detect

In [5]:
nltk.download("wordnet")
nltk.download("stopwords")
nltk.download('punkt')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

#Common Function Declarations#

In [6]:
# add period column
def label_period (row):
   return (( row['year'] // 10) % 10) * 10

In [7]:
# amount of songs in each period
def songs_per_period(df):
  value_count = df['period'].value_counts()
  print(value_count)
  print("Total sum: ", value_count.sum())

In [8]:
# total score of popular songs in each period
def get_popular_count(rows):
  return rows['popularity'].sum()

def pop_songs_per_period(df):
  for i in range(0,90,10):
    count = get_popular_count(df[df['period'] == i])
    print(i, ": ", count)

In [9]:
# detect language of songs based on name, artist using ML algo
def label_language(row):
  try:
    lang = detect(row['name'])
  except:
    #look for english artist name in unidentified song names
    try:
      lang = detect(row['artists'])
    except:
      lang = "error"
      print("This row throws and error:", row['name'], "Artist: ", row['artists'],"Date: ", row['year'])
  return lang
  
# detect language of a word
def get_lang(word):
  try:
    lang = detect(word)
  except:
    lang = "error"
  return lang

In [10]:
#Drop periods: 20s,30s,40s, and 50s
def drop_periods(df):
  for i in range(20, 51, 10):
    df = df[df['period'] != i]
  return df

In [11]:
def divide_by_period(df):
  periods = [60, 70, 80, 90, 0, 10]
  dataframes = []
  for i in range(6):
    df_period = pd.DataFrame()
    df_period = df[df["period"] == periods[i]]
    print(df_period)
    dataframes.append(df_period)
  return dataframes

#Data Selection#

In [None]:
# data.csv is the untouched Spotify data from Kaggle
root = '/content/gdrive/My Drive/machine learning/preprocessing'
csv_path = os.path.join(root, 'data.csv')
df = pd.read_csv(csv_path)

In [None]:
df.head()

Unnamed: 0,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,valence,year
0,0.995,['Carl Woitschach'],0.708,158648,0.195,0,6KbQ3uYMLKb5jDxLF7wYDD,0.563,10,0.151,-12.428,1,Singende Bataillone 1. Teil,0,1928,0.0506,118.469,0.779,1928
1,0.994,"['Robert Schumann', 'Vladimir Horowitz']",0.379,282133,0.0135,0,6KuQTIu1KoTTkLXKrwlLPV,0.901,8,0.0763,-28.454,1,"Fantasiestücke, Op. 111: Più tosto lento",0,1928,0.0462,83.972,0.0767,1928
2,0.604,['Seweryn Goszczyński'],0.749,104300,0.22,0,6L63VW0PibdM1HDSBoqnoM,0.0,5,0.119,-19.924,0,Chapter 1.18 - Zamek kaniowski,0,1928,0.929,107.177,0.88,1928
3,0.995,['Francisco Canaro'],0.781,180760,0.13,0,6M94FkXd15sOAOQYRnWPN8,0.887,1,0.111,-14.734,0,Bebamos Juntos - Instrumental (Remasterizado),0,1928-09-25,0.0926,108.003,0.72,1928
4,0.99,"['Frédéric Chopin', 'Vladimir Horowitz']",0.21,687733,0.204,0,6N6tiFZ9vLTSOIxkj8qKrd,0.908,11,0.098,-16.829,1,"Polonaise-Fantaisie in A-Flat Major, Op. 61",1,1928,0.0424,62.149,0.0693,1928


In [None]:
df.shape[0] #row count

169909

In [None]:
df['period'] = df.apply (lambda row: label_period(row), axis=1)
df['period'].unique()

array([20, 30, 40, 50, 60, 70, 80, 90,  0, 10])

In [None]:
#delete further unnecessary columns
df = df.drop(columns=['acousticness','danceability','duration_ms', 'energy', 
                      'explicit', 'key', 'liveness', 'loudness', 'mode', 'release_date',
                      'tempo', 'valence', 'year'])

In [None]:
# Three different tracks to select songs in the data
#TRACK 1: speechiness > 0.33      -> track = 2
#TRACK 2: instrumentalness < 0.5  -> track = 3 -> FOUND TO BE THE OPTIMAL SELECTION
#TRACK 3: instrumentalness < 0.5 AND speechiness > 0.33 -> track = 6

track = 3

In [None]:
# delete songs with not enough lyrics 
if track % 2 == 0:
  df_track = df[df['speechiness'] > 0.33]
  df_track = df_track[df_track['instrumentalness'] != 0.0]
  
if track % 3 == 0:
  df_track = df[df['instrumentalness'] < 0.5]

df_track.head()

Unnamed: 0,artists,id,instrumentalness,name,popularity,speechiness,period
2,['Seweryn Goszczyński'],6L63VW0PibdM1HDSBoqnoM,0.0,Chapter 1.18 - Zamek kaniowski,0,0.929,20
6,"['Franz Liszt', 'Vladimir Horowitz']",6O0puPuyrxPjDTHDUgsWI7,0.435,"Valse oubliée No. 1 in F-Sharp Major, S. 215/1",0,0.04,20
8,"['Francisco Canaro', 'Charlo']",6OaJ8Bh7lsBeYoBmwmo2nh,0.206,Moneda Corriente - Remasterizado,0,0.127,20
9,['Seweryn Goszczyński'],6PrZexNb16cabXR8Q418Xc,0.0,Chapter 1.3 - Zamek kaniowski,0,0.954,20
15,"['George Butterworth', 'John Cameron']",6Sdpmree8xpGWaedACPMlP,6.3e-05,A Shropshire Lad: Is My Team Ploughing?,0,0.051,20


In [None]:
# amount of songs in each period
print("IN DF:")
songs_per_period(df)
print("IN DF Track:")
songs_per_period(df_track)

IN DF:
90    20000
80    20000
70    20000
60    20000
0     20000
50    19950
10    19900
40    14968
30     8889
20     6202
Name: period, dtype: int64
Total sum:  169909
IN DF Track:
10    18499
0     18446
90    17962
70    17847
80    17692
60    16738
50    14783
40     9083
30     6176
20     4140
Name: period, dtype: int64
Total sum:  141366


In [None]:
# USE IN CASE OF NEED -> TIME CONSUMING ALGORITHM
df_track['language'] = df_track.apply (lambda row: label_language(row), axis=1)
df_track.to_csv('df_lang.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [None]:
# READ from already languaged data
df_track = pd.read_csv('df_lang.csv', index_col=0)

In [None]:
# Information about the languages in the current data
value_count = df_track['language'].value_counts()
print(value_count)

en       76545
de        9722
es        7072
it        5019
pt        3990
fr        3935
id        3421
tl        3129
nl        3006
af        2263
ru        1996
so        1894
no        1792
ca        1683
sw        1677
ro        1498
da        1422
cy        1314
fi        1296
et        1167
tr        1044
pl         974
sv         829
ko         623
sl         554
lt         532
el         513
hu         392
hr         388
sk         367
vi         242
sq         241
cs         235
zh-cn      184
zh-tw       99
lv          98
ja          74
he          43
ar          41
uk          35
bg           6
mk           5
fa           3
th           3
Name: language, dtype: int64


In [None]:
# Only get EN songs
df_track = df_track[df_track['language'] == 'en']

print("IN DF Track:")
songs_per_period(df_track)

IN DF Track:
70    11816
80    10982
60    10832
0     10177
10    10136
90     9769
50     7046
40     2428
30     1917
20     1442
Name: period, dtype: int64
Total sum:  76545


In [None]:
# Drop column no longer needed
df_track = df_track.drop(columns=['language', 'instrumentalness', 'speechiness'])

In [None]:
#Drop periods: 20s,30s,40s, and 50s
df_track = drop_periods(df_track)

In [None]:
df_track.head()

Unnamed: 0,artists,id,name,popularity,period
2055,"['John Hughes', 'Frank Asper', 'Mormon Taberna...",08nFFgiYbTGRxBaSYoXV2V,"Guide Us, O Thou Great Jehovah - Voice",13,60
2056,['Frankie Avalon'],0CUsZyJMSDjhdoQZxHhTtx,A Boy Without a Girl,13,60
2057,['The Everly Brothers'],0YBFwnejhmHiukKks1U3xu,Memories Are Made of This - Remastered Version,14,60
2058,"['John Bacchus Dykes', 'Mormon Tabernacle Choi...",0uMVXukzzOiWfSb1Lkq6k8,"Holy, Holy, Holy - Voice",13,60
2059,['The Everly Brothers'],0w5LsmxCPcWsHATkKyudLz,That's Just Too Much,17,60


In [None]:
# WRITE to CSV
df_track.to_csv('df_eng.csv')

#Data Preprocessing#

In [12]:
#READ lyrical data
csv_path = '/content/gdrive/MyDrive/machine learning/preprocessing/lyrics.csv'
lyrical_df = pd.read_csv(csv_path, index_col=0)
lyrical_df.head()


Unnamed: 0,artists,id,name,popularity,period,lyrics
0,['Frankie Avalon'],0CUsZyJMSDjhdoQZxHhTtx,A Boy Without a Girl,13.0,60.0,A boy without a girl\nIs a song without a tune...
1,['The Everly Brothers'],0YBFwnejhmHiukKks1U3xu,Memories Are Made of This - Remastered Version,14.0,60.0,The sweet sweet memories you gave-a me\nYou ca...
2,['Annette Funicello'],1kOKcxYSK4FID1apuNV8qe,How Will I Know My Love,11.0,60.0,How will I know my love?\nHow will I know my d...
3,['Chuck Berry'],26VeLcnNOv7iOtzb7VuHZF,Confessin' The Blues,20.0,60.0,"Baby, here I stand before you\nWith my heart i..."
4,['Elmore James'],2cr2rjdV2JOz1J0prTiPOG,Shake Your Money Maker,14.0,60.0,Shake your moneymaker\nShake your moneymaker\n...


In [13]:
# remove any nan lyrics from the data
songs_per_period(lyrical_df)

lyrical_df.replace('', np.nan, inplace = True)
lyrical_df = lyrical_df.dropna()

songs_per_period(lyrical_df)

70.0    7978
80.0    7298
90.0    5740
0.0     5137
60.0    4738
10.0    4426
Name: period, dtype: int64
Total sum:  35317
70.0    7380
80.0    6777
90.0    5280
0.0     4811
60.0    4152
10.0    4126
Name: period, dtype: int64
Total sum:  32526


In [14]:
# check if lyrics contain floats
df_num = lyrical_df.select_dtypes(include=[np.float])
print(df_num)

      popularity  period
0           13.0    60.0
1           14.0    60.0
2           11.0    60.0
3           20.0    60.0
4           14.0    60.0
...          ...     ...
4030        59.0    10.0
4031        59.0    10.0
4032        58.0    10.0
4033        61.0    10.0
4034        67.0    10.0

[32526 rows x 2 columns]


**TOKENIZATION OF LYRICAL DATA**

In [15]:
def remove_punc(df):
  #remove punctuations and turn case to lower
  df['lyrics'] = df.apply (lambda row: " ".join(re.findall("[\w][\w]+", row['lyrics'])).lower(), axis=1)
  #reduce multiple space to single space
  df['lyrics'] = df.apply (lambda row: re.sub('[\s]+', ' ', row['lyrics']), axis=1)
  return df

lyrical_df = remove_punc(lyrical_df)
lyrical_df.head()

Unnamed: 0,artists,id,name,popularity,period,lyrics
0,['Frankie Avalon'],0CUsZyJMSDjhdoQZxHhTtx,A Boy Without a Girl,13.0,60.0,boy without girl is song without tune is year ...
1,['The Everly Brothers'],0YBFwnejhmHiukKks1U3xu,Memories Are Made of This - Remastered Version,14.0,60.0,the sweet sweet memories you gave me you can b...
2,['Annette Funicello'],1kOKcxYSK4FID1apuNV8qe,How Will I Know My Love,11.0,60.0,how will know my love how will know my darlin ...
3,['Chuck Berry'],26VeLcnNOv7iOtzb7VuHZF,Confessin' The Blues,20.0,60.0,baby here stand before you with my heart in my...
4,['Elmore James'],2cr2rjdV2JOz1J0prTiPOG,Shake Your Money Maker,14.0,60.0,shake your moneymaker shake your moneymaker sh...


In [16]:
#NEW stopwords list for song lyrics
def get_stopwords(print=False):
  lyrical_stopwords = list()
  lyrical_stopwords = stopwords.words("english")
  lyrical_stopwords.extend(["verse", "intro", "chorus"])
  return lyrical_stopwords

#global list to be used following functions
lyrical_stopwords = get_stopwords()
stopwords_set = set()

In [17]:
#Remove stops words & lemmatize
def clean_data(str, get_remove_words=False, eng_word=False):
  lemmatizer = WordNetLemmatizer()
  tokens = nltk.word_tokenize(str)

  remove_stopwords = list(filter(lambda token: token not in lyrical_stopwords,tokens))
  lemmatized_words = [lemmatizer.lemmatize(word) for word in remove_stopwords]
  if eng_word:
    lemmatized_words = [word for word in lemmatized_words if get_lang(word) == 'en']

  if get_remove_words:
    #collecting all removed words in a set
    removed_words = list(filter(lambda token: token in lyrical_stopwords,tokens))
    global stopwords_set
    for word in removed_words:
      stopwords_set.add(word)
  return " ".join(lemmatized_words)

lyrical_df['lyrics'] = lyrical_df.apply(lambda row: clean_data(row['lyrics']), axis=1)
lyrical_df.head()

Unnamed: 0,artists,id,name,popularity,period,lyrics
0,['Frankie Avalon'],0CUsZyJMSDjhdoQZxHhTtx,A Boy Without a Girl,13.0,60.0,boy without girl song without tune year withou...
1,['The Everly Brothers'],0YBFwnejhmHiukKks1U3xu,Memories Are Made of This - Remastered Version,14.0,60.0,sweet sweet memory gave beat memory gave take ...
2,['Annette Funicello'],1kOKcxYSK4FID1apuNV8qe,How Will I Know My Love,11.0,60.0,know love know darlin whippoorwill give sign k...
3,['Chuck Berry'],26VeLcnNOv7iOtzb7VuHZF,Confessin' The Blues,20.0,60.0,baby stand heart hand put mama hoping understa...
4,['Elmore James'],2cr2rjdV2JOz1J0prTiPOG,Shake Your Money Maker,14.0,60.0,shake moneymaker shake moneymaker shake moneym...


In [None]:
# clean from nan values again
lyrical_df.replace('', np.nan, inplace = True)
lyrical_df = lyrical_df.dropna()
print(lyrical_df.shape[0])

32507


In [None]:
# WRITE processed lyrical data
lyrical_df.to_csv('/content/gdrive/MyDrive/machine learning/preprocessing/LYRICS_PROCESSED.csv', index=False)

#Transforming Processed Data For Models

In [18]:
csv_path = '/content/gdrive/MyDrive/machine learning/preprocessing/LYRICS_PROCESSED.csv'
lyrical_df = pd.read_csv(csv_path, index_col=0)
lyrical_df.head()

Unnamed: 0_level_0,id,name,popularity,period,lyrics
artists,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
['Frankie Avalon'],0CUsZyJMSDjhdoQZxHhTtx,A Boy Without a Girl,13.0,60.0,boy without girl song without tune year withou...
['The Everly Brothers'],0YBFwnejhmHiukKks1U3xu,Memories Are Made of This - Remastered Version,14.0,60.0,sweet sweet memory gave beat memory gave take ...
['Annette Funicello'],1kOKcxYSK4FID1apuNV8qe,How Will I Know My Love,11.0,60.0,know love know darlin whippoorwill give sign k...
['Chuck Berry'],26VeLcnNOv7iOtzb7VuHZF,Confessin' The Blues,20.0,60.0,baby stand heart hand put mama hoping understa...
['Elmore James'],2cr2rjdV2JOz1J0prTiPOG,Shake Your Money Maker,14.0,60.0,shake moneymaker shake moneymaker shake moneym...


In [19]:
dataframes = divide_by_period(lyrical_df)

                                             id  ...                                             lyrics
artists                                          ...                                                   
['Frankie Avalon']       0CUsZyJMSDjhdoQZxHhTtx  ...  boy without girl song without tune year withou...
['The Everly Brothers']  0YBFwnejhmHiukKks1U3xu  ...  sweet sweet memory gave beat memory gave take ...
['Annette Funicello']    1kOKcxYSK4FID1apuNV8qe  ...  know love know darlin whippoorwill give sign k...
['Chuck Berry']          26VeLcnNOv7iOtzb7VuHZF  ...  baby stand heart hand put mama hoping understa...
['Elmore James']         2cr2rjdV2JOz1J0prTiPOG  ...  shake moneymaker shake moneymaker shake moneym...
...                                         ...  ...                                                ...
['Percy Sledge']         3yjuIBV7rnHvoPlMRjgis1  ...  living living living ohhhh nobody else nobody ...
['Frank Sinatra']        13qzHYbEqzcq34eZzYsKIs  ...  town lonel

In [20]:
# mean of popular songs in each period
def label_popularity(row, mean):
    return 1 if row["popularity"] > mean else 0

In [21]:
MAX_SONGS_PER_PERIOD = 4200
balanced = list()
for dataframe in dataframes:
  if dataframe.shape[0] > MAX_SONGS_PER_PERIOD:
    df = dataframe.iloc[:MAX_SONGS_PER_PERIOD]
  else:
    df = dataframe
  
  mean = df['popularity'].mean()
  print(df['period'].iloc[0], ": ", mean)
  df["pop_class"] = df.apply(lambda row: label_popularity(row, mean), axis=1)
  balanced.append(df)

balanced

60.0 :  28.33341363526861
70.0 :  32.342619047619046
80.0 :  35.22952380952381
90.0 :  42.499761904761904
0.0 :  48.12880952380952
10.0 :  58.44013572467281


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


[                                             id  ... pop_class
 artists                                          ...          
 ['Frankie Avalon']       0CUsZyJMSDjhdoQZxHhTtx  ...         0
 ['The Everly Brothers']  0YBFwnejhmHiukKks1U3xu  ...         0
 ['Annette Funicello']    1kOKcxYSK4FID1apuNV8qe  ...         0
 ['Chuck Berry']          26VeLcnNOv7iOtzb7VuHZF  ...         0
 ['Elmore James']         2cr2rjdV2JOz1J0prTiPOG  ...         0
 ...                                         ...  ...       ...
 ['Percy Sledge']         3yjuIBV7rnHvoPlMRjgis1  ...         1
 ['Frank Sinatra']        13qzHYbEqzcq34eZzYsKIs  ...         1
 ['Jefferson Airplane']   1xTyfejmnulYD3fO390sz9  ...         1
 ['The Band']             7aTCCNpHJD2Jgv0LLVJb8Y  ...         0
 ['The Woolies']          615QoDLpyIODGVvHEPP4pt  ...         1
 
 [4151 rows x 6 columns],
                                                          id  ... pop_class
 artists                                                      ..

In [22]:
# Write data per period
for df in balanced:
  period_name = int(df.iloc[0]["period"])
  filename = '/content/gdrive/My Drive/machine learning/DATA/per_period/period_' + str(period_name) + '.csv'
  df.to_csv(filename, index=False)

In [23]:
balanced = pd.concat(balanced)

In [24]:
# Write all data
balanced.to_csv('/content/gdrive/My Drive/machine learning/DATA/BALANCED.csv', index=False)

In [None]:
# EXTRA INFO
dataframes = divide_by_period(balanced)
for dataframe in dataframes:
    print(int(dataframe.iloc[0]["period"]))
    min = 100000
    max = 0
    total = 0
    index_max = 0
    index_min = 0
    count = 0
    for i in range(dataframe.shape[0]):
      lyrics = dataframe.iloc[i]["lyrics"]
      if(lyrics == "instrumental"):
        count += 1
      words = lyrics.split()
      number_of_words = len(words)
      if number_of_words <= min:
        min = number_of_words
        index_min = i
      if number_of_words >= max:
        max = number_of_words
        index_max = i
      total += number_of_words
    print("Max: ", max)
    print("Min: ", min)
    average = total / dataframe.shape[0]
    print("Avg: ", average)
    print("instrumental count: ", count)


                                             id  ... pop_class
artists                                          ...          
['Frankie Avalon']       0CUsZyJMSDjhdoQZxHhTtx  ...         0
['The Everly Brothers']  0YBFwnejhmHiukKks1U3xu  ...         0
['Annette Funicello']    1kOKcxYSK4FID1apuNV8qe  ...         0
['Chuck Berry']          26VeLcnNOv7iOtzb7VuHZF  ...         0
['Elmore James']         2cr2rjdV2JOz1J0prTiPOG  ...         0
...                                         ...  ...       ...
['Percy Sledge']         3yjuIBV7rnHvoPlMRjgis1  ...         1
['Frank Sinatra']        13qzHYbEqzcq34eZzYsKIs  ...         1
['Jefferson Airplane']   1xTyfejmnulYD3fO390sz9  ...         1
['The Band']             7aTCCNpHJD2Jgv0LLVJb8Y  ...         0
['The Woolies']          615QoDLpyIODGVvHEPP4pt  ...         1

[4151 rows x 6 columns]
                                                         id  ... pop_class
artists                                                      ...          
['Elto