### Read the file from Google Docs

In [None]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

In [None]:

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [None]:
downloaded = drive.CreateFile({'id':"13H3uAYY6DpKO9zJkniPAckyUwzxGubL-"})
downloaded.GetContentFile('train.csv')

downloaded = drive.CreateFile({'id':"1i5OOpKZPEwXxw4UB2oryMdhhwXNfAfwJ"})
downloaded.GetContentFile('test.csv')

### Reading dataset

In [None]:
import pandas as pd
df_train = pd.read_csv('train.csv')
df_train.columns

Index(['Artist', 'Song', 'Genre', 'Language', 'Lyrics'], dtype='object')

### 2.1- EDA

In [None]:
df_train.groupby('Genre').count().sort_values(by='Lyrics', ascending=False)

Unnamed: 0_level_0,Artist,Song,Language,Lyrics
Genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Rock,121404,121403,121403,121392
Pop,108714,108714,108712,108695
Metal,20291,20291,20290,20287
Jazz,13545,13545,13545,13545
Folk,8644,8644,8644,8644
Indie,8449,8449,8449,8449
R&B,2793,2793,2793,2793
Hip-Hop,2240,2240,2240,2240
Electronic,2213,2213,2213,2213
Country,1890,1890,1890,1890


In [None]:
df_train.groupby('Language').count().sort_values(by='Lyrics', ascending=False)

Unnamed: 0_level_0,Artist,Song,Genre,Lyrics
Language,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
en,250197,250196,250197,250197
pt,30102,30102,30102,30102
es,3892,3892,3892,3892
ro,1184,1184,1184,1184
it,808,808,808,808
id,737,737,737,737
fr,644,644,644,644
de,478,478,478,478
sw,304,304,304,304
so,229,229,229,229


In [None]:
def clean_train(df):
  df = df[ df['Lyrics'].notnull() ]
  df = df[ df['Genre'].notnull() ]
  df = df[ df['Artist'].notnull() ]
  df = df[ df['Language'].isin(['en', 'es', 'pt']) ]
  df = df[ df['Genre'] != 'Country' ]
  df = df[ df['Genre'] != 'Electronic' ]
  df = df[ df['Genre'] != 'Hip-Hop' ]
  df = df[ df['Genre'] != 'R&B' ]
  df = df[['Artist', 'Language', 'Lyrics', 'Genre']]
  return df

In [None]:
df_train = clean_train(df_train)
df_train.shape

(275102, 4)

In [None]:
! pip install stopwordsiso

Collecting stopwordsiso
[?25l  Downloading https://files.pythonhosted.org/packages/3e/03/4c5f24b654bb9459f81aa5c1b60b094b804286b99dca9f2e116c9eb01ac8/stopwordsiso-0.6.1-py3-none-any.whl (73kB)
[K     |████▌                           | 10kB 12.5MB/s eta 0:00:01[K     |█████████                       | 20kB 14.2MB/s eta 0:00:01[K     |█████████████▍                  | 30kB 10.8MB/s eta 0:00:01[K     |█████████████████▉              | 40kB 8.7MB/s eta 0:00:01[K     |██████████████████████▎         | 51kB 7.6MB/s eta 0:00:01[K     |██████████████████████████▊     | 61kB 8.8MB/s eta 0:00:01[K     |███████████████████████████████▏| 71kB 9.3MB/s eta 0:00:01[K     |████████████████████████████████| 81kB 4.6MB/s 
[?25hInstalling collected packages: stopwordsiso
Successfully installed stopwordsiso-0.6.1


In [None]:
from stopwordsiso import stopwords

STOPWORDS = stopwords(['en', 'es', 'pt'])

In [None]:
import re
import numpy as np

def preprocess_text(txt):
  text = str(txt).lower()
  text = re.sub('[^a-zA-Z]', ' ', text)
  text = re.sub(r"\s+[a-zA-Z]\s+", ' ', text)
  text = ' '.join(word for word in text.split() if word not in STOPWORDS)
  text = re.sub(r'\s+', ' ', text)
  return text

def getXs(df):
  res  = []
  for text in df['Lyrics'].values:
    res.append(preprocess_text(text))
  return res

def getYs(df):
  hash_genre = {'Indie':0, 'Folk':1, 'Jazz':2, 'Metal': 3, 'Pop': 4, 'Rock':5}
  y = df['Genre']
  y = np.array(list(map(lambda x: hash_genre[x], y)))
  return y

In [None]:
df_train['Lyrics'] = getXs(df_train)
df_train['GenreY'] = getYs(df_train)

In [None]:
def clean_df(df):
  df['n_words'] = df['Lyrics'].str.split().apply(len)
  df = df[df['n_words'] > 25]
  df = df[df['n_words'] < 700]
  return df

In [None]:
df_train = clean_df(df_train)
df_train.shape

(255346, 6)

In [None]:
df_train.groupby('Genre').count().sort_values('Lyrics')

Unnamed: 0_level_0,Artist,Language,Lyrics,GenreY,n_words
Genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Indie,7341,7341,7341,7341,7341
Folk,7994,7994,7994,7994,7994
Jazz,11419,11419,11419,11419,11419
Metal,18348,18348,18348,18348,18348
Pop,101207,101207,101207,101207,101207
Rock,109037,109037,109037,109037,109037


### Balancing the training dataset

In [None]:
from sklearn.utils import shuffle
from imblearn.under_sampling import RandomUnderSampler

def sample_df(df):
  y = df.pop('GenreY').values
  X = df.values

  undersample = RandomUnderSampler(sampling_strategy='auto')
  X, y = undersample.fit_resample(X, y)
  df = pd.DataFrame(X, columns=df.columns)
  df['GenreY'] = y
  df = shuffle(df)
  return df

In [None]:
df_train = sample_df(df_train)
df_train.shape



(44046, 6)

In [None]:
df_train.groupby('Genre').count().sort_values('Lyrics')

Unnamed: 0_level_0,Artist,Language,Lyrics,n_words,GenreY
Genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Folk,7341,7341,7341,7341,7341
Indie,7341,7341,7341,7341,7341
Jazz,7341,7341,7341,7341,7341
Metal,7341,7341,7341,7341,7341
Pop,7341,7341,7341,7341,7341
Rock,7341,7341,7341,7341,7341


### Save Train

In [None]:
df_train.to_csv('train_new.csv', index=False)

In [None]:
from google.colab import files
files.download('train_new.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### Prepare Test

In [None]:
df_test = pd.read_csv('test.csv')

In [None]:
df_test.columns

Index(['Song', 'Song year', 'Artist', 'Genre', 'Lyrics', 'Track_id'], dtype='object')

In [None]:
def clean_test(df):
  df = df[ df['Lyrics'].notnull() ]
  df = df[ df['Genre'].notnull() ]
  df = df[ df['Artist'].notnull() ]
  df = df[ df['Genre'] != 'Country' ]
  df = df[ df['Genre'] != 'Electronic' ]
  df = df[ df['Genre'] != 'Hip-Hop' ]
  df = df[ df['Genre'] != 'R&B' ]
  df = df[['Artist', 'Lyrics', 'Genre']]
  return df

In [None]:
df_test = clean_test(df_test)
df_test.shape

(4995, 3)

In [None]:
import re
import numpy as np

def preprocess_textTest(txt):
  text = str(txt).lower()
  text = re.sub('[^a-zA-Z]', ' ', text)
  text = re.sub(r"\s+[a-zA-Z]\s+", ' ', text)
  text = re.sub(r'\s+', ' ', text)
  return text

def getXsTest(df):
  res  = []
  for text in df['Lyrics'].values:
    res.append(preprocess_textTest(text))
  return res

In [None]:
df_test['Lyrics'] = getXsTest(df_test)
df_test['GenreY'] = getYs(df_test)
df_test.shape

(4995, 4)

In [None]:
def preprocess_text2(txt):
  text = str(txt)
  text = ' '.join(word for word in text.split() if word not in STOPWORDS)
  text = re.sub(r'\s+', ' ', text)
  return text

def cleanLyric(df):
  res  = []
  for text in df['Lyrics'].values:
    res.append(preprocess_text2(text))
  return res

In [None]:
df_test['Lyrics'] = cleanLyric(df_test)
df_test = clean_df(df_test)
df_test.shape

(4636, 5)

In [None]:
df_test.sort_values('n_words', ascending=False).head()

Unnamed: 0,Artist,Lyrics,Genre,GenreY,n_words
4812,celine-dion,eddy marnay michel legrand homage michel legra...,Pop,4,537
1010,frank-zappa,frank zappa guitar synclavier steve guitar ray...,Rock,5,445
866,do-or-die,told hang sellin haller remember runnin cain h...,Metal,3,384
599,bal-sagoth,kor avul thaa finest jewel crown realm sublime...,Metal,3,315
3384,genius,intro method johnny blaze special technique fu...,Metal,3,314


In [None]:
df_test.groupby('Genre').count()

Unnamed: 0_level_0,Artist,Lyrics,GenreY,n_words
Genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Folk,471,471,471,471
Indie,462,462,462,462
Jazz,584,584,584,584
Metal,775,775,775,775
Pop,1053,1053,1053,1053
Rock,1291,1291,1291,1291


In [None]:
import time
from textblob import TextBlob

def getLanguage2(df):
  lans = []
  for lyric in df['Lyrics'].values:
    try:
      tb  = TextBlob(lyric)
      lan = tb.detect_language()
    except:
      lan = "other"
    lans.append(lan)
    time.sleep(1)
  return lans

In [None]:
df_test['Language'] = getLanguage2(df_test)

In [None]:
df_test.groupby('Language').count()

Unnamed: 0_level_0,Artist,Lyrics,Genre,GenreY,n_words
Language,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
bn,1,1,1,1,1
bs,1,1,1,1,1
cy,1,1,1,1,1
de,4,4,4,4,4
en,4567,4567,4567,4567,4567
es,5,5,5,5,5
fr,7,7,7,7,7
ga,13,13,13,13,13
gd,7,7,7,7,7
gu,1,1,1,1,1


In [None]:
df_test = df_test[ df_test['Language'].isin(['en', 'es', 'pt']) ]
df_test.shape

(4574, 6)

In [None]:
df_test = df_test[['Artist','Language','Lyrics','Genre','n_words','GenreY']]

In [None]:
df_test.head()

Unnamed: 0,Artist,Language,Lyrics,Genre,n_words,GenreY
1,the-elwins,en,cold hands sharpen axe criminal town gonna hur...,Indie,49,0
2,bullet-for-my-valentine,en,ready time war break fucking doors smash windo...,Metal,79,3
3,dream-street,en,change color hair yeah pairs shoes wear lot ch...,Pop,69,4
8,craig-cardiff,en,lost lost flood broke exposed love sleepy head...,Indie,27,0
10,carnal-forge,en,broken hopes lies blind penetrates eyes escape...,Metal,85,3


In [None]:
df_test.groupby('Genre').count()

Unnamed: 0_level_0,Artist,Language,Lyrics,n_words,GenreY
Genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Folk,443,443,443,443,443
Indie,459,459,459,459,459
Jazz,581,581,581,581,581
Metal,771,771,771,771,771
Pop,1033,1033,1033,1033,1033
Rock,1287,1287,1287,1287,1287


### Save Test

In [None]:
df_test.to_csv('test_new.csv', index=False)

In [None]:
from google.colab import files
files.download('test_new.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>