In [None]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

In [None]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [None]:
downloaded = drive.CreateFile({'id':"13H3uAYY6DpKO9zJkniPAckyUwzxGubL-"})
downloaded.GetContentFile('train.csv')

downloaded = drive.CreateFile({'id':"1i5OOpKZPEwXxw4UB2oryMdhhwXNfAfwJ"})
downloaded.GetContentFile('test.csv')

In [None]:
import pandas as pd

df_train = pd.read_csv('train.csv')
df_test  = pd.read_csv('test.csv')
df_train.shape, df_test.shape

((290183, 5), (7935, 6))

### Get Artists

In [None]:
all_data = pd.concat([df_train,df_test],axis=0).reset_index(drop=True)
all_data.shape

(298118, 7)

In [None]:
top10 = all_data['Artist'].value_counts()[:10].sort_values(ascending=False).keys() 
top10

Index(['elvis presley', 'chris brown', 'elvis costello', 'ella fitzgerald',
       'the rolling stones', 'bee gees', 'glee', 'beyonce', 'bad religion',
       'elton john'],
      dtype='object')

In [None]:
df_top10 = all_data[ all_data['Artist'].isin(top10) ]
df_top10.shape

(9275, 7)

In [None]:
df_top10.groupby('Artist').count()

Unnamed: 0_level_0,Song,Genre,Language,Lyrics,Song year,Track_id
Artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
bad religion,752,752,752,751,0,0
bee gees,811,811,811,811,0,0
beyonce,752,752,750,752,2,2
chris brown,1239,1239,1239,1239,0,0
ella fitzgerald,874,874,874,874,0,0
elton john,728,728,728,728,0,0
elvis costello,923,923,923,923,0,0
elvis presley,1611,1611,1611,1611,0,0
glee,765,765,765,765,0,0
the rolling stones,820,820,820,820,0,0


In [None]:
df_top10 = df_top10[['Artist', 'Lyrics']]
df_top10.shape

(9275, 2)

### Clean Data

In [None]:
! pip install stopwordsiso

In [None]:
from stopwordsiso import stopwords

STOPWORDS = stopwords(['en'])

In [None]:
import re
import numpy as np

def preprocess_text(txt):
  text = str(txt).lower()
  text = re.sub('[^a-zA-Z]', ' ', text)
  text = re.sub(r"\s+[a-zA-Z]\s+", ' ', text)
  text = ' '.join(word for word in text.split() if word not in STOPWORDS)
  text = re.sub(r'\s+', ' ', text)
  return text

def getXs(df):
  res  = []
  for text in df['Lyrics'].values:
    res.append(preprocess_text(text))
  return res

def getYs(df):
  hash_genre = {'elvis presley':0, 'chris brown':1, 'elvis costello':2,
                'ella fitzgerald':3, 'the rolling stones':4, 'bee gees':5,
                'glee':6, 'beyonce':7, 'bad religion':8, 'elton john':9}
  y = df['Artist']
  y = np.array(list(map(lambda x: hash_genre[x], y)))
  return y

In [None]:
df_top10['Lyrics']  = getXs(df_top10)
df_top10['ArtistY'] = getYs(df_top10)

In [None]:
df_top10.head()

Unnamed: 0,Artist,Lyrics,ArtistY
3791,bad religion,global citizen cuz blessed nationality growing...,8
3792,bad religion,infected hope break exist persist talk talk ta...,8
3793,bad religion,walk dinner gonna hands gonna mind tie haggard...,8
3794,bad religion,father hear curse day born sorrow hurting grou...,8
3795,bad religion,happy beautiful fuckin candy canes planes brig...,8


In [None]:
def clean_df(df):
  df['n_words'] = df['Lyrics'].str.split().apply(len)
  df = df[df['n_words'] > 25]
  df = df[df['n_words'] < 700]
  return df

In [None]:
df_top10 = clean_df(df_top10)
df_top10.shape

(8564, 4)

In [None]:
df_top10.groupby('Artist').count().sort_values('Lyrics')

Unnamed: 0_level_0,Lyrics,ArtistY,n_words
Artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
elton john,674,674,674
bad religion,713,713,713
beyonce,726,726,726
ella fitzgerald,729,729,729
glee,731,731,731
bee gees,751,751,751
the rolling stones,777,777,777
elvis costello,888,888,888
chris brown,1224,1224,1224
elvis presley,1351,1351,1351


### Save

In [None]:
df_top10.to_csv('top10.csv', index=False)

In [None]:
from google.colab import files
files.download('top10.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>