# Data Cleaning

In [1]:
import pandas as pd
import numpy as np

In [3]:
data = pd.read_csv('lyrics_dataset.csv')

In [4]:
data.head()

Unnamed: 0,index,song,year,artist,genre,lyrics
0,0,ego-remix,2009,beyonce-knowles,Pop,"Oh baby, how you doing?\nYou know I'm gonna cu..."
1,1,then-tell-me,2009,beyonce-knowles,Pop,"playin' everything so easy,\nit's like you see..."
2,2,honesty,2009,beyonce-knowles,Pop,If you search\nFor tenderness\nIt isn't hard t...
3,3,you-are-my-rock,2009,beyonce-knowles,Pop,"Oh oh oh I, oh oh oh I\n[Verse 1:]\nIf I wrote..."
4,4,black-culture,2009,beyonce-knowles,Pop,"Party the people, the people the party it's po..."


In [5]:
genre_lyrics = data[['genre', 'lyrics', 'song', 'artist']]
genre_lyrics = genre_lyrics.dropna()
genre_lyrics.head()

Unnamed: 0,genre,lyrics,song,artist
0,Pop,"Oh baby, how you doing?\nYou know I'm gonna cu...",ego-remix,beyonce-knowles
1,Pop,"playin' everything so easy,\nit's like you see...",then-tell-me,beyonce-knowles
2,Pop,If you search\nFor tenderness\nIt isn't hard t...,honesty,beyonce-knowles
3,Pop,"Oh oh oh I, oh oh oh I\n[Verse 1:]\nIf I wrote...",you-are-my-rock,beyonce-knowles
4,Pop,"Party the people, the people the party it's po...",black-culture,beyonce-knowles


### Removing rows which have no genre, or have genre of other

In [6]:
genre_lyrics = genre_lyrics.loc[genre_lyrics['genre'] != 'Not Available']
genre_lyrics = genre_lyrics.loc[genre_lyrics['genre'] != 'Other']
genre_lyrics['genre'].value_counts()

Rock          109235
Pop            40466
Hip-Hop        24850
Metal          23759
Country        14387
Jazz            7970
Electronic      7966
R&B             3401
Indie           3149
Folk            2243
Name: genre, dtype: int64

### A lot of the lyrics we were using had unnecessary characters which would restrict the effectiveness of our bag of worlds model when performing Naive Bayes or Logistic Regression. As a result, we removed characters/words such as '\n', [Verse 1], Instrumental, Chorus, etc.

In [7]:
genre_lyrics['lyrics'] = genre_lyrics['lyrics'].apply(lambda x: str(x).replace('\n', ' '))
genre_lyrics.head()

Unnamed: 0,genre,lyrics,song,artist
0,Pop,"Oh baby, how you doing? You know I'm gonna cut...",ego-remix,beyonce-knowles
1,Pop,"playin' everything so easy, it's like you seem...",then-tell-me,beyonce-knowles
2,Pop,If you search For tenderness It isn't hard to ...,honesty,beyonce-knowles
3,Pop,"Oh oh oh I, oh oh oh I [Verse 1:] If I wrote a...",you-are-my-rock,beyonce-knowles
4,Pop,"Party the people, the people the party it's po...",black-culture,beyonce-knowles


In [8]:
def remove_brackets(s):
    s = s.strip()
    startIndex = None
    endIndex = None
    remove = []
    for i in range(len(s)):
        if s[i] == '[':
            startIndex = i
        if s[i] == ']':
            endIndex = i
        if startIndex != None and endIndex != None:
            remove.append((startIndex, endIndex+1))
            startIndex = None
            endIndex = None
            
    for r in remove:
        start, end = r
        substr = s[start:end]
        s = s.replace(substr, '')
    
    return s

In [9]:
genre_lyrics['lyrics'] = genre_lyrics['lyrics'].apply(lambda x: remove_brackets(x))
genre_lyrics.head()

Unnamed: 0,genre,lyrics,song,artist
0,Pop,"Oh baby, how you doing? You know I'm gonna cut...",ego-remix,beyonce-knowles
1,Pop,"playin' everything so easy, it's like you seem...",then-tell-me,beyonce-knowles
2,Pop,If you search For tenderness It isn't hard to ...,honesty,beyonce-knowles
3,Pop,"Oh oh oh I, oh oh oh I If I wrote a book abou...",you-are-my-rock,beyonce-knowles
4,Pop,"Party the people, the people the party it's po...",black-culture,beyonce-knowles


In [10]:
genre_lyrics['lyrics'] = genre_lyrics['lyrics'].apply(lambda x: x.replace('INSTRUMENTAL', ''))
genre_lyrics['lyrics'] = genre_lyrics['lyrics'].apply(lambda x: x.replace('(Chorus)', ''))
genre_lyrics['lyrics'] = genre_lyrics['lyrics'].apply(lambda x: x.replace('Chorus:', ''))
genre_lyrics.head()

Unnamed: 0,genre,lyrics,song,artist
0,Pop,"Oh baby, how you doing? You know I'm gonna cut...",ego-remix,beyonce-knowles
1,Pop,"playin' everything so easy, it's like you seem...",then-tell-me,beyonce-knowles
2,Pop,If you search For tenderness It isn't hard to ...,honesty,beyonce-knowles
3,Pop,"Oh oh oh I, oh oh oh I If I wrote a book abou...",you-are-my-rock,beyonce-knowles
4,Pop,"Party the people, the people the party it's po...",black-culture,beyonce-knowles


In [11]:
genre_lyrics['genre'].value_counts()

Rock          109235
Pop            40466
Hip-Hop        24850
Metal          23759
Country        14387
Jazz            7970
Electronic      7966
R&B             3401
Indie           3149
Folk            2243
Name: genre, dtype: int64

### At this point, we removed the genres which as less than 8000 training examples to ensure we have a large enough sample size of the lyrics. Then we sampled 8000 rows from the following four classes: Rock, Pop, Hip-Hop, and Metal to ensure there are no class imbalances.

In [12]:
rock = genre_lyrics.loc[genre_lyrics['genre'] == 'Rock'].sample(n=2000)
pop = genre_lyrics.loc[genre_lyrics['genre'] == 'Pop'].sample(n=2000)
hip_hop = genre_lyrics.loc[genre_lyrics['genre'] == 'Hip-Hop'].sample(n=2000)
metal = genre_lyrics.loc[genre_lyrics['genre'] == 'Metal'].sample(n=2000)
# country = genre_lyrics.loc[genre_lyrics['genre'] == 'Country'].sample(n=2243)
# jazz = genre_lyrics.loc[genre_lyrics['genre'] == 'Jazz'].sample(n=2243)
# electronic = genre_lyrics.loc[genre_lyrics['genre'] == 'Electronic'].sample(n=2243)
# r_b = genre_lyrics.loc[genre_lyrics['genre'] == 'R&B'].sample(n=2243)
# indie = genre_lyrics.loc[genre_lyrics['genre'] == 'Indie'].sample(n=2243)
# folk = genre_lyrics.loc[genre_lyrics['genre'] == 'Folk'].sample(n=2243)

In [14]:
final = pd.concat([rock, pop, hip_hop, metal])

In [15]:
final['genre'].value_counts()

Pop        2000
Hip-Hop    2000
Rock       2000
Metal      2000
Name: genre, dtype: int64

### Converting to CSV which will be used for the lyrics training and testing, and also for determining additional features using the Spotify API.

In [16]:
final.to_csv("cleaned_nn.csv")