In [9]:
import pandas as pd
from porter_stemmer import PorterStemmer

In [32]:
stop_words = set([x.strip() for x in open("stopwords.english", encoding="ISO-8859-1").readlines()])
stemmer = PorterStemmer()

# from previous assignments
def protect_meta_characters (text):
    return text.replace(",", "COMMA").replace("=","EQUALS")

def makefeat(attribute, value):
    return attribute+"="+protect_meta_characters(value)

# custom lyrics tokenization for Genius formatting
def tokenize(lyrics):
    tokens = []
    # return tokenized lyrics
    lines = lyrics.splitlines()
    for line in lines:
        if line:
            # ignore verse and intro markers etc
            if line[0] == '[' and line[-1] == ']':
                continue
            tokens.extend([protect_meta_characters(stemmer.stem_token(word.lower())) for word in line.split(' ') if word.lower() not in stop_words])
    return tokens

# takes a song with lyrics and genre and creates a feature set
def generate_features(lyrics, genre):
    features = []
    tokens = tokenize(lyrics)

    # unigrams
    for token in tokens():
        features.append(makefeat('unigram', 'token'))
    
    # write features to out file
    # with open('./out/train.data', 'w') as f:




In [2]:
# load data
hiphop_df = pd.read_pickle('./hiphop_df')
pop_df = pd.read_pickle('./pop_df')

In [8]:
# create features on training dataset and train model
hiphop_df.apply(axis=1, func=lambda row: generate_features(row['lyrics'], 'hiphop'))
pop_df.apply(axis=1, func=lambda row: generate_features(row['lyrics'], 'pop'))


HIPHOP [Chorus: Drake & Lil
HIPHOP [Intro: Drake]
Yeah

HIPHOP [Intro]
(Wake up, F1
HIPHOP [Intro]
Got my steez
HIPHOP [Intro]
(CashMoneyAP
HIPHOP [Intro]
Yeah
FreshDu
HIPHOP [Intro]
Th-th-th-thi
HIPHOP [Intro]
Uh, uh, uh, 
HIPHOP [Intro]
No way, Hoza
HIPHOP [Intro]
Shotta
Turn 
HIPHOP [Intro]
You know wha
HIPHOP [Intro]
It ain't wha
HIPHOP [Intro]
(Pipe that s
HIPHOP [Intro]
(CashMoneyAP
HIPHOP [Intro]
Eh-eh-eh-eh,
HIPHOP [Intro]
CashMoneyAP

HIPHOP [Intro]
(Pipe that s
HIPHOP [Intro]
Ayy, mmm

[C
HIPHOP [Intro]
Only love co
HIPHOP [Verse 1]
Here we go
HIPHOP [Intro]
Yeah

[Verse
HIPHOP [Intro]
All you ever
HIPHOP [Verse 1]
We did som
HIPHOP [Intro]
They call me
HIPHOP [Intro]
(Damn, Kai, 
HIPHOP [Intro]
Baby, this t
HIPHOP [Verse 1]
Gone in on
HIPHOP [Verse 1]
All them m
HIPHOP [Intro]
Top priority
HIPHOP [Verse 1]
Chrome Hea
HIPHOP [Verse 1]
Yeah, they
HIPHOP [Verse 1]
Wonderin' 
HIPHOP [Verse 1]
Know my gr
HIPHOP [Intro]
(He's on—, h
HIPHOP [Intro]
(I know you'
HIPHOP [Refrain]
You

3     None
11    None
12    None
0     None
3     None
      ... 
6     None
7     None
9     None
10    None
11    None
Length: 108, dtype: object

In [33]:
# evaluate songs 
print(tokenize(pop_df.iloc[0]['lyrics']))

['embers', 'stayed', 'breeze', 'feel', 'elements', 'remind', 'beauty', 'bleak', 'stuck', 'long', 'lights', 'breathe?', 'ohCOMMA', 'loveCOMMA', 'feel', 'times', 'jumped', 'real', 'scars', 'healCOMMA', 'waves', 'break', 'boat', 'waves', 'brеak', 'boat', 'stones', 'crash', 'boardwalk', 'thе', 'wind', 'rush', 'trees', 'eyes', 'peeled', 'memories', 'fall', 'short', "could've", 'left', 'long', 'call', 'need?', 'ohCOMMA', 'loveCOMMA', 'feel', 'times', 'jumped', 'real', 'scars', 'healCOMMA', 'waves', 'break', 'boat', 'waves', 'break', 'boat', 'waves', 'break', 'boat', 'waves', 'break', 'boat', 'loveCOMMA', 'feel', 'times', 'jumped', 'real', 'scars', 'healCOMMA', 'waves', 'break', 'boat']
