In [129]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import nltk
from nltk import word_tokenize, tokenize
from nltk.corpus import stopwords
import numpy as np
nltk.download('punkt')
nltk.download('stopwords')
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.tokenize.treebank import TreebankWordDetokenizer
from sklearn.metrics import accuracy_score

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ailee\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ailee\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [130]:
data = pd.read_csv('data144_final_project_cleaned_data.csv')
data = data.sample(frac = 1).reset_index()

In [131]:
Y = data['genre']
data = data.drop(columns = ['Unnamed: 0', 'song', 'artist', 'index'], axis = 1)
X = data[['genre', 'lyrics']]

In [132]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state = 42)
X_train = X_train.reset_index().drop(columns = ['index'], axis = 1)
X_test = X_test.reset_index().drop(columns = ['index'], axis = 1)
y_train = y_train.reset_index().drop(columns = ['index'], axis = 1)
y_test = y_test.reset_index().drop(columns = ['index'], axis = 1)

In [133]:
X_train['lyrics'] = X_train['lyrics'].astype(str)
X_train['lyrics'] = X_train['lyrics'].map(lambda x: x.lower())

X_test['lyrics'] = X_test['lyrics'].astype(str)
X_test['lyrics'] = X_test['lyrics'].map(lambda x: x.lower())

In [134]:
from string import punctuation
def remove_punctuation(document):
    no_punct = ''.join([character for character in document if character not in punctuation])
    return no_punct

X_train['lyrics'] = X_train['lyrics'].apply(remove_punctuation)
X_test['lyrics'] = X_test['lyrics'].apply(remove_punctuation)

In [135]:
X_train['lyrics']

0        i never knew what this could do seeing another...
1        i wonder if im alive breathe slowly open your ...
2        craig david dim chris ft rosette bitsn pieces ...
3        tale as old as time true as it can be barely e...
4        hampstead girl on parliament hill drinks in th...
                               ...                        
63556    so long weve been married last burdens weve ca...
63557     alright go this comes from the e chapter 8 ve...
63558    it had taken her a long time suddenly back on ...
63559    hey hey hey hey where my party people oh oh oh...
63560    and heres a letter from bill macy of gainesvil...
Name: lyrics, Length: 63561, dtype: object

In [136]:
def remove_digit(document): 
    no_digit = ''.join([character for character in document if not character.isdigit()])      
    return no_digit

X_train['lyrics'] = X_train['lyrics'].apply(remove_digit)
X_test['lyrics'] = X_test['lyrics'].apply(remove_digit)

In [137]:
X_train['lyrics']

0        i never knew what this could do seeing another...
1        i wonder if im alive breathe slowly open your ...
2        craig david dim chris ft rosette bitsn pieces ...
3        tale as old as time true as it can be barely e...
4        hampstead girl on parliament hill drinks in th...
                               ...                        
63556    so long weve been married last burdens weve ca...
63557     alright go this comes from the e chapter  ver...
63558    it had taken her a long time suddenly back on ...
63559    hey hey hey hey where my party people oh oh oh...
63560    and heres a letter from bill macy of gainesvil...
Name: lyrics, Length: 63561, dtype: object

In [138]:
X_train['lyrics'] = X_train['lyrics'].apply(word_tokenize)

X_test['lyrics'] = X_test['lyrics'].apply(word_tokenize)

In [139]:
X_train['lyrics']

0        [i, never, knew, what, this, could, do, seeing...
1        [i, wonder, if, im, alive, breathe, slowly, op...
2        [craig, david, dim, chris, ft, rosette, bitsn,...
3        [tale, as, old, as, time, true, as, it, can, b...
4        [hampstead, girl, on, parliament, hill, drinks...
                               ...                        
63556    [so, long, weve, been, married, last, burdens,...
63557    [alright, go, this, comes, from, the, e, chapt...
63558    [it, had, taken, her, a, long, time, suddenly,...
63559    [hey, hey, hey, hey, where, my, party, people,...
63560    [and, heres, a, letter, from, bill, macy, of, ...
Name: lyrics, Length: 63561, dtype: object

In [140]:
stop_words = set(stopwords.words('english'))
porter = PorterStemmer()

def remove_stopwords(text):
    words = [word for word in text if not word in stop_words] 
    return words
X_train['lyrics'] = X_train['lyrics'].apply(remove_stopwords)
X_test['lyrics'] = X_test['lyrics'].apply(remove_stopwords)

In [141]:
# additional_stoplist = ['.', '!', '\'s', '?', ';', 'n\'t', '\'ll', 'would', '--', 'ta', 'wan', 'ai',
#                     'na', 'ya', 'could', 'It', 'am', '\'m', ',', '\'', '\'re', 'u', '``', '\'\'',
#                               'wa', 'ca', '\'em', '...', ':', 'em', 'wit', 'wo', 'ya', 'gon', 'y\'all', 
# #                                '\'ve', 'im', '\'cause', 'cause', '\'d', '-' 'ha', 'un']

In [142]:
X_train['lyrics']

0        [never, knew, could, seeing, another, woman, c...
1        [wonder, im, alive, breathe, slowly, open, eye...
2        [craig, david, dim, chris, ft, rosette, bitsn,...
3        [tale, old, time, true, barely, even, friends,...
4        [hampstead, girl, parliament, hill, drinks, si...
                               ...                        
63556    [long, weve, married, last, burdens, weve, car...
63557    [alright, go, comes, e, chapter, verse, reads,...
63558    [taken, long, time, suddenly, back, sit, frien...
63559    [hey, hey, hey, hey, party, people, oh, oh, oh...
63560    [heres, letter, bill, macy, gainesville, flori...
Name: lyrics, Length: 63561, dtype: object

In [143]:
def stemmer(text):
    stemmed_document = [porter.stem(word) for word in text]
    return stemmed_document
X_train['lyrics'] = X_train['lyrics'].apply(stemmer)
X_test['lyrics'] = X_test['lyrics'].apply(stemmer)

In [144]:
X_train_copy = X_train

X_test_copy = X_test

In [145]:
X_train_copy_metal = X_train_copy[X_train_copy['genre'] == 'Metal']
X_train_copy_pop = X_train_copy[X_train_copy['genre'] == 'Pop']
X_train_copy_hiphop = X_train_copy[X_train_copy['genre'] == 'Hip-Hop']
X_train_copy_country = X_train_copy[X_train_copy['genre'] == 'Country']

In [146]:
## run for each genre
def codee(df):
    df_lyrics = df.drop(columns = ['genre'])
    lyrics_detokenize = df_lyrics['lyrics'].apply(TreebankWordDetokenizer().detokenize)
    docs = lyrics_detokenize.str.cat(sep = ' ')
    tfidf_vectorizer = TfidfVectorizer(use_idf=True) 
    tfidf_vectorizer_vectors = tfidf_vectorizer.fit_transform([docs]) 
    df = pd.DataFrame(tfidf_vectorizer_vectors.T.todense(), index = tfidf_vectorizer.get_feature_names()) 
    return df

In [147]:
metal = codee(X_train_copy_metal)
pop = codee(X_train_copy_pop)
hiphop = codee(X_train_copy_hiphop)
country = codee(X_train_copy_country)

In [148]:
pop

Unnamed: 0,0
aa,0.000154
aaa,0.000062
aaaa,0.000023
aaaaa,0.000015
aaaaaa,0.000015
...,...
zuo,0.000015
zuvaloppa,0.000023
zwentendorf,0.000008
zz,0.000008


In [149]:
metal_top20 = metal.sort_values(by=[0],ascending=False).iloc[:30,:].reset_index()
pop_top20 = pop.sort_values(by=[0],ascending=False).iloc[:30,:].reset_index()
hiphop_top20 = hiphop.sort_values(by=[0],ascending=False).iloc[:30,:].reset_index()
country_top20 = country.sort_values(by=[0],ascending=False).iloc[:30,:].reset_index()
all_genre = np.concatenate([np.array(country_top20['index']) , np.array(metal_top20['index']) , np.array(pop_top20['index']) , np.array(hiphop_top20['index'])])

In [151]:
X_train_copy

Unnamed: 0,genre,lyrics
0,Country,"[never, knew, could, see, anoth, woman, cant, ..."
1,Metal,"[wonder, im, aliv, breath, slowli, open, eye, ..."
2,Pop,"[craig, david, dim, chri, ft, rosett, bitsn, p..."
3,Pop,"[tale, old, time, true, bare, even, friend, so..."
4,Pop,"[hampstead, girl, parliament, hill, drink, sit..."
...,...,...
63556,Country,"[long, weve, marri, last, burden, weve, carri,..."
63557,Hip-Hop,"[alright, go, come, e, chapter, vers, read, ti..."
63558,Metal,"[taken, long, time, suddenli, back, sit, frien..."
63559,Country,"[hey, hey, hey, hey, parti, peopl, oh, oh, oh,..."


In [152]:
# df_lyrics_train = X_train_copy.drop(columns = ['genre'])
# lyrics_detokenize_train = df_lyrics_train['lyrics'].apply(TreebankWordDetokenizer().detokenize)

# df_lyrics_test = X_test_copy.drop(columns = ['genre'])
# lyrics_detokenize_test = df_lyrics_test['lyrics'].apply(TreebankWordDetokenizer().detokenize)

In [153]:
## keep only words in top 20
def remove_word(text):
    removed_document = [word for word in text if word in all_genre]
    return removed_document
X_train_copy['lyrics'] = X_train_copy['lyrics'].apply(remove_word)
X_test_copy['lyrics'] = X_test_copy['lyrics'].apply(remove_word)

In [154]:
lyrics_detokenize_xtest = X_test_copy['lyrics'].apply(TreebankWordDetokenizer().detokenize)

lyrics_detokenize_xtrain = X_train_copy['lyrics'].apply(TreebankWordDetokenizer().detokenize)

In [155]:
countVec = CountVectorizer()

In [156]:
lyrics_detokenize_xtrain

0        never see cant caus never get babi got get tak...
1                             im eye see feel know let let
2                                                         
3                  time littl say littl one time time time
4        girl eye see like love back take eye love take...
                               ...                        
63556    love make oh mind live life know love go say w...
63557    go come time get get like caus shit like know ...
63558    time back im feel got need heart night night w...
63559    oh oh oh oh back well go babi come well like n...
63560                                                  day
Name: lyrics, Length: 63561, dtype: object

In [157]:
word_count_vector = countVec.fit_transform(lyrics_detokenize_xtrain)

word_count_vector_test = countVec.fit_transform(lyrics_detokenize_xtest)

In [158]:
dtm_body = pd.DataFrame(word_count_vector.toarray(), columns=countVec.get_feature_names())

dtm_body_test = pd.DataFrame(word_count_vector_test.toarray(), columns=countVec.get_feature_names())

In [159]:
dtm_body

Unnamed: 0,aint,away,babi,back,bitch,cant,caus,come,day,die,...,take,that,time,want,way,well,world,ya,yeah,your
0,0,0,1,2,0,2,2,0,0,0,...,2,0,0,0,0,1,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,4,0,0,0,0,0,0,0
4,0,0,0,2,0,0,0,0,0,0,...,3,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63556,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,2,0,0,0,0
63557,2,1,0,1,0,0,2,2,0,0,...,0,1,1,0,0,0,0,1,0,0
63558,0,0,0,2,0,0,0,0,0,0,...,0,0,4,0,1,0,0,0,0,0
63559,3,4,1,1,0,0,0,4,0,0,...,0,1,0,0,0,8,0,0,4,0


In [160]:
X_model = dtm_body[all_genre]

X_mod_test = dtm_body_test[all_genre]

In [161]:
X_model

Unnamed: 0,love,im,know,dont,like,time,go,one,oh,come,...,come.1,want,ya,man,time.1,that,say,girl,never,cant
0,2,0,1,1,1,0,0,0,1,0,...,0,0,0,1,0,0,0,0,3,2
1,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,4,0,1,0,0,...,0,0,0,0,4,0,1,0,0,0
4,2,0,0,0,2,1,1,0,0,0,...,0,0,0,0,1,0,0,2,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63556,3,0,1,0,0,0,1,0,1,0,...,0,0,0,0,0,1,3,0,0,0
63557,0,0,1,1,4,1,2,0,0,2,...,2,0,1,0,1,1,0,0,0,0
63558,0,1,0,0,0,4,0,0,0,0,...,0,0,0,1,4,0,0,0,0,0
63559,0,0,2,2,1,0,8,0,23,4,...,4,0,0,0,0,1,0,0,0,0


In [162]:
y_train

Unnamed: 0,genre
0,Country
1,Metal
2,Pop
3,Pop
4,Pop
...,...
63556,Country
63557,Hip-Hop
63558,Metal
63559,Country


In [165]:

knn = KNeighborsClassifier(n_neighbors = 10)
knn.fit(X_model,y_train)
y_pred = knn.predict(X_mod_test)
print(accuracy_score(y_test, y_pred))

  knn.fit(X_model,y_train)


0.5669514324821825


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=74f32f54-e450-409c-aa59-e7d6d7d12c9b' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>