In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
from tqdm import tqdm
from sklearn.utils import resample

from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import string

In [341]:
import nltk
nltk.download('wordnet')

import subprocess

# Download and unzip wordnet
try:
    nltk.data.find('wordnet.zip')
except:
    nltk.download('wordnet', download_dir='/kaggle/working/')
    command = "unzip /kaggle/working/corpora/wordnet.zip -d /kaggle/working/corpora"
    subprocess.run(command.split())
    nltk.data.path.append('/kaggle/working/')
    
from nltk.corpus import wordnet

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /kaggle/working/...
[nltk_data]   Package wordnet is already up-to-date!
Archive:  /kaggle/working/corpora/wordnet.zip


replace /kaggle/working/corpora/wordnet/lexnames? [y]es, [n]o, [A]ll, [N]one, [r]ename:  NULL
(EOF or read error, treating as "[N]one" ...)


In [342]:
!pip install eng-to-ipa
import eng_to_ipa as ipa



In [343]:
data_path = "/kaggle/input/genius-song-lyrics-with-language-information/song_lyrics.csv"
chunks = pd.read_csv(data_path, iterator=True, chunksize=1000)

In [344]:
new_data = pd.DataFrame()
for data in tqdm(chunks):
    data_filtered = data[data.language == "en"][["title", "artist", "tag", "lyrics"]]
    
    new_data = pd.concat([new_data, data_filtered])
    
    if min(new_data.tag.value_counts()) >= 3500:
        break

283it [00:21, 13.45it/s]


In [345]:
desired_size = min(new_data.tag.value_counts())
new_data.tag.value_counts()

tag
rap        134404
misc        55803
rock        33676
pop          6949
country      5646
rb           3543
Name: count, dtype: int64

In [346]:
undersampled_data = pd.DataFrame()

for tag in set(new_data.tag):
    class_df = new_data[new_data.tag == tag]
    
    undersampled = resample(class_df, replace=False, n_samples=desired_size, random_state=42)
    undersampled_data = pd.concat([undersampled_data, undersampled])

In [347]:
undersampled_data = undersampled_data.sample(frac=1).reset_index(drop=True)
undersampled_data.tag.value_counts()

tag
rock       3543
rap        3543
misc       3543
pop        3543
country    3543
rb         3543
Name: count, dtype: int64

In [348]:
undersampled_data.head()

Unnamed: 0,title,artist,tag,lyrics
0,Love Me Like Im Not Made of Stone,Lykke Li,rock,"[Produced By Björn Yttling, Greg Kurstin & Lyk..."
1,Mudstained Troubadour,Opaque,rap,"[Verse 1: Opaque]\nI walk in mud to my knees, ..."
2,A Day In The Country Full Text,Anton Chekhov,misc,BETWEEN eight and nine o'clock in the morning....
3,My Same,Adele,pop,"[Intro]\nAye, aye, aye-aye\nAye, aye, aye-aye\..."
4,Rosalind Helen And Her Child,Percy Bysshe Shelley,misc,SCENE. THE SHORE OF THE LAKE OF COMO\n\nHELEN:...


# Data Processing

## Convert lowercase, remove extra information provided by data source, lemmatize and remove punctuations

In [349]:
# Convert to lowercase
undersampled_data.lyrics = undersampled_data.lyrics.str.lower()

In [350]:
import re

# rhe source website did not use a single format the demostrade new lines
def handleNewLine(text):
    new_line_idxs = [match.start() for match in re.finditer(r'\n', text)]
    
    lines = []
    for idx in range(0, len(new_line_idxs)-1):
        startIndex = new_line_idxs[idx]
        endIndex = new_line_idxs[idx+1]
        line = text[startIndex:endIndex]
        line = line.split("\n")[1]
        
        if len(line)>0:
            lines.append(line)
    
    return ' \n '.join(lines)

undersampled_data['lyrics'] = undersampled_data['lyrics'].apply(handleNewLine)

In [351]:
# Remove extra notes
undersampled_data['lyrics'] = undersampled_data['lyrics'].str.replace(r'\[.*?\]', '', regex=True)
undersampled_data['lyrics'] = undersampled_data['lyrics'].str.replace(r'\([^)]*\)', '', regex=True)

In [352]:
# Remove punctuations from the lyrics column except new line (\n)
punctuation = re.compile(r'[^\w\s\n]+')
undersampled_data['lyrics'] = undersampled_data['lyrics'].apply(lambda x: punctuation.sub('', x).strip())
undersampled_data['lyrics'] = undersampled_data['lyrics'].str.replace(" \n  \n ", " \n ")

In [353]:
random_song = np.random.randint(0, len(undersampled_data))
undersampled_data.lyrics.iloc[random_song], undersampled_data.tag.iloc[random_song]

('naah yeah rain comin we rollin never stollin still hollin girls fallin nerd ballin \n saw this you on my list \n sho for rollin beats on no \n we gonna make it rain yo \n medic medic fuck lil with static  \n good flow trick yeah bro still rollin  yeah rain comin fuck weed throwin have loli have money what a goodness yeah  yeah freedom what a dream doom oooww scared hahah boom my music goes hard yeah i go hard yeah meeen my men amazin yeah p on board haha \n uhhuuuuh yeaah this is crackhouse i m going back house fuck loss i am boss \n pmugshot \n smokin smokin lickin in the rockin ouw no big slim do show just creat a low no oww saw this law but im gonna creat some bow roll no just wanna get this now kill this bone im sho for this ho oww yoo haha we gonna make it rain yeaaah \n yeaah call brother men and still havin doubt meen i have a fuckin lot of hater know this meeeen i am going to rock our money meeen yeaah rain comin need to know this until you blow with this yeahhhahah haha mama

In [354]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

for index, row in tqdm(undersampled_data.iterrows()):
    text = row['lyrics']
    lemmatized_text = []
    
    # Lemmatize the text
    for word in text.split(" "):
        if word not in stop_words or word == "\n":
            if word == "\n":
                lemmatized_text.append(word)
            else:
                lemmatized_text.append(lemmatizer.lemmatize(word))
    
    txt = ' '.join(lemmatized_text)
    undersampled_data.loc[index, 'lyrics'] = " \n " + txt

21258it [00:37, 563.21it/s]


In [355]:
undersampled_data.reset_index(inplace=True, drop=True)

In [356]:
undersampled_data.lyrics.iloc[random_song], undersampled_data.tag.iloc[random_song]

(' \n naah yeah rain comin rollin never stollin still hollin girl fallin nerd ballin \n saw list \n sho rollin beat \n gonna make rain yo \n medic medic fuck lil static  \n good flow trick yeah bro still rollin  yeah rain comin fuck weed throwin loli money goodness yeah  yeah freedom dream doom oooww scared hahah boom music go hard yeah go hard yeah meeen men amazin yeah p board haha \n uhhuuuuh yeaah crackhouse going back house fuck loss bos \n pmugshot \n smokin smokin lickin rockin ouw big slim show creat low oww saw law im gonna creat bow roll wanna get kill bone im sho ho oww yoo haha gonna make rain yeaaah \n yeaah call brother men still havin doubt meen fuckin lot hater know meeeen going rock money meeen yeaah rain comin need know blow yeahhhahah haha mama fuck killa need cola came give suny cause thang meeeen yeah haha see fuck rain  \n saw list \n sho rollin beat',
 'rap')

## Split each line

In [357]:
def split_lyrics(lyric):
  """Splits a lyric into a list of lines."""
  lines = []
  try:
      splt = lyric.split("\n")
      for line in splt:
        line = line.strip()

        if len(line) > 1:
          lines.append(line)

      return lines
  except:
        return None

undersampled_data["lines"] = undersampled_data.lyrics.apply(split_lyrics)
undersampled_data.reset_index(inplace=True, drop=True)

In [358]:
undersampled_data.head()

Unnamed: 0,title,artist,tag,lyrics,lines
0,Love Me Like Im Not Made of Stone,Lykke Li,rock,\n there heart cannot hide \n there beat cant...,"[there heart cannot hide, there beat cant deny..."
1,Mudstained Troubadour,Opaque,rap,\n walk mud knee cloudy dark \n spark start b...,"[walk mud knee cloudy dark, spark start bark a..."
2,A Day In The Country Full Text,Anton Chekhov,misc,\n dark leadencoloured mass creeping sky towa...,[dark leadencoloured mass creeping sky towards...
3,My Same,Adele,pop,\n aye aye ayeaye \n aye aye ayeaye \n aye ay...,"[aye aye ayeaye, aye aye ayeaye, aye aye ayeay..."
4,Rosalind Helen And Her Child,Percy Bysshe Shelley,misc,\n helen \n come hither sweet rosalind \n ti ...,"[helen, come hither sweet rosalind, ti long si..."


In [359]:
undersampled_data.iloc[random_song]

title                                      Rain ft PMugShot
artist                                             The Last
tag                                                     rap
lyrics     \n naah yeah rain comin rollin never stollin ...
lines     [naah yeah rain comin rollin never stollin sti...
Name: 14505, dtype: object

## Generate list of tokens

In [360]:
def split_tokens(lines):
    tokens = []
    
    try:
        for line in lines:
            tokens.append(word_tokenize(line))

        return tokens
    except:
        return None

undersampled_data["tokens"] = undersampled_data.lines.apply(split_tokens)
undersampled_data.reset_index(inplace=True, drop=True)

# Feature Exctraction

## Textual Features

Total words, unique words, total chars, total lines, avg word per line

In [369]:
for idx, tokens in tqdm(enumerate(list(undersampled_data.tokens))):
    total_lines = len(tokens)
    
    total_word = 0
    unique_words = []
    total_chars = 0
    
    for line in tokens:
        total_word += len(line)
        
        for token in line:
            total_chars += len(token)
            if token not in unique_words:
                unique_words.append(token)
            
    if total_lines != 0:
        
        avg_word_per_line = total_word / total_lines

        undersampled_data.at[idx, "total_lines"] = total_lines
        undersampled_data.at[idx, "total_word"] = total_word
        undersampled_data.at[idx, "unique_words"] = len(unique_words)
        undersampled_data.at[idx, "total_chars"] = total_chars
        undersampled_data.at[idx, "avg_word_per_line"] = avg_word_per_line

21258it [00:16, 1296.71it/s]


In [373]:
undersampled_data = undersampled_data.dropna().reset_index(drop=True)

In [375]:
undersampled_data[["tag", "total_word", "avg_word_per_line", "unique_words", "total_chars"]].groupby("tag").mean()

Unnamed: 0_level_0,total_word,avg_word_per_line,unique_words,total_chars
tag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
country,107.957933,4.018459,62.681818,511.817617
misc,604.418071,18.393448,321.532249,3550.735038
pop,154.300282,3.466503,67.09435,716.616384
rap,264.762895,5.218872,171.810772,1273.848105
rb,181.554489,3.611197,80.861378,822.165443
rock,111.692633,3.502165,60.471916,536.243297


## Syllable Features
Total syllables, avg syllables per word, avg syllables per line, flesch reading ease 

In [28]:
!pip install syllables
!pip install py-readability-metrics

Collecting syllables
  Downloading syllables-1.0.9-py3-none-any.whl (15 kB)
Collecting importlib-metadata<7.0,>=5.1 (from syllables)
  Downloading importlib_metadata-5.2.0-py3-none-any.whl (21 kB)
Installing collected packages: importlib-metadata, syllables
  Attempting uninstall: importlib-metadata
    Found existing installation: importlib-metadata 6.7.0
    Uninstalling importlib-metadata-6.7.0:
      Successfully uninstalled importlib-metadata-6.7.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
opentelemetry-api 1.18.0 requires importlib-metadata~=6.0.0, but you have importlib-metadata 5.2.0 which is incompatible.
yapf 0.40.1 requires importlib-metadata>=6.6.0, but you have importlib-metadata 5.2.0 which is incompatible.[0m[31m
[0mSuccessfully installed importlib-metadata-5.2.0 syllables-1.0.9
Collecting py-readability-metrics
  Downloading py_rea

In [376]:
from readability import Readability
import syllables

In [377]:
for idx, line in tqdm(enumerate(list(undersampled_data.lines))):
    total = 0
    
    for sep_line in line:
        syllable_count = syllables.estimate(sep_line)
        total += syllable_count
    
    try:
        full_text = ' '.join(line)
        if len(full_text.split()) >= 100:
            r = Readability(full_text)
            r_score = r.flesch_kincaid().score
        else:
            r_score = 100

        undersampled_data.at[idx, "avg_syllables_per_line"] = total / len(line)
        undersampled_data.at[idx, "avg_syllables_per_word"] = total / len(' '.join(line).split(' '))
        undersampled_data.at[idx, "sum_syllables"] = total
    except:
        r_score = None
        
    undersampled_data.at[idx, "readability"] = r_score

21118it [05:41, 61.77it/s] 


In [378]:
undersampled_data.head()

Unnamed: 0,title,artist,tag,lyrics,lines,tokens,total_lines,total_word,unique_words,total_chars,avg_word_per_line,avg_syllables_per_line,avg_syllables_per_word,sum_syllables,readability
0,Love Me Like Im Not Made of Stone,Lykke Li,rock,\n there heart cannot hide \n there beat cant...,"[there heart cannot hide, there beat cant deny...","[[there, heart, can, not, hide], [there, beat,...",29.0,121.0,48.0,504.0,4.172414,6.551724,1.583333,190.0,44.465
1,Mudstained Troubadour,Opaque,rap,\n walk mud knee cloudy dark \n spark start b...,"[walk mud knee cloudy dark, spark start bark a...","[[walk, mud, knee, cloudy, dark], [spark, star...",55.0,276.0,244.0,1407.0,5.018182,8.581818,1.710145,472.0,107.46029
2,A Day In The Country Full Text,Anton Chekhov,misc,\n dark leadencoloured mass creeping sky towa...,[dark leadencoloured mass creeping sky towards...,"[[dark, leadencoloured, mass, creeping, sky, t...",65.0,1131.0,620.0,6125.0,17.4,30.153846,1.666667,1960.0,433.491503
3,My Same,Adele,pop,\n aye aye ayeaye \n aye aye ayeaye \n aye ay...,"[aye aye ayeaye, aye aye ayeaye, aye aye ayeay...","[[aye, aye, ayeaye], [aye, aye, ayeaye], [aye,...",40.0,164.0,65.0,826.0,4.1,6.725,1.640244,269.0,65.00561
4,Rosalind Helen And Her Child,Percy Bysshe Shelley,misc,\n helen \n come hither sweet rosalind \n ti ...,"[helen, come hither sweet rosalind, ti long si...","[[helen], [come, hither, sweet, rosalind], [ti...",1333.0,4848.0,1762.0,25125.0,3.636909,6.012753,1.65565,8015.0,1841.608343


## Line-Syllable Similarity

In [379]:
undersampled_data["line_similarity"] = None

for idx, line in tqdm(enumerate(list(undersampled_data.tokens))):
    similarity_list = []
    for sep_line_id in range(1, len(line)):
        currentLineLen = len(line[sep_line_id])
        prevLineLen = len(line[sep_line_id-1])
        
        if currentLineLen == 0 or prevLineLen == 0:
            pass
        else:
            similarity = 1 - (abs(currentLineLen - prevLineLen) / max(currentLineLen, prevLineLen))
            similarity_list.append(similarity)
            
    undersampled_data.at[idx, "line_similarity"] = np.mean(similarity_list)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
21118it [00:01, 11669.08it/s]


In [380]:
undersampled_data.head()

Unnamed: 0,title,artist,tag,lyrics,lines,tokens,total_lines,total_word,unique_words,total_chars,avg_word_per_line,avg_syllables_per_line,avg_syllables_per_word,sum_syllables,readability,line_similarity
0,Love Me Like Im Not Made of Stone,Lykke Li,rock,\n there heart cannot hide \n there beat cant...,"[there heart cannot hide, there beat cant deny...","[[there, heart, can, not, hide], [there, beat,...",29.0,121.0,48.0,504.0,4.172414,6.551724,1.583333,190.0,44.465,0.718452
1,Mudstained Troubadour,Opaque,rap,\n walk mud knee cloudy dark \n spark start b...,"[walk mud knee cloudy dark, spark start bark a...","[[walk, mud, knee, cloudy, dark], [spark, star...",55.0,276.0,244.0,1407.0,5.018182,8.581818,1.710145,472.0,107.46029,0.746715
2,A Day In The Country Full Text,Anton Chekhov,misc,\n dark leadencoloured mass creeping sky towa...,[dark leadencoloured mass creeping sky towards...,"[[dark, leadencoloured, mass, creeping, sky, t...",65.0,1131.0,620.0,6125.0,17.4,30.153846,1.666667,1960.0,433.491503,0.458772
3,My Same,Adele,pop,\n aye aye ayeaye \n aye aye ayeaye \n aye ay...,"[aye aye ayeaye, aye aye ayeaye, aye aye ayeay...","[[aye, aye, ayeaye], [aye, aye, ayeaye], [aye,...",40.0,164.0,65.0,826.0,4.1,6.725,1.640244,269.0,65.00561,0.732234
4,Rosalind Helen And Her Child,Percy Bysshe Shelley,misc,\n helen \n come hither sweet rosalind \n ti ...,"[helen, come hither sweet rosalind, ti long si...","[[helen], [come, hither, sweet, rosalind], [ti...",1333.0,4848.0,1762.0,25125.0,3.636909,6.012753,1.65565,8015.0,1841.608343,0.765212


In [381]:
undersampled_data[["tag", "line_similarity"]].groupby("tag").mean()

Unnamed: 0_level_0,line_similarity
tag,Unnamed: 1_level_1
country,0.714037
misc,0.629427
pop,0.701177
rap,0.709364
rb,0.691597
rock,0.709807


## Vocab Features
Tf-idf vectors

In [382]:
from sklearn.feature_extraction.text import TfidfVectorizer

def merge_strings(row):
    return ' '.join(row)

corpus = []
for lyric in undersampled_data.lyrics:
    filtered_words = [word for word in lyric.split() if 3 <= len(word) <= 12]
    
    corpus.append(' '.join(filtered_words))
    
vectorizer = TfidfVectorizer(min_df=50, max_df=0.5)
tfidf_matrix  = vectorizer.fit_transform(corpus)

tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
tfidf_df['tag'] = undersampled_data['tag']
avg_tfidf_by_genre = tfidf_df.groupby('tag').mean()

In [383]:
avg_tfidf_by_genre

Unnamed: 0_level_0,100,1st,3rd,911,abandon,abandoned,abide,ability,able,aboard,...,youngest,youre,yous,youth,youthful,youve,yup,zeal,zero,zone
tag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
country,0.00017,7.6e-05,3.6e-05,2.8e-05,4.3e-05,0.00025,9.6e-05,2.2e-05,0.000558,0.00019,...,0.000105,0.02692,0.000213,0.000413,0.0,0.011927,0.0,0.0,0.000327,0.000199
misc,0.000452,0.000603,0.000411,0.000335,0.000529,0.001109,0.001035,0.00138,0.004913,0.000454,...,0.000398,0.003879,0.000176,0.005351,0.000929,0.00183,0.0,0.000775,0.000446,0.000544
pop,0.000247,0.0,0.0,8.1e-05,0.000254,6.1e-05,2.9e-05,0.0,0.000417,0.000126,...,0.0,0.03932,0.000474,0.000832,0.0,0.010012,0.000181,0.0,0.000676,0.000655
rap,0.001971,0.000713,0.001026,0.00059,0.000286,0.000394,0.000298,0.000476,0.001011,0.000305,...,0.000224,0.016065,0.001182,0.001577,7.5e-05,0.002928,0.001504,9.2e-05,0.000961,0.003907
rb,0.000242,0.000341,0.000174,0.00032,0.000178,0.000117,0.000143,0.000213,0.000605,1.4e-05,...,8.1e-05,0.035421,0.000834,0.00033,3.4e-05,0.008876,0.000244,0.0,0.000436,0.000832
rock,0.000148,0.0,0.000157,2.8e-05,0.000153,0.0005,0.000105,0.000107,0.000355,0.000319,...,7e-05,0.026819,0.000188,0.000797,0.000235,0.011931,5.6e-05,1.6e-05,0.00054,0.000176


In [386]:
top_words_by_genre = {}
for genre in avg_tfidf_by_genre.index:
    top_words_by_genre[genre] = avg_tfidf_by_genre.loc[genre].nlargest(50).index.tolist()

In [387]:
top_words_by_genre["country"][:5], top_words_by_genre["rap"][:5]

(['love', 'time', 'one', 'youre', 'well'],
 ['nigga', 'get', 'got', 'shit', 'aint'])

In [388]:
selected_words = set()
for genre in top_words_by_genre:
    for word in top_words_by_genre[genre]:
        selected_words.add(word)
        
selected_words = list(selected_words)
len(selected_words)

110

In [389]:
data_df = undersampled_data.copy()
binary_df = pd.DataFrame(
    columns = selected_words,
    index=data_df.title
)

for i in range(len(data_df)):
    song_binary_list = [0]*len(selected_words)
    for line_token in data_df["tokens"].iloc[i]:
        for token in line_token:
            if token in selected_words:
                word_idx = selected_words.index(token)
                song_binary_list[word_idx] = 1
    
    song_binary_list = pd.DataFrame(
            np.array(song_binary_list).reshape(1, -1),
            columns=binary_df.columns
        )
    
    binary_df.iloc[i] = song_binary_list;

In [390]:
binary_df.head()

Unnamed: 0_level_0,say,mind,nigga,aint,men,fuck,might,cuz,got,girl,...,there,fucking,ever,yall,way,god,need,cant,night,word
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Love Me Like Im Not Made of Stone,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
Mudstained Troubadour,0,0,0,0,0,0,0,0,0,0,...,0,1,1,0,1,0,0,1,0,1
A Day In The Country Full Text,1,0,0,0,0,0,1,0,1,1,...,0,0,0,0,1,1,1,1,1,1
My Same,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
Rosalind Helen And Her Child,1,1,0,0,1,0,1,0,0,0,...,0,0,1,0,1,1,1,0,1,1


In [391]:
final_df = data_df.merge(
    right=binary_df,
    left_on="title",
    right_index=True
)
final_df.head()

Unnamed: 0,title,artist,tag,lyrics,lines,tokens,total_lines,total_word,unique_words,total_chars,...,there,fucking,ever,yall,way,god,need,cant,night,word
0,Love Me Like Im Not Made of Stone,Lykke Li,rock,\n there heart cannot hide \n there beat cant...,"[there heart cannot hide, there beat cant deny...","[[there, heart, can, not, hide], [there, beat,...",29.0,121.0,48.0,504.0,...,1,0,0,0,0,0,0,1,0,0
1,Mudstained Troubadour,Opaque,rap,\n walk mud knee cloudy dark \n spark start b...,"[walk mud knee cloudy dark, spark start bark a...","[[walk, mud, knee, cloudy, dark], [spark, star...",55.0,276.0,244.0,1407.0,...,0,1,1,0,1,0,0,1,0,1
2,A Day In The Country Full Text,Anton Chekhov,misc,\n dark leadencoloured mass creeping sky towa...,[dark leadencoloured mass creeping sky towards...,"[[dark, leadencoloured, mass, creeping, sky, t...",65.0,1131.0,620.0,6125.0,...,0,0,0,0,1,1,1,1,1,1
3,My Same,Adele,pop,\n aye aye ayeaye \n aye aye ayeaye \n aye ay...,"[aye aye ayeaye, aye aye ayeaye, aye aye ayeay...","[[aye, aye, ayeaye], [aye, aye, ayeaye], [aye,...",40.0,164.0,65.0,826.0,...,0,0,0,0,1,0,0,0,0,0
4,Rosalind Helen And Her Child,Percy Bysshe Shelley,misc,\n helen \n come hither sweet rosalind \n ti ...,"[helen, come hither sweet rosalind, ti long si...","[[helen], [come, hither, sweet, rosalind], [ti...",1333.0,4848.0,1762.0,25125.0,...,0,0,1,0,1,1,1,0,1,1


## Model Training

In [392]:
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import GridSearchCV

## Base Model

In [393]:
data = final_df.dropna()
x, y = data.drop(["title", "artist", "tag", "lyrics", "lines", "tokens"], axis=1), data.tag 
x_train, x_test, y_train, y_test = train_test_split(
    x, y,
    random_state=42
)

print(x_train.shape, x_test.shape)

pca = PCA(n_components=30)
x_train = pca.fit_transform(x_train)
x_test = pca.transform(x_test)

model = LogisticRegressionCV(cv=3, max_iter=200, solver="newton-cholesky").fit(x_train, y_train)
preds = model.predict(x_test)

print(accuracy_score(y_test, preds))
print(f1_score(y_test, preds, average="weighted"))

(19821, 120) (6608, 120)
0.5196731234866828
0.5101286764515212


In [395]:
def performGridSearch(x_train, y_train, x_test, y_test, model, param_grid, cv=3):
    grid_search = GridSearchCV(model, param_grid, cv=cv, verbose=2)
    grid_search.fit(x_train, y_train)
    
    best_params = grid_search.best_params_
    best_model = grid_search.best_estimator_
    
    preds = best_model.predict(x_test)
    
    print(best_params)
    return f1_score(y_test, preds, average="weighted")

### Tuning Models

In [149]:
model = RandomForestClassifier()
param_grid = {
    "n_estimators": [50, 100, 200],
    "criterion": ["gini", "log_loss"],
    "max_depth":[None, 3, 10],
    "max_features": ["log2", None]
}

performGridSearch(x_train, y_train, x_test, y_test, model, param_grid, cv=1)

Fitting 2 folds for each of 36 candidates, totalling 72 fits
[CV] END criterion=gini, max_depth=None, max_features=log2, n_estimators=50; total time=   2.3s
[CV] END criterion=gini, max_depth=None, max_features=log2, n_estimators=50; total time=   2.3s
[CV] END criterion=gini, max_depth=None, max_features=log2, n_estimators=100; total time=   4.5s
[CV] END criterion=gini, max_depth=None, max_features=log2, n_estimators=100; total time=   4.7s
[CV] END criterion=gini, max_depth=None, max_features=log2, n_estimators=200; total time=   9.3s
[CV] END criterion=gini, max_depth=None, max_features=log2, n_estimators=200; total time=   9.3s
[CV] END criterion=gini, max_depth=None, max_features=None, n_estimators=50; total time=  13.6s
[CV] END criterion=gini, max_depth=None, max_features=None, n_estimators=50; total time=  13.9s
[CV] END criterion=gini, max_depth=None, max_features=None, n_estimators=100; total time=  27.2s
[CV] END criterion=gini, max_depth=None, max_features=None, n_estimato

KeyboardInterrupt: 

In [150]:
model = MLPClassifier()
param_grid = {
    "hidden_layer_sizes": [(50), (100), (200), (100, 100), (200, 200)],
    "alpha": [0.0001, 0.01],
    "learning_rate": ["constant", "invscaling", "adaptive"]
}

performGridSearch(x_train, y_train, x_test, y_test, model, param_grid, cv=2)

Fitting 2 folds for each of 30 candidates, totalling 60 fits
[CV] END alpha=0.0001, hidden_layer_sizes=50, learning_rate=constant; total time=   1.4s
[CV] END alpha=0.0001, hidden_layer_sizes=50, learning_rate=constant; total time=   1.6s
[CV] END alpha=0.0001, hidden_layer_sizes=50, learning_rate=invscaling; total time=   1.9s
[CV] END alpha=0.0001, hidden_layer_sizes=50, learning_rate=invscaling; total time=   1.6s
[CV] END alpha=0.0001, hidden_layer_sizes=50, learning_rate=adaptive; total time=   1.6s
[CV] END alpha=0.0001, hidden_layer_sizes=50, learning_rate=adaptive; total time=   1.2s
[CV] END alpha=0.0001, hidden_layer_sizes=100, learning_rate=constant; total time=   1.8s
[CV] END alpha=0.0001, hidden_layer_sizes=100, learning_rate=constant; total time=   2.1s
[CV] END alpha=0.0001, hidden_layer_sizes=100, learning_rate=invscaling; total time=   1.8s
[CV] END alpha=0.0001, hidden_layer_sizes=100, learning_rate=invscaling; total time=   1.1s
[CV] END alpha=0.0001, hidden_layer_s



0.5209727632041241

In [396]:
model = KNeighborsClassifier()
param_grid = {
    "n_neighbors": [3, 5, 10, 20],
    "leaf_size": [10, 30, 100]
}

performGridSearch(x_train, y_train, x_test, y_test, model, param_grid, cv=2)

Fitting 2 folds for each of 12 candidates, totalling 24 fits
[CV] END ........................leaf_size=10, n_neighbors=3; total time=   0.6s
[CV] END ........................leaf_size=10, n_neighbors=3; total time=   0.6s
[CV] END ........................leaf_size=10, n_neighbors=5; total time=   0.6s
[CV] END ........................leaf_size=10, n_neighbors=5; total time=   0.6s
[CV] END .......................leaf_size=10, n_neighbors=10; total time=   0.6s
[CV] END .......................leaf_size=10, n_neighbors=10; total time=   0.6s
[CV] END .......................leaf_size=10, n_neighbors=20; total time=   0.6s
[CV] END .......................leaf_size=10, n_neighbors=20; total time=   0.6s
[CV] END ........................leaf_size=30, n_neighbors=3; total time=   0.6s
[CV] END ........................leaf_size=30, n_neighbors=3; total time=   0.6s
[CV] END ........................leaf_size=30, n_neighbors=5; total time=   0.6s
[CV] END ........................leaf_size=30, n

0.5454023199597815

In [152]:
model = SVC()
param_grid = {
    "C": [0.5, 1, 2],
    "gamma": ["scale", "auto"],
    "degree": [1, 3, 5],
    "class_weight": [None, "balanced"]
}

performGridSearch(x_train, y_train, x_test, y_test, model, param_grid, cv=2)

Fitting 2 folds for each of 36 candidates, totalling 72 fits
[CV] END ....C=0.5, class_weight=None, degree=1, gamma=scale; total time=  10.6s
[CV] END ....C=0.5, class_weight=None, degree=1, gamma=scale; total time=  10.6s
[CV] END .....C=0.5, class_weight=None, degree=1, gamma=auto; total time=  20.4s
[CV] END .....C=0.5, class_weight=None, degree=1, gamma=auto; total time=  20.6s
[CV] END ....C=0.5, class_weight=None, degree=3, gamma=scale; total time=  10.5s
[CV] END ....C=0.5, class_weight=None, degree=3, gamma=scale; total time=  10.3s
[CV] END .....C=0.5, class_weight=None, degree=3, gamma=auto; total time=  20.3s
[CV] END .....C=0.5, class_weight=None, degree=3, gamma=auto; total time=  20.4s
[CV] END ....C=0.5, class_weight=None, degree=5, gamma=scale; total time=  10.2s
[CV] END ....C=0.5, class_weight=None, degree=5, gamma=scale; total time=  10.4s
[CV] END .....C=0.5, class_weight=None, degree=5, gamma=auto; total time=  20.2s
[CV] END .....C=0.5, class_weight=None, degree=5

0.4043476918884171