In [1]:
import pandas as pd
import matplotlib.pyplot as plt


In [2]:
df = pd.read_csv('../DATA/cleaned_train_lyrics.csv', encoding='latin1')
df.head()

Unnamed: 0.1,Unnamed: 0,Lyric,genre
0,0,"See me, ancient one! Dismal Tuat, Nergal unsaf...",Metal
1,1,Feels like Im covered in lies so turn off the ...,Metal
2,2,"Works of art, painted black Magniloquent, blee...",Metal
3,3,Into the cage like an animal You must survive ...,Metal
4,4,Paralysed in pleasure I hear you call Lost my ...,Metal


In [8]:
import pandas as pd
from collections import Counter
from wordcloud import STOPWORDS
import string
import re
import matplotlib.pyplot as plt

def clean_lyrics(lyrics):
    lyrics = lyrics.lower() 
    lyrics = re.sub(f'[{string.punctuation}]', '', lyrics)  
    words = lyrics.split()  
    words = [word for word in words if word not in STOPWORDS]  
    return words
def remove_custom_words(lyrics, custom_words):
    lyrics = clean_lyrics(lyrics)  
    return ' '.join([word for word in lyrics if word not in custom_words])

custom_words_to_remove = ['he', 'her', 'it', 'and', 'the', 'you', 'i', 'we', 'im', 'dont', 'got', 'verse', 'chorus', 'youre', 'oh', 'ill', '1', '2']
df['cleaned_lyrics'] = df['Lyric'].apply(lambda x: remove_custom_words(x, custom_words_to_remove))
df['word_count'] = df['cleaned_lyrics'].apply(lambda x: len(x.split()))

cleaned_words = []
df['cleaned_lyrics'].apply(lambda x: cleaned_words.extend(x.split()))

cleaned_word_counts = Counter(cleaned_words)
cleaned_common_words = cleaned_word_counts.most_common(50)

print(cleaned_common_words)


[('know', 580937), ('love', 480302), ('now', 458553), ('time', 390728), ('will', 379832), ('one', 378435), ('see', 373051), ('never', 364395), ('go', 355086), ('cant', 301345), ('back', 292851), ('life', 280511), ('yeah', 273786), ('come', 269401), ('way', 265705), ('cause', 264817), ('take', 262438), ('make', 256300), ('say', 252583), ('let', 244015), ('want', 242396), ('aint', 238216), ('away', 214665), ('feel', 214508), ('man', 210734), ('ive', 210314), ('right', 208098), ('baby', 201397), ('well', 198980), ('thats', 198818), ('day', 189445), ('night', 189391), ('need', 189309), ('world', 186944), ('heart', 184775), ('gonna', 183131), ('tell', 176110), ('still', 172695), ('wanna', 170217), ('us', 167388), ('think', 166142), ('theres', 158045), ('keep', 154695), ('eyes', 154109), ('every', 153406), ('good', 152118), ('mind', 151396), ('give', 149525), ('little', 147629), ('said', 142507)]


In [19]:
common_words = ['know', 'love', 'now', 'time', 'will', 'one', 'see', 'never', 'go', 'cant', 'back', 'life', 'yeah', 
                'come', 'way', 'cause', 'take', 'make', 'say', 'let', 'want', 'aint', 'away', 'feel', 'man', 'ive', 
                'right', 'baby', 'well', 'thats', 'day', 'night', 'need', 'world', 'heart', 'gonna', 'tell', 'still', 
                'wanna', 'us', 'think', 'theres', 'keep', 'eyes', 'every', 'good', 'mind', 'give', 'little', 'said']

for word in common_words:
    df[word] = df['cleaned_lyrics'].apply(lambda x: 1 if word in x.split() else 0)

df.head()

Unnamed: 0.1,Unnamed: 0,Lyric,genre,cleaned_lyrics,word_count,word_count_not_cleaned,know,love,now,time,...,think,theres,keep,eyes,every,good,mind,give,little,said
0,0,"See me, ancient one! Dismal Tuat, Nergal unsaf...",Metal,see ancient one dismal tuat nergal unsafe spre...,37,64,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,Feels like Im covered in lies so turn off the ...,Metal,feels covered lies turn light closing eyes fly...,88,179,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,2,"Works of art, painted black Magniloquent, blee...",Metal,works art painted black magniloquent bleeding ...,91,121,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,Into the cage like an animal You must survive ...,Metal,cage animal must survive kill die learning cri...,93,140,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,Paralysed in pleasure I hear you call Lost my ...,Metal,paralysed pleasure hear call lost cognitive co...,100,178,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [21]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

features = common_words

X = df[features]  
y = df['genre']   

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_model = RandomForestClassifier(n_estimators=40, random_state=42)

rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

print(classification_report(y_test, y_pred))


Accuracy: 0.4558
              precision    recall  f1-score   support

       Metal       0.46      0.56      0.50     20030
     country       0.51      0.55      0.53     20044
         pop       0.30      0.25      0.28     20069
         rap       0.66      0.69      0.68     19866
        rock       0.28      0.23      0.25     19991

    accuracy                           0.46    100000
   macro avg       0.44      0.46      0.45    100000
weighted avg       0.44      0.46      0.45    100000



In [22]:
# k=5
# size=len(df)
# for i in k:
#     start = (i)*100000
#     end = i+1*100000
#     X_train = df[start:end]
#     y_train = df[start:end]
    
#     X_test, y_test = df[]
from sklearn.model_selection import KFold 
import numpy as np

kf = KFold(n_splits=5, shuffle=True, random_state=42)

accuracies = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    rf_model.fit(X_train, y_train)
    
    y_pred = rf_model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)
    
    print(f"Fold Classification Report:\n{classification_report(y_test, y_pred)}\n")

mean_accuracy = np.mean(accuracies)
print(f"Mean Accuracy across {kf.n_splits} folds: {mean_accuracy}")


Fold Classification Report:
              precision    recall  f1-score   support

       Metal       0.46      0.56      0.50     20030
     country       0.51      0.55      0.53     20044
         pop       0.31      0.25      0.28     20069
         rap       0.66      0.69      0.68     19866
        rock       0.28      0.23      0.25     19991

    accuracy                           0.46    100000
   macro avg       0.44      0.46      0.45    100000
weighted avg       0.44      0.46      0.45    100000


Fold Classification Report:
              precision    recall  f1-score   support

       Metal       0.46      0.57      0.51     20065
     country       0.51      0.55      0.53     19999
         pop       0.30      0.25      0.27     19935
         rap       0.66      0.70      0.68     19905
        rock       0.28      0.22      0.25     20096

    accuracy                           0.46    100000
   macro avg       0.44      0.46      0.45    100000
weighted avg       0