In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('/home/anais/code/anaisdangeot/mood_detector/raw_data/labeled_lyrics_cleaned_processed.csv')
data.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,artist,seq,song,label,cleaned_lyrics
0,0,0,Elijah Blake,"No, no\r\nI ain't ever trapped out the bando\r...",Everyday,0.626,aint ever trap bando oh lord dont get wrong kn...
1,1,1,Elijah Blake,"The drinks go down and smoke goes up, I feel m...",Live Till We Die,0.63,drink go smoke go feel get let go care get los...
2,2,2,Elijah Blake,She don't live on planet Earth no more\r\nShe ...,The Otherside,0.24,dont live planet earth find love venus thats w...
3,3,3,Elijah Blake,"Trippin' off that Grigio, mobbin', lights low\...",Pinot,0.536,trippin grigio mobbin light low trippin grigio...
4,4,4,Elijah Blake,"I see a midnight panther, so gallant and so br...",Shadows & Diamonds,0.371,see midnight panther gallant brave find find a...


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 158353 entries, 0 to 158352
Data columns (total 7 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   Unnamed: 0.1    158353 non-null  int64  
 1   Unnamed: 0      158353 non-null  int64  
 2   artist          158353 non-null  object 
 3   seq             158353 non-null  object 
 4   song            158353 non-null  object 
 5   label           158353 non-null  float64
 6   cleaned_lyrics  158344 non-null  object 
dtypes: float64(1), int64(2), object(4)
memory usage: 8.5+ MB


In [4]:
data['cleaned_lyrics'].isna().sum()

9

In [5]:
data['cleaned_lyrics'] = data['cleaned_lyrics'].dropna()

In [6]:
data['cleaned_lyrics'].head(1000)

0      aint ever trap bando oh lord dont get wrong kn...
1      drink go smoke go feel get let go care get los...
2      dont live planet earth find love venus thats w...
3      trippin grigio mobbin light low trippin grigio...
4      see midnight panther gallant brave find find a...
                             ...                        
995    close eye ill kiss tomorrow ill miss remember ...
996    shes get kind love dont let know shes get kind...
997    feel heart beat im alone feel heartbeat like h...
998    hes dog hes dress like sheep get bone backyard...
999    stand straight foot love lift shirt stand stra...
Name: cleaned_lyrics, Length: 1000, dtype: object

In [7]:
# TEST
from sklearn.feature_extraction.text import CountVectorizer

texts = data['cleaned_lyrics']
count_vectorizer = CountVectorizer()
X = count_vectorizer.fit_transform(data['cleaned_lyrics'].head(1000).apply(lambda x: np.str_(x)))
X.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [8]:
data.describe()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,label
count,158353.0,158353.0,158353.0
mean,79176.0,79176.0,0.491052
std,45712.717926,45712.717926,0.249619
min,0.0,0.0,0.0
25%,39588.0,39588.0,0.286
50%,79176.0,79176.0,0.483
75%,118764.0,118764.0,0.691
max,158352.0,158352.0,0.998


# Naive Bayes model

In [9]:
# CREATING A BINARY VARIABLE FOR POSITIVE OR NEGATIVE MOOD
positive = data[data['label']>= 0.5].shape[0]
negative = data[data['label']< 0.5].shape[0]
share_positive = positive /(positive+negative)
print (f'Percentage of song that are positive {share_positive}')

def cat_valence(row):
    if row >= 0.5:
        return 'positive mood'
    elif row <0.5:
        return 'negative mood'
    else:
        return 'Nan'
data['mood'] = data['label'].apply(lambda x:cat_valence(x))

Percentage of song that are positive 0.4795614860469963


In [10]:
data['mood'].value_counts()

negative mood    82413
positive mood    75940
Name: mood, dtype: int64

In [11]:
import numpy as np

from sklearn.model_selection import cross_validate
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import recall_score

# Feature/Target
X = data['cleaned_lyrics'].apply(lambda x: np.str_(x))
y = data["mood"]

# Pipeline vectorizer + Naive Bayes
pipeline_naive_bayes = make_pipeline(
    TfidfVectorizer(), 
    MultinomialNB()
)

# Cross-validation
cv_results = cross_validate(pipeline_naive_bayes, X, y, cv = 5, scoring = ["accuracy"])
average_accuracy = cv_results["test_accuracy"].mean()
np.round(average_accuracy,2)

0.65

In [12]:
from sklearn.model_selection import GridSearchCV

# Define the grid of parameters
parameters = {
    'tfidfvectorizer__ngram_range': ((1,2)),
    'tfidfvectorizer__max_df': (0.25, 0.5),
    #'tfidfvectorizer__max_features': (4, 50),
    #'multinomialnb__alpha': (0.1,1)
}

# Perform Grid Search
grid_search = GridSearchCV(
    pipeline_naive_bayes,
    parameters,
    scoring = "accuracy",
    cv = 5,
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X, y)

# Best score
print(f"Best Score = {grid_search.best_score_}")

# Best params
print(f"Best params = {grid_search.best_params_}")

Fitting 5 folds for each of 4 candidates, totalling 20 fits


: 

: 

# LDA

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

vectorizer = TfidfVectorizer()

vectorized_documents = vectorizer.fit_transform(data['cleaned_lyrics'].apply(lambda x: np.str_(x)))

# Instantiate the LDA 
n_components = 2
lda_model = LatentDirichletAllocation(n_components=n_components, max_iter = 100)

# Fit the LDA on the vectorized documents
lda_model.fit(X)

# Transform the vectorized docs
document_topic_mixture = lda_model.transform(X)
document_topic_mixture

In [None]:
def print_topics(lda_model, vectorizer, top_words):
    # 1. TOPIC MIXTURE OF WORDS FOR EACH TOPIC
    topic_mixture = pd.DataFrame(
        lda_model.components_,
        columns = vectorizer.get_feature_names_out()
    )
    
    # 2. FINDING THE TOP WORDS FOR EACH TOPIC
    ## Number of topics
    n_components = topic_mixture.shape[0]

    ## Top words for each topic
    for topic in range(n_components):
        print("-"*10)
        print(f"For topic {topic}, here are the the top {top_words} words with weights:")

        topic_df = topic_mixture.iloc[topic]\
            .sort_values(ascending = False).head(top_words)
        
        print(round(topic_df,3))

In [None]:
print_topics(lda_model, vectorizer, 5)