# Initial step

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('/home/anais/code/anaisdangeot/mood_detector/raw_data/labeled_lyrics_cleaned_processed.csv')
data.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,artist,seq,song,label,cleaned_lyrics
0,0,0,Elijah Blake,"No, no\r\nI ain't ever trapped out the bando\r...",Everyday,0.626,aint ever trap bando oh lord dont get wrong kn...
1,1,1,Elijah Blake,"The drinks go down and smoke goes up, I feel m...",Live Till We Die,0.63,drink go smoke go feel get let go care get los...
2,2,2,Elijah Blake,She don't live on planet Earth no more\r\nShe ...,The Otherside,0.24,dont live planet earth find love venus thats w...
3,3,3,Elijah Blake,"Trippin' off that Grigio, mobbin', lights low\...",Pinot,0.536,trippin grigio mobbin light low trippin grigio...
4,4,4,Elijah Blake,"I see a midnight panther, so gallant and so br...",Shadows & Diamonds,0.371,see midnight panther gallant brave find find a...


In [3]:
data['cleaned_lyrics'].isna().sum()
data['cleaned_lyrics'] = data['cleaned_lyrics'].dropna()

In [4]:
data_sub = data.sample(10000)

In [5]:
def cat_valence(row):
    if row >= 0.5:
        return 1
    elif row <0.5:
        return 0
    else:
        return None
data_sub['mood'] = data_sub['label'].apply(lambda x:cat_valence(x))
data_sub['mood'].value_counts()

0    5220
1    4780
Name: mood, dtype: int64

In [6]:
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split


# Feature/Target
X = data_sub['cleaned_lyrics'].apply(lambda x: np.str_(x))
y = data_sub["mood"]

# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1)

# Baseline using logistic regression

In [7]:
import numpy as np

from sklearn.model_selection import cross_validate
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import recall_score

# Pipeline vectorizer + Naive Bayes
pipeline = make_pipeline(
    TfidfVectorizer(), 
    LogisticRegression()
)

# Cross-validation
cv_results = cross_validate(pipeline, X_train, y_train, cv = 5, scoring = ["accuracy"])
average_accuracy = cv_results["test_accuracy"].mean()
baseline_accuracy = np.round(average_accuracy,2)
baseline_accuracy

0.63

In [8]:
from sklearn.model_selection import GridSearchCV

# Define the grid of parameters
parameters = {
    'tfidfvectorizer__ngram_range': ((2,2), (1,2)),
    'tfidfvectorizer__max_df': [0,25, 0.3, 0.35],
    'tfidfvectorizer__max_features': [50],
}

# Perform Grid Search
grid_search = GridSearchCV(
    pipeline,
    parameters,
    scoring = "accuracy",
    cv = 5,
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

# Best score
print(f"Best Score = {grid_search.best_score_}")

# Best params
print(f"Best params = {grid_search.best_params_}")

Fitting 5 folds for each of 8 candidates, totalling 40 fits


10 fits failed out of a total of 40.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "/home/anais/.pyenv/versions/3.10.6/envs/mood_detector/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/anais/.pyenv/versions/3.10.6/envs/mood_detector/lib/python3.10/site-packages/sklearn/pipeline.py", line 401, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "/home/anais/.pyenv/versions/3.10.6/envs/mood_detector/lib/python3.10/site-packages/sklearn/pipeline.py", line 359, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "/home/an

Best Score = 0.6045714285714285
Best params = {'tfidfvectorizer__max_df': 0.3, 'tfidfvectorizer__max_features': 50, 'tfidfvectorizer__ngram_range': (1, 2)}


After different iterations, best params: max_df= 0.35, max_features=50, n_gram=(1,2)

## Preparation for analysis

In [9]:
vectorizer = TfidfVectorizer(ngram_range=(1,2), max_df=0.35,max_features=50)

vectorized_documents = vectorizer.fit_transform(X_train)

In [10]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

### Let's create some mock data
def get_mock_up_data():

    ### Let's tokenize the vocabulary 
    tk = Tokenizer()
    tk.fit_on_texts(X_train)
    vocab_size = len(tk.word_index)
    print(f'There are {vocab_size} different words in your corpus')
    X_token = tk.texts_to_sequences(X_train)

    ### Pad the inputs
    X_pad = pad_sequences(X_token, dtype='float32', padding='post')
    
    return X_pad, vocab_size

X_pad, vocab_size = get_mock_up_data()
print("X_pad.shape", X_pad.shape)
X_pad

2023-06-14 17:10:19.833902: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-06-14 17:10:19.880044: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-06-14 17:10:19.880693: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


There are 26752 different words in your corpus
X_pad.shape (7000, 943)


array([[3.640e+02, 4.900e+01, 2.590e+02, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [1.000e+00, 1.067e+03, 1.842e+03, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [4.220e+02, 2.892e+03, 5.000e+00, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       ...,
       [9.200e+01, 1.326e+03, 9.660e+02, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [4.600e+01, 4.377e+03, 2.870e+02, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [4.090e+02, 8.000e+00, 5.740e+02, ..., 0.000e+00, 0.000e+00,
        0.000e+00]], dtype=float32)

# DL model using RNN (LSTM)

In [11]:
### Let's build the neural network now
from tensorflow.keras import layers, Sequential

# Size of your embedding space = size of the vector representing each word
embedding_size = 100

model = Sequential()
model.add(layers.Embedding(
    input_dim=vocab_size+1, #+1 for the 0 padding
    output_dim=embedding_size, # 100
    mask_zero=True, # Built-in masking layer :)
))

model.add(layers.LSTM(20))
model.add(layers.Dense(1, activation="sigmoid"))
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 100)         2675300   
                                                                 
 lstm (LSTM)                 (None, 20)                9680      
                                                                 
 dense (Dense)               (None, 1)                 21        
                                                                 
Total params: 2,685,001
Trainable params: 2,685,001
Non-trainable params: 0
_________________________________________________________________


In [12]:
from tensorflow.keras.callbacks import EarlyStopping

es = EarlyStopping(monitor='loss', patience=1)
model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])


history = model.fit(X_pad, y_train, epochs=100, validation_split=0.2, batch_size=64, verbose=1, callbacks=[es])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100


# DL using CNN

In [None]:
embedding_size = 100
model_cnn = Sequential([
    layers.Embedding(input_dim=7000, input_dim=vocab_size+1, output_dim=embedding_size, mask_zero=True),
    layers.Conv1D(20, kernel_size=3),
    layers.Flatten(),
    layers.Dense(1, activation="sigmoid"),
])

es = EarlyStopping(monitor='loss', patience=1)
model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])


history = model.fit(X_pad, y_train, epochs=100, validation_split=0.2, batch_size=64, verbose=1, callbacks=[es])

# ML using SVM classifier

In [None]:
from sklearn.svm import SVC
svc = SVC(kernel='linear', C=10)

In [None]:
# Pipeline vectorizer + Naive Bayes
pipeline_SVM = make_pipeline(
    TfidfVectorizer(), 
    SVC()
)

# Cross-validation
cv_results = cross_validate(pipeline_SVM, X_train, y_train, cv = 5, scoring = ["accuracy"])
SVM_average_accuracy = cv_results["test_accuracy"].mean()
SVM_accuracy = np.round(SVM_average_accuracy,2)
SVM_accuracy

In [None]:
# Define the grid of parameters
parameters = {
    'SVC__kernel': ('linear', 'poly', 'rbf', 'sigmoid'),
    'SVC__C': (0.2, 0.5, 0.7)
}

# Perform Grid Search
grid_search = GridSearchCV(
    pipeline,
    parameters,
    scoring = "accuracy",
    cv = 5,
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

# Best score
print(f"Best Score = {grid_search.best_score_}")

# Best params
print(f"Best params = {grid_search.best_params_}")